diff --git a/Cargo.lock b/Cargo.lock index a4323be238..e8a2b73149 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1305,26 +1305,6 @@ dependencies = [ "syn 2.0.29", ] -[[package]] -name = "bindgen" -version = "0.66.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b84e06fc203107bfbad243f4aba2af864eb7db3b1cf46ea0a023b0b433d2a7" -dependencies = [ - "bitflags 2.4.0", - "cexpr", - "clang-sys", - "lazy_static", - "lazycell", - "peeking_take_while", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 2.0.29", -] - [[package]] name = "bit-set" version = "0.5.3" @@ -2751,51 +2731,150 @@ dependencies = [ name = "dozer-ingestion" version = "0.1.39" dependencies = [ - "base64 0.21.0", - "bson", "bytes", "chrono", "criterion", - "deltalake", "dozer-cli", - "dozer-log", + "dozer-ingestion-connector", + "dozer-ingestion-deltalake", + "dozer-ingestion-dozer", + "dozer-ingestion-ethereum", + "dozer-ingestion-grpc", + "dozer-ingestion-kafka", + "dozer-ingestion-mongodb", + "dozer-ingestion-mysql", + "dozer-ingestion-object-store", + "dozer-ingestion-postgres", + "dozer-ingestion-snowflake", "dozer-tracing", - "dozer-types", "dozer-utils", "env_logger", "futures", - "genawaiter", - "geozero", "hex", + "parquet", + "prost-reflect", + "rand 0.8.5", + "serial_test", + "tempdir", + "tokio", + "url", +] + +[[package]] +name = "dozer-ingestion-connector" +version = "0.1.0" +dependencies = [ + "dozer-types", + "futures", + "tokio", +] + +[[package]] +name = "dozer-ingestion-deltalake" +version = "0.1.0" +dependencies = [ + "deltalake", + "dozer-ingestion-connector", + "dozer-ingestion-object-store", +] + +[[package]] +name = "dozer-ingestion-dozer" +version = "0.1.0" +dependencies = [ + "dozer-ingestion-connector", + "dozer-log", +] + +[[package]] +name = "dozer-ingestion-ethereum" +version = "0.1.0" +dependencies = [ + "dozer-ingestion-connector", + "dozer-tracing", "hex-literal", - "include_dir", - "memchr", + "web3", +] + +[[package]] +name = "dozer-ingestion-grpc" +version = "0.1.0" +dependencies = [ + "dozer-ingestion-connector", + "tonic-reflection", + "tonic-web", + "tower-http", +] + +[[package]] +name = "dozer-ingestion-kafka" +version = "0.1.0" +dependencies = [ + "base64 0.21.0", + "dozer-ingestion-connector", + "rdkafka", + "schema_registry_converter", +] + +[[package]] +name = "dozer-ingestion-mongodb" +version = "0.1.0" +dependencies = [ + "bson", + "dozer-ingestion-connector", "mongodb", +] + +[[package]] +name = "dozer-ingestion-mysql" +version = "0.1.0" +dependencies = [ + "dozer-ingestion-connector", + "geozero", + "hex", "mysql_async", "mysql_common", + "rand 0.8.5", + "serial_test", +] + +[[package]] +name = "dozer-ingestion-object-store" +version = "0.1.0" +dependencies = [ + "deltalake", + "dozer-ingestion-connector", "object_store", - "odbc", - "parquet", + "url", +] + +[[package]] +name = "dozer-ingestion-postgres" +version = "0.1.0" +dependencies = [ + "dozer-ingestion-connector", "postgres-protocol", "postgres-types", - "prost-reflect", "rand 0.8.5", - "rdkafka", "regex", "rustls 0.21.7", "rustls-native-certs 0.6.3", - "schema_registry_converter", "serial_test", - "tempdir", - "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tonic-reflection", - "tonic-web", - "tower-http", - "url", "uuid", - "web3", +] + +[[package]] +name = "dozer-ingestion-snowflake" +version = "0.1.0" +dependencies = [ + "dozer-ingestion-connector", + "genawaiter", + "include_dir", + "memchr", + "odbc", + "rand 0.8.5", ] [[package]] @@ -4508,7 +4587,7 @@ version = "0.11.0+8.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" dependencies = [ - "bindgen 0.65.1", + "bindgen", "bzip2-sys", "cc", "glob", @@ -5053,7 +5132,7 @@ checksum = "57349d5a326b437989b6ee4dc8f2f34b0cc131202748414712a8e7d98952fc8c" dependencies = [ "base64 0.21.0", "bigdecimal", - "bindgen 0.66.1", + "bindgen", "bitflags 2.4.0", "bitvec 1.0.1", "byteorder", diff --git a/dozer-cli/src/errors.rs b/dozer-cli/src/errors.rs index 4b4ef6ae3c..cb5ff4d37d 100644 --- a/dozer-cli/src/errors.rs +++ b/dozer-cli/src/errors.rs @@ -17,7 +17,6 @@ use dozer_api::{ use dozer_cache::dozer_log::storage; use dozer_cache::errors::CacheError; use dozer_core::errors::ExecutionError; -use dozer_ingestion::errors::ConnectorError; use dozer_sql::errors::PipelineError; use dozer_types::{constants::LOCK_FILE, thiserror::Error}; use dozer_types::{errors::internal::BoxedError, serde_json}; @@ -76,8 +75,6 @@ pub enum OrchestrationError { #[error(transparent)] ExecutionError(#[from] ExecutionError), #[error(transparent)] - ConnectorError(#[from] ConnectorError), - #[error(transparent)] PipelineError(#[from] PipelineError), #[error(transparent)] CliError(#[from] CliError), diff --git a/dozer-cli/src/lib.rs b/dozer-cli/src/lib.rs index f8d41e0861..eb27d03c77 100644 --- a/dozer-cli/src/lib.rs +++ b/dozer-cli/src/lib.rs @@ -24,8 +24,8 @@ mod tests; mod utils; // Re-exports pub use dozer_ingestion::{ - connectors::{get_connector, TableInfo}, errors::ConnectorError, + {get_connector, TableInfo}, }; pub use dozer_sql::builder::QueryContext; pub fn wrapped_statement_to_pipeline(sql: &str) -> Result { diff --git a/dozer-cli/src/pipeline/builder.rs b/dozer-cli/src/pipeline/builder.rs index a272dfb8d7..ff94000c9b 100644 --- a/dozer-cli/src/pipeline/builder.rs +++ b/dozer-cli/src/pipeline/builder.rs @@ -9,7 +9,7 @@ use dozer_core::app::AppPipeline; use dozer_core::app::PipelineEntryPoint; use dozer_core::node::SinkFactory; use dozer_core::DEFAULT_PORT_HANDLE; -use dozer_ingestion::connectors::{get_connector, get_connector_info_table}; +use dozer_ingestion::{get_connector, get_connector_info_table}; use dozer_sql::builder::statement_to_pipeline; use dozer_sql::builder::{OutputNodeInfo, QueryContext}; use dozer_tracing::LabelsAndProgress; @@ -26,6 +26,7 @@ use tokio::sync::Mutex; use crate::pipeline::dummy_sink::DummySinkFactory; use crate::pipeline::LogSinkFactory; +use super::connector_source::ConnectorSourceFactoryError; use super::source_builder::SourceBuilder; use crate::errors::OrchestrationError; use dozer_types::log::info; @@ -91,13 +92,17 @@ impl<'a> PipelineBuilder<'a> { let mut connector_map = HashMap::new(); for connection in self.connections { - let connector = get_connector(connection.clone())?; + let connector = get_connector(connection.clone()) + .map_err(|e| ConnectorSourceFactoryError::Connector(e.into()))?; if let Some(info_table) = get_connector_info_table(connection) { info!("[{}] Connection parameters\n{info_table}", connection.name); } - let connector_tables = connector.list_tables().await?; + let connector_tables = connector + .list_tables() + .await + .map_err(ConnectorSourceFactoryError::Connector)?; // override source name if specified let connector_tables: Vec = connector_tables diff --git a/dozer-cli/src/pipeline/connector_source.rs b/dozer-cli/src/pipeline/connector_source.rs index 5d0213893d..0eccbebd4e 100644 --- a/dozer-cli/src/pipeline/connector_source.rs +++ b/dozer-cli/src/pipeline/connector_source.rs @@ -3,11 +3,10 @@ use dozer_core::channels::SourceChannelForwarder; use dozer_core::node::{ OutputPortDef, OutputPortType, PortHandle, Source, SourceFactory, SourceState, }; -use dozer_ingestion::connectors::{ +use dozer_ingestion::{ get_connector, CdcType, Connector, TableIdentifier, TableInfo, TableToIngest, }; -use dozer_ingestion::errors::ConnectorError; -use dozer_ingestion::ingestion::{IngestionConfig, Ingestor}; +use dozer_ingestion::{IngestionConfig, Ingestor}; use dozer_tracing::LabelsAndProgress; use dozer_types::errors::internal::BoxedError; @@ -39,7 +38,7 @@ struct Table { #[derive(Debug, Error)] pub enum ConnectorSourceFactoryError { #[error("Connector error: {0}")] - Connector(#[from] ConnectorError), + Connector(#[source] BoxedError), #[error("Port not found for source: {0}")] PortNotFoundInSource(PortHandle), #[error("Schema not initialized")] @@ -75,14 +74,18 @@ impl ConnectorSourceFactory { ) -> Result { let connection_name = connection.name.clone(); - let connector = get_connector(connection)?; + let connector = get_connector(connection) + .map_err(|e| ConnectorSourceFactoryError::Connector(e.into()))?; // Fill column names if not provided. let table_identifiers = table_and_ports .iter() .map(|(table, _)| TableIdentifier::new(table.schema.clone(), table.name.clone())) .collect(); - let all_columns = connector.list_columns(table_identifiers).await?; + let all_columns = connector + .list_columns(table_identifiers) + .await + .map_err(ConnectorSourceFactoryError::Connector)?; for ((table, _), columns) in table_and_ports.iter_mut().zip(all_columns) { if table.column_names.is_empty() { table.column_names = columns.column_names; @@ -93,13 +96,16 @@ impl ConnectorSourceFactory { .iter() .map(|(table, _)| table.clone()) .collect(); - let source_schemas = connector.get_schemas(&tables).await?; + let source_schemas = connector + .get_schemas(&tables) + .await + .map_err(ConnectorSourceFactoryError::Connector)?; let mut tables = vec![]; for ((table, port), source_schema) in table_and_ports.into_iter().zip(source_schemas) { let name = table.name; let columns = table.column_names; - let source_schema = source_schema?; + let source_schema = source_schema.map_err(ConnectorSourceFactoryError::Connector)?; let schema = source_schema.schema; let cdc_type = source_schema.cdc_type; @@ -280,9 +286,6 @@ impl Source for ConnectorSource { .await; match result { Ok(Ok(_)) => {} - // If we get a channel error, it means the source sender thread has quit. - // Any error handling is done in that thread. - Ok(Err(ConnectorError::IngestorError)) => {} Ok(Err(e)) => std::panic::panic_any(e), // Aborted means we are shutting down Err(Aborted) => (), diff --git a/dozer-cli/src/pipeline/source_builder.rs b/dozer-cli/src/pipeline/source_builder.rs index 2d54f23e62..a20751ad72 100644 --- a/dozer-cli/src/pipeline/source_builder.rs +++ b/dozer-cli/src/pipeline/source_builder.rs @@ -2,7 +2,7 @@ use crate::pipeline::connector_source::ConnectorSourceFactory; use crate::OrchestrationError; use dozer_api::shutdown::ShutdownReceiver; use dozer_core::appsource::{AppSourceManager, AppSourceMappings}; -use dozer_ingestion::connectors::TableInfo; +use dozer_ingestion::TableInfo; use dozer_tracing::LabelsAndProgress; use dozer_types::models::connection::Connection; diff --git a/dozer-cli/src/simple/orchestrator.rs b/dozer-cli/src/simple/orchestrator.rs index 53b66a641e..2fcc841bf5 100644 --- a/dozer-cli/src/simple/orchestrator.rs +++ b/dozer-cli/src/simple/orchestrator.rs @@ -1,6 +1,7 @@ use super::executor::{run_dag_executor, Executor}; use super::Contract; use crate::errors::OrchestrationError; +use crate::pipeline::connector_source::ConnectorSourceFactoryError; use crate::pipeline::PipelineBuilder; use crate::simple::build; use crate::simple::helper::validate_config; @@ -30,7 +31,7 @@ use crate::console_helper::GREEN; use crate::console_helper::PURPLE; use crate::console_helper::RED; use dozer_core::errors::ExecutionError; -use dozer_ingestion::connectors::{get_connector, SourceSchema, TableInfo}; +use dozer_ingestion::{get_connector, SourceSchema, TableInfo}; use dozer_sql::builder::statement_to_pipeline; use dozer_sql::errors::PipelineError; use dozer_types::log::info; @@ -288,8 +289,12 @@ impl SimpleOrchestrator { ) -> Result, Vec)>, OrchestrationError> { let mut schema_map = HashMap::new(); for connection in &self.config.connections { - let connector = get_connector(connection.clone())?; - let schema_tuples = connector.list_all_schemas().await?; + let connector = get_connector(connection.clone()) + .map_err(|e| ConnectorSourceFactoryError::Connector(e.into()))?; + let schema_tuples = connector + .list_all_schemas() + .await + .map_err(ConnectorSourceFactoryError::Connector)?; schema_map.insert(connection.name.clone(), schema_tuples); } diff --git a/dozer-ingestion/Cargo.toml b/dozer-ingestion/Cargo.toml index 8e14c61c98..f363b23dea 100644 --- a/dozer-ingestion/Cargo.toml +++ b/dozer-ingestion/Cargo.toml @@ -7,87 +7,42 @@ authors = ["getdozer/dozer-dev"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -dozer-utils = { path = "../dozer-utils" } -dozer-types = { path = "../dozer-types" } -dozer-log = { path = "../dozer-log" } +dozer-ingestion-connector = { path = "./connector" } +dozer-ingestion-deltalake = { path = "./deltalake" } +dozer-ingestion-dozer = { path = "./dozer" } +dozer-ingestion-ethereum = { path = "./ethereum", optional = true } +dozer-ingestion-grpc = { path = "./grpc" } +dozer-ingestion-kafka = { path = "./kafka", optional = true } +dozer-ingestion-mongodb = { path = "./mongodb", optional = true } +dozer-ingestion-mysql = { path = "./mysql" } +dozer-ingestion-object-store = { path = "./object-store" } +dozer-ingestion-postgres = { path = "./postgres" } +dozer-ingestion-snowflake = { path = "./snowflake", optional = true } tokio = { version = "1", features = ["full"] } futures = "0.3.28" -# Postgres connector -postgres-protocol = "0.6.4" -postgres-types = { version = "0.2.4", features = [ - "with-serde_json-1", - "with-uuid-1", -] } -tokio-postgres = { version = "0.7.7", features = [ - "with-chrono-0_4", - "with-geo-types-0_7", - "with-uuid-1", -] } -# DataFusion connector -object_store = { version = "0.6.1", features = ["aws"] } -# Eth connector -web3 = { version = "0.18.0", optional = true } -# Kafka connector -rdkafka = { version = "0.34.0", optional = true } -# odbc connector -odbc = { version = "0.17.0", optional = true } -# Mongodb connector -mongodb = { version = "2.6.1", optional = true } -base64 = "0.21.0" -include_dir = { version = "0.7.3", optional = true } -schema_registry_converter = { version = "3.1.0", features = [ - "avro", -], optional = true } -regex = "1" -tonic-web = "0.10.2" -tonic-reflection = "0.10.0" -tower-http = { version = "0.4", features = ["full"] } prost-reflect = { version = "0.12.0", features = ["serde", "text-format"] } -deltalake = { version = "0.15.0", default-features = false, features = [ - "s3", - "datafusion", -] } -bson = "2.7.0" -uuid = { version = "1.3.1", features = ["serde", "v4"] } -rustls = { version = "0.21.7", features = ["dangerous_configuration"] } -tokio-postgres-rustls = "0.10.0" -rustls-native-certs = "0.6.3" rand = "0.8.5" url = "2.4.1" -mysql_async = { version = "0.32.2", default-features = false, features = [ - "default-rustls", -] } -mysql_common = { version = "0.30", default-features = false, features = [ - "chrono", - "rust_decimal", -] } chrono = "0.4.26" -geozero = { version = "0.11.0", default-features = false, features = [ - "with-wkb", -] } bytes = "1.4.0" -genawaiter = { version = "0.99.1", optional = true } -memchr = { version = "2.6.4", optional = true } [dev-dependencies] criterion = { version = "0.4.0", features = ["html_reports"] } serial_test = "1.0.0" -rand = "0.8.5" -hex-literal = "0.4.1" dozer-tracing = { path = "../dozer-tracing" } tempdir = "0.3.7" parquet = "45.0.0" env_logger = "0.10.0" hex = "0.4.3" +dozer-utils = { path = "../dozer-utils" } dozer-cli = { path = "../dozer-cli" } [features] -# Defines a feature named `odbc` that does not enable any other features. -snowflake = ["dep:odbc", "dep:include_dir", "dep:genawaiter", "dep:memchr"] -ethereum = ["dep:web3"] -kafka = ["dep:rdkafka", "dep:schema_registry_converter"] -mongodb = ["dep:mongodb"] +snowflake = ["dep:dozer-ingestion-snowflake"] +ethereum = ["dep:dozer-ingestion-ethereum"] +kafka = ["dep:dozer-ingestion-kafka"] +mongodb = ["dep:dozer-ingestion-mongodb"] [[bench]] name = "connectors" diff --git a/dozer-ingestion/benches/connectors.rs b/dozer-ingestion/benches/connectors.rs index b58cf08480..356965fcbf 100644 --- a/dozer-ingestion/benches/connectors.rs +++ b/dozer-ingestion/benches/connectors.rs @@ -1,6 +1,6 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use dozer_ingestion::test_util::create_test_runtime; -use dozer_types::serde_yaml; +use dozer_ingestion_connector::dozer_types::serde_yaml; use helper::TestConfig; mod helper; fn connectors(criter: &mut Criterion) { diff --git a/dozer-ingestion/benches/grpc.rs b/dozer-ingestion/benches/grpc.rs index 8c9fa1604d..04a73f0ce4 100644 --- a/dozer-ingestion/benches/grpc.rs +++ b/dozer-ingestion/benches/grpc.rs @@ -2,19 +2,17 @@ use std::{sync::Arc, thread}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use dozer_ingestion::test_util::create_test_runtime; -use dozer_types::tonic::transport::Channel; -use dozer_types::{ +use dozer_ingestion_connector::dozer_types::{ arrow::array::{Int32Array, StringArray}, + arrow::{datatypes as arrow_types, record_batch::RecordBatch}, + arrow_types::from_arrow::serialize_record_batch, grpc_types::ingest::{ingest_service_client::IngestServiceClient, IngestArrowRequest}, indicatif::{MultiProgress, ProgressBar}, serde_yaml, + tonic::transport::Channel, }; mod helper; use crate::helper::TestConfig; -use dozer_types::{ - arrow::{datatypes as arrow_types, record_batch::RecordBatch}, - arrow_types::from_arrow::serialize_record_batch, -}; const ARROW_PORT: u32 = 60056; const BATCH_SIZE: usize = 100; diff --git a/dozer-ingestion/benches/helper.rs b/dozer-ingestion/benches/helper.rs index df9a0abe01..0f225929ce 100644 --- a/dozer-ingestion/benches/helper.rs +++ b/dozer-ingestion/benches/helper.rs @@ -1,15 +1,18 @@ use std::sync::Arc; -use dozer_ingestion::connectors::{Connector, TableToIngest}; -use dozer_ingestion::ingestion::{IngestionIterator, Ingestor}; -use dozer_types::indicatif::{ProgressBar, ProgressStyle}; -use dozer_types::log::error; -use dozer_types::models::connection::Connection; -use dozer_types::serde::{self, Deserialize, Serialize}; +use dozer_ingestion_connector::{ + dozer_types::{ + indicatif::{ProgressBar, ProgressStyle}, + log::error, + models::connection::Connection, + serde::{Deserialize, Serialize}, + }, + Connector, IngestionIterator, Ingestor, TableToIngest, +}; use tokio::runtime::Runtime; #[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(crate = "self::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct TestConfig { pub connection: Connection, pub tables_filter: Option>, @@ -42,7 +45,7 @@ pub fn get_progress() -> ProgressBar { } pub fn get_connection_iterator(runtime: Arc, config: TestConfig) -> IngestionIterator { - let connector = dozer_ingestion::connectors::get_connector(config.connection).unwrap(); + let connector = dozer_ingestion::get_connector(config.connection).unwrap(); let tables = runtime.block_on(list_tables(&*connector)); let (ingestor, iterator) = Ingestor::initialize_channel(Default::default()); runtime.clone().spawn_blocking(move || async move { diff --git a/dozer-ingestion/connector/Cargo.toml b/dozer-ingestion/connector/Cargo.toml new file mode 100644 index 0000000000..f0b197b98f --- /dev/null +++ b/dozer-ingestion/connector/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "dozer-ingestion-connector" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-types = { path = "../../dozer-types" } +futures = "0.3.28" +tokio = "1.32.0" diff --git a/dozer-ingestion/src/ingestion/ingestor.rs b/dozer-ingestion/connector/src/ingestor.rs similarity index 93% rename from dozer-ingestion/src/ingestion/ingestor.rs rename to dozer-ingestion/connector/src/ingestor.rs index 5b4a551b53..0f893fe7fb 100644 --- a/dozer-ingestion/src/ingestion/ingestor.rs +++ b/dozer-ingestion/connector/src/ingestor.rs @@ -5,7 +5,18 @@ use tokio::{ time::timeout, }; -use super::IngestionConfig; +#[derive(Debug, Clone)] +pub struct IngestionConfig { + forwarder_channel_cap: usize, +} + +impl Default for IngestionConfig { + fn default() -> Self { + Self { + forwarder_channel_cap: 100000, + } + } +} #[derive(Debug)] /// `IngestionIterator` is the receiver side of a spsc channel. The sender side is `Ingestor`. diff --git a/dozer-ingestion/connector/src/lib.rs b/dozer-ingestion/connector/src/lib.rs new file mode 100644 index 0000000000..a1015e847b --- /dev/null +++ b/dozer-ingestion/connector/src/lib.rs @@ -0,0 +1,162 @@ +use std::fmt::Debug; + +use dozer_types::errors::internal::BoxedError; +use dozer_types::node::OpIdentifier; +use dozer_types::serde; +use dozer_types::serde::{Deserialize, Serialize}; +pub use dozer_types::tonic::async_trait; +use dozer_types::types::{FieldType, Schema}; + +mod ingestor; +pub mod test_util; +pub mod utils; + +pub use ingestor::{IngestionConfig, IngestionIterator, Ingestor}; + +pub use dozer_types; +pub use futures; +pub use tokio; + +#[derive(Clone, Copy, Serialize, Deserialize, Debug, Eq, PartialEq, Default)] +#[serde(crate = "dozer_types::serde")] +/// A source table's CDC event type. +pub enum CdcType { + /// Connector gets old record on delete/update operations. + FullChanges, + /// Connector only gets PK of old record on delete/update operations. + OnlyPK, + #[default] + /// Connector cannot get any info about old records. In other words, the table is append-only. + Nothing, +} + +#[derive(Clone, Serialize, Deserialize, Debug, Eq, PartialEq)] +#[serde(crate = "dozer_types::serde")] +/// A source table's schema and CDC type. +pub struct SourceSchema { + /// Dozer schema mapped from the source table. Columns are already filtered based on `TableInfo.column_names`. + pub schema: Schema, + #[serde(default)] + /// The source table's CDC type. + pub cdc_type: CdcType, +} + +impl SourceSchema { + pub fn new(schema: Schema, cdc_type: CdcType) -> Self { + Self { schema, cdc_type } + } +} + +/// Result of mapping one source table schema to Dozer schema. +pub type SourceSchemaResult = Result; + +#[async_trait] +pub trait Connector: Send + Sync + Debug { + /// Returns all the external types and their corresponding Dozer types. + /// If the external type is not supported, None should be returned. + fn types_mapping() -> Vec<(String, Option)> + where + Self: Sized; + + /// Validates the connector's connection level properties. + async fn validate_connection(&self) -> Result<(), BoxedError>; + + /// Lists all the table names in the connector. + async fn list_tables(&self) -> Result, BoxedError>; + + /// Validates the connector's table level properties for each table. + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError>; + + /// Lists all the column names for each table. + async fn list_columns( + &self, + tables: Vec, + ) -> Result, BoxedError>; + + /// Gets the schema for each table. Only requested columns need to be mapped. + /// + /// If this function fails at the connector level, such as a network error, it should return a outer level `Err`. + /// Otherwise the outer level `Ok` should always contain the same number of elements as `table_infos`. + /// + /// If it fails at the table or column level, such as a unsupported data type, one of the elements should be `Err`. + async fn get_schemas( + &self, + table_infos: &[TableInfo], + ) -> Result, BoxedError>; + + /// Lists all tables and columns and gets the schema for each table. + async fn list_all_schemas(&self) -> Result<(Vec, Vec), BoxedError> { + let tables = self.list_tables().await?; + let table_infos = self.list_columns(tables).await?; + let schemas = self + .get_schemas(&table_infos) + .await? + .into_iter() + .collect::, _>>()?; + Ok((table_infos, schemas)) + } + + /// Starts outputting data from `tables` to `ingestor`. This method should never return unless there is an unrecoverable error. + async fn start( + &self, + ingestor: &Ingestor, + tables: Vec, + ) -> Result<(), BoxedError>; +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +/// Unique identifier of a source table. A source table must have a `name`, optionally under a `schema` scope. +pub struct TableIdentifier { + /// The `schema` scope of the table. + /// + /// Connector that supports schema scope must decide on a default schema, that doesn't must assert that `schema.is_none()`. + pub schema: Option, + /// The table name, must be unique under the `schema` scope, or global scope if `schema` is `None`. + pub name: String, +} + +impl TableIdentifier { + pub fn new(schema: Option, name: String) -> Self { + Self { schema, name } + } + + pub fn from_table_name(name: String) -> Self { + Self { schema: None, name } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, Eq, PartialEq)] +#[serde(crate = "self::serde")] +/// `TableIdentifier` with column names. +pub struct TableInfo { + /// The `schema` scope of the table. + pub schema: Option, + /// The table name, must be unique under the `schema` scope, or global scope if `schema` is `None`. + pub name: String, + /// The column names to be mapped. + pub column_names: Vec, +} + +#[derive(Debug, Clone)] +/// `TableInfo` with an optional checkpoint info. +pub struct TableToIngest { + /// The `schema` scope of the table. + pub schema: Option, + /// The table name, must be unique under the `schema` scope, or global scope if `schema` is `None`. + pub name: String, + /// The column names to be mapped. + pub column_names: Vec, + /// The checkpoint to start after. + pub checkpoint: Option, +} + +impl TableToIngest { + pub fn from_scratch(table_info: TableInfo) -> Self { + Self { + schema: table_info.schema, + name: table_info.name, + column_names: table_info.column_names, + checkpoint: None, + } + } +} diff --git a/dozer-ingestion/connector/src/test_util.rs b/dozer-ingestion/connector/src/test_util.rs new file mode 100644 index 0000000000..2b11876e08 --- /dev/null +++ b/dozer-ingestion/connector/src/test_util.rs @@ -0,0 +1,71 @@ +use std::sync::Arc; + +use dozer_types::{ + constants::DEFAULT_CONFIG_PATH, + log::error, + models::{config::Config, connection::ConnectionConfig}, +}; +use futures::stream::{AbortHandle, Abortable}; +use tokio::runtime::Runtime; + +use crate::{Connector, IngestionIterator, Ingestor, TableInfo, TableToIngest}; + +pub fn create_test_runtime() -> Arc { + Arc::new( + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(), + ) +} + +pub fn spawn_connector( + runtime: Arc, + connector: impl Connector + 'static, + tables: Vec, +) -> (IngestionIterator, AbortHandle) { + let (ingestor, iterator) = Ingestor::initialize_channel(Default::default()); + let (abort_handle, abort_registration) = AbortHandle::new_pair(); + let tables = tables + .into_iter() + .map(TableToIngest::from_scratch) + .collect(); + runtime.clone().spawn_blocking(move || { + runtime.block_on(async move { + if let Ok(Err(e)) = + Abortable::new(connector.start(&ingestor, tables), abort_registration).await + { + error!("Connector `start` returned error: {e}") + } + }) + }); + (iterator, abort_handle) +} + +pub fn spawn_connector_all_tables( + runtime: Arc, + connector: impl Connector + 'static, +) -> (IngestionIterator, AbortHandle) { + let tables = runtime.block_on(list_all_table(&connector)); + spawn_connector(runtime, connector, tables) +} + +pub fn create_runtime_and_spawn_connector_all_tables( + connector: impl Connector + 'static, +) -> (IngestionIterator, AbortHandle) { + let runtime = create_test_runtime(); + spawn_connector_all_tables(runtime.clone(), connector) +} + +async fn list_all_table(connector: &impl Connector) -> Vec { + let tables = connector.list_tables().await.unwrap(); + connector.list_columns(tables).await.unwrap() +} + +pub fn load_test_connection_config() -> ConnectionConfig { + let config_path = std::path::PathBuf::from(format!("src/tests/{DEFAULT_CONFIG_PATH}")); + + let dozer_config = std::fs::read_to_string(config_path).unwrap(); + let mut dozer_config = dozer_types::serde_yaml::from_str::(&dozer_config).unwrap(); + dozer_config.connections.remove(0).config +} diff --git a/dozer-ingestion/src/utils.rs b/dozer-ingestion/connector/src/utils.rs similarity index 66% rename from dozer-ingestion/src/utils.rs rename to dozer-ingestion/connector/src/utils.rs index e1d1019a1f..ca1f19c9c0 100644 --- a/dozer-ingestion/src/utils.rs +++ b/dozer-ingestion/connector/src/utils.rs @@ -1,3 +1,35 @@ +use dozer_types::thiserror::Error; + +#[derive(Debug, Clone)] +pub struct ListOrFilterColumns { + pub schema: Option, + pub name: String, + pub columns: Option>, +} + +#[derive(Debug, Error)] +#[error("table not found: {}", table_name(schema.as_deref(), name))] +pub struct TableNotFound { + pub schema: Option, + pub name: String, +} + +fn table_name(schema: Option<&str>, name: &str) -> String { + match schema { + Some(schema) => format!("{}.{}", schema, name), + None => name.to_string(), + } +} + +pub fn warn_dropped_primary_index(table_name: &str) { + dozer_types::log::warn!( + "One or more primary index columns from the source table are \ + not part of the defined schema for table: '{0}'. \ + The primary index will therefore not be present in the Dozer table", + table_name + ); +} + #[macro_export] macro_rules! retry_on_network_failure { ($description:expr, $operation:expr, $network_error_predicate:expr $(, $reconnect:expr)? $(,)?) => diff --git a/dozer-ingestion/deltalake/Cargo.toml b/dozer-ingestion/deltalake/Cargo.toml new file mode 100644 index 0000000000..de74724834 --- /dev/null +++ b/dozer-ingestion/deltalake/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "dozer-ingestion-deltalake" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +dozer-ingestion-object-store = { path = "../object-store" } +deltalake = { version = "0.15.0", default-features = false, features = [ + "datafusion", +] } diff --git a/dozer-ingestion/src/connectors/delta_lake/connector.rs b/dozer-ingestion/deltalake/src/connector.rs similarity index 71% rename from dozer-ingestion/src/connectors/delta_lake/connector.rs rename to dozer-ingestion/deltalake/src/connector.rs index 251954e5fe..a50cdb4aa4 100644 --- a/dozer-ingestion/src/connectors/delta_lake/connector.rs +++ b/dozer-ingestion/deltalake/src/connector.rs @@ -1,14 +1,13 @@ -use crate::connectors::delta_lake::reader::DeltaLakeReader; -use crate::connectors::delta_lake::schema_helper::SchemaHelper; -use crate::connectors::delta_lake::ConnectorResult; -use crate::connectors::{ - table_name, Connector, ListOrFilterColumns, SourceSchemaResult, TableIdentifier, TableInfo, - TableToIngest, +use crate::reader::DeltaLakeReader; +use crate::schema_helper::SchemaHelper; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + errors::internal::BoxedError, models::ingestion_types::DeltaLakeConfig, types::FieldType, + }, + utils::{ListOrFilterColumns, TableNotFound}, + Connector, Ingestor, SourceSchemaResult, TableIdentifier, TableInfo, TableToIngest, }; -use crate::errors::ConnectorError; -use crate::ingestion::Ingestor; -use dozer_types::models::ingestion_types::DeltaLakeConfig; -use dozer_types::tonic::async_trait; #[derive(Debug)] pub struct DeltaLakeConnector { @@ -23,18 +22,18 @@ impl DeltaLakeConnector { #[async_trait] impl Connector for DeltaLakeConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { Ok(()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { Ok(self .config .tables @@ -43,7 +42,7 @@ impl Connector for DeltaLakeConnector { .collect()) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let mut delta_table_names = vec![]; // Collect delta table names in config, the validate table info for delta_table in self.config.tables.iter() { @@ -51,10 +50,11 @@ impl Connector for DeltaLakeConnector { } for table in tables.iter() { if !delta_table_names.contains(&table.name.as_str()) || table.schema.is_some() { - return Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - ))); + return Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into()); } } Ok(()) @@ -63,7 +63,7 @@ impl Connector for DeltaLakeConnector { async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let table_infos = tables .into_iter() .map(|table| ListOrFilterColumns { @@ -95,7 +95,7 @@ impl Connector for DeltaLakeConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> ConnectorResult> { + ) -> Result, BoxedError> { let table_infos = table_infos .iter() .map(|table_info| ListOrFilterColumns { @@ -108,7 +108,11 @@ impl Connector for DeltaLakeConnector { schema_helper.get_schemas(&table_infos).await } - async fn start(&self, ingestor: &Ingestor, tables: Vec) -> ConnectorResult<()> { + async fn start( + &self, + ingestor: &Ingestor, + tables: Vec, + ) -> Result<(), BoxedError> { let reader = DeltaLakeReader::new(self.config.clone()); reader.read(&tables, ingestor).await } diff --git a/dozer-ingestion/src/connectors/delta_lake/mod.rs b/dozer-ingestion/deltalake/src/lib.rs similarity index 51% rename from dozer-ingestion/src/connectors/delta_lake/mod.rs rename to dozer-ingestion/deltalake/src/lib.rs index 63b44234b8..a03a7a8997 100644 --- a/dozer-ingestion/src/connectors/delta_lake/mod.rs +++ b/dozer-ingestion/deltalake/src/lib.rs @@ -1,10 +1,6 @@ -use crate::errors::ConnectorError; - mod connector; mod reader; mod schema_helper; mod test; pub use connector::DeltaLakeConnector; - -type ConnectorResult = Result; diff --git a/dozer-ingestion/src/connectors/delta_lake/reader.rs b/dozer-ingestion/deltalake/src/reader.rs similarity index 76% rename from dozer-ingestion/src/connectors/delta_lake/reader.rs rename to dozer-ingestion/deltalake/src/reader.rs index e241fdb54e..7a9f8d6dc8 100644 --- a/dozer-ingestion/src/connectors/delta_lake/reader.rs +++ b/dozer-ingestion/deltalake/src/reader.rs @@ -1,14 +1,19 @@ -use crate::connectors::delta_lake::ConnectorResult; -use crate::connectors::TableToIngest; -use crate::errors::ConnectorError; -use crate::ingestion::Ingestor; -use deltalake::datafusion::prelude::SessionContext; -use dozer_types::arrow_types::from_arrow::{map_schema_to_dozer, map_value_to_dozer_field}; -use dozer_types::models::ingestion_types::{DeltaLakeConfig, IngestionMessage}; -use dozer_types::types::{Operation, Record}; -use futures::StreamExt; use std::sync::Arc; +use deltalake::datafusion::prelude::SessionContext; +use dozer_ingestion_connector::{ + dozer_types::{ + arrow_types::from_arrow::{map_schema_to_dozer, map_value_to_dozer_field}, + errors::internal::BoxedError, + models::ingestion_types::{DeltaLakeConfig, IngestionMessage}, + types::{Operation, Record}, + }, + futures::StreamExt, + tokio, + utils::TableNotFound, + Ingestor, TableToIngest, +}; + pub struct DeltaLakeReader { config: DeltaLakeConfig, } @@ -18,7 +23,11 @@ impl DeltaLakeReader { Self { config } } - pub async fn read(&self, table: &[TableToIngest], ingestor: &Ingestor) -> ConnectorResult<()> { + pub async fn read( + &self, + table: &[TableToIngest], + ingestor: &Ingestor, + ) -> Result<(), BoxedError> { for (table_index, table) in table.iter().enumerate() { self.read_impl(table_index, table, ingestor).await?; } @@ -30,7 +39,7 @@ impl DeltaLakeReader { table_index: usize, table: &TableToIngest, ingestor: &Ingestor, - ) -> ConnectorResult<()> { + ) -> Result<(), BoxedError> { assert!(table.checkpoint.is_none()); let table_path = table_path(&self.config, &table.name)?; @@ -46,8 +55,7 @@ impl DeltaLakeReader { tokio::pin!(data); while let Some(Ok(batch)) = data.next().await { let batch_schema = batch.schema(); - let dozer_schema = map_schema_to_dozer(&batch_schema) - .map_err(|e| ConnectorError::InternalError(Box::new(e)))?; + let dozer_schema = map_schema_to_dozer(&batch_schema)?; for row in 0..batch.num_rows() { let fields = batch .columns() @@ -77,13 +85,14 @@ impl DeltaLakeReader { } } -pub fn table_path(config: &DeltaLakeConfig, table_name: &str) -> ConnectorResult { +pub fn table_path(config: &DeltaLakeConfig, table_name: &str) -> Result { for delta_table in config.tables.iter() { if delta_table.name == table_name { return Ok(delta_table.path.clone()); } } - Err(ConnectorError::TableNotFound(format!( - "Delta table: {table_name} can't find" - ))) + Err(TableNotFound { + schema: None, + name: table_name.to_string(), + }) } diff --git a/dozer-ingestion/src/connectors/delta_lake/schema_helper.rs b/dozer-ingestion/deltalake/src/schema_helper.rs similarity index 60% rename from dozer-ingestion/src/connectors/delta_lake/schema_helper.rs rename to dozer-ingestion/deltalake/src/schema_helper.rs index c6612c630a..cc20d3904a 100644 --- a/dozer-ingestion/src/connectors/delta_lake/schema_helper.rs +++ b/dozer-ingestion/deltalake/src/schema_helper.rs @@ -1,10 +1,12 @@ -use crate::connectors::delta_lake::reader::table_path; -use crate::connectors::delta_lake::ConnectorResult; -use crate::connectors::object_store::schema_mapper::map_schema; -use crate::connectors::{CdcType, ListOrFilterColumns, SourceSchema, SourceSchemaResult}; -use deltalake::arrow::datatypes::SchemaRef; -use deltalake::datafusion::prelude::SessionContext; -use dozer_types::models::ingestion_types::DeltaLakeConfig; +use deltalake::{arrow::datatypes::SchemaRef, datafusion::prelude::SessionContext}; +use dozer_ingestion_connector::{ + dozer_types::{errors::internal::BoxedError, models::ingestion_types::DeltaLakeConfig}, + utils::ListOrFilterColumns, + CdcType, SourceSchema, SourceSchemaResult, +}; +use dozer_ingestion_object_store::schema_mapper::map_schema; + +use crate::reader::table_path; use std::sync::Arc; pub struct SchemaHelper { @@ -19,7 +21,7 @@ impl SchemaHelper { pub async fn get_schemas( &self, tables: &[ListOrFilterColumns], - ) -> ConnectorResult> { + ) -> Result, BoxedError> { let mut schemas = vec![]; for table in tables.iter() { schemas.push(self.get_schemas_impl(table).await); @@ -27,7 +29,10 @@ impl SchemaHelper { Ok(schemas) } - async fn get_schemas_impl(&self, table: &ListOrFilterColumns) -> ConnectorResult { + async fn get_schemas_impl( + &self, + table: &ListOrFilterColumns, + ) -> Result { let table_path = table_path(&self.config, &table.name)?; let ctx = SessionContext::new(); let delta_table = deltalake::open_table(table_path).await?; diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/.part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet.crc b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/.part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet.crc similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/.part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet.crc rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/.part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet.crc diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/.part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet.crc b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/.part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet.crc similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/.part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet.crc rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/.part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet.crc diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/.part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet.crc b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/.part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet.crc similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/.part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet.crc rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/.part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet.crc diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_change_data/.gitkeep b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_change_data/.gitkeep similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_change_data/.gitkeep rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_change_data/.gitkeep diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_delta_index/.gitkeep b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_delta_index/.gitkeep similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_delta_index/.gitkeep rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_delta_index/.gitkeep diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_delta_log/00000000000000000000.json b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_delta_log/00000000000000000000.json similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_delta_log/00000000000000000000.json rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_delta_log/00000000000000000000.json diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_delta_log/00000000000000000001.json b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_delta_log/00000000000000000001.json similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/_delta_log/00000000000000000001.json rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/_delta_log/00000000000000000001.json diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe1-c000.snappy.parquet diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a1-c000.snappy.parquet diff --git a/dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet b/dozer-ingestion/deltalake/src/test/data/delta-0.8.0/part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/data/delta-0.8.0/part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet rename to dozer-ingestion/deltalake/src/test/data/delta-0.8.0/part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet diff --git a/dozer-ingestion/src/connectors/delta_lake/test/deltalake_test.rs b/dozer-ingestion/deltalake/src/test/deltalake_test.rs similarity index 70% rename from dozer-ingestion/src/connectors/delta_lake/test/deltalake_test.rs rename to dozer-ingestion/deltalake/src/test/deltalake_test.rs index 764e4e7b8b..685aea7a13 100644 --- a/dozer-ingestion/src/connectors/delta_lake/test/deltalake_test.rs +++ b/dozer-ingestion/deltalake/src/test/deltalake_test.rs @@ -1,14 +1,16 @@ -use crate::connectors::delta_lake::DeltaLakeConnector; -use crate::connectors::Connector; -use crate::test_util::create_runtime_and_spawn_connector_all_tables; -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::models::ingestion_types::{DeltaLakeConfig, DeltaTable}; -use dozer_types::types::SourceDefinition::Dynamic; -use dozer_types::types::{Field, FieldType, Operation}; +use crate::DeltaLakeConnector; +use dozer_ingestion_connector::{ + dozer_types::{ + models::ingestion_types::{DeltaLakeConfig, DeltaTable, IngestionMessage}, + types::{Field, FieldType, Operation, SourceDefinition}, + }, + test_util::create_runtime_and_spawn_connector_all_tables, + tokio, Connector, +}; #[tokio::test] async fn get_schema_from_deltalake() { - let path = "src/connectors/delta_lake/test/data/delta-0.8.0"; + let path = "src/test/data/delta-0.8.0"; let table_name = "test_table"; let delta_table = DeltaTable { path: path.to_string(), @@ -24,12 +26,12 @@ async fn get_schema_from_deltalake() { assert_eq!(&field.name, "value"); assert_eq!(field.typ, FieldType::Int); assert!(field.nullable); - assert_eq!(field.source, Dynamic); + assert_eq!(field.source, SourceDefinition::Dynamic); } #[test] fn read_deltalake() { - let path = "src/connectors/delta_lake/test/data/delta-0.8.0"; + let path = "src/test/data/delta-0.8.0"; let table_name = "test_table"; let delta_table = DeltaTable { path: path.to_string(), diff --git a/dozer-ingestion/src/connectors/delta_lake/test/mod.rs b/dozer-ingestion/deltalake/src/test/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/delta_lake/test/mod.rs rename to dozer-ingestion/deltalake/src/test/mod.rs diff --git a/dozer-ingestion/dozer/Cargo.toml b/dozer-ingestion/dozer/Cargo.toml new file mode 100644 index 0000000000..237783c18e --- /dev/null +++ b/dozer-ingestion/dozer/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "dozer-ingestion-dozer" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +dozer-log = { path = "../../dozer-log" } diff --git a/dozer-ingestion/src/connectors/dozer/connector.rs b/dozer-ingestion/dozer/src/connector.rs similarity index 72% rename from dozer-ingestion/src/connectors/dozer/connector.rs rename to dozer-ingestion/dozer/src/connector.rs index 872611ac18..56bed4bd1d 100644 --- a/dozer-ingestion/src/connectors/dozer/connector.rs +++ b/dozer-ingestion/dozer/src/connector.rs @@ -1,37 +1,36 @@ use std::collections::HashMap; -use dozer_log::{ - reader::{LogReaderBuilder, LogReaderOptions}, - replication::LogOperation, -}; -use dozer_types::{ - errors::types::DeserializationError, - grpc_types::internal::{ - internal_pipeline_service_client::InternalPipelineServiceClient, - DescribeApplicationResponse, +use dozer_ingestion_connector::{ + dozer_types::{ + self, + errors::internal::BoxedError, + grpc_types::internal::{ + internal_pipeline_service_client::InternalPipelineServiceClient, + DescribeApplicationResponse, + }, + models::ingestion_types::{ + default_buffer_size, default_log_batch_size, default_timeout, IngestionMessage, + NestedDozerConfig, NestedDozerLogOptions, + }, + node::OpIdentifier, + serde_json, + tonic::{async_trait, transport::Channel}, + types::{FieldType, Operation, Record, Schema}, }, - models::ingestion_types::{ - default_buffer_size, default_log_batch_size, default_timeout, IngestionMessage, - NestedDozerConfig, NestedDozerLogOptions, + tokio::{ + sync::mpsc::{channel, Sender}, + task::JoinSet, }, - node::OpIdentifier, - serde_json, - tonic::{async_trait, transport::Channel}, - types::{Operation, Record, Schema}, + utils::warn_dropped_primary_index, + CdcType, Connector, Ingestor, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, + TableToIngest, }; -use tokio::{ - sync::mpsc::{channel, Sender}, - task::JoinSet, +use dozer_log::{ + reader::{LogReaderBuilder, LogReaderOptions}, + replication::LogOperation, }; -use crate::{ - connectors::{ - warn_dropped_primary_index, CdcType, Connector, SourceSchema, SourceSchemaResult, - TableIdentifier, TableInfo, TableToIngest, - }, - errors::{ConnectorError, NestedDozerConnectorError}, - ingestion::Ingestor, -}; +use crate::NestedDozerConnectorError; #[derive(Debug)] pub struct NestedDozerConnector { @@ -40,20 +39,20 @@ pub struct NestedDozerConnector { #[async_trait] impl Connector for NestedDozerConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { let _ = self.get_client().await?; Ok(()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { let mut tables = vec![]; let response = self.describe_application().await?; for (endpoint, _) in response.endpoints { @@ -63,7 +62,7 @@ impl Connector for NestedDozerConnector { Ok(tables) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { self.validate_connection().await?; for table in tables { @@ -75,17 +74,11 @@ impl Connector for NestedDozerConnector { async fn list_columns( &self, _tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let mut tables = vec![]; let response = self.describe_application().await?; for (endpoint, build) in response.endpoints { - let schema: SourceSchema = serde_json::from_str(&build.schema_string).map_err(|e| { - ConnectorError::TypeError( - dozer_types::errors::types::TypeError::DeserializationError( - DeserializationError::Json(e), - ), - ) - })?; + let schema: SourceSchema = serde_json::from_str(&build.schema_string)?; tables.push(TableInfo { schema: None, name: endpoint, @@ -103,23 +96,28 @@ impl Connector for NestedDozerConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let mut schemas = vec![]; for table_info in table_infos { let log_reader = self.get_reader_builder(table_info.name.clone()).await; - schemas.push(log_reader.and_then(|log_reader| { - let source_primary_index_len = log_reader.schema.schema.primary_index.len(); - let source_schema = log_reader.schema.schema; - let schema_mapper = SchemaMapper::new(source_schema, &table_info.column_names)?; - let mut schema = schema_mapper.map()?; - if schema.primary_index.len() < source_primary_index_len { - schema.primary_index.clear(); - warn_dropped_primary_index(&table_info.name); - } - - Ok(SourceSchema::new(schema, CdcType::FullChanges)) - })); + schemas.push( + log_reader + .and_then(|log_reader| { + let source_primary_index_len = log_reader.schema.schema.primary_index.len(); + let source_schema = log_reader.schema.schema; + let schema_mapper = + SchemaMapper::new(source_schema, &table_info.column_names)?; + let mut schema = schema_mapper.map()?; + if schema.primary_index.len() < source_primary_index_len { + schema.primary_index.clear(); + warn_dropped_primary_index(&table_info.name); + } + + Ok(SourceSchema::new(schema, CdcType::FullChanges)) + }) + .map_err(Into::into), + ); } Ok(schemas) @@ -129,7 +127,7 @@ impl Connector for NestedDozerConnector { &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), BoxedError> { let mut joinset = JoinSet::new(); let (sender, mut receiver) = channel(100); @@ -141,12 +139,10 @@ impl Connector for NestedDozerConnector { let ingestor = ingestor.clone(); joinset.spawn(async move { while let Some(message) = receiver.recv().await { - ingestor - .handle_message(message) - .await - .map_err(|_| ConnectorError::IngestorError)?; + // If the other side of the channel is dropped, return Ok. + let _ = ingestor.handle_message(message).await; } - Ok(()) + Ok::<_, NestedDozerConnectorError>(()) }); while let Some(result) = joinset.join_next().await { @@ -163,25 +159,24 @@ impl NestedDozerConnector { pub fn new(config: NestedDozerConfig) -> Self { Self { config } } - async fn get_client(&self) -> Result, ConnectorError> { + async fn get_client( + &self, + ) -> Result, NestedDozerConnectorError> { let client = InternalPipelineServiceClient::connect(self.config.url.clone()) .await - .map_err(|e| { - ConnectorError::NestedDozerConnectorError( - NestedDozerConnectorError::ConnectionError(self.config.url.clone(), e), - ) - })?; + .map_err(|e| NestedDozerConnectorError::ConnectionError(self.config.url.clone(), e))?; Ok(client) } - async fn describe_application(&self) -> Result { + async fn describe_application( + &self, + ) -> Result { let mut client = self.get_client().await?; - let response = client.describe_application(()).await.map_err(|e| { - ConnectorError::NestedDozerConnectorError( - NestedDozerConnectorError::DescribeEndpointsError(e), - ) - })?; + let response = client + .describe_application(()) + .await + .map_err(NestedDozerConnectorError::DescribeEndpointsError)?; Ok(response.into_inner()) } @@ -198,7 +193,7 @@ impl NestedDozerConnector { async fn get_reader_builder( &self, endpoint: String, - ) -> Result { + ) -> Result { let log_options = Self::get_log_options(endpoint, self.config.log_options.clone()); let log_reader_builder = LogReaderBuilder::new(self.config.url.clone(), log_options) .await @@ -212,7 +207,7 @@ async fn read_table( table_info: TableToIngest, reader_builder: LogReaderBuilder, sender: Sender, -) -> Result<(), ConnectorError> { +) -> Result<(), NestedDozerConnectorError> { let starting_point = table_info .checkpoint .map(|checkpoint| checkpoint.seq_in_tx + 1) @@ -221,9 +216,10 @@ async fn read_table( let schema = reader.schema.schema.clone(); let map = SchemaMapper::new(schema, &table_info.column_names)?; loop { - let op_and_pos = reader.read_one().await.map_err(|e| { - ConnectorError::NestedDozerConnectorError(NestedDozerConnectorError::ReaderError(e)) - })?; + let op_and_pos = reader + .read_one() + .await + .map_err(NestedDozerConnectorError::ReaderError)?; let op = match op_and_pos.op { LogOperation::Op { op } => op, LogOperation::Commit { .. } | LogOperation::SnapshottingDone { .. } => continue, @@ -271,7 +267,7 @@ impl SchemaMapper { fn new( source_schema: dozer_types::types::Schema, columns: &[String], - ) -> Result { + ) -> Result { let mut our_fields = Vec::with_capacity(columns.len()); let upstream_fields: HashMap = source_schema .fields @@ -293,9 +289,7 @@ impl SchemaMapper { primary_index.push(i); } } else { - return Err(ConnectorError::NestedDozerConnectorError( - NestedDozerConnectorError::ColumnNotFound(column.to_owned()), - )); + return Err(NestedDozerConnectorError::ColumnNotFound(column.to_owned())); } } @@ -306,7 +300,7 @@ impl SchemaMapper { }) } - fn map(self) -> Result { + fn map(self) -> Result { let field_definitions = reorder(&self.source_schema.fields, &self.fields) .map(|mut field| { field.source = Default::default(); @@ -351,7 +345,7 @@ mod tests { fn map( source_schema: Schema, output_fields: &[&'static str], - ) -> Result { + ) -> Result { let mapper = SchemaMapper::new(source_schema, &columns(output_fields))?; mapper.map() diff --git a/dozer-ingestion/dozer/src/lib.rs b/dozer-ingestion/dozer/src/lib.rs new file mode 100644 index 0000000000..5620508c91 --- /dev/null +++ b/dozer-ingestion/dozer/src/lib.rs @@ -0,0 +1,25 @@ +mod connector; +pub use connector::NestedDozerConnector; +use dozer_ingestion_connector::dozer_types::{ + self, + thiserror::{self, Error}, +}; +use dozer_log::errors::{ReaderBuilderError, ReaderError}; + +#[derive(Error, Debug)] +enum NestedDozerConnectorError { + #[error("Failed to connect to upstream dozer at {0}: {1:?}")] + ConnectionError(String, #[source] dozer_types::tonic::transport::Error), + + #[error("Failed to query endpoints from upstream dozer app: {0}")] + DescribeEndpointsError(#[source] dozer_types::tonic::Status), + + #[error(transparent)] + ReaderError(#[from] ReaderError), + + #[error(transparent)] + ReaderBuilderError(#[from] ReaderBuilderError), + + #[error("Column {0} not found")] + ColumnNotFound(String), +} diff --git a/dozer-ingestion/ethereum/Cargo.toml b/dozer-ingestion/ethereum/Cargo.toml new file mode 100644 index 0000000000..a5241ccea4 --- /dev/null +++ b/dozer-ingestion/ethereum/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "dozer-ingestion-ethereum" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +web3 = "0.18.0" +hex-literal = "0.4.1" +dozer-tracing = { path = "../../dozer-tracing" } diff --git a/dozer-ingestion/src/connectors/ethereum/README.md b/dozer-ingestion/ethereum/src/README.md similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/README.md rename to dozer-ingestion/ethereum/src/README.md diff --git a/dozer-ingestion/src/connectors/ethereum/helper.rs b/dozer-ingestion/ethereum/src/helper.rs similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/helper.rs rename to dozer-ingestion/ethereum/src/helper.rs diff --git a/dozer-ingestion/src/connectors/ethereum/mod.rs b/dozer-ingestion/ethereum/src/lib.rs similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/mod.rs rename to dozer-ingestion/ethereum/src/lib.rs diff --git a/dozer-ingestion/src/connectors/ethereum/log/connector.rs b/dozer-ingestion/ethereum/src/log/connector.rs similarity index 82% rename from dozer-ingestion/src/connectors/ethereum/log/connector.rs rename to dozer-ingestion/ethereum/src/log/connector.rs index 71a539c5e1..46b04c00ca 100644 --- a/dozer-ingestion/src/connectors/ethereum/log/connector.rs +++ b/dozer-ingestion/ethereum/src/log/connector.rs @@ -1,20 +1,21 @@ use std::collections::HashMap; use std::{str::FromStr, sync::Arc}; -use crate::connectors::{ - table_name, CdcType, Connector, SourceSchema, SourceSchemaResult, TableIdentifier, - TableToIngest, -}; -use crate::ingestion::Ingestor; -use crate::{connectors::TableInfo, errors::ConnectorError}; -use dozer_types::models::ingestion_types::{EthFilter, EthLogConfig}; - -use dozer_types::log::warn; -use dozer_types::serde_json; -use dozer_types::tonic::async_trait; - use super::helper; use super::sender::{run, EthDetails}; +use dozer_ingestion_connector::utils::TableNotFound; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + errors::internal::BoxedError, + log::warn, + models::ingestion_types::{EthFilter, EthLogConfig}, + serde_json, + types::FieldType, + }, + CdcType, Connector, Ingestor, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, + TableToIngest, +}; use web3::ethabi::{Contract, Event}; use web3::types::{Address, BlockNumber, Filter, FilterBuilder, H256, U64}; @@ -124,25 +125,22 @@ impl EthLogConnector { #[async_trait] impl Connector for EthLogConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { // Return contract parsing error for contract in &self.config.contracts { - let res: Result = serde_json::from_str(&contract.abi); - if let Err(e) = res { - return Err(ConnectorError::map_serialization_error(e)); - } + serde_json::from_str(&contract.abi)?; } Ok(()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { let event_schema_names = helper::get_contract_event_schemas(&self.contracts) .into_iter() .map(|(name, _)| TableIdentifier::from_table_name(name)); @@ -151,14 +149,15 @@ impl Connector for EthLogConnector { Ok(result) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let existing_tables = self.list_tables().await?; for table in tables { if !existing_tables.contains(table) || table.schema.is_some() { - return Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - ))); + return Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into()); } } Ok(()) @@ -167,7 +166,7 @@ impl Connector for EthLogConnector { async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let event_schemas = helper::get_contract_event_schemas(&self.contracts); let mut result = vec![]; for table in tables { @@ -188,10 +187,11 @@ impl Connector for EthLogConnector { .map(|field| field.name.clone()) .collect() } else { - return Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - ))); + return Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into()); }; result.push(TableInfo { schema: table.schema, @@ -205,7 +205,7 @@ impl Connector for EthLogConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let mut schemas = vec![( ETH_LOGS_TABLE.to_string(), SourceSchema::new(helper::get_eth_schema(), CdcType::Nothing), @@ -223,10 +223,11 @@ impl Connector for EthLogConnector { warn!("TODO: filter columns"); result.push(Ok(schema.clone())); } else { - result.push(Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - )))); + result.push(Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into())); } } @@ -237,7 +238,7 @@ impl Connector for EthLogConnector { &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), BoxedError> { // Start a new thread that interfaces with ETH node let wss_url = self.config.wss_url.to_owned(); let filter = self.config.filter.to_owned().unwrap_or_default(); diff --git a/dozer-ingestion/src/connectors/ethereum/log/helper.rs b/dozer-ingestion/ethereum/src/log/helper.rs similarity index 98% rename from dozer-ingestion/src/connectors/ethereum/log/helper.rs rename to dozer-ingestion/ethereum/src/log/helper.rs index 3681150d99..c4ae561cb2 100644 --- a/dozer-ingestion/src/connectors/ethereum/log/helper.rs +++ b/dozer-ingestion/ethereum/src/log/helper.rs @@ -1,15 +1,14 @@ -use dozer_types::log::error; -use dozer_types::types::{ - Field, FieldDefinition, FieldType, Operation, Record, Schema, SourceDefinition, -}; use std::collections::HashMap; use std::sync::Arc; +use dozer_ingestion_connector::dozer_types::log::error; +use dozer_ingestion_connector::dozer_types::types::{ + Field, FieldDefinition, FieldType, Operation, Record, Schema, SourceDefinition, +}; +use dozer_ingestion_connector::{CdcType, SourceSchema, TableToIngest}; use web3::ethabi::RawLog; use web3::types::Log; -use crate::connectors::{CdcType, SourceSchema, TableToIngest}; - use super::connector::{ContractTuple, ETH_LOGS_TABLE}; use super::sender::EthDetails; diff --git a/dozer-ingestion/ethereum/src/log/mod.rs b/dozer-ingestion/ethereum/src/log/mod.rs new file mode 100644 index 0000000000..84ee673377 --- /dev/null +++ b/dozer-ingestion/ethereum/src/log/mod.rs @@ -0,0 +1,17 @@ +mod connector; +mod helper; +mod sender; +pub use connector::EthLogConnector; +use dozer_ingestion_connector::dozer_types::thiserror::{self, Error}; + +#[cfg(test)] +mod tests; + +#[derive(Debug, Error)] +enum Error { + #[error("Failed fetching after {0} recursions")] + EthTooManyRecurisions(usize), + + #[error("Received empty message in connector")] + EmptyMessage, +} diff --git a/dozer-ingestion/src/connectors/ethereum/log/sender.rs b/dozer-ingestion/ethereum/src/log/sender.rs similarity index 80% rename from dozer-ingestion/src/connectors/ethereum/log/sender.rs rename to dozer-ingestion/ethereum/src/log/sender.rs index 446ab212b7..2e28270d69 100644 --- a/dozer-ingestion/src/connectors/ethereum/log/sender.rs +++ b/dozer-ingestion/ethereum/src/log/sender.rs @@ -2,23 +2,21 @@ use core::time; use std::collections::HashMap; use std::sync::Arc; -use crate::connectors::ethereum::log::connector::EthLogConnector; -use crate::ingestion::Ingestor; -use crate::{ - connectors::{ethereum::helper as conn_helper, TableToIngest}, - errors::ConnectorError, +use dozer_ingestion_connector::dozer_types::errors::internal::BoxedError; +use dozer_ingestion_connector::dozer_types::log::{debug, info, trace, warn}; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::{ + EthFilter, IngestionMessage, }; -use dozer_types::log::{debug, info, trace, warn}; -use dozer_types::models::ingestion_types::{EthFilter, IngestionMessage}; - -use futures::StreamExt; - -use futures::future::{BoxFuture, FutureExt}; - +use dozer_ingestion_connector::futures::future::BoxFuture; +use dozer_ingestion_connector::futures::{FutureExt, StreamExt}; +use dozer_ingestion_connector::{tokio, Ingestor, TableToIngest}; use web3::transports::WebSocket; use web3::types::{Log, H256}; use web3::Web3; +use crate::log::Error; +use crate::{helper as conn_helper, EthLogConnector}; + use super::connector::ContractTuple; use super::helper; @@ -57,19 +55,11 @@ impl<'a> EthDetails<'a> { } } -#[allow(unreachable_code)] -pub async fn run(details: Arc>) -> Result<(), ConnectorError> { - let client = conn_helper::get_wss_client(&details.wss_url) - .await - .map_err(ConnectorError::EthError)?; +pub async fn run(details: Arc>) -> Result<(), BoxedError> { + let client = conn_helper::get_wss_client(&details.wss_url).await?; // Get current block no. - let latest_block_no = client - .eth() - .block_number() - .await - .map_err(ConnectorError::EthError)? - .as_u64(); + let latest_block_no = client.eth().block_number().await?.as_u64(); let block_end = match details.filter.to_block { None => latest_block_no, @@ -120,8 +110,7 @@ pub async fn run(details: Arc>) -> Result<(), ConnectorError> { let filter = client .eth_filter() .create_logs_filter(EthLogConnector::build_filter(&filter)) - .await - .map_err(ConnectorError::EthError)?; + .await?; let stream = filter.stream(time::Duration::from_secs(1)); @@ -130,11 +119,9 @@ pub async fn run(details: Arc>) -> Result<(), ConnectorError> { loop { let msg = stream.next().await; - let msg = msg - .map_or(Err(ConnectorError::EmptyMessage), Ok)? - .map_err(ConnectorError::EthError)?; + let msg = msg.ok_or(Error::EmptyMessage)??; - process_log(details.clone(), msg).await?; + process_log(details.clone(), msg).await; } } else { info!("[{}] Reading reached block_to limit", details.conn_name); @@ -149,7 +136,7 @@ pub fn fetch_logs( block_end: u64, depth: usize, retries_left: usize, -) -> BoxFuture<'_, Result<(), ConnectorError>> { +) -> BoxFuture<'_, Result<(), BoxedError>> { let filter = details.filter.clone(); let depth_str = (0..depth) .map(|_| " ".to_string()) @@ -168,11 +155,11 @@ pub fn fetch_logs( process_log( details.clone(), msg, - ).await?; + ).await; } Ok(()) }, - Err(e) => match &e { + Err(e) => match e { web3::Error::Rpc(rpc_error) => { // Infura returns a RpcError if the no of records are more than 10000 // { code: ServerError(-32005), message: "query returned more than 10000 results", data: None } @@ -180,7 +167,7 @@ pub fn fetch_logs( if rpc_error.code.code() == -32005 { debug!("[{}] {} More than 10000 records, block_start: {},block_end: {}, depth: {}", details.conn_name, depth_str, block_start, block_end, depth); if depth > 100 { - Err(ConnectorError::EthTooManyRecurisions(depth)) + Err(Error::EthTooManyRecurisions(depth).into()) } else { let middle = (block_start + block_end) / 2; debug!("[{}] {} Splitting in two calls block_start: {}, middle: {}, block_end: {}", details.conn_name, depth_str,block_start, block_end, middle); @@ -206,12 +193,12 @@ pub fn fetch_logs( Ok(()) } } else { - Err(ConnectorError::EthError(e)) + Err(rpc_error.into()) } } e => { if retries_left == 0 { - Err(ConnectorError::EthError(e.to_owned())) + Err(e.into()) } else { warn!("[{}] Retrying to fetch logs", details.conn_name); fetch_logs(details, client, block_start, block_end, depth, retries_left - 1).await?; @@ -224,15 +211,13 @@ pub fn fetch_logs( .boxed() } -async fn process_log(details: Arc>, msg: Log) -> Result<(), ConnectorError> { +async fn process_log(details: Arc>, msg: Log) { // Filter pending logs. log.log_index is None for pending State - if msg.log_index.is_none() { - Ok(()) - } else { + if msg.log_index.is_some() { if let Some((table_index, op)) = helper::map_log_to_event(msg.to_owned(), details.clone()) { trace!("Writing log : {:?}", op); // Write eth_log record - details + if details .ingestor .handle_message(IngestionMessage::OperationEvent { table_index, @@ -240,7 +225,11 @@ async fn process_log(details: Arc>, msg: Log) -> Result<(), Conne id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving end is closed, exit + return; + } } else { trace!("Ignoring log : {:?}", msg); } @@ -250,19 +239,17 @@ async fn process_log(details: Arc>, msg: Log) -> Result<(), Conne let op = helper::decode_event(msg, details.contracts.to_owned(), details.tables.clone()); if let Some((table_index, op)) = op { trace!("Writing event : {:?}", op); - details + // if receiving end is closed, ignore + let _ = details .ingestor .handle_message(IngestionMessage::OperationEvent { table_index, op, id: None, }) - .await - .map_err(|_| ConnectorError::IngestorError)?; + .await; } else { trace!("Writing event : {:?}", op); } - - Ok(()) } } diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/Dockerfile b/dozer-ingestion/ethereum/src/log/tests/Dockerfile similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/log/tests/Dockerfile rename to dozer-ingestion/ethereum/src/log/tests/Dockerfile diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/connector.rs b/dozer-ingestion/ethereum/src/log/tests/connector.rs similarity index 89% rename from dozer-ingestion/src/connectors/ethereum/log/tests/connector.rs rename to dozer-ingestion/ethereum/src/log/tests/connector.rs index 18e45b9ee4..60951b6d17 100644 --- a/dozer-ingestion/src/connectors/ethereum/log/tests/connector.rs +++ b/dozer-ingestion/ethereum/src/log/tests/connector.rs @@ -1,8 +1,9 @@ -use dozer_types::types::{Field, Operation}; +use dozer_ingestion_connector::{ + dozer_types::types::{Field, Operation}, + test_util::create_test_runtime, +}; use hex_literal::hex; -use crate::test_util::create_test_runtime; - use super::helper::run_eth_sample; #[test] diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/contracts/CustomEvent.code b/dozer-ingestion/ethereum/src/log/tests/contracts/CustomEvent.code similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/log/tests/contracts/CustomEvent.code rename to dozer-ingestion/ethereum/src/log/tests/contracts/CustomEvent.code diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/contracts/CustomEvent.json b/dozer-ingestion/ethereum/src/log/tests/contracts/CustomEvent.json similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/log/tests/contracts/CustomEvent.json rename to dozer-ingestion/ethereum/src/log/tests/contracts/CustomEvent.json diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/contracts/CustomEvent.sol b/dozer-ingestion/ethereum/src/log/tests/contracts/CustomEvent.sol similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/log/tests/contracts/CustomEvent.sol rename to dozer-ingestion/ethereum/src/log/tests/contracts/CustomEvent.sol diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/docker-compose.yml b/dozer-ingestion/ethereum/src/log/tests/docker-compose.yml similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/log/tests/docker-compose.yml rename to dozer-ingestion/ethereum/src/log/tests/docker-compose.yml diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/helper.rs b/dozer-ingestion/ethereum/src/log/tests/helper.rs similarity index 88% rename from dozer-ingestion/src/connectors/ethereum/log/tests/helper.rs rename to dozer-ingestion/ethereum/src/log/tests/helper.rs index 382acb36df..72127a5152 100644 --- a/dozer-ingestion/src/connectors/ethereum/log/tests/helper.rs +++ b/dozer-ingestion/ethereum/src/log/tests/helper.rs @@ -1,27 +1,24 @@ use std::{sync::Arc, time::Duration}; -use crate::{ - connectors::{ - ethereum::{helper, EthLogConnector}, - Connector, TableInfo, +use dozer_ingestion_connector::{ + dozer_types::{ + errors::internal::BoxedError, + log::info, + models::ingestion_types::{EthContract, EthFilter, EthLogConfig, IngestionMessage}, + types::Operation, }, - errors::ConnectorError, test_util::spawn_connector, + tokio::runtime::Runtime, + Connector, TableInfo, }; - -use dozer_types::{ - log::info, - models::ingestion_types::{EthContract, EthFilter, EthLogConfig, IngestionMessage}, - types::Operation, -}; - -use tokio::runtime::Runtime; use web3::{ contract::{Contract, Options}, transports::WebSocket, types::H160, }; +use crate::{helper, EthLogConnector}; + pub async fn deploy_contract(wss_url: String, my_account: H160) -> Contract { let web3 = helper::get_wss_client(&wss_url).await.unwrap(); // Get the contract bytecode for instance from Solidity compiler @@ -48,7 +45,7 @@ pub async fn deploy_contract(wss_url: String, my_account: H160) -> Contract, -) -> Result<(EthLogConnector, Vec), ConnectorError> { +) -> Result<(EthLogConnector, Vec), BoxedError> { let address = format!("{:?}", contract.address()); let eth_connector = EthLogConnector::new( EthLogConfig { diff --git a/dozer-ingestion/src/connectors/ethereum/log/tests/mod.rs b/dozer-ingestion/ethereum/src/log/tests/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/log/tests/mod.rs rename to dozer-ingestion/ethereum/src/log/tests/mod.rs diff --git a/dozer-ingestion/src/connectors/ethereum/trace/connector.rs b/dozer-ingestion/ethereum/src/trace/connector.rs similarity index 76% rename from dozer-ingestion/src/connectors/ethereum/trace/connector.rs rename to dozer-ingestion/ethereum/src/trace/connector.rs index 45041ce412..63fd7c7b4e 100644 --- a/dozer-ingestion/src/connectors/ethereum/trace/connector.rs +++ b/dozer-ingestion/ethereum/src/trace/connector.rs @@ -1,14 +1,18 @@ -use super::super::helper as conn_helper; -use super::helper::{self, get_block_traces, map_trace_to_ops}; -use crate::connectors::{ - table_name, CdcType, Connector, SourceSchema, SourceSchemaResult, TableIdentifier, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + errors::internal::BoxedError, + log::{error, info, warn}, + models::ingestion_types::{default_batch_size, EthTraceConfig, IngestionMessage}, + types::FieldType, + }, + utils::TableNotFound, + CdcType, Connector, Ingestor, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, TableToIngest, }; -use crate::{connectors::TableInfo, errors::ConnectorError, ingestion::Ingestor}; -use dozer_types::log::{error, info, warn}; -use dozer_types::models::ingestion_types::{default_batch_size, EthTraceConfig, IngestionMessage}; -use dozer_types::tonic::async_trait; +use super::super::helper as conn_helper; +use super::helper::{self, get_block_traces, map_trace_to_ops}; #[derive(Debug)] pub struct EthTraceConnector { @@ -26,30 +30,31 @@ impl EthTraceConnector { #[async_trait] impl Connector for EthTraceConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { validate(&self.config).await } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { Ok(vec![TableIdentifier::from_table_name( ETH_TRACE_TABLE.to_string(), )]) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { for table in tables { if table.name != ETH_TRACE_TABLE || table.schema.is_some() { - return Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - ))); + return Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into()); } } Ok(()) @@ -58,14 +63,15 @@ impl Connector for EthTraceConnector { async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let mut result = Vec::new(); for table in tables { if table.name != ETH_TRACE_TABLE || table.schema.is_some() { - return Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - ))); + return Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into()); } let column_names = helper::get_trace_schema() .fields @@ -84,7 +90,7 @@ impl Connector for EthTraceConnector { async fn get_schemas( &self, _table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { warn!("TODO: respect table_infos"); Ok(vec![Ok(SourceSchema::new( helper::get_trace_schema(), @@ -96,18 +102,16 @@ impl Connector for EthTraceConnector { &self, ingestor: &Ingestor, _tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), BoxedError> { let config = self.config.clone(); let conn_name = self.conn_name.clone(); run(ingestor, config, conn_name).await } } -pub async fn validate(config: &EthTraceConfig) -> Result<(), ConnectorError> { +pub async fn validate(config: &EthTraceConfig) -> Result<(), BoxedError> { // Check if transport can be initialized - let tuple = conn_helper::get_batch_http_client(&config.https_url) - .await - .map_err(ConnectorError::EthError)?; + let tuple = conn_helper::get_batch_http_client(&config.https_url).await?; // Check if debug API is available get_block_traces(tuple, (1000000, 1000005)).await?; @@ -119,10 +123,8 @@ pub async fn run( ingestor: &Ingestor, config: EthTraceConfig, conn_name: String, -) -> Result<(), ConnectorError> { - let client_tuple = conn_helper::get_batch_http_client(&config.https_url) - .await - .map_err(ConnectorError::EthError)?; +) -> Result<(), BoxedError> { + let client_tuple = conn_helper::get_batch_http_client(&config.https_url).await?; info!( "Starting Eth Trace connector: {} from block {}", @@ -134,7 +136,7 @@ pub async fn run( config.batch_size.unwrap_or_else(default_batch_size), ); - let mut errors: Vec = vec![]; + let mut errors: Vec = vec![]; for batch in batch_iter { for retry in 0..RETRIES { if retry >= RETRIES - 1 { @@ -149,14 +151,18 @@ pub async fn run( let ops = map_trace_to_ops(&result.result); for op in ops { - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index: 0, // We have only one table op, id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving end is closed, exit + return Ok(()); + } } } diff --git a/dozer-ingestion/src/connectors/ethereum/trace/helper.rs b/dozer-ingestion/ethereum/src/trace/helper.rs similarity index 85% rename from dozer-ingestion/src/connectors/ethereum/trace/helper.rs rename to dozer-ingestion/ethereum/src/trace/helper.rs index adbafcfa5a..1ae76db81d 100644 --- a/dozer-ingestion/src/connectors/ethereum/trace/helper.rs +++ b/dozer-ingestion/ethereum/src/trace/helper.rs @@ -1,24 +1,28 @@ -use dozer_types::log::{debug, error}; -use dozer_types::serde; -use dozer_types::serde_json::{self, json}; -use dozer_types::types::{Field, FieldType, Operation, Record, Schema, SourceDefinition}; -use serde::{Deserialize, Serialize}; - -use dozer_types::types::FieldDefinition; +use dozer_ingestion_connector::dozer_types::{ + errors::internal::BoxedError, + log::{debug, error}, + serde::{Deserialize, Serialize}, + serde_json::{self, json}, + types::{Field, FieldDefinition, FieldType, Operation, Record, Schema, SourceDefinition}, +}; use web3::transports::{Batch, Http}; use web3::types::{H160, U256}; use web3::{BatchTransport, Transport, Web3}; -use crate::errors::ConnectorError; - #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(crate = "self::serde", rename_all = "camelCase")] +#[serde( + crate = "dozer_ingestion_connector::dozer_types::serde", + rename_all = "camelCase" +)] pub struct TraceResult { pub result: Trace, } #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(crate = "self::serde", rename_all = "camelCase")] +#[serde( + crate = "dozer_ingestion_connector::dozer_types::serde", + rename_all = "camelCase" +)] pub struct Trace { #[serde(rename = "type")] pub type_field: String, @@ -35,7 +39,7 @@ pub struct Trace { pub async fn get_block_traces( tuple: (Web3>, Http), batch: (u64, u64), -) -> Result, ConnectorError> { +) -> Result, BoxedError> { debug_assert!(batch.0 < batch.1, "Batch start must be less than batch end"); let (client, transport) = tuple; let mut requests = vec![]; @@ -58,10 +62,7 @@ pub async fn get_block_traces( request_count += 1; } - let batch_results = transport - .send_batch(requests) - .await - .map_err(ConnectorError::EthError)?; + let batch_results = transport.send_batch(requests).await?; debug!( "Requests: {:?}, Results: {:?}", @@ -72,11 +73,10 @@ pub async fn get_block_traces( for (idx, res) in batch_results.iter().enumerate() { let res = res.clone().map_err(|e| { error!("Error getting trace: {:?}", e); - ConnectorError::EthError(e) + e })?; - let r: Vec = - serde_json::from_value(res).map_err(ConnectorError::map_serialization_error)?; + let r: Vec = serde_json::from_value(res)?; debug!("Idx: {} : Response: {:?}", idx, r); diff --git a/dozer-ingestion/src/connectors/ethereum/trace/mod.rs b/dozer-ingestion/ethereum/src/trace/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/ethereum/trace/mod.rs rename to dozer-ingestion/ethereum/src/trace/mod.rs diff --git a/dozer-ingestion/src/connectors/ethereum/trace/tests.rs b/dozer-ingestion/ethereum/src/trace/tests.rs similarity index 90% rename from dozer-ingestion/src/connectors/ethereum/trace/tests.rs rename to dozer-ingestion/ethereum/src/trace/tests.rs index 81f7952a39..0eff17ac9e 100644 --- a/dozer-ingestion/src/connectors/ethereum/trace/tests.rs +++ b/dozer-ingestion/ethereum/src/trace/tests.rs @@ -1,19 +1,17 @@ use std::{env, time::Duration}; -use dozer_types::{ - log::info, - models::ingestion_types::{EthTraceConfig, IngestionMessage}, - types::{Field, Operation}, -}; - -use crate::{ - connectors::{ - ethereum::{helper, trace::helper::get_block_traces, EthTraceConnector}, - Connector, +use dozer_ingestion_connector::{ + dozer_types::{ + log::info, + models::ingestion_types::{EthTraceConfig, IngestionMessage}, + types::{Field, Operation}, }, test_util::{create_test_runtime, spawn_connector}, + tokio, Connector, }; +use crate::{helper, trace::helper::get_block_traces, EthTraceConnector}; + use super::connector::BatchIterator; #[test] diff --git a/dozer-ingestion/grpc/Cargo.toml b/dozer-ingestion/grpc/Cargo.toml new file mode 100644 index 0000000000..84779cdf3f --- /dev/null +++ b/dozer-ingestion/grpc/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "dozer-ingestion-grpc" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +tower-http = { version = "0.4", features = ["full"] } +tonic-web = "0.10.2" +tonic-reflection = "0.10.0" diff --git a/dozer-ingestion/src/connectors/grpc/adapter/arrow.rs b/dozer-ingestion/grpc/src/adapter/arrow.rs similarity index 64% rename from dozer-ingestion/src/connectors/grpc/adapter/arrow.rs rename to dozer-ingestion/grpc/src/adapter/arrow.rs index 5fc2006c27..72ff83f150 100644 --- a/dozer-ingestion/src/connectors/grpc/adapter/arrow.rs +++ b/dozer-ingestion/grpc/src/adapter/arrow.rs @@ -1,28 +1,27 @@ use std::collections::HashMap; -use dozer_types::{ - arrow::datatypes::Schema as ArrowSchema, - arrow::{self, ipc::reader::StreamReader}, - arrow_types::{self, from_arrow::map_record_batch_to_dozer_records}, - bytes::{Buf, Bytes}, - grpc_types::ingest::IngestArrowRequest, - models::ingestion_types::IngestionMessage, - serde::{Deserialize, Serialize}, - serde_json, - tonic::async_trait, - types::{Operation, Record, Schema}, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + arrow::datatypes::Schema as ArrowSchema, + arrow::{self, ipc::reader::StreamReader}, + arrow_types::{self, from_arrow::map_record_batch_to_dozer_records}, + bytes::{Buf, Bytes}, + grpc_types::ingest::IngestArrowRequest, + models::ingestion_types::IngestionMessage, + serde::{Deserialize, Serialize}, + serde_json, + types::{Operation, Record, Schema}, + }, + CdcType, Ingestor, SourceSchema, }; -use crate::{ - connectors::{CdcType, SourceSchema}, - errors::{ConnectorError, ObjectStoreConnectorError}, - ingestion::Ingestor, -}; +use crate::Error; use super::{GrpcIngestMessage, IngestAdapter}; #[derive(Clone, Serialize, Deserialize, Debug)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct GrpcArrowSchema { pub name: String, pub schema: arrow::datatypes::Schema, @@ -43,16 +42,14 @@ impl ArrowAdapter { #[allow(clippy::type_complexity)] fn parse_schemas( schemas_str: &str, - ) -> Result<(Vec<(String, SourceSchema)>, HashMap), ConnectorError> { - let grpc_schemas: Vec = - serde_json::from_str(schemas_str).map_err(ConnectorError::map_serialization_error)?; + ) -> Result<(Vec<(String, SourceSchema)>, HashMap), Error> { + let grpc_schemas: Vec = serde_json::from_str(schemas_str)?; let mut schemas = vec![]; let mut arrow_schemas = HashMap::new(); for (id, grpc_schema) in grpc_schemas.into_iter().enumerate() { - let schema = arrow_types::from_arrow::map_schema_to_dozer(&grpc_schema.schema) - .map_err(|e| ConnectorError::InternalError(Box::new(e)))?; + let schema = arrow_types::from_arrow::map_schema_to_dozer(&grpc_schema.schema)?; arrow_schemas.insert(id as u32, grpc_schema.schema); @@ -67,7 +64,7 @@ impl ArrowAdapter { #[async_trait] impl IngestAdapter for ArrowAdapter { - fn new(schemas_str: String) -> Result { + fn new(schemas_str: String) -> Result { let (schemas, arrow_schemas) = Self::parse_schemas(&schemas_str)?; let schema_map = schemas.into_iter().collect(); Ok(Self { @@ -88,11 +85,9 @@ impl IngestAdapter for ArrowAdapter { table_index: usize, msg: GrpcIngestMessage, ingestor: &'static Ingestor, - ) -> Result<(), ConnectorError> { + ) -> Result<(), Error> { match msg { - GrpcIngestMessage::Default(_) => Err(ConnectorError::InitializationError( - "Wrong message format!".to_string(), - )), + GrpcIngestMessage::Default(_) => Err(Error::CannotHandleDefaultMessage), GrpcIngestMessage::Arrow(msg) => { handle_message(table_index, msg, &self.schema_map, ingestor).await } @@ -105,12 +100,10 @@ pub async fn handle_message( req: IngestArrowRequest, schema_map: &HashMap, ingestor: &'static Ingestor, -) -> Result<(), ConnectorError> { +) -> Result<(), Error> { let schema = &schema_map .get(&req.schema_name) - .ok_or_else(|| { - ConnectorError::InitializationError(format!("schema not found: {}", req.schema_name)) - })? + .ok_or_else(|| Error::SchemaNotFound(req.schema_name.clone()))? .schema; let records = map_record_batch(req, schema)?; @@ -118,30 +111,30 @@ pub async fn handle_message( for r in records { let op = Operation::Insert { new: r }; - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index, op, id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving end is closed, then we can just ignore the message + return Ok(()); + } } Ok(()) } -fn map_record_batch( - req: IngestArrowRequest, - schema: &Schema, -) -> Result, ConnectorError> { +fn map_record_batch(req: IngestArrowRequest, schema: &Schema) -> Result, Error> { let mut buf = Bytes::from(req.records).reader(); // read stream back let mut reader = StreamReader::try_new(&mut buf, None)?; let mut records = Vec::new(); while let Some(Ok(batch)) = reader.next() { - let b_recs = map_record_batch_to_dozer_records(batch, schema) - .map_err(ObjectStoreConnectorError::FromArrowError)?; + let b_recs = map_record_batch_to_dozer_records(batch, schema)?; records.extend(b_recs); } diff --git a/dozer-ingestion/src/connectors/grpc/adapter/default.rs b/dozer-ingestion/grpc/src/adapter/default.rs similarity index 76% rename from dozer-ingestion/src/connectors/grpc/adapter/default.rs rename to dozer-ingestion/grpc/src/adapter/default.rs index 60a757dfda..0367bb8a79 100644 --- a/dozer-ingestion/src/connectors/grpc/adapter/default.rs +++ b/dozer-ingestion/grpc/src/adapter/default.rs @@ -1,34 +1,32 @@ -use dozer_types::serde_json; -use dozer_types::tonic::async_trait; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + self, chrono, + grpc_types::{self, ingest::IngestRequest}, + json_types::prost_to_json_value, + models::ingestion_types::IngestionMessage, + ordered_float::OrderedFloat, + rust_decimal::Decimal, + serde_json, + types::{Field, Operation, Record, Schema}, + }, + Ingestor, SourceSchema, +}; -use crate::{connectors::SourceSchema, errors::ConnectorError}; +use crate::Error; use super::{GrpcIngestMessage, IngestAdapter}; -use dozer_types::{ - chrono, - models::ingestion_types::IngestionMessage, - ordered_float::OrderedFloat, - types::{Field, Operation, Record, Schema}, -}; use std::collections::HashMap; -use crate::ingestion::Ingestor; - -use dozer_types::grpc_types; -use dozer_types::grpc_types::ingest::IngestRequest; -use dozer_types::json_types::prost_to_json_value; -use dozer_types::rust_decimal::Decimal; - #[derive(Debug)] pub struct DefaultAdapter { schema_map: HashMap, } impl DefaultAdapter { - fn parse_schemas(schemas_str: &str) -> Result, ConnectorError> { - let schemas: HashMap = - serde_json::from_str(schemas_str).map_err(ConnectorError::map_serialization_error)?; + fn parse_schemas(schemas_str: &str) -> Result, Error> { + let schemas: HashMap = serde_json::from_str(schemas_str)?; Ok(schemas) } @@ -36,7 +34,7 @@ impl DefaultAdapter { #[async_trait] impl IngestAdapter for DefaultAdapter { - fn new(schemas_str: String) -> Result { + fn new(schemas_str: String) -> Result { let schema_map = Self::parse_schemas(&schemas_str)?; Ok(Self { schema_map }) } @@ -52,14 +50,12 @@ impl IngestAdapter for DefaultAdapter { table_index: usize, msg: GrpcIngestMessage, ingestor: &'static Ingestor, - ) -> Result<(), ConnectorError> { + ) -> Result<(), Error> { match msg { GrpcIngestMessage::Default(msg) => { handle_message(table_index, msg, &self.schema_map, ingestor).await } - GrpcIngestMessage::Arrow(_) => Err(ConnectorError::InitializationError( - "Wrong message format!".to_string(), - )), + GrpcIngestMessage::Arrow(_) => Err(Error::CannotHandleArrowMessage), } } } @@ -69,12 +65,10 @@ pub async fn handle_message( req: IngestRequest, schema_map: &HashMap, ingestor: &'static Ingestor, -) -> Result<(), ConnectorError> { +) -> Result<(), Error> { let schema = &schema_map .get(&req.schema_name) - .ok_or_else(|| { - ConnectorError::InitializationError(format!("schema not found: {}", req.schema_name)) - })? + .ok_or_else(|| Error::SchemaNotFound(req.schema_name.clone()))? .schema; let op = match req.typ() { @@ -89,24 +83,26 @@ pub async fn handle_message( new: map_record(req.new.unwrap(), schema)?, }, }; - ingestor + // If receiving end is closed, then we can just ignore the message + let _ = ingestor .handle_message(IngestionMessage::OperationEvent { table_index, op, id: None, }) - .await - .map_err(|_| ConnectorError::IngestorError) + .await; + Ok(()) } -fn map_record(rec: grpc_types::types::Record, schema: &Schema) -> Result { +fn map_record(rec: grpc_types::types::Record, schema: &Schema) -> Result { let mut values: Vec = vec![]; let values_count = rec.values.len(); let schema_fields_count = schema.fields.len(); if values_count != schema_fields_count { - return Err(ConnectorError::InitializationError( - format!("record is not properly formed. Length of values {values_count} does not match schema: {schema_fields_count} "), - )); + return Err(Error::NumFieldsMismatch { + values_count, + schema_fields_count, + }); } for (idx, v) in rec.values.into_iter().enumerate() { @@ -179,9 +175,11 @@ fn map_record(rec: grpc_types::types::Record, schema: &Schema) -> Result Ok(dozer_types::types::Field::Null), - (a, b) => Err(ConnectorError::InitializationError(format!( - "data is not valid at index: {idx}, Type: {a:?}, Expected Type: {b}" - ))), + (a, b) => Err(Error::FieldTypeMismatch { + index: idx, + value: a, + field_type: b, + }), }); values.push(val.unwrap_or(Ok(dozer_types::types::Field::Null))?); } diff --git a/dozer-ingestion/src/connectors/grpc/adapter/mod.rs b/dozer-ingestion/grpc/src/adapter/mod.rs similarity index 72% rename from dozer-ingestion/src/connectors/grpc/adapter/mod.rs rename to dozer-ingestion/grpc/src/adapter/mod.rs index 3e08229549..07f4262e69 100644 --- a/dozer-ingestion/src/connectors/grpc/adapter/mod.rs +++ b/dozer-ingestion/grpc/src/adapter/mod.rs @@ -1,30 +1,32 @@ use std::fmt::Debug; -use dozer_types::grpc_types::ingest::{IngestArrowRequest, IngestRequest}; -use dozer_types::tonic::async_trait; - -use crate::{connectors::SourceSchema, errors::ConnectorError, ingestion::Ingestor}; - mod default; mod arrow; pub use arrow::ArrowAdapter; pub use default::DefaultAdapter; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::grpc_types::ingest::{IngestArrowRequest, IngestRequest}, + Ingestor, SourceSchema, +}; + +use crate::Error; #[async_trait] pub trait IngestAdapter: Debug where Self: Send + Sync + 'static + Sized, { - fn new(schemas_str: String) -> Result; + fn new(schemas_str: String) -> Result; fn get_schemas(&self) -> Vec<(String, SourceSchema)>; async fn handle_message( &self, table_index: usize, msg: GrpcIngestMessage, ingestor: &'static Ingestor, - ) -> Result<(), ConnectorError>; + ) -> Result<(), Error>; } pub enum GrpcIngestMessage { @@ -41,7 +43,7 @@ impl GrpcIngestor where T: IngestAdapter, { - pub fn new(schemas_str: String) -> Result { + pub fn new(schemas_str: String) -> Result { let adapter = T::new(schemas_str)?; Ok(Self { adapter }) } @@ -51,7 +53,7 @@ impl GrpcIngestor where A: IngestAdapter, { - pub fn get_schemas(&self) -> Result, ConnectorError> { + pub fn get_schemas(&self) -> Result, Error> { Ok(self.adapter.get_schemas()) } @@ -60,7 +62,7 @@ where table_index: usize, msg: GrpcIngestMessage, ingestor: &'static Ingestor, - ) -> Result<(), ConnectorError> { + ) -> Result<(), Error> { self.adapter .handle_message(table_index, msg, ingestor) .await diff --git a/dozer-ingestion/src/connectors/grpc/connector.rs b/dozer-ingestion/grpc/src/connector.rs similarity index 71% rename from dozer-ingestion/src/connectors/grpc/connector.rs rename to dozer-ingestion/grpc/src/connector.rs index d2383bd920..5975a9ac13 100644 --- a/dozer-ingestion/src/connectors/grpc/connector.rs +++ b/dozer-ingestion/grpc/src/connector.rs @@ -1,20 +1,26 @@ use std::fmt::Debug; use std::path::Path; +use crate::Error; + use super::adapter::{GrpcIngestor, IngestAdapter}; use super::ingest::IngestorServiceImpl; -use crate::connectors::{ - table_name, Connector, SourceSchema, SourceSchemaResult, TableIdentifier, TableToIngest, -}; -use crate::{connectors::TableInfo, errors::ConnectorError, ingestion::Ingestor}; -use dozer_types::grpc_types::ingest::ingest_service_server::IngestServiceServer; -use dozer_types::log::{info, warn}; -use dozer_types::models::ingestion_types::{ - default_ingest_host, default_ingest_port, GrpcConfig, GrpcConfigSchemas, +use dozer_ingestion_connector::utils::TableNotFound; +use dozer_ingestion_connector::{ + async_trait, dozer_types, + dozer_types::{ + errors::internal::BoxedError, + grpc_types::ingest::ingest_service_server::IngestServiceServer, + log::{info, warn}, + models::ingestion_types::{ + default_ingest_host, default_ingest_port, GrpcConfig, GrpcConfigSchemas, + }, + tonic::transport::Server, + tracing::Level, + }, + Connector, Ingestor, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, + TableToIngest, }; -use dozer_types::tonic::async_trait; -use dozer_types::tonic::transport::Server; -use dozer_types::tracing::Level; use tower_http::trace::{self, TraceLayer}; #[derive(Debug)] @@ -31,15 +37,15 @@ impl GrpcConnector where T: IngestAdapter, { - pub fn new(name: String, config: GrpcConfig) -> Result { - Ok(Self { + pub fn new(name: String, config: GrpcConfig) -> Self { + Self { name, config, _phantom: std::marker::PhantomData, - }) + } } - pub fn parse_config(config: &GrpcConfig) -> Result + pub fn parse_config(config: &GrpcConfig) -> Result where T: IngestAdapter, { @@ -49,7 +55,7 @@ where GrpcConfigSchemas::Path(path) => { let path = Path::new(path); std::fs::read_to_string(path) - .map_err(|e| ConnectorError::InitializationError(e.to_string()))? + .map_err(|e| Error::CannotReadFile(path.to_path_buf(), e))? } }; @@ -60,13 +66,11 @@ where &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), Error> { let host = self.config.host.clone().unwrap_or_else(default_ingest_host); let port = self.config.port.unwrap_or_else(default_ingest_port); - let addr = format!("{host}:{port}").parse().map_err(|e| { - ConnectorError::InitializationError(format!("Failed to parse address: {e}")) - })?; + let addr = format!("{host}:{port}").parse()?; let schemas_str = Self::parse_config(&self.config)?; let adapter = GrpcIngestor::::new(schemas_str)?; @@ -96,12 +100,12 @@ where .add_service(reflection_service) .serve(addr) .await - .map_err(|e| ConnectorError::InitializationError(e.to_string())) + .map_err(Into::into) } } impl GrpcConnector { - fn get_all_schemas(&self) -> Result, ConnectorError> { + fn get_all_schemas(&self) -> Result, Error> { let schemas_str = Self::parse_config(&self.config)?; let adapter = GrpcIngestor::::new(schemas_str)?; adapter.get_schemas() @@ -120,11 +124,12 @@ where todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { - self.get_all_schemas().map(|_| ()) + async fn validate_connection(&self) -> Result<(), BoxedError> { + self.get_all_schemas()?; + Ok(()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { Ok(self .get_all_schemas()? .into_iter() @@ -132,17 +137,18 @@ where .collect()) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let schemas = self.get_all_schemas()?; for table in tables { if !schemas .iter() .any(|(name, _)| name == &table.name && table.schema.is_none()) { - return Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - ))); + return Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into()); } } Ok(()) @@ -151,7 +157,7 @@ where async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let schemas = self.get_all_schemas()?; let mut result = vec![]; for table in tables { @@ -171,10 +177,11 @@ where column_names, }) } else { - return Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - ))); + return Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into()); } } Ok(result) @@ -183,7 +190,7 @@ where async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let schemas_str = Self::parse_config(&self.config)?; let adapter = GrpcIngestor::::new(schemas_str)?; @@ -198,10 +205,11 @@ where warn!("TODO: filter columns"); result.push(Ok(schema.clone())); } else { - result.push(Err(ConnectorError::TableNotFound(table_name( - table.schema.as_deref(), - &table.name, - )))); + result.push(Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + } + .into())); } } @@ -212,7 +220,7 @@ where &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { - self.serve(ingestor, tables).await + ) -> Result<(), BoxedError> { + self.serve(ingestor, tables).await.map_err(Into::into) } } diff --git a/dozer-ingestion/src/connectors/grpc/ingest.rs b/dozer-ingestion/grpc/src/ingest.rs similarity index 94% rename from dozer-ingestion/src/connectors/grpc/ingest.rs rename to dozer-ingestion/grpc/src/ingest.rs index 6222f1325e..91697d39c5 100644 --- a/dozer-ingestion/src/connectors/grpc/ingest.rs +++ b/dozer-ingestion/grpc/src/ingest.rs @@ -1,15 +1,17 @@ use std::sync::Arc; -use dozer_types::tonic::{self, Streaming}; -use dozer_types::{grpc_types::ingest::IngestArrowRequest, log::error}; -use futures::StreamExt; - -use dozer_types::grpc_types::ingest::{ - ingest_service_server::IngestService, IngestRequest, IngestResponse, +use dozer_ingestion_connector::{ + dozer_types::{ + grpc_types::ingest::{ + ingest_service_server::IngestService, IngestArrowRequest, IngestRequest, IngestResponse, + }, + log::error, + tonic::{self, Streaming}, + }, + futures::StreamExt, + tokio, Ingestor, TableToIngest, }; -use crate::{connectors::TableToIngest, ingestion::Ingestor}; - use super::adapter::{GrpcIngestMessage, GrpcIngestor, IngestAdapter}; pub struct IngestorServiceImpl diff --git a/dozer-ingestion/grpc/src/lib.rs b/dozer-ingestion/grpc/src/lib.rs new file mode 100644 index 0000000000..044fd3e32e --- /dev/null +++ b/dozer-ingestion/grpc/src/lib.rs @@ -0,0 +1,51 @@ +pub mod connector; +mod ingest; + +mod adapter; +use std::{net::AddrParseError, path::PathBuf}; + +pub use adapter::{ArrowAdapter, DefaultAdapter, GrpcIngestMessage, GrpcIngestor, IngestAdapter}; +use dozer_ingestion_connector::dozer_types::{ + arrow::error::ArrowError, + arrow_types::errors::FromArrowError, + grpc_types, serde_json, + thiserror::{self, Error}, + tonic::transport, + types::FieldType, +}; + +#[cfg(test)] +mod tests; + +#[derive(Debug, Error)] +pub enum Error { + #[error("cannot read file {0:?}: {1}")] + CannotReadFile(PathBuf, #[source] std::io::Error), + #[error("serde json error: {0}")] + SerdeJson(#[from] serde_json::Error), + #[error("from arrow error: {0}")] + FromArrow(#[from] FromArrowError), + #[error("arrow error: {0}")] + Arrow(#[from] ArrowError), + #[error("cannot parse address: {0}")] + AddrParse(#[from] AddrParseError), + #[error("tonic transport error: {0}")] + TonicTransport(#[from] transport::Error), + #[error("default adapter cannot handle arrow ingest message")] + CannotHandleArrowMessage, + #[error("arrow adapter cannot handle default ingest message")] + CannotHandleDefaultMessage, + #[error("schema not found: {0}")] + SchemaNotFound(String), + #[error("record is not properly formed. Length of values {values_count} does not match schema: {schema_fields_count}")] + NumFieldsMismatch { + values_count: usize, + schema_fields_count: usize, + }, + #[error("data is not valid at index: {index}, Type: {value:?}, Expected Type: {field_type}")] + FieldTypeMismatch { + index: usize, + value: grpc_types::types::value::Value, + field_type: FieldType, + }, +} diff --git a/dozer-ingestion/src/connectors/grpc/tests.rs b/dozer-ingestion/grpc/src/tests.rs similarity index 92% rename from dozer-ingestion/src/connectors/grpc/tests.rs rename to dozer-ingestion/grpc/src/tests.rs index 8c8bccca8d..a82771be05 100644 --- a/dozer-ingestion/src/connectors/grpc/tests.rs +++ b/dozer-ingestion/grpc/src/tests.rs @@ -1,35 +1,31 @@ use std::collections::HashMap; use std::{sync::Arc, thread}; -use crate::connectors::grpc::{ArrowAdapter, DefaultAdapter}; -use crate::ingestion::IngestionIterator; -use crate::test_util::{create_test_runtime, spawn_connector_all_tables}; -use dozer_types::arrow_types::to_arrow::DOZER_SCHEMA_KEY; -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::{ +use dozer_ingestion_connector::dozer_types::{ arrow::array::{Int32Array, StringArray}, + arrow::{datatypes as arrow_types, record_batch::RecordBatch}, + arrow_types::from_arrow::serialize_record_batch, + arrow_types::to_arrow::DOZER_SCHEMA_KEY, grpc_types::{ ingest::{ingest_service_client::IngestServiceClient, IngestArrowRequest, IngestRequest}, types, }, + json_types::JsonValue as dozer_JsonValue, + models::ingestion_types::IngestionMessage, + models::ingestion_types::{GrpcConfig, GrpcConfigSchemas}, + ordered_float::OrderedFloat, serde_json, + serde_json::json, serde_json::Value, + tonic::transport::Channel, types::Operation, + types::{FieldDefinition, FieldType, Schema as DozerSchema, SourceDefinition}, }; -use dozer_types::{ - arrow::{datatypes as arrow_types, record_batch::RecordBatch}, - arrow_types::from_arrow::serialize_record_batch, -}; +use dozer_ingestion_connector::test_util::{create_test_runtime, spawn_connector_all_tables}; +use dozer_ingestion_connector::tokio::runtime::Runtime; +use dozer_ingestion_connector::{dozer_types, tokio, IngestionIterator}; -use dozer_types::json_types::JsonValue as dozer_JsonValue; -use dozer_types::ordered_float::OrderedFloat; -use dozer_types::tonic::transport::Channel; -use dozer_types::types::{FieldDefinition, FieldType, Schema as DozerSchema, SourceDefinition}; -use dozer_types::{ - models::ingestion_types::{GrpcConfig, GrpcConfigSchemas}, - serde_json::json, -}; -use tokio::runtime::Runtime; +use crate::{ArrowAdapter, DefaultAdapter}; use super::connector::GrpcConnector; use super::IngestAdapter; @@ -48,8 +44,7 @@ fn ingest_grpc( port: Some(port), host: None, }, - ) - .unwrap(); + ); let (iterator, _) = spawn_connector_all_tables(runtime.clone(), grpc_connector); diff --git a/dozer-ingestion/kafka/Cargo.toml b/dozer-ingestion/kafka/Cargo.toml new file mode 100644 index 0000000000..75ca3937a5 --- /dev/null +++ b/dozer-ingestion/kafka/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "dozer-ingestion-kafka" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +rdkafka = "0.34.0" +schema_registry_converter = { version = "3.1.0", features = ["avro"] } +base64 = "0.21.0" diff --git a/dozer-ingestion/src/connectors/kafka/connector.rs b/dozer-ingestion/kafka/src/connector.rs similarity index 69% rename from dozer-ingestion/src/connectors/kafka/connector.rs rename to dozer-ingestion/kafka/src/connector.rs index a4f6495844..7be348671e 100644 --- a/dozer-ingestion/src/connectors/kafka/connector.rs +++ b/dozer-ingestion/kafka/src/connector.rs @@ -1,23 +1,24 @@ +use dozer_ingestion_connector::async_trait; +use dozer_ingestion_connector::dozer_types::errors::internal::BoxedError; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::KafkaConfig; +use dozer_ingestion_connector::dozer_types::types::FieldType; +use dozer_ingestion_connector::Connector; +use dozer_ingestion_connector::Ingestor; +use dozer_ingestion_connector::SourceSchema; +use dozer_ingestion_connector::SourceSchemaResult; +use dozer_ingestion_connector::TableIdentifier; +use dozer_ingestion_connector::TableInfo; +use dozer_ingestion_connector::TableToIngest; use rdkafka::consumer::BaseConsumer; -use rdkafka::ClientConfig; - -use crate::connectors::{ - Connector, SourceSchema, SourceSchemaResult, TableIdentifier, TableToIngest, -}; -use crate::ingestion::Ingestor; -use crate::{connectors::TableInfo, errors::ConnectorError}; -use dozer_types::models::ingestion_types::KafkaConfig; use rdkafka::consumer::Consumer; use rdkafka::util::Timeout; +use rdkafka::ClientConfig; -use dozer_types::tonic::async_trait; - -use crate::connectors::kafka::no_schema_registry_basic::NoSchemaRegistryBasic; - -use crate::connectors::kafka::schema_registry_basic::SchemaRegistryBasic; -use crate::connectors::kafka::stream_consumer::StreamConsumer; -use crate::connectors::kafka::stream_consumer_basic::StreamConsumerBasic; -use crate::errors::KafkaError::KafkaConnectionError; +use crate::no_schema_registry_basic::NoSchemaRegistryBasic; +use crate::schema_registry_basic::SchemaRegistryBasic; +use crate::stream_consumer::StreamConsumer; +use crate::stream_consumer_basic::StreamConsumerBasic; +use crate::KafkaError; #[derive(Debug)] pub struct KafkaConnector { @@ -32,7 +33,7 @@ impl KafkaConnector { async fn get_schemas_impl( &self, table_names: Option<&[String]>, - ) -> Result, ConnectorError> { + ) -> Result, KafkaError> { if let Some(schema_registry_url) = &self.config.schema_registry_url { SchemaRegistryBasic::get_schema(table_names, schema_registry_url.clone()).await } else { @@ -43,27 +44,25 @@ impl KafkaConnector { #[async_trait] impl Connector for KafkaConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { Ok(()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { let consumer = ClientConfig::new() .set("bootstrap.servers", &self.config.broker.clone()) .set("api.version.request", "true") - .create::() - .map_err(KafkaConnectionError)?; + .create::()?; - let metadata = consumer - .fetch_metadata(None, Timeout::After(std::time::Duration::new(60, 0))) - .map_err(KafkaConnectionError)?; + let metadata = + consumer.fetch_metadata(None, Timeout::After(std::time::Duration::new(60, 0)))?; let topics = metadata.topics(); let mut tables = vec![]; @@ -77,18 +76,19 @@ impl Connector for KafkaConnector { Ok(tables) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let table_names = tables .iter() .map(|table| table.name.clone()) .collect::>(); - self.get_schemas_impl(Some(&table_names)).await.map(|_| ()) + self.get_schemas_impl(Some(&table_names)).await?; + Ok(()) } async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let table_names = tables .iter() .map(|table| table.name.clone()) @@ -114,7 +114,7 @@ impl Connector for KafkaConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let table_names = table_infos .iter() .map(|table| table.name.clone()) @@ -131,9 +131,11 @@ impl Connector for KafkaConnector { &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), BoxedError> { let broker = self.config.broker.to_owned(); - run(broker, tables, ingestor, &self.config.schema_registry_url).await + run(broker, tables, ingestor, &self.config.schema_registry_url) + .await + .map_err(Into::into) } } @@ -142,7 +144,7 @@ async fn run( tables: Vec, ingestor: &Ingestor, schema_registry_url: &Option, -) -> Result<(), ConnectorError> { +) -> Result<(), KafkaError> { let mut client_config = ClientConfig::new(); client_config .set("bootstrap.servers", broker) diff --git a/dozer-ingestion/src/connectors/kafka/debezium/mapper.rs b/dozer-ingestion/kafka/src/debezium/mapper.rs similarity index 93% rename from dozer-ingestion/src/connectors/kafka/debezium/mapper.rs rename to dozer-ingestion/kafka/src/debezium/mapper.rs index 1f49913037..6826d9460b 100644 --- a/dozer-ingestion/src/connectors/kafka/debezium/mapper.rs +++ b/dozer-ingestion/kafka/src/debezium/mapper.rs @@ -1,12 +1,13 @@ -use crate::errors::KafkaSchemaError; -use crate::errors::KafkaSchemaError::{BinaryDecodeError, FieldNotFound, TypeNotSupported}; use base64::{engine, Engine}; +use dozer_ingestion_connector::dozer_types::{ + serde_json::Value, + types::{Field, Schema}, +}; +use std::collections::HashMap; -use crate::connectors::kafka::debezium::stream_consumer::DebeziumSchemaStruct; +use crate::KafkaSchemaError; -use dozer_types::serde_json::Value; -use dozer_types::types::{Field, Schema}; -use std::collections::HashMap; +use super::stream_consumer::DebeziumSchemaStruct; // fn convert_decimal(value: &str, scale: u32) -> Result { // let decoded_value = engine::general_purpose::STANDARD @@ -41,7 +42,7 @@ fn convert_value(value: Value, schema: &DebeziumSchemaStruct) -> Result value @@ -50,9 +51,11 @@ fn convert_value(value: Value, schema: &DebeziumSchemaStruct) -> Result value .as_bool() .map_or(Ok(Field::Null), |s| Ok(Field::from(s))), - _ => Err(TypeNotSupported(typ)), + _ => Err(KafkaSchemaError::TypeNotSupported(typ)), }, - _ => Err(TypeNotSupported("Unexpected value type".to_string())), + _ => Err(KafkaSchemaError::TypeNotSupported( + "Unexpected value type".to_string(), + )), } // Some(name) => { // match name.as_str() { @@ -132,7 +135,7 @@ pub fn convert_value_to_schema( Some(field_value) => { let schema_struct = fields_map .get(&f.name) - .ok_or_else(|| FieldNotFound(f.name.clone()))?; + .ok_or_else(|| KafkaSchemaError::FieldNotFound(f.name.clone()))?; convert_value(field_value, schema_struct) } }) @@ -141,16 +144,13 @@ pub fn convert_value_to_schema( #[cfg(test)] mod tests { - use crate::connectors::kafka::debezium::mapper::{convert_value, convert_value_to_schema}; - - use crate::connectors::kafka::debezium::stream_consumer::DebeziumSchemaStruct; - use crate::errors::KafkaSchemaError::TypeNotSupported; - use base64::{engine, Engine}; - use dozer_types::chrono::NaiveDateTime; + use dozer_ingestion_connector::dozer_types::{ + chrono::NaiveDateTime, + serde_json::Map, + types::{FieldDefinition, FieldType, SourceDefinition}, + }; - use dozer_types::serde_json::{Map, Value}; - use dozer_types::types::{Field, FieldDefinition, FieldType, Schema, SourceDefinition}; - use std::collections::HashMap; + use super::*; #[macro_export] macro_rules! test_conversion_debezium { @@ -278,7 +278,7 @@ mod tests { "Unknown type value", "Unknown type", None, - TypeNotSupported("Unknown type".to_string()), + KafkaSchemaError::TypeNotSupported("Unknown type".to_string()), None ); // test_conversion_debezium_error!( diff --git a/dozer-ingestion/src/connectors/kafka/debezium/mod.rs b/dozer-ingestion/kafka/src/debezium/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/kafka/debezium/mod.rs rename to dozer-ingestion/kafka/src/debezium/mod.rs diff --git a/dozer-ingestion/src/connectors/kafka/debezium/no_schema_registry.rs b/dozer-ingestion/kafka/src/debezium/no_schema_registry.rs similarity index 63% rename from dozer-ingestion/src/connectors/kafka/debezium/no_schema_registry.rs rename to dozer-ingestion/kafka/src/debezium/no_schema_registry.rs index 3f8fdd0425..dc5e2b528b 100644 --- a/dozer-ingestion/src/connectors/kafka/debezium/no_schema_registry.rs +++ b/dozer-ingestion/kafka/src/debezium/no_schema_registry.rs @@ -1,15 +1,14 @@ -use crate::connectors::kafka::debezium::schema::map_schema; -use crate::connectors::kafka::debezium::stream_consumer::DebeziumMessage; -use crate::connectors::SourceSchema; -use crate::errors::KafkaError::{BytesConvertError, JsonDecodeError, KafkaConnectionError}; -use crate::errors::{ConnectorError, KafkaError, KafkaStreamError}; -use dozer_types::serde_json; +use dozer_ingestion_connector::dozer_types::serde_json; +use dozer_ingestion_connector::{CdcType, SourceSchema}; use rdkafka::config::RDKafkaLogLevel; +use rdkafka::consumer::stream_consumer::StreamConsumer as RdkafkaStreamConsumer; use rdkafka::consumer::{Consumer, DefaultConsumerContext}; use rdkafka::{ClientConfig, Message}; -use crate::connectors::CdcType::FullChanges; -use rdkafka::consumer::stream_consumer::StreamConsumer as RdkafkaStreamConsumer; +use crate::{KafkaError, KafkaStreamError}; + +use super::schema::map_schema; +use super::stream_consumer::DebeziumMessage; pub struct NoSchemaRegistry {} @@ -17,7 +16,7 @@ impl NoSchemaRegistry { pub async fn get_schema( table_names: Option<&[String]>, broker: String, - ) -> Result, ConnectorError> { + ) -> Result, KafkaError> { let mut schemas = vec![]; match table_names { None => {} @@ -31,10 +30,9 @@ impl NoSchemaRegistry { .set("session.timeout.ms", "6000") .set("enable.auto.commit", "true") .set_log_level(RDKafkaLogLevel::Debug) - .create_with_context(context) - .map_err(KafkaConnectionError)?; + .create_with_context(context)?; - con.subscribe(&[table]).map_err(KafkaConnectionError)?; + con.subscribe(&[table])?; let m = con.recv().await.map_err(|e| { KafkaError::KafkaStreamError(KafkaStreamError::PollingError(e)) @@ -42,20 +40,19 @@ impl NoSchemaRegistry { if let (Some(message), Some(key)) = (m.payload(), m.key()) { let value_struct: DebeziumMessage = serde_json::from_str( - std::str::from_utf8(message).map_err(BytesConvertError)?, + std::str::from_utf8(message).map_err(KafkaError::BytesConvertError)?, ) - .map_err(JsonDecodeError)?; + .map_err(KafkaError::JsonDecodeError)?; let key_struct: DebeziumMessage = serde_json::from_str( - std::str::from_utf8(key).map_err(BytesConvertError)?, + std::str::from_utf8(key).map_err(KafkaError::BytesConvertError)?, ) - .map_err(JsonDecodeError)?; + .map_err(KafkaError::JsonDecodeError)?; let (mapped_schema, _fields_map) = - map_schema(&value_struct.schema, &key_struct.schema).map_err(|e| { - ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)) - })?; + map_schema(&value_struct.schema, &key_struct.schema) + .map_err(KafkaError::KafkaSchemaError)?; - schemas.push(SourceSchema::new(mapped_schema, FullChanges)); + schemas.push(SourceSchema::new(mapped_schema, CdcType::FullChanges)); } } } diff --git a/dozer-ingestion/src/connectors/kafka/debezium/schema.rs b/dozer-ingestion/kafka/src/debezium/schema.rs similarity index 89% rename from dozer-ingestion/src/connectors/kafka/debezium/schema.rs rename to dozer-ingestion/kafka/src/debezium/schema.rs index e7dc27d6d0..7a801921ab 100644 --- a/dozer-ingestion/src/connectors/kafka/debezium/schema.rs +++ b/dozer-ingestion/kafka/src/debezium/schema.rs @@ -1,11 +1,13 @@ -use dozer_types::serde_json::Value; use std::collections::HashMap; -use crate::connectors::kafka::debezium::stream_consumer::DebeziumSchemaStruct; +use dozer_ingestion_connector::dozer_types::{ + serde_json::Value, + types::{FieldDefinition, FieldType, Schema, SourceDefinition}, +}; -use crate::errors::KafkaSchemaError; -use crate::errors::KafkaSchemaError::{SchemaDefinitionNotFound, TypeNotSupported}; -use dozer_types::types::{FieldDefinition, FieldType, Schema, SourceDefinition}; +use crate::KafkaSchemaError; + +use super::stream_consumer::DebeziumSchemaStruct; // Reference: https://debezium.io/documentation/reference/0.9/connectors/postgresql.html pub fn map_type(schema: &DebeziumSchemaStruct) -> Result { @@ -17,9 +19,11 @@ pub fn map_type(schema: &DebeziumSchemaStruct) -> Result Ok(FieldType::Binary), "float" | "float32" | "float64" | "double" => Ok(FieldType::Float), "boolean" => Ok(FieldType::Boolean), - _ => Err(TypeNotSupported(typ)), + _ => Err(KafkaSchemaError::TypeNotSupported(typ)), }, - _ => Err(TypeNotSupported("Unexpected value type".to_string())), + _ => Err(KafkaSchemaError::TypeNotSupported( + "Unexpected value type".to_string(), + )), }, Some(name) => match name.as_str() { "io.debezium.time.MicroTime" @@ -32,7 +36,7 @@ pub fn map_type(schema: &DebeziumSchemaStruct) -> Result Ok(FieldType::Json), - _ => Err(TypeNotSupported(name)), + _ => Err(KafkaSchemaError::TypeNotSupported(name)), }, } } @@ -47,7 +51,7 @@ pub fn map_schema( }; match &schema.fields { - None => Err(SchemaDefinitionNotFound), + None => Err(KafkaSchemaError::SchemaDefinitionNotFound), Some(fields) => { let new_schema_struct = fields.iter().find(|f| { if let Some(val) = f.field.clone() { @@ -91,7 +95,7 @@ pub fn map_schema( fields_schema_map, )) } else { - Err(SchemaDefinitionNotFound) + Err(KafkaSchemaError::SchemaDefinitionNotFound) } } } @@ -99,12 +103,7 @@ pub fn map_schema( #[cfg(test)] mod tests { - use crate::connectors::kafka::debezium::schema::{map_schema, map_type}; - use crate::connectors::kafka::debezium::stream_consumer::DebeziumSchemaStruct; - use crate::errors::KafkaSchemaError::SchemaDefinitionNotFound; - use crate::errors::KafkaSchemaError::TypeNotSupported; - use dozer_types::serde_json::Value; - use dozer_types::types::{FieldDefinition, FieldType, Schema, SourceDefinition}; + use super::*; #[test] fn test_it_fails_when_schema_empty() { @@ -129,7 +128,7 @@ mod tests { }; let actual_error = map_schema(&schema, &key_schema).unwrap_err(); - assert_eq!(actual_error, SchemaDefinitionNotFound); + assert_eq!(actual_error, KafkaSchemaError::SchemaDefinitionNotFound); } #[test] @@ -275,7 +274,7 @@ mod tests { test_map_type!( "not found", None, - Err(TypeNotSupported("not found".to_string())) + Err(KafkaSchemaError::TypeNotSupported("not found".to_string())) ); test_map_type!( "int8", @@ -300,7 +299,9 @@ mod tests { test_map_type!( "string", Some("not existing".to_string()), - Err(TypeNotSupported("not existing".to_string())) + Err(KafkaSchemaError::TypeNotSupported( + "not existing".to_string() + )) ); } } diff --git a/dozer-ingestion/src/connectors/kafka/debezium/schema_registry.rs b/dozer-ingestion/kafka/src/debezium/schema_registry.rs similarity index 81% rename from dozer-ingestion/src/connectors/kafka/debezium/schema_registry.rs rename to dozer-ingestion/kafka/src/debezium/schema_registry.rs index fc2c6f89f3..ab5bfbe362 100644 --- a/dozer-ingestion/src/connectors/kafka/debezium/schema_registry.rs +++ b/dozer-ingestion/kafka/src/debezium/schema_registry.rs @@ -1,18 +1,18 @@ -#![allow(clippy::type_complexity)] +use std::collections::HashMap; -use crate::connectors::kafka::debezium::schema::map_type; -use crate::connectors::kafka::debezium::stream_consumer::DebeziumSchemaStruct; -use crate::connectors::{CdcType, SourceSchema}; -use crate::errors::KafkaError::{JsonDecodeError, SchemaRegistryFetchError}; -use crate::errors::KafkaSchemaError::TypeNotSupported; -use crate::errors::{ConnectorError, KafkaError, KafkaSchemaError}; -use dozer_types::log::error; -use dozer_types::serde_json; -use dozer_types::serde_json::Value; -use dozer_types::types::{FieldDefinition, FieldType, Schema, SourceDefinition}; +use dozer_ingestion_connector::dozer_types::log::error; +use dozer_ingestion_connector::dozer_types::serde_json::{self, Value}; +use dozer_ingestion_connector::dozer_types::types::{ + FieldDefinition, FieldType, Schema, SourceDefinition, +}; +use dozer_ingestion_connector::{tokio, CdcType, SourceSchema}; use schema_registry_converter::async_impl::schema_registry::SrSettings; use schema_registry_converter::schema_registry_common::SubjectNameStrategy; -use std::collections::HashMap; + +use crate::{KafkaError, KafkaSchemaError}; + +use super::schema::map_type; +use super::stream_consumer::DebeziumSchemaStruct; pub struct SchemaRegistry {} @@ -46,7 +46,7 @@ impl SchemaRegistry { } } - Err(TypeNotSupported("Array".to_string())) + Err(KafkaSchemaError::TypeNotSupported("Array".to_string())) } Value::Object(obj) => SchemaRegistry::map_typ(&DebeziumSchemaStruct { r#type: obj.get("type").unwrap().clone(), @@ -57,7 +57,9 @@ impl SchemaRegistry { version: None, parameters: None, }), - _ => Err(TypeNotSupported("Unexpected value".to_string())), + _ => Err(KafkaSchemaError::TypeNotSupported( + "Unexpected value".to_string(), + )), } } @@ -80,17 +82,18 @@ impl SchemaRegistry { tokio::time::sleep(RETRY_INTERVAL).await; continue; } - Err(err) => return Err(SchemaRegistryFetchError(err)), + Err(err) => return Err(KafkaError::SchemaRegistryFetchError(err)), } }; - serde_json::from_str::(&schema_result.schema).map_err(JsonDecodeError) + serde_json::from_str::(&schema_result.schema) + .map_err(KafkaError::JsonDecodeError) } pub async fn get_schema( table_names: Option<&[String]>, schema_registry_url: String, - ) -> Result, ConnectorError> { + ) -> Result, KafkaError> { let sr_settings = SrSettings::new(schema_registry_url); match table_names { None => Ok(vec![]), @@ -114,13 +117,12 @@ impl SchemaRegistry { let mut fields_schema_map: HashMap = HashMap::new(); - let defined_fields: Result, ConnectorError> = fields + let defined_fields: Result, KafkaError> = fields .iter() .enumerate() .map(|(idx, f)| { - let (typ, nullable) = Self::map_typ(f).map_err(|e| { - ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)) - })?; + let (typ, nullable) = + Self::map_typ(f).map_err(KafkaError::KafkaSchemaError)?; let name = f.name.clone().unwrap(); if pk_fields.contains(&name) { pk_keys_indexes.push(idx); diff --git a/dozer-ingestion/src/connectors/kafka/debezium/stream_consumer.rs b/dozer-ingestion/kafka/src/debezium/stream_consumer.rs similarity index 68% rename from dozer-ingestion/src/connectors/kafka/debezium/stream_consumer.rs rename to dozer-ingestion/kafka/src/debezium/stream_consumer.rs index a8ec21188d..ba907de845 100644 --- a/dozer-ingestion/src/connectors/kafka/debezium/stream_consumer.rs +++ b/dozer-ingestion/kafka/src/debezium/stream_consumer.rs @@ -1,25 +1,24 @@ -use crate::connectors::kafka::debezium::mapper::convert_value_to_schema; -use crate::connectors::kafka::debezium::schema::map_schema; -use crate::connectors::kafka::stream_consumer::StreamConsumer; -use crate::connectors::kafka::stream_consumer_helper::{ - is_network_failure, OffsetsMap, StreamConsumerHelper, +use crate::debezium::mapper::convert_value_to_schema; +use crate::debezium::schema::map_schema; +use crate::stream_consumer::StreamConsumer; +use crate::stream_consumer_helper::{is_network_failure, OffsetsMap, StreamConsumerHelper}; +use crate::{KafkaError, KafkaStreamError}; + +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + models::ingestion_types::IngestionMessage, + serde::{Deserialize, Serialize}, + serde_json, + serde_json::Value, + types::{Operation, Record}, + }, + Ingestor, TableToIngest, }; -use crate::errors::KafkaError::{BytesConvertError, JsonDecodeError}; -use crate::errors::{ConnectorError, KafkaError, KafkaStreamError}; -use crate::ingestion::Ingestor; -use dozer_types::models::ingestion_types::IngestionMessage; - -use dozer_types::serde::{Deserialize, Serialize}; -use dozer_types::serde_json; -use dozer_types::serde_json::Value; -use dozer_types::types::{Operation, Record}; - -use crate::connectors::TableToIngest; -use dozer_types::tonic::async_trait; use rdkafka::{ClientConfig, Message}; #[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] #[serde(untagged)] pub enum DebeziumFieldType { I8(i8), @@ -35,7 +34,7 @@ pub enum DebeziumFieldType { } #[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct DebeziumField { pub r#type: String, pub optional: bool, @@ -44,7 +43,7 @@ pub struct DebeziumField { } #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct DebeziumSchemaParameters { pub scale: Option, #[serde(rename(deserialize = "connect.decimal.precision"))] @@ -52,7 +51,7 @@ pub struct DebeziumSchemaParameters { } #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct DebeziumSchemaStruct { pub r#type: Value, pub fields: Option>, @@ -64,7 +63,7 @@ pub struct DebeziumSchemaStruct { } #[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct DebeziumPayload { pub before: Option, pub after: Option, @@ -72,7 +71,7 @@ pub struct DebeziumPayload { } #[derive(Debug, Serialize, Deserialize)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct DebeziumMessage { pub schema: DebeziumSchemaStruct, pub payload: DebeziumPayload, @@ -91,7 +90,7 @@ impl StreamConsumer for DebeziumStreamConsumer { ingestor: &Ingestor, tables: Vec, _schema_registry_url: &Option, - ) -> Result<(), ConnectorError> { + ) -> Result<(), KafkaError> { let topics: Vec<&str> = tables .iter() .map(|t| { @@ -115,15 +114,17 @@ impl StreamConsumer for DebeziumStreamConsumer { StreamConsumerHelper::update_offsets(&mut offsets, &m); if let (Some(message), Some(key)) = (m.payload(), m.key()) { - let mut value_struct: DebeziumMessage = - serde_json::from_str(std::str::from_utf8(message).map_err(BytesConvertError)?) - .map_err(JsonDecodeError)?; - let key_struct: DebeziumMessage = - serde_json::from_str(std::str::from_utf8(key).map_err(BytesConvertError)?) - .map_err(JsonDecodeError)?; + let mut value_struct: DebeziumMessage = serde_json::from_str( + std::str::from_utf8(message).map_err(KafkaError::BytesConvertError)?, + ) + .map_err(KafkaError::JsonDecodeError)?; + let key_struct: DebeziumMessage = serde_json::from_str( + std::str::from_utf8(key).map_err(KafkaError::BytesConvertError)?, + ) + .map_err(KafkaError::JsonDecodeError)?; let (schema, fields_map) = map_schema(&value_struct.schema, &key_struct.schema) - .map_err(|e| ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)))?; + .map_err(KafkaError::KafkaSchemaError)?; // When update happens before is null. // If PK value changes, then debezium creates two events - delete and insert @@ -136,15 +137,11 @@ impl StreamConsumer for DebeziumStreamConsumer { match (value_struct.payload.after, value_struct.payload.before) { (Some(new_payload), Some(old_payload)) => { let new = convert_value_to_schema(new_payload, &schema, &fields_map) - .map_err(|e| { - ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)) - })?; + .map_err(KafkaError::KafkaSchemaError)?; let old = convert_value_to_schema(old_payload, &schema, &fields_map) - .map_err(|e| { - ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)) - })?; + .map_err(KafkaError::KafkaSchemaError)?; - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index: 0, op: Operation::Update { @@ -160,15 +157,17 @@ impl StreamConsumer for DebeziumStreamConsumer { id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving side is closed, we should stop the stream + return Ok(()); + } } (None, Some(old_payload)) => { let old = convert_value_to_schema(old_payload, &schema, &fields_map) - .map_err(|e| { - ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)) - })?; + .map_err(KafkaError::KafkaSchemaError)?; - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index: 0, op: Operation::Delete { @@ -180,15 +179,17 @@ impl StreamConsumer for DebeziumStreamConsumer { id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving side is closed, we should stop the stream + return Ok(()); + } } (Some(new_payload), None) => { let new = convert_value_to_schema(new_payload, &schema, &fields_map) - .map_err(|e| { - ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)) - })?; + .map_err(KafkaError::KafkaSchemaError)?; - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index: 0, op: Operation::Insert { @@ -200,7 +201,11 @@ impl StreamConsumer for DebeziumStreamConsumer { id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving side is closed, we should stop the stream + return Ok(()); + } } (None, None) => {} } diff --git a/dozer-ingestion/kafka/src/lib.rs b/dozer-ingestion/kafka/src/lib.rs new file mode 100644 index 0000000000..d7e9212ab8 --- /dev/null +++ b/dozer-ingestion/kafka/src/lib.rs @@ -0,0 +1,92 @@ +use std::str::Utf8Error; + +use base64::DecodeError; +use dozer_ingestion_connector::dozer_types::{ + rust_decimal, serde_json, + thiserror::{self, Error}, +}; +use schema_registry_converter::error::SRCError; + +pub mod connector; +pub mod debezium; +pub mod no_schema_registry_basic; +pub mod schema_registry_basic; +pub mod stream_consumer; +pub mod stream_consumer_basic; +mod stream_consumer_helper; +#[cfg(any(test, feature = "debezium_bench"))] +pub mod test_utils; + +#[derive(Error, Debug)] +pub enum KafkaError { + #[error(transparent)] + KafkaSchemaError(#[from] KafkaSchemaError), + + #[error("Connection error. Error: {0}")] + KafkaConnectionError(#[from] rdkafka::error::KafkaError), + + #[error("JSON decode error. Error: {0}")] + JsonDecodeError(#[source] serde_json::Error), + + #[error("Bytes convert error")] + BytesConvertError(#[source] Utf8Error), + + #[error(transparent)] + KafkaStreamError(#[from] KafkaStreamError), + + #[error("Schema registry fetch failed. Error: {0}")] + SchemaRegistryFetchError(#[source] SRCError), + + #[error("Topic not defined")] + TopicNotDefined, +} + +#[derive(Error, Debug)] +pub enum KafkaStreamError { + #[error("Consume commit error")] + ConsumeCommitError(#[source] rdkafka::error::KafkaError), + + #[error("Message consume error")] + MessageConsumeError(#[source] rdkafka::error::KafkaError), + + #[error("Polling error")] + PollingError(#[source] rdkafka::error::KafkaError), +} + +#[derive(Error, Debug, PartialEq)] +pub enum KafkaSchemaError { + #[error("Schema definition not found")] + SchemaDefinitionNotFound, + + #[error("Unsupported \"{0}\" type")] + TypeNotSupported(String), + + #[error("Field \"{0}\" not found")] + FieldNotFound(String), + + #[error("Binary decode error")] + BinaryDecodeError(#[source] DecodeError), + + #[error("Scale not found")] + ScaleNotFound, + + #[error("Scale is invalid")] + ScaleIsInvalid, + + #[error("Decimal convert error")] + DecimalConvertError(#[source] rust_decimal::Error), + + #[error("Invalid date")] + InvalidDateError, + + #[error("Invalid json: {0}")] + InvalidJsonError(String), + + // #[error("Invalid time")] + // InvalidTimeError, + #[error("Invalid timestamp")] + InvalidTimestampError, +} + +#[cfg(test)] +mod tests; diff --git a/dozer-ingestion/src/connectors/kafka/no_schema_registry_basic.rs b/dozer-ingestion/kafka/src/no_schema_registry_basic.rs similarity index 82% rename from dozer-ingestion/src/connectors/kafka/no_schema_registry_basic.rs rename to dozer-ingestion/kafka/src/no_schema_registry_basic.rs index 56836ca3cd..890020a4f2 100644 --- a/dozer-ingestion/src/connectors/kafka/no_schema_registry_basic.rs +++ b/dozer-ingestion/kafka/src/no_schema_registry_basic.rs @@ -1,10 +1,9 @@ -#![allow(clippy::type_complexity)] +use dozer_ingestion_connector::{ + dozer_types::types::{FieldDefinition, FieldType, Schema, SourceDefinition}, + CdcType, SourceSchema, +}; -use crate::connectors::{CdcType, SourceSchema}; - -use crate::errors::ConnectorError; - -use dozer_types::types::{FieldDefinition, FieldType, Schema, SourceDefinition}; +use crate::KafkaError; pub struct NoSchemaRegistryBasic {} @@ -31,7 +30,7 @@ impl NoSchemaRegistryBasic { SourceSchema::new(schema, CdcType::FullChanges) } - pub fn get_schema(table_names: Option<&[String]>) -> Result, ConnectorError> { + pub fn get_schema(table_names: Option<&[String]>) -> Result, KafkaError> { let mut schemas = vec![]; if let Some(tables) = table_names { for _ in 0..tables.len() { diff --git a/dozer-ingestion/src/connectors/kafka/schema_registry_basic.rs b/dozer-ingestion/kafka/src/schema_registry_basic.rs similarity index 78% rename from dozer-ingestion/src/connectors/kafka/schema_registry_basic.rs rename to dozer-ingestion/kafka/src/schema_registry_basic.rs index 153be94885..d07243c549 100644 --- a/dozer-ingestion/src/connectors/kafka/schema_registry_basic.rs +++ b/dozer-ingestion/kafka/src/schema_registry_basic.rs @@ -1,23 +1,24 @@ #![allow(clippy::type_complexity)] -use crate::connectors::kafka::debezium::stream_consumer::DebeziumSchemaStruct; -use crate::connectors::{CdcType, SourceSchema}; - -use crate::errors::{ConnectorError, KafkaError}; - -use dozer_types::types::{FieldDefinition, Schema, SourceDefinition}; - -use crate::connectors::kafka::debezium::schema_registry::SchemaRegistry; +use dozer_ingestion_connector::{ + dozer_types::types::{FieldDefinition, Schema, SourceDefinition}, + CdcType, SourceSchema, +}; use schema_registry_converter::async_impl::schema_registry::SrSettings; use std::collections::HashMap; +use crate::{ + debezium::{schema_registry::SchemaRegistry, stream_consumer::DebeziumSchemaStruct}, + KafkaError, +}; + pub struct SchemaRegistryBasic {} impl SchemaRegistryBasic { pub async fn get_single_schema( table_name: &str, schema_registry_url: &str, - ) -> Result<(SourceSchema, HashMap), ConnectorError> { + ) -> Result<(SourceSchema, HashMap), KafkaError> { let sr_settings = SrSettings::new(schema_registry_url.to_string()); let key_result = SchemaRegistry::fetch_struct(&sr_settings, table_name, true).await?; let schema_result = SchemaRegistry::fetch_struct(&sr_settings, table_name, false).await?; @@ -33,13 +34,13 @@ impl SchemaRegistryBasic { let mut pk_keys_indexes = vec![]; let mut fields_schema_map: HashMap = HashMap::new(); - let defined_fields: Result, ConnectorError> = fields + let defined_fields: Result, KafkaError> = fields .iter() .to_owned() .enumerate() .map(|(idx, f)| { - let (typ, nullable) = SchemaRegistry::map_typ(f) - .map_err(|e| ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)))?; + let (typ, nullable) = + SchemaRegistry::map_typ(f).map_err(KafkaError::KafkaSchemaError)?; let name = f.name.clone().unwrap(); if pk_fields.contains(&name) { pk_keys_indexes.push(idx); @@ -68,7 +69,7 @@ impl SchemaRegistryBasic { pub async fn get_schema( table_names: Option<&[String]>, schema_registry_url: String, - ) -> Result, ConnectorError> { + ) -> Result, KafkaError> { let mut schemas = vec![]; if let Some(tables) = table_names { for table_name in tables.iter() { diff --git a/dozer-ingestion/src/connectors/kafka/stream_consumer.rs b/dozer-ingestion/kafka/src/stream_consumer.rs similarity index 58% rename from dozer-ingestion/src/connectors/kafka/stream_consumer.rs rename to dozer-ingestion/kafka/src/stream_consumer.rs index 9c6d7deb59..d7573710a8 100644 --- a/dozer-ingestion/src/connectors/kafka/stream_consumer.rs +++ b/dozer-ingestion/kafka/src/stream_consumer.rs @@ -1,8 +1,6 @@ -use crate::errors::ConnectorError; -use crate::ingestion::Ingestor; +use crate::KafkaError; -use crate::connectors::TableToIngest; -use dozer_types::tonic::async_trait; +use dozer_ingestion_connector::{async_trait, Ingestor, TableToIngest}; use rdkafka::ClientConfig; #[async_trait] @@ -13,5 +11,5 @@ pub trait StreamConsumer { ingestor: &Ingestor, tables: Vec, schema_registry_url: &Option, - ) -> Result<(), ConnectorError>; + ) -> Result<(), KafkaError>; } diff --git a/dozer-ingestion/src/connectors/kafka/stream_consumer_basic.rs b/dozer-ingestion/kafka/src/stream_consumer_basic.rs similarity index 69% rename from dozer-ingestion/src/connectors/kafka/stream_consumer_basic.rs rename to dozer-ingestion/kafka/src/stream_consumer_basic.rs index f82c4a65a8..34c28a5ddf 100644 --- a/dozer-ingestion/src/connectors/kafka/stream_consumer_basic.rs +++ b/dozer-ingestion/kafka/src/stream_consumer_basic.rs @@ -1,31 +1,26 @@ -use crate::connectors::kafka::debezium::mapper::convert_value_to_schema; use std::collections::HashMap; -use crate::connectors::kafka::stream_consumer::StreamConsumer; -use crate::errors::KafkaError::{ - BytesConvertError, JsonDecodeError, KafkaStreamError, TopicNotDefined, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + models::ingestion_types::IngestionMessage, + serde::{Deserialize, Serialize}, + serde_json::{self, Value}, + types::{Field, Operation, Record}, + }, + Ingestor, TableToIngest, }; -use crate::errors::{ConnectorError, KafkaError}; -use crate::ingestion::Ingestor; -use dozer_types::models::ingestion_types::IngestionMessage; - -use dozer_types::serde::{Deserialize, Serialize}; -use dozer_types::serde_json; -use dozer_types::serde_json::Value; -use dozer_types::types::{Field, Operation, Record}; - -use crate::connectors::kafka::no_schema_registry_basic::NoSchemaRegistryBasic; -use crate::connectors::kafka::schema_registry_basic::SchemaRegistryBasic; -use dozer_types::tonic::async_trait; - -use crate::connectors::TableToIngest; -use crate::errors::KafkaStreamError::PollingError; use rdkafka::{ClientConfig, Message}; +use crate::schema_registry_basic::SchemaRegistryBasic; +use crate::stream_consumer::StreamConsumer; +use crate::{debezium::mapper::convert_value_to_schema, KafkaError}; +use crate::{no_schema_registry_basic::NoSchemaRegistryBasic, KafkaStreamError}; + use super::stream_consumer_helper::{is_network_failure, OffsetsMap, StreamConsumerHelper}; #[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] #[serde(untagged)] pub enum FieldType { I8(i8), @@ -41,7 +36,7 @@ pub enum FieldType { } #[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct KafkaField { pub r#type: String, pub optional: bool, @@ -50,7 +45,7 @@ pub struct KafkaField { } #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct SchemaParameters { pub scale: Option, #[serde(rename(deserialize = "connect.decimal.precision"))] @@ -58,7 +53,7 @@ pub struct SchemaParameters { } #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct SchemaStruct { pub r#type: Value, pub fields: Option>, @@ -70,7 +65,7 @@ pub struct SchemaStruct { } #[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(crate = "dozer_types::serde")] +#[serde(crate = "dozer_ingestion_connector::dozer_types::serde")] pub struct Payload { pub before: Option, pub after: Option, @@ -88,7 +83,7 @@ impl StreamConsumer for StreamConsumerBasic { ingestor: &Ingestor, tables: Vec, schema_registry_url: &Option, - ) -> Result<(), ConnectorError> { + ) -> Result<(), KafkaError> { let topics: Vec = tables.iter().map(|t| t.name.clone()).collect(); let mut schemas = HashMap::new(); @@ -114,18 +109,19 @@ impl StreamConsumer for StreamConsumerBasic { con = StreamConsumerHelper::resume(&client_config, &topics, &offsets).await?; continue; } - let m = result.map_err(|e| KafkaStreamError(PollingError(e)))?; + let m = result + .map_err(|e| KafkaError::KafkaStreamError(KafkaStreamError::PollingError(e)))?; StreamConsumerHelper::update_offsets(&mut offsets, &m); match schemas.get(m.topic()) { - None => return Err(ConnectorError::KafkaError(TopicNotDefined)), + None => return Err(KafkaError::TopicNotDefined), Some((table_index, (schema, fields_map))) => { if let (Some(message), Some(key)) = (m.payload(), m.key()) { let new = match schema_registry_url { None => { - let value = - std::str::from_utf8(message).map_err(BytesConvertError)?; - let key = - std::str::from_utf8(key).map_err(BytesConvertError)?; + let value = std::str::from_utf8(message) + .map_err(KafkaError::BytesConvertError)?; + let key = std::str::from_utf8(key) + .map_err(KafkaError::BytesConvertError)?; vec![ Field::String(key.to_string()), @@ -134,26 +130,26 @@ impl StreamConsumer for StreamConsumerBasic { } Some(_) => { let value_struct: Value = serde_json::from_str( - std::str::from_utf8(message).map_err(BytesConvertError)?, + std::str::from_utf8(message) + .map_err(KafkaError::BytesConvertError)?, ) - .map_err(JsonDecodeError)?; + .map_err(KafkaError::JsonDecodeError)?; let _key_struct: Value = serde_json::from_str( - std::str::from_utf8(key).map_err(BytesConvertError)?, + std::str::from_utf8(key) + .map_err(KafkaError::BytesConvertError)?, ) - .map_err(JsonDecodeError)?; + .map_err(KafkaError::JsonDecodeError)?; convert_value_to_schema( value_struct, &schema.schema, fields_map, ) - .map_err(|e| { - ConnectorError::KafkaError(KafkaError::KafkaSchemaError(e)) - })? + .map_err(KafkaError::KafkaSchemaError)? } }; - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index: *table_index, op: Operation::Insert { @@ -165,7 +161,11 @@ impl StreamConsumer for StreamConsumerBasic { id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving side is closed, we should stop the stream + return Ok(()); + } } } } diff --git a/dozer-ingestion/src/connectors/kafka/stream_consumer_helper.rs b/dozer-ingestion/kafka/src/stream_consumer_helper.rs similarity index 92% rename from dozer-ingestion/src/connectors/kafka/stream_consumer_helper.rs rename to dozer-ingestion/kafka/src/stream_consumer_helper.rs index 4eb6798ede..c7eb31c345 100644 --- a/dozer-ingestion/src/connectors/kafka/stream_consumer_helper.rs +++ b/dozer-ingestion/kafka/src/stream_consumer_helper.rs @@ -1,5 +1,4 @@ -use crate::errors::ConnectorError; -use crate::errors::KafkaError::KafkaConnectionError; +use dozer_ingestion_connector::{dozer_types, tokio}; use rdkafka::{ consumer::{BaseConsumer, Consumer}, message::BorrowedMessage, @@ -8,6 +7,8 @@ use rdkafka::{ }; use std::collections::HashMap; +use crate::KafkaError; + pub struct StreamConsumerHelper; pub type OffsetsMap = HashMap; // key: topic, value: (partition, offset) @@ -16,7 +17,7 @@ impl StreamConsumerHelper { pub async fn start( client_config: &ClientConfig, topics: &[&str], - ) -> Result { + ) -> Result { Self::resume_impl(client_config, topics, None).await } @@ -24,7 +25,7 @@ impl StreamConsumerHelper { client_config: &ClientConfig, topics: &[&str], offsets: &OffsetsMap, - ) -> Result { + ) -> Result { Self::resume_impl(client_config, topics, Some(offsets)).await } @@ -39,7 +40,7 @@ impl StreamConsumerHelper { client_config: &ClientConfig, topics: &[&str], offsets: Option<&OffsetsMap>, - ) -> Result { + ) -> Result { loop { match Self::try_resume(client_config, topics, offsets).await { Ok(con) => return Ok(con), @@ -51,7 +52,7 @@ impl StreamConsumerHelper { tokio::time::sleep(RETRY_INTERVAL).await; continue; } - Err(err) => Err(KafkaConnectionError(err))?, + Err(err) => Err(KafkaError::KafkaConnectionError(err))?, } } } diff --git a/dozer-ingestion/src/connectors/kafka/test_utils.rs b/dozer-ingestion/kafka/src/test_utils.rs similarity index 100% rename from dozer-ingestion/src/connectors/kafka/test_utils.rs rename to dozer-ingestion/kafka/src/test_utils.rs diff --git a/dozer-ingestion/src/connectors/kafka/tests.rs b/dozer-ingestion/kafka/src/tests.rs similarity index 100% rename from dozer-ingestion/src/connectors/kafka/tests.rs rename to dozer-ingestion/kafka/src/tests.rs diff --git a/dozer-ingestion/mongodb/Cargo.toml b/dozer-ingestion/mongodb/Cargo.toml new file mode 100644 index 0000000000..709b38457b --- /dev/null +++ b/dozer-ingestion/mongodb/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "dozer-ingestion-mongodb" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +mongodb = "2.6.1" +bson = "2.7.0" diff --git a/dozer-ingestion/src/connectors/mongodb/mod.rs b/dozer-ingestion/mongodb/src/lib.rs similarity index 85% rename from dozer-ingestion/src/connectors/mongodb/mod.rs rename to dozer-ingestion/mongodb/src/lib.rs index 0e7d879f5f..f734f0eb85 100644 --- a/dozer-ingestion/src/connectors/mongodb/mod.rs +++ b/dozer-ingestion/mongodb/src/lib.rs @@ -1,30 +1,38 @@ use std::collections::HashMap; use bson::{doc, Bson, Document, Timestamp}; -use dozer_types::tonic::async_trait; -use futures::{stream::FuturesUnordered, StreamExt, TryFutureExt, TryStreamExt}; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + self, + errors::{internal::BoxedError, types::DeserializationError}, + json_types::{serde_json_to_json_value, JsonValue}, + models::ingestion_types::IngestionMessage, + thiserror::{self, Error}, + types::{Field, FieldDefinition, FieldType, Operation, Record, SourceDefinition}, + }, + futures::{stream::FuturesUnordered, StreamExt, TryFutureExt, TryStreamExt}, + tokio::{ + self, + sync::mpsc::{channel, Sender}, + }, + CdcType, Connector, Ingestor, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, + TableToIngest, +}; use mongodb::{ change_stream::event::ChangeStreamEvent, error::{CommandError, ErrorKind}, options::{ChangeStreamOptions, ClientOptions, ConnectionString}, }; -use tokio::sync::mpsc::{channel, Sender}; - -use crate::{errors::ConnectorError, ingestion::Ingestor}; -use dozer_types::{ - errors::types::DeserializationError, - json_types::{serde_json_to_json_value, JsonValue}, - models::ingestion_types::IngestionMessage, - thiserror::{self, Error}, - types::{Field, FieldDefinition, FieldType, Operation, Record, SourceDefinition}, -}; -use super::{ - Connector, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, TableToIngest, -}; +pub use bson; +pub use mongodb; #[derive(Error, Debug)] pub enum MongodbConnectorError { + #[error("Failed to parse connection string. {0}")] + ParseConnectionString(#[source] mongodb::error::Error), + #[error("Server is not part of a replica set")] NotAReplicaSet, @@ -93,7 +101,9 @@ impl std::ops::BitOr for Privs { } } -async fn start_session(client: &mongodb::Client) -> Result { +async fn start_session( + client: &mongodb::Client, +) -> Result { let session_options = mongodb::options::SessionOptions::builder() .snapshot(true) .build(); @@ -103,14 +113,11 @@ async fn start_session(client: &mongodb::Client) -> Result { - MongodbConnectorError::NotAReplicaSet - } - _ => MongodbConnectorError::ConnectionFailure(e), + .map_err(|e| match *e.kind { + ErrorKind::Command(CommandError { code: 123, .. }) => { + MongodbConnectorError::NotAReplicaSet } - .into() + _ => MongodbConnectorError::ConnectionFailure(e), }) } @@ -119,8 +126,8 @@ async fn snapshot_collection( db: &mongodb::Database, collection: &str, table_idx: usize, - tx: Sender>, -) -> Result { + tx: Sender>, +) -> Result { let mut session = start_session(client).await?; let collection: mongodb::Collection = db.collection(collection); let mut documents = collection @@ -196,8 +203,8 @@ async fn replicate_collection( collection: &str, start_at: Timestamp, table_idx: usize, - tx: Sender>, -) -> Result<(), ConnectorError> { + tx: Sender>, +) -> Result<(), MongodbConnectorError> { let collection: mongodb::Collection = db.collection(collection); let options = ChangeStreamOptions::builder() .start_at_operation_time(Some(start_at)) @@ -244,14 +251,7 @@ async fn replicate_collection( _ => todo!(), } }) - .for_each(|op| async { - tx.send( - op.map_err(ConnectorError::MongodbError) - .map(|op| (table_idx, op)), - ) - .await - .unwrap() - }) + .for_each(|op| async { tx.send(op.map(|op| (table_idx, op))).await.unwrap() }) .await; Ok(()) } @@ -263,24 +263,25 @@ struct ServerInfo { } impl MongodbConnector { - pub fn new(connection_string: String) -> Result { - let _ = ConnectionString::parse(&connection_string).map_err(|e| { - ConnectorError::WrongConnectionConfiguration(DeserializationError::Custom(Box::new(e))) - })?; + pub fn new(connection_string: String) -> Result { + let _ = ConnectionString::parse(&connection_string) + .map_err(MongodbConnectorError::ParseConnectionString); Ok(Self { conn_string: connection_string, }) } - async fn client_options(&self) -> Result { - let mut options = ClientOptions::parse(&self.conn_string).await.map_err(|e| { - ConnectorError::WrongConnectionConfiguration(DeserializationError::Custom(Box::new(e))) - })?; + async fn client_options( + &self, + ) -> Result { + let mut options = ClientOptions::parse(&self.conn_string) + .await + .map_err(MongodbConnectorError::ParseConnectionString)?; options.write_concern = None; Ok(options) } - async fn client(&self) -> Result { + async fn client(&self) -> Result { let options = self.client_options().await?; self.client_with_options(options).await } @@ -288,10 +289,10 @@ impl MongodbConnector { async fn client_with_options( &self, options: mongodb::options::ClientOptions, - ) -> Result { + ) -> Result { let client = mongodb::Client::with_options(options).unwrap(); if client.default_database().is_none() { - return Err(NoDatabaseError.into()); + return Err(NoDatabaseError); } Ok(client) } @@ -305,7 +306,7 @@ impl MongodbConnector { async fn identify_server( &self, client: &mongodb::Client, - ) -> Result { + ) -> Result { let db = self.database(client); let hello = doc! { "hello": 1, @@ -335,7 +336,7 @@ impl MongodbConnector { database: &mongodb::Database, username: &str, tables: &[TableIdentifier], - ) -> Result<(), ConnectorError> { + ) -> Result<(), MongodbConnectorError> { // Users can always view their own privileges, so failure here is a connection // error let user_info = database @@ -447,14 +448,14 @@ impl MongodbConnector { if missing_privs.is_empty() { Ok(()) } else { - Err(MissingPermissions(missing_privs).into()) + Err(MissingPermissions(missing_privs)) } } } #[async_trait] impl Connector for MongodbConnector { - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { let client = self.client().await?; let server_info = self.identify_server(&client).await?; if !server_info.replset { @@ -473,7 +474,7 @@ impl Connector for MongodbConnector { async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { Ok(tables .into_iter() .map(|table| TableInfo { @@ -487,7 +488,7 @@ impl Connector for MongodbConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let _ = self.client().await?; Ok(table_infos .iter() @@ -510,13 +511,13 @@ impl Connector for MongodbConnector { ], primary_index: vec![0], }, - cdc_type: super::CdcType::OnlyPK, + cdc_type: CdcType::OnlyPK, }) }) .collect()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { let client = self.client().await?; let database = self.database(&client); let collections = database @@ -535,7 +536,7 @@ impl Connector for MongodbConnector { .collect()) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let options = self.client_options().await?; let client = self.client_with_options(options.clone()).await?; let database = self.database(&client); @@ -592,14 +593,14 @@ impl Connector for MongodbConnector { &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), BoxedError> { // Snapshot: find // // Replicate: changeStream let client = self.client().await?; let database = self.database(&client); - let (tx, mut rx) = channel::>(100); + let (tx, mut rx) = channel::>(100); let snapshots = FuturesUnordered::new(); for (idx, table) in tables.iter().enumerate() { @@ -611,33 +612,45 @@ impl Connector for MongodbConnector { let snapshot_ingestor = ingestor.clone(); let snapshot_task = tokio::spawn(async move { - snapshot_ingestor + if snapshot_ingestor .handle_message(IngestionMessage::SnapshottingStarted) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If the ingestor is already closed, we don't need to do anything + return Ok::<_, MongodbConnectorError>(()); + } while let Some(result) = rx.recv().await { let (table_index, op) = result?; - snapshot_ingestor + if snapshot_ingestor .handle_message(IngestionMessage::OperationEvent { table_index, op, id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If the ingestor is already closed, we don't need to do anything + return Ok(()); + } } - snapshot_ingestor + if snapshot_ingestor .handle_message(IngestionMessage::SnapshottingDone) .await - .map_err(|_| ConnectorError::IngestorError)?; - Ok::<_, ConnectorError>(()) + .is_err() + { + // If the ingestor is already closed, we don't need to do anything + return Ok(()); + }; + Ok(()) }); let timestamps: Vec<(usize, Timestamp)> = snapshots.try_collect().await?; snapshot_task.await.unwrap()?; - let (tx, mut rx) = channel::>(100); + let (tx, mut rx) = channel::>(100); let replicators = FuturesUnordered::new(); for (table_idx, timestamp) in timestamps { @@ -656,16 +669,20 @@ impl Connector for MongodbConnector { let replication_task = tokio::spawn(async move { while let Some(result) = rx.recv().await { let (table_index, op) = result?; - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index, op, id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If the ingestor is already closed, we don't need to do anything + return Ok::<_, MongodbConnectorError>(()); + } } - Ok::<_, ConnectorError>(()) + Ok(()) }); let _: () = replicators.try_collect().await?; diff --git a/dozer-ingestion/mysql/Cargo.toml b/dozer-ingestion/mysql/Cargo.toml new file mode 100644 index 0000000000..b4507bb641 --- /dev/null +++ b/dozer-ingestion/mysql/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "dozer-ingestion-mysql" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +mysql_async = { version = "0.32.2", default-features = false, features = [ + "default-rustls", +] } +mysql_common = { version = "0.30", default-features = false, features = [ + "chrono", + "rust_decimal", +] } +geozero = { version = "0.11.0", default-features = false, features = [ + "with-wkb", +] } +rand = "0.8.5" + +[dev-dependencies] +serial_test = "1.0.0" +hex = "0.4.3" diff --git a/dozer-ingestion/src/connectors/mysql/binlog.rs b/dozer-ingestion/mysql/src/binlog.rs similarity index 95% rename from dozer-ingestion/src/connectors/mysql/binlog.rs rename to dozer-ingestion/mysql/src/binlog.rs index b05d807704..949565f822 100644 --- a/dozer-ingestion/src/connectors/mysql/binlog.rs +++ b/dozer-ingestion/mysql/src/binlog.rs @@ -1,20 +1,21 @@ +use crate::{connection::is_network_failure, MySQLConnectorError}; + use super::{ connection::Conn, conversion::{IntoField, IntoFields, IntoJsonValue}, schema::{ColumnDefinition, TableDefinition}, }; -use crate::{ - connectors::mysql::connection::is_network_failure, - errors::{ConnectorError, MySQLConnectorError}, - ingestion::Ingestor, -}; -use dozer_types::{json_types::JsonValue, types::Field}; -use dozer_types::{ - log::trace, - models::ingestion_types::IngestionMessage, - types::{FieldType, Operation, Record}, +use dozer_ingestion_connector::{ + dozer_types::{ + json_types::JsonValue, + log::trace, + models::ingestion_types::IngestionMessage, + types::Field, + types::{FieldType, Operation, Record}, + }, + futures::StreamExt, + Ingestor, }; -use futures::StreamExt; use mysql_async::{binlog::EventFlags, BinlogStream, Pool}; use mysql_common::{ binlog::{ @@ -128,7 +129,7 @@ impl BinlogIngestor<'_, '_, '_, '_> { Ok(()) } - pub async fn ingest(&mut self) -> Result<(), ConnectorError> { + pub async fn ingest(&mut self) -> Result<(), MySQLConnectorError> { if self.binlog_stream.is_none() { self.open_binlog().await?; } @@ -201,19 +202,28 @@ impl BinlogIngestor<'_, '_, '_, '_> { _ => unreachable!(), }; - if query_event.query_raw() == b"BEGIN" { - self.ingestor + if query_event.query_raw() == b"BEGIN" + && self + .ingestor .handle_message(IngestionMessage::SnapshottingStarted) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving side is closed, we can stop ingesting. + return Ok(()); } } XID_EVENT => { - self.ingestor + if self + .ingestor .handle_message(IngestionMessage::SnapshottingDone) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving side is closed, we can stop ingesting. + return Ok(()); + } } WRITE_ROWS_EVENT | UPDATE_ROWS_EVENT | DELETE_ROWS_EVENT | WRITE_ROWS_EVENT_V1 @@ -249,16 +259,21 @@ impl BinlogIngestor<'_, '_, '_, '_> { rows_event: &BinlogRowsEvent<'_>, table: &TableDefinition, tme: &TableMapEvent<'a>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), MySQLConnectorError> { for op in self.make_rows_operations(rows_event, table, tme) { - self.ingestor + if self + .ingestor .handle_message(IngestionMessage::OperationEvent { table_index: table.table_index, op: op?, id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving side is closed, we can stop ingesting. + return Ok(()); + } } Ok(()) @@ -694,7 +709,7 @@ impl<'a> BinlogRowsEvent<'a> { mod tests { use std::collections::BTreeMap; - use dozer_types::{ + use dozer_ingestion_connector::dozer_types::{ json_types::JsonValue, types::{Field, FieldType}, }; @@ -709,7 +724,7 @@ mod tests { Value, }; - use crate::connectors::mysql::conversion::IntoField; + use crate::conversion::IntoField; #[test] fn test_field_conversion() { diff --git a/dozer-ingestion/src/connectors/mysql/connection.rs b/dozer-ingestion/mysql/src/connection.rs similarity index 97% rename from dozer-ingestion/src/connectors/mysql/connection.rs rename to dozer-ingestion/mysql/src/connection.rs index 9cc2b9c1ce..ac6775ad18 100644 --- a/dozer-ingestion/src/connectors/mysql/connection.rs +++ b/dozer-ingestion/mysql/src/connection.rs @@ -1,7 +1,9 @@ -use crate::retry_on_network_failure; +use dozer_ingestion_connector::{ + dozer_types, retry_on_network_failure, + tokio::{self, sync::mpsc::Receiver}, +}; use mysql_async::{prelude::Queryable, BinlogRequest, BinlogStream, Params, Pool}; use mysql_common::{prelude::FromRow, Row}; -use tokio::sync::mpsc::Receiver; #[derive(Debug)] pub struct Conn { diff --git a/dozer-ingestion/src/connectors/mysql/connector.rs b/dozer-ingestion/mysql/src/connector.rs similarity index 92% rename from dozer-ingestion/src/connectors/mysql/connector.rs rename to dozer-ingestion/mysql/src/connector.rs index b2ed6fac0f..f13dc7a6de 100644 --- a/dozer-ingestion/src/connectors/mysql/connector.rs +++ b/dozer-ingestion/mysql/src/connector.rs @@ -1,3 +1,5 @@ +use crate::MySQLConnectorError; + use super::{ binlog::{get_binlog_format, get_master_binlog_position, BinlogIngestor, BinlogPosition}, connection::Conn, @@ -5,18 +7,16 @@ use super::{ helpers::{escape_identifier, qualify_table_name}, schema::{ColumnDefinition, SchemaHelper, TableDefinition}, }; -use crate::{ - connectors::{ - CdcType, Connector, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, - TableToIngest, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + errors::internal::BoxedError, + models::ingestion_types::IngestionMessage, + types::{FieldDefinition, FieldType, Operation, Record, Schema, SourceDefinition}, }, - errors::MySQLConnectorError, -}; -use crate::{errors::ConnectorError, ingestion::Ingestor}; -use dozer_types::tonic::async_trait; -use dozer_types::{ - models::ingestion_types::IngestionMessage, - types::{FieldDefinition, FieldType, Operation, Record, Schema, SourceDefinition}, + utils::TableNotFound, + CdcType, Connector, Ingestor, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, + TableToIngest, }; use mysql_async::{Opts, Pool}; use mysql_common::Row; @@ -45,7 +45,7 @@ impl MySQLConnector { #[async_trait] impl Connector for MySQLConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { @@ -96,25 +96,25 @@ impl Connector for MySQLConnector { ] } - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { let _ = self.connect().await?; Ok(()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { let tables = self.schema_helper().list_tables().await?; Ok(tables) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let existing_tables = self.list_tables().await?; for table in tables { if !existing_tables.contains(table) { - Err(ConnectorError::TableNotFound(qualify_table_name( - table.schema.as_deref(), - &table.name, - )))?; + Err(TableNotFound { + schema: table.schema.clone(), + name: table.name.clone(), + })?; } } @@ -124,7 +124,7 @@ impl Connector for MySQLConnector { async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let tables_infos = self.schema_helper().list_columns(tables).await?; Ok(tables_infos) } @@ -132,7 +132,7 @@ impl Connector for MySQLConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { if table_infos.is_empty() { return Ok(Vec::new()); } @@ -192,7 +192,7 @@ impl Connector for MySQLConnector { &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), BoxedError> { let table_infos = tables .into_iter() .map(|table| { @@ -225,7 +225,7 @@ impl MySQLConnector { &self, ingestor: &Ingestor, table_infos: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), MySQLConnectorError> { let table_definitions = self .schema_helper() .get_table_definitions(&table_infos) @@ -244,7 +244,7 @@ impl MySQLConnector { &self, ingestor: &Ingestor, table_definitions: &[TableDefinition], - ) -> Result, ConnectorError> { + ) -> Result, MySQLConnectorError> { let mut binlog_position_per_table = Vec::new(); let mut conn = self.connect().await?; @@ -280,10 +280,14 @@ impl MySQLConnector { continue; } - ingestor + if ingestor .handle_message(IngestionMessage::SnapshottingStarted) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving end is closed, we should stop the replication + break; + } let mut rows = conn.exec_iter( format!( @@ -310,14 +314,18 @@ impl MySQLConnector { new: Record::new(row.into_fields(&field_types)?), }; - ingestor + if ingestor .handle_message(IngestionMessage::OperationEvent { table_index, op, id: None, }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiving end is closed, we should stop the replication + break; + } } let binlog_position = get_master_binlog_position(&mut conn).await?; @@ -326,10 +334,13 @@ impl MySQLConnector { .await .map_err(MySQLConnectorError::QueryExecutionError)?; - ingestor + if ingestor .handle_message(IngestionMessage::SnapshottingDone) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + break; + } binlog_position_per_table.push((td.clone(), binlog_position)); } @@ -341,7 +352,7 @@ impl MySQLConnector { &self, ingestor: &Ingestor, binlog_positions: Vec<(TableDefinition, BinlogPosition)>, - ) -> Result { + ) -> Result { assert!(!binlog_positions.is_empty()); let position = { @@ -378,7 +389,7 @@ impl MySQLConnector { tables: &[TableDefinition], start_position: BinlogPosition, stop_position: Option, - ) -> Result<(), ConnectorError> { + ) -> Result<(), MySQLConnectorError> { let server_id = self.server_id.unwrap_or_else(|| rand::thread_rng().gen()); let mut binlog_ingestor = BinlogIngestor::new( @@ -396,23 +407,21 @@ impl MySQLConnector { #[cfg(test)] mod tests { - use super::MySQLConnector; use crate::{ - connectors::{ - mysql::{ - connection::Conn, - tests::{create_test_table, mariadb_test_config, mysql_test_config, TestConfig}, - }, - CdcType, Connector, SourceSchema, TableIdentifier, - }, - ingestion::{IngestionIterator, Ingestor}, + connection::Conn, + tests::{create_test_table, mariadb_test_config, mysql_test_config, TestConfig}, }; - use dozer_types::{ - json_types::JsonValue, - models::ingestion_types::IngestionMessage, - types::{ - Field, FieldDefinition, FieldType, Operation::*, Record, Schema, SourceDefinition, + + use super::MySQLConnector; + use dozer_ingestion_connector::{ + dozer_types::{ + json_types::JsonValue, + models::ingestion_types::IngestionMessage, + types::{ + Field, FieldDefinition, FieldType, Operation::*, Record, Schema, SourceDefinition, + }, }, + tokio, CdcType, Connector, IngestionIterator, Ingestor, SourceSchema, TableIdentifier, }; use serial_test::serial; use std::time::Duration; diff --git a/dozer-ingestion/src/connectors/mysql/conversion.rs b/dozer-ingestion/mysql/src/conversion.rs similarity index 96% rename from dozer-ingestion/src/connectors/mysql/conversion.rs rename to dozer-ingestion/mysql/src/conversion.rs index 0c8b5a6823..5497270c14 100644 --- a/dozer-ingestion/src/connectors/mysql/conversion.rs +++ b/dozer-ingestion/mysql/src/conversion.rs @@ -1,6 +1,5 @@ -use crate::errors::MySQLConnectorError; -use chrono::{DateTime, NaiveDate, NaiveDateTime, Offset, Utc}; -use dozer_types::{ +use dozer_ingestion_connector::dozer_types::{ + chrono::{DateTime, NaiveDate, NaiveDateTime, Offset, Utc}, json_types::{serde_json_to_json_value, JsonValue}, rust_decimal::Decimal, serde_json, @@ -10,6 +9,8 @@ use geozero::{wkb, GeomProcessor}; use mysql_common::{Row, Value}; use std::time::Duration; +use crate::MySQLConnectorError; + pub fn get_field_type_for_mysql_column_type( column_type: &str, ) -> Result { @@ -190,14 +191,8 @@ impl GeomProcessor for PointProcessor { #[cfg(test)] mod tests { - use super::get_field_type_for_mysql_column_type; - use crate::connectors::mysql::conversion::IntoField; - use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Offset, Utc}; - use dozer_types::{ - json_types::JsonValue, - types::{DozerDuration, Field, FieldType, TimeUnit}, - }; - use mysql_common::Value; + use super::*; + use mysql_common::{chrono::NaiveTime, Value}; use std::time::Duration; #[test] diff --git a/dozer-ingestion/src/connectors/mysql/helpers.rs b/dozer-ingestion/mysql/src/helpers.rs similarity index 89% rename from dozer-ingestion/src/connectors/mysql/helpers.rs rename to dozer-ingestion/mysql/src/helpers.rs index 574bba014d..df3b2fd415 100644 --- a/dozer-ingestion/src/connectors/mysql/helpers.rs +++ b/dozer-ingestion/mysql/src/helpers.rs @@ -12,7 +12,7 @@ pub fn qualify_table_name(schema: Option<&str>, name: &str) -> String { #[cfg(test)] mod tests { - use crate::connectors::mysql::helpers::{escape_identifier, qualify_table_name}; + use crate::helpers::{escape_identifier, qualify_table_name}; #[test] fn test_identifiers() { diff --git a/dozer-ingestion/mysql/src/lib.rs b/dozer-ingestion/mysql/src/lib.rs new file mode 100644 index 0000000000..798c36f2e0 --- /dev/null +++ b/dozer-ingestion/mysql/src/lib.rs @@ -0,0 +1,50 @@ +use dozer_ingestion_connector::dozer_types::{ + errors::types::DeserializationError, + thiserror::{self, Error}, +}; +use geozero::error::GeozeroError; + +mod binlog; +mod connection; +pub mod connector; +mod conversion; +pub(crate) mod helpers; +mod schema; +#[cfg(test)] +mod tests; + +#[derive(Error, Debug)] +pub enum MySQLConnectorError { + #[error("Invalid connection URL: {0:?}")] + InvalidConnectionURLError(#[source] mysql_async::UrlError), + + #[error("Failed to connect to mysql with the specified url {0}. {1}")] + ConnectionFailure(String, #[source] mysql_async::Error), + + #[error("Unsupported field type: {0}")] + UnsupportedFieldType(String), + + #[error("Invalid field value. {0}")] + InvalidFieldValue(#[from] mysql_common::FromValueError), + + #[error("Invalid json value. {0}")] + JsonDeserializationError(#[from] DeserializationError), + + #[error("Invalid geometric value. {0}")] + InvalidGeometricValue(#[from] GeozeroError), + + #[error("Failed to open binlog. {0}")] + BinlogOpenError(#[source] mysql_async::Error), + + #[error("Failed to read binlog. {0}")] + BinlogReadError(#[source] mysql_async::Error), + + #[error("Binlog error: {0}")] + BinlogError(String), + + #[error("Query failed. {0}")] + QueryExecutionError(#[source] mysql_async::Error), + + #[error("Failed to fetch query result. {0}")] + QueryResultError(#[source] mysql_async::Error), +} diff --git a/dozer-ingestion/src/connectors/mysql/schema.rs b/dozer-ingestion/mysql/src/schema.rs similarity index 97% rename from dozer-ingestion/src/connectors/mysql/schema.rs rename to dozer-ingestion/mysql/src/schema.rs index f8ad832b2a..692d333f52 100644 --- a/dozer-ingestion/src/connectors/mysql/schema.rs +++ b/dozer-ingestion/mysql/src/schema.rs @@ -1,12 +1,10 @@ +use crate::{helpers::escape_identifier, MySQLConnectorError}; + use super::{ connection::{Conn, QueryResult}, conversion::get_field_type_for_mysql_column_type, }; -use crate::{ - connectors::{mysql::helpers::escape_identifier, TableIdentifier, TableInfo}, - errors::MySQLConnectorError, -}; -use dozer_types::types::FieldType; +use dozer_ingestion_connector::{dozer_types::types::FieldType, TableIdentifier, TableInfo}; use mysql_async::{from_row, Pool}; use mysql_common::Value; @@ -340,11 +338,10 @@ impl<'a> From<&'a TableInfo> for TableInfoRef<'a> { #[cfg(test)] mod tests { use super::{ColumnDefinition, SchemaHelper, TableDefinition}; - use crate::connectors::{ - mysql::tests::{create_test_table, mariadb_test_config, mysql_test_config, TestConfig}, - TableIdentifier, TableInfo, + use crate::tests::{create_test_table, mariadb_test_config, mysql_test_config, TestConfig}; + use dozer_ingestion_connector::{ + dozer_types::types::FieldType, tokio, TableIdentifier, TableInfo, }; - use dozer_types::types::FieldType; use serial_test::serial; async fn test_connector_schemas(config: TestConfig) { diff --git a/dozer-ingestion/src/connectors/mysql/tests.rs b/dozer-ingestion/mysql/src/tests.rs similarity index 98% rename from dozer-ingestion/src/connectors/mysql/tests.rs rename to dozer-ingestion/mysql/src/tests.rs index d3db16774b..8b5155167d 100644 --- a/dozer-ingestion/src/connectors/mysql/tests.rs +++ b/dozer-ingestion/mysql/src/tests.rs @@ -1,4 +1,4 @@ -use crate::connectors::TableInfo; +use dozer_ingestion_connector::TableInfo; use mysql_async::{prelude::Queryable, Opts, Pool}; pub struct TestConfig { diff --git a/dozer-ingestion/object-store/Cargo.toml b/dozer-ingestion/object-store/Cargo.toml new file mode 100644 index 0000000000..90d0de4f96 --- /dev/null +++ b/dozer-ingestion/object-store/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "dozer-ingestion-object-store" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +deltalake = { version = "0.15.0", default-features = false, features = [ + "s3", + "datafusion", +] } +object_store = { version = "0.6.1", features = ["aws"] } +url = "2.4.1" diff --git a/dozer-ingestion/src/connectors/object_store/adapters.rs b/dozer-ingestion/object-store/src/adapters.rs similarity index 82% rename from dozer-ingestion/src/connectors/object_store/adapters.rs rename to dozer-ingestion/object-store/src/adapters.rs index 0276908fa6..8628a65216 100644 --- a/dozer-ingestion/src/connectors/object_store/adapters.rs +++ b/dozer-ingestion/object-store/src/adapters.rs @@ -1,25 +1,27 @@ -use crate::errors::ObjectStoreObjectError::TableDefinitionNotFound; -use crate::errors::{ConnectorError, ObjectStoreConnectorError}; -use dozer_types::models::ingestion_types::{LocalStorage, S3Storage, Table, TableConfig}; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::{ + LocalStorage, S3Storage, Table, TableConfig, +}; use object_store::aws::{AmazonS3, AmazonS3Builder}; use object_store::local::LocalFileSystem; use object_store::{BackoffConfig, ObjectStore, RetryConfig}; use std::fmt::Debug; use url::Url; +use crate::{ObjectStoreConnectorError, ObjectStoreObjectError}; + pub trait DozerObjectStore: Clone + Send + Sync + Debug + 'static { type ObjectStore: ObjectStore; fn table_params( &self, table_name: &str, - ) -> Result, ConnectorError> { + ) -> Result, ObjectStoreConnectorError> { let table = self .tables() .iter() .find(|table| table.name == table_name) .ok_or(ObjectStoreConnectorError::DataFusionStorageObjectError( - TableDefinitionNotFound, + ObjectStoreObjectError::TableDefinitionNotFound, ))?; self.store_params(table) @@ -28,7 +30,7 @@ pub trait DozerObjectStore: Clone + Send + Sync + Debug + 'static { fn store_params( &self, table: &Table, - ) -> Result, ConnectorError>; + ) -> Result, ObjectStoreConnectorError>; fn tables(&self) -> &[Table]; } @@ -53,7 +55,7 @@ impl DozerObjectStore for S3Storage { fn store_params( &self, table: &Table, - ) -> Result, ConnectorError> { + ) -> Result, ObjectStoreConnectorError> { let details = &self.details; let retry_config = RetryConfig { @@ -68,8 +70,7 @@ impl DozerObjectStore for S3Storage { .with_access_key_id(&details.access_key_id) .with_secret_access_key(&details.secret_access_key) .with_retry(retry_config) - .build() - .map_err(|e| ConnectorError::InitializationError(e.to_string()))?; + .build()?; let folder = match &table.config { TableConfig::CSV(csv_config) => csv_config.path.clone(), @@ -101,11 +102,10 @@ impl DozerObjectStore for LocalStorage { fn store_params( &self, table: &Table, - ) -> Result, ConnectorError> { + ) -> Result, ObjectStoreConnectorError> { let path = &self.details.path.as_str(); - let object_store = LocalFileSystem::new_with_prefix(path) - .map_err(|e| ConnectorError::InitializationError(e.to_string()))?; + let object_store = LocalFileSystem::new_with_prefix(path)?; let folder = match &table.config { TableConfig::CSV(csv_config) => csv_config.path.clone(), diff --git a/dozer-ingestion/src/connectors/object_store/connection.rs b/dozer-ingestion/object-store/src/connection.rs similarity index 100% rename from dozer-ingestion/src/connectors/object_store/connection.rs rename to dozer-ingestion/object-store/src/connection.rs diff --git a/dozer-ingestion/src/connectors/object_store/connection/validator.rs b/dozer-ingestion/object-store/src/connection/validator.rs similarity index 59% rename from dozer-ingestion/src/connectors/object_store/connection/validator.rs rename to dozer-ingestion/object-store/src/connection/validator.rs index 07d8327fcc..4193ea91e0 100644 --- a/dozer-ingestion/src/connectors/object_store/connection/validator.rs +++ b/dozer-ingestion/object-store/src/connection/validator.rs @@ -1,9 +1,10 @@ -use crate::connectors::object_store::adapters::DozerObjectStore; -use crate::connectors::TableIdentifier; -use crate::errors::{ConnectorError, ObjectStoreConnectorError, ObjectStoreTableReaderError}; use deltalake::datafusion::datasource::listing::ListingTableUrl; +use dozer_ingestion_connector::{ + dozer_types::indicatif::{ProgressBar, ProgressStyle}, + TableIdentifier, +}; -use dozer_types::indicatif::ProgressStyle; +use crate::{adapters::DozerObjectStore, ObjectStoreConnectorError}; pub enum Validations { Permissions, @@ -13,9 +14,9 @@ pub fn validate_connection( name: &str, tables: Option<&[TableIdentifier]>, config: T, -) -> Result<(), ConnectorError> { +) -> Result<(), ObjectStoreConnectorError> { let validations_order: Vec = vec![Validations::Permissions]; - let pb = dozer_types::indicatif::ProgressBar::new(validations_order.len() as u64); + let pb = ProgressBar::new(validations_order.len() as u64); pb.set_style( ProgressStyle::with_template(&format!( "[{}] {}", @@ -41,17 +42,12 @@ pub fn validate_connection( fn validate_permissions( tables: Option<&[TableIdentifier]>, config: T, -) -> Result<(), ConnectorError> { +) -> Result<(), ObjectStoreConnectorError> { if let Some(tables) = tables { for table in tables.iter() { let params = config.table_params(&table.name)?; - ListingTableUrl::parse(¶ms.table_path).map_err(|e| { - ConnectorError::ObjectStoreConnectorError( - ObjectStoreConnectorError::TableReaderError( - ObjectStoreTableReaderError::TableReadFailed(e), - ), - ) - })?; + ListingTableUrl::parse(¶ms.table_path) + .map_err(ObjectStoreConnectorError::InternalDataFusionError)?; } } diff --git a/dozer-ingestion/src/connectors/object_store/connector.rs b/dozer-ingestion/object-store/src/connector.rs similarity index 82% rename from dozer-ingestion/src/connectors/object_store/connector.rs rename to dozer-ingestion/object-store/src/connector.rs index 488845e0a0..7b2ec7d54d 100644 --- a/dozer-ingestion/src/connectors/object_store/connector.rs +++ b/dozer-ingestion/object-store/src/connector.rs @@ -1,17 +1,21 @@ -use dozer_types::models::ingestion_types::{IngestionMessage, TableConfig}; -use dozer_types::tonic::async_trait; -use futures::future::join_all; use std::collections::HashMap; -use tokio::sync::mpsc::channel; -use tokio::task::JoinSet; -use crate::connectors::object_store::adapters::DozerObjectStore; -use crate::connectors::object_store::schema_mapper; -use crate::connectors::{ - Connector, ListOrFilterColumns, SourceSchemaResult, TableIdentifier, TableInfo, TableToIngest, +use dozer_ingestion_connector::dozer_types::errors::internal::BoxedError; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::{ + IngestionMessage, TableConfig, }; -use crate::errors::{ConnectorError, ObjectStoreConnectorError}; -use crate::ingestion::Ingestor; +use dozer_ingestion_connector::dozer_types::types::FieldType; +use dozer_ingestion_connector::futures::future::join_all; +use dozer_ingestion_connector::tokio::sync::mpsc::channel; +use dozer_ingestion_connector::tokio::task::JoinSet; +use dozer_ingestion_connector::utils::ListOrFilterColumns; +use dozer_ingestion_connector::{ + async_trait, tokio, Connector, Ingestor, SourceSchemaResult, TableIdentifier, TableInfo, + TableToIngest, +}; + +use crate::adapters::DozerObjectStore; +use crate::{schema_mapper, ObjectStoreConnectorError}; use super::connection::validator::validate_connection; use super::csv::csv_table::CsvTable; @@ -19,10 +23,6 @@ use super::delta::delta_table::DeltaTable; use super::parquet::parquet_table::ParquetTable; use super::table_watcher::TableWatcher; -use crate::errors::ObjectStoreConnectorError::RecvError; - -type ConnectorResult = Result; - #[derive(Debug)] pub struct ObjectStoreConnector { config: T, @@ -36,18 +36,18 @@ impl ObjectStoreConnector { #[async_trait] impl Connector for ObjectStoreConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { - validate_connection("object_store", None, self.config.clone()) + async fn validate_connection(&self) -> Result<(), BoxedError> { + validate_connection("object_store", None, self.config.clone()).map_err(Into::into) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { Ok(self .config .tables() @@ -56,15 +56,15 @@ impl Connector for ObjectStoreConnector { .collect()) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { - validate_connection("object_store", Some(tables), self.config.clone()) + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { + validate_connection("object_store", Some(tables), self.config.clone()).map_err(Into::into) } async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { - let schemas = get_schema_from_tables(&self.config, &tables).await?; + ) -> Result, BoxedError> { + let schemas = get_schema_from_tables(&self.config, &tables).await; let mut result = vec![]; for (table, schema) in tables.into_iter().zip(schemas) { let schema = schema?; @@ -87,7 +87,7 @@ impl Connector for ObjectStoreConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> ConnectorResult> { + ) -> Result, BoxedError> { let list_or_filter_columns = table_infos .iter() .map(|table_info| ListOrFilterColumns { @@ -96,10 +96,14 @@ impl Connector for ObjectStoreConnector { columns: Some(table_info.column_names.clone()), }) .collect::>(); - schema_mapper::get_schema(&self.config, &list_or_filter_columns).await + Ok(schema_mapper::get_schema(&self.config, &list_or_filter_columns).await) } - async fn start(&self, ingestor: &Ingestor, tables: Vec) -> ConnectorResult<()> { + async fn start( + &self, + ingestor: &Ingestor, + tables: Vec, + ) -> Result<(), BoxedError> { let (sender, mut receiver) = channel::, ObjectStoreConnectorError>>(100); // todo: increase buffer siz let ingestor_clone = ingestor.clone(); @@ -110,20 +114,15 @@ impl Connector for ObjectStoreConnector { let message = receiver .recv() .await - .ok_or(ConnectorError::ObjectStoreConnectorError(RecvError))??; + .ok_or(ObjectStoreConnectorError::RecvError)??; match message { None => { break; } - Some(evt) => { - ingestor_clone - .handle_message(evt) - .await - .map_err(|_| ConnectorError::IngestorError)?; - } + Some(evt) => ingestor_clone.handle_message(evt).await?, } } - Ok::<_, ConnectorError>(()) + Ok::<_, BoxedError>(()) }); // sender sending out message for pipeline @@ -208,7 +207,7 @@ impl Connector for ObjectStoreConnector { let mut csv_table = CsvTable::new(csv_config, config); csv_table.update_state = state; csv_table.watch(table_index, &table_info, sender).await?; - Ok::<_, ConnectorError>(()) + Ok::<_, ObjectStoreConnectorError>(()) }); } TableConfig::Delta(config) => { @@ -217,7 +216,7 @@ impl Connector for ObjectStoreConnector { table .watch(table_index, &table_info, sender.clone()) .await?; - Ok::<_, ConnectorError>(()) + Ok::<_, ObjectStoreConnectorError>(()) }); } TableConfig::Parquet(parquet_config) => { @@ -228,7 +227,7 @@ impl Connector for ObjectStoreConnector { table .watch(table_index, &table_info, sender.clone()) .await?; - Ok::<_, ConnectorError>(()) + Ok::<_, ObjectStoreConnectorError>(()) }); } } @@ -248,7 +247,7 @@ impl Connector for ObjectStoreConnector { async fn get_schema_from_tables( config: &impl DozerObjectStore, tables: &[TableIdentifier], -) -> Result, ConnectorError> { +) -> Vec { let table_infos = tables .iter() .map(|table| ListOrFilterColumns { diff --git a/dozer-ingestion/src/connectors/object_store/csv/csv_table.rs b/dozer-ingestion/object-store/src/csv/csv_table.rs similarity index 92% rename from dozer-ingestion/src/connectors/object_store/csv/csv_table.rs rename to dozer-ingestion/object-store/src/csv/csv_table.rs index b36a7cda25..9b207ff4fb 100644 --- a/dozer-ingestion/src/connectors/object_store/csv/csv_table.rs +++ b/dozer-ingestion/object-store/src/csv/csv_table.rs @@ -1,36 +1,28 @@ -use dozer_types::{ - chrono::{DateTime, Utc}, - models::ingestion_types::{CsvConfig, IngestionMessage}, - tracing::info, -}; use std::{collections::HashMap, path::Path, sync::Arc, time::Duration}; -use dozer_types::tonic::async_trait; -use futures::StreamExt; -use object_store::ObjectStore; -use tokio::sync::mpsc::Sender; - -use crate::{ - connectors::{ - object_store::{ - adapters::DozerObjectStore, - table_watcher::{FileInfo, TableWatcher}, - }, - TableInfo, - }, - errors::{ConnectorError, ObjectStoreConnectorError}, -}; - use deltalake::{ datafusion::{datasource::listing::ListingTableUrl, prelude::SessionContext}, Path as DeltaPath, }; -use tokio::task::JoinHandle; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + chrono::{DateTime, Utc}, + models::ingestion_types::{CsvConfig, IngestionMessage}, + tracing::info, + }, + futures::StreamExt, + tokio::{self, sync::mpsc::Sender, task::JoinHandle}, + TableInfo, +}; +use object_store::ObjectStore; -use crate::connectors::object_store::helper::is_marker_file_exist; use crate::{ - connectors::{self, object_store::helper::map_listing_options}, - errors::ObjectStoreObjectError, + adapters::DozerObjectStore, + helper::{is_marker_file_exist, map_listing_options}, + table_reader, + table_watcher::{FileInfo, TableWatcher}, + ObjectStoreConnectorError, ObjectStoreObjectError, }; const _WATCHER_INTERVAL: Duration = Duration::from_secs(1); @@ -55,7 +47,7 @@ impl CsvTable { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), ObjectStoreConnectorError> { let params = self.store_config.table_params(&table.name)?; let store = Arc::new(params.object_store); @@ -175,7 +167,7 @@ impl CsvTable { }) .unwrap(); - let result = connectors::object_store::table_reader::TableReader::::read( + let result = table_reader::read( table_index, ctx.clone(), file_path, @@ -203,8 +195,10 @@ impl TableWatcher for CsvTable { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result>)>, ConnectorError> - { + ) -> Result< + JoinHandle<(usize, HashMap>)>, + ObjectStoreConnectorError, + > { let params = self.store_config.table_params(&table.name)?; let store = Arc::new(params.object_store); @@ -324,7 +318,7 @@ impl TableWatcher for CsvTable { }) .unwrap(); - let result = connectors::object_store::table_reader::TableReader::::read( + let result = table_reader::read( table_index, ctx.clone(), file_path, @@ -349,7 +343,7 @@ impl TableWatcher for CsvTable { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), ObjectStoreConnectorError> { self.read(table_index, table, sender).await?; Ok(()) } diff --git a/dozer-ingestion/src/connectors/object_store/csv/mod.rs b/dozer-ingestion/object-store/src/csv/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/object_store/csv/mod.rs rename to dozer-ingestion/object-store/src/csv/mod.rs diff --git a/dozer-ingestion/src/connectors/object_store/delta/delta_table.rs b/dozer-ingestion/object-store/src/delta/delta_table.rs similarity index 90% rename from dozer-ingestion/src/connectors/object_store/delta/delta_table.rs rename to dozer-ingestion/object-store/src/delta/delta_table.rs index bfda973f75..3c8a2834df 100644 --- a/dozer-ingestion/src/connectors/object_store/delta/delta_table.rs +++ b/dozer-ingestion/object-store/src/delta/delta_table.rs @@ -1,27 +1,22 @@ use std::{collections::HashMap, sync::Arc}; use deltalake::{datafusion::prelude::SessionContext, s3_storage_options}; -use dozer_types::chrono::{DateTime, Utc}; -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::tonic::async_trait; -use dozer_types::{ - arrow_types::from_arrow::{map_schema_to_dozer, map_value_to_dozer_field}, - models::ingestion_types::DeltaConfig, - tracing::error, - types::{Operation, Record}, -}; -use futures::StreamExt; -use tokio::sync::mpsc::Sender; -use tokio::task::JoinHandle; - -use crate::{ - connectors::{ - object_store::{adapters::DozerObjectStore, table_watcher::TableWatcher}, - TableInfo, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + arrow_types::from_arrow::{map_schema_to_dozer, map_value_to_dozer_field}, + chrono::{DateTime, Utc}, + log::error, + models::ingestion_types::{DeltaConfig, IngestionMessage}, + types::{Operation, Record}, }, - errors::{ConnectorError, ObjectStoreConnectorError}, + futures::StreamExt, + tokio::{self, sync::mpsc::Sender, task::JoinHandle}, + TableInfo, }; +use crate::{adapters::DozerObjectStore, table_watcher::TableWatcher, ObjectStoreConnectorError}; + pub struct DeltaTable { _table_config: DeltaConfig, store_config: T, @@ -126,8 +121,10 @@ impl TableWatcher for DeltaTable { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result>)>, ConnectorError> - { + ) -> Result< + JoinHandle<(usize, HashMap>)>, + ObjectStoreConnectorError, + > { let params = self.store_config.table_params(&table.name)?; let ctx = SessionContext::new(); @@ -173,9 +170,7 @@ impl TableWatcher for DeltaTable { // .map_err(ConnectorError::IngestorError)?; while let Some(Ok(batch)) = data.next().await { - let dozer_schema = map_schema_to_dozer(&batch.schema()) - .map_err(|e| ConnectorError::InternalError(Box::new(e))) - .unwrap(); + let dozer_schema = map_schema_to_dozer(&batch.schema()).unwrap(); for row in 0..batch.num_rows() { let fields = batch .columns() @@ -243,7 +238,7 @@ impl TableWatcher for DeltaTable { _table_index: usize, _table: &TableInfo, _sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), ObjectStoreConnectorError> { Ok(()) } } diff --git a/dozer-ingestion/src/connectors/object_store/delta/mod.rs b/dozer-ingestion/object-store/src/delta/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/object_store/delta/mod.rs rename to dozer-ingestion/object-store/src/delta/mod.rs diff --git a/dozer-ingestion/src/connectors/object_store/helper.rs b/dozer-ingestion/object-store/src/helper.rs similarity index 89% rename from dozer-ingestion/src/connectors/object_store/helper.rs rename to dozer-ingestion/object-store/src/helper.rs index d0c0a5f7d8..b11b985197 100644 --- a/dozer-ingestion/src/connectors/object_store/helper.rs +++ b/dozer-ingestion/object-store/src/helper.rs @@ -1,11 +1,11 @@ -use crate::connectors::object_store::table_watcher::FileInfo; -use crate::errors::ObjectStoreObjectError; use deltalake::datafusion::datasource::file_format::csv::CsvFormat; use deltalake::datafusion::datasource::file_format::parquet::ParquetFormat; use deltalake::datafusion::datasource::listing::ListingOptions; -use dozer_types::models::ingestion_types::{Table, TableConfig}; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::{Table, TableConfig}; use std::sync::Arc; +use crate::{table_watcher::FileInfo, ObjectStoreObjectError}; + pub fn map_listing_options( data_fusion_table: &Table, ) -> Result { diff --git a/dozer-ingestion/object-store/src/lib.rs b/dozer-ingestion/object-store/src/lib.rs new file mode 100644 index 0000000000..af60636e92 --- /dev/null +++ b/dozer-ingestion/object-store/src/lib.rs @@ -0,0 +1,95 @@ +use deltalake::datafusion::error::DataFusionError; +use dozer_ingestion_connector::dozer_types::{ + arrow_types::errors::FromArrowError, + thiserror::{self, Error}, +}; + +mod adapters; +mod connection; +pub mod connector; +mod csv; +mod delta; +mod helper; +mod parquet; +mod schema_helper; +pub mod schema_mapper; +mod table_reader; +pub(crate) mod table_watcher; +#[cfg(test)] +mod tests; +mod watcher; + +#[derive(Error, Debug)] +pub enum ObjectStoreConnectorError { + #[error("object store error: {0}")] + ObjectStore(#[from] object_store::Error), + + #[error(transparent)] + DataFusionSchemaError(#[from] ObjectStoreSchemaError), + + #[error(transparent)] + DataFusionStorageObjectError(#[from] ObjectStoreObjectError), + + #[error("Internal data fusion error")] + InternalDataFusionError(#[source] DataFusionError), + + #[error(transparent)] + TableReaderError(#[from] ObjectStoreTableReaderError), + + #[error(transparent)] + FromArrowError(#[from] FromArrowError), + + #[error("Failed to send message on data read channel")] + SendError, + + #[error("Failed to receive message on data read channel")] + RecvError, +} + +#[derive(Error, Debug, PartialEq)] +pub enum ObjectStoreSchemaError { + #[error("Unsupported type of \"{0}\" field")] + FieldTypeNotSupported(String), + + #[error("Date time conversion failed")] + DateTimeConversionError, + + #[error("Date conversion failed")] + DateConversionError, + + #[error("Time conversion failed")] + TimeConversionError, + + #[error("Duration conversion failed")] + DurationConversionError, +} + +#[derive(Error, Debug)] +pub enum ObjectStoreObjectError { + #[error("Missing storage details")] + MissingStorageDetails, + + #[error("Table definition not found")] + TableDefinitionNotFound, + + #[error("Listing path {0} parsing error: {1}")] + ListingPathParsingError(String, #[source] DataFusionError), + + #[error("File format unsupported: {0}")] + FileFormatUnsupportedError(String), + + #[error("Listing path {0} error: {1}")] + ListingPathError(String, #[source] DataFusionError), +} + +#[derive(Error, Debug)] +pub enum ObjectStoreTableReaderError { + #[error("Table read failed: {0}")] + TableReadFailed(DataFusionError), + + #[error("Columns select failed: {0}")] + ColumnsSelectFailed(DataFusionError), + + #[error("Stream execution failed: {0}")] + StreamExecutionError(DataFusionError), +} diff --git a/dozer-ingestion/src/connectors/object_store/parquet/mod.rs b/dozer-ingestion/object-store/src/parquet/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/object_store/parquet/mod.rs rename to dozer-ingestion/object-store/src/parquet/mod.rs diff --git a/dozer-ingestion/src/connectors/object_store/parquet/parquet_table.rs b/dozer-ingestion/object-store/src/parquet/parquet_table.rs similarity index 92% rename from dozer-ingestion/src/connectors/object_store/parquet/parquet_table.rs rename to dozer-ingestion/object-store/src/parquet/parquet_table.rs index b9b7f7b42e..d72bdc9d25 100644 --- a/dozer-ingestion/src/connectors/object_store/parquet/parquet_table.rs +++ b/dozer-ingestion/object-store/src/parquet/parquet_table.rs @@ -1,35 +1,28 @@ -use crate::{ - connectors::{ - self, - object_store::{helper::map_listing_options, table_watcher::FileInfo}, - }, - errors::ObjectStoreObjectError, -}; +use std::{collections::HashMap, path::Path, sync::Arc, time::Duration}; + use deltalake::{ datafusion::{datasource::listing::ListingTableUrl, prelude::SessionContext}, Path as DeltaPath, }; - -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::tonic::async_trait; -use dozer_types::{ - chrono::{DateTime, Utc}, - models::ingestion_types::ParquetConfig, - tracing::info, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + chrono::{DateTime, Utc}, + models::ingestion_types::{IngestionMessage, ParquetConfig}, + tracing::info, + }, + futures::StreamExt, + tokio::{self, sync::mpsc::Sender, task::JoinHandle}, + TableInfo, }; -use futures::StreamExt; use object_store::ObjectStore; -use std::{collections::HashMap, path::Path, sync::Arc, time::Duration}; -use tokio::sync::mpsc::Sender; -use tokio::task::JoinHandle; -use crate::connectors::object_store::helper::is_marker_file_exist; use crate::{ - connectors::{ - object_store::{adapters::DozerObjectStore, table_watcher::TableWatcher}, - TableInfo, - }, - errors::{ConnectorError, ObjectStoreConnectorError}, + adapters::DozerObjectStore, + helper::{is_marker_file_exist, map_listing_options}, + table_reader, + table_watcher::{FileInfo, TableWatcher}, + ObjectStoreConnectorError, ObjectStoreObjectError, }; const _WATCHER_INTERVAL: Duration = Duration::from_secs(1); @@ -54,7 +47,7 @@ impl ParquetTable { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), ObjectStoreConnectorError> { let params = self.store_config.table_params(&table.name)?; let store = Arc::new(params.object_store); @@ -180,7 +173,7 @@ impl ParquetTable { }) .unwrap(); - let result = connectors::object_store::table_reader::TableReader::::read( + let result = table_reader::read( table_index, ctx.clone(), file_path, @@ -208,8 +201,10 @@ impl TableWatcher for ParquetTable { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result>)>, ConnectorError> - { + ) -> Result< + JoinHandle<(usize, HashMap>)>, + ObjectStoreConnectorError, + > { let params = self.store_config.table_params(&table.name)?; let store = Arc::new(params.object_store); @@ -335,7 +330,7 @@ impl TableWatcher for ParquetTable { }) .unwrap(); - let result = connectors::object_store::table_reader::TableReader::::read( + let result = table_reader::read( table_index, ctx.clone(), file_path, @@ -360,7 +355,7 @@ impl TableWatcher for ParquetTable { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), ObjectStoreConnectorError> { self.read(table_index, table, sender).await?; Ok(()) } diff --git a/dozer-ingestion/src/connectors/object_store/readme.md b/dozer-ingestion/object-store/src/readme.md similarity index 100% rename from dozer-ingestion/src/connectors/object_store/readme.md rename to dozer-ingestion/object-store/src/readme.md diff --git a/dozer-ingestion/src/connectors/object_store/schema_helper.rs b/dozer-ingestion/object-store/src/schema_helper.rs similarity index 86% rename from dozer-ingestion/src/connectors/object_store/schema_helper.rs rename to dozer-ingestion/object-store/src/schema_helper.rs index b15c06f19f..7f9785105b 100644 --- a/dozer-ingestion/src/connectors/object_store/schema_helper.rs +++ b/dozer-ingestion/object-store/src/schema_helper.rs @@ -1,10 +1,9 @@ use std::sync::Arc; -use crate::errors::ObjectStoreSchemaError; -use crate::errors::ObjectStoreSchemaError::FieldTypeNotSupported; use deltalake::arrow::datatypes::{DataType, Field}; +use dozer_ingestion_connector::dozer_types::types::{FieldDefinition, FieldType, SourceDefinition}; -use dozer_types::types::{FieldDefinition, FieldType, SourceDefinition}; +use crate::ObjectStoreSchemaError; pub fn map_schema_to_dozer<'a, I: Iterator>>( fields_list: I, @@ -41,7 +40,11 @@ pub fn map_schema_to_dozer<'a, I: Iterator>>( // DataType::Decimal128(_, _) => {} // DataType::Decimal256(_, _) => {} // DataType::Map(_, _) => {} - _ => return Err(FieldTypeNotSupported(field.name().clone())), + _ => { + return Err(ObjectStoreSchemaError::FieldTypeNotSupported( + field.name().clone(), + )) + } }; Ok(FieldDefinition { diff --git a/dozer-ingestion/src/connectors/object_store/schema_mapper.rs b/dozer-ingestion/object-store/src/schema_mapper.rs similarity index 84% rename from dozer-ingestion/src/connectors/object_store/schema_mapper.rs rename to dozer-ingestion/object-store/src/schema_mapper.rs index 28974c8e76..e17a12ced5 100644 --- a/dozer-ingestion/src/connectors/object_store/schema_mapper.rs +++ b/dozer-ingestion/object-store/src/schema_mapper.rs @@ -1,24 +1,25 @@ -use crate::connectors::object_store::adapters::DozerObjectStore; -use crate::connectors::object_store::schema_helper::map_schema_to_dozer; -use crate::connectors::{CdcType, ListOrFilterColumns, SourceSchema, SourceSchemaResult}; -use crate::errors::ObjectStoreObjectError::ListingPathParsingError; -use crate::errors::{ConnectorError, ObjectStoreConnectorError}; use deltalake::arrow::datatypes::SchemaRef; use deltalake::datafusion::datasource::file_format::csv::CsvFormat; use deltalake::datafusion::datasource::file_format::parquet::ParquetFormat; use deltalake::datafusion::datasource::listing::{ListingOptions, ListingTableUrl}; use deltalake::datafusion::prelude::SessionContext; use deltalake::s3_storage_options; -use dozer_types::log::error; -use dozer_types::models::ingestion_types::TableConfig; -use dozer_types::types::Schema; +use dozer_ingestion_connector::dozer_types::log::error; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::TableConfig; +use dozer_ingestion_connector::dozer_types::types::Schema; +use dozer_ingestion_connector::utils::ListOrFilterColumns; +use dozer_ingestion_connector::{CdcType, SourceSchema, SourceSchemaResult}; use std::collections::HashMap; use std::sync::Arc; +use crate::adapters::DozerObjectStore; +use crate::schema_helper::map_schema_to_dozer; +use crate::{ObjectStoreConnectorError, ObjectStoreObjectError, ObjectStoreSchemaError}; + pub fn map_schema( resolved_schema: SchemaRef, table: &ListOrFilterColumns, -) -> Result { +) -> Result { let fields_list = resolved_schema.fields().iter(); let fields = match &table.columns { @@ -31,7 +32,7 @@ pub fn map_schema( }; Ok(Schema { - fields: fields.map_err(ObjectStoreConnectorError::DataFusionSchemaError)?, + fields: fields?, primary_index: vec![], }) } @@ -39,12 +40,12 @@ pub fn map_schema( pub async fn get_schema( config: &impl DozerObjectStore, tables: &[ListOrFilterColumns], -) -> Result, ConnectorError> { +) -> Vec { let mut result = vec![]; for table in tables.iter() { result.push(get_table_schema(config, table).await); } - Ok(result) + result } async fn get_table_schema( @@ -79,10 +80,9 @@ async fn get_object_schema( let params = store_config.table_params(&table.name)?; let table_path = ListingTableUrl::parse(¶ms.table_path).map_err(|e| { - ObjectStoreConnectorError::DataFusionStorageObjectError(ListingPathParsingError( - params.table_path.clone(), - e, - )) + ObjectStoreConnectorError::DataFusionStorageObjectError( + ObjectStoreObjectError::ListingPathParsingError(params.table_path.clone(), e), + ) })?; let ctx = SessionContext::new(); @@ -95,7 +95,7 @@ async fn get_object_schema( .await .map_err(|e| { error!("{:?}", e); - ConnectorError::UnableToInferSchema(e) + ObjectStoreConnectorError::InternalDataFusionError(e) })?; let schema = map_schema(resolved_schema, table)?; diff --git a/dozer-ingestion/object-store/src/table_reader.rs b/dozer-ingestion/object-store/src/table_reader.rs new file mode 100644 index 0000000000..c7ba11f656 --- /dev/null +++ b/dozer-ingestion/object-store/src/table_reader.rs @@ -0,0 +1,125 @@ +use deltalake::datafusion::datasource::listing::{ + ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, +}; +use deltalake::datafusion::prelude::SessionContext; +use dozer_ingestion_connector::dozer_types::arrow_types::from_arrow::{ + map_schema_to_dozer, map_value_to_dozer_field, +}; +use dozer_ingestion_connector::dozer_types::log::error; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::IngestionMessage; +use dozer_ingestion_connector::dozer_types::types::{Operation, Record}; +use dozer_ingestion_connector::futures::StreamExt; +use dozer_ingestion_connector::tokio::sync::mpsc::Sender; +use dozer_ingestion_connector::{tokio, TableInfo}; +use std::sync::Arc; + +use crate::{ObjectStoreConnectorError, ObjectStoreTableReaderError}; + +pub struct TableReader { + pub(crate) config: T, +} + +pub async fn read( + table_index: usize, + ctx: SessionContext, + table_path: ListingTableUrl, + listing_options: ListingOptions, + table: &TableInfo, + sender: Sender, ObjectStoreConnectorError>>, +) -> Result<(), ObjectStoreConnectorError> { + let resolved_schema = listing_options + .infer_schema(&ctx.state(), &table_path) + .await + .map_err(ObjectStoreConnectorError::InternalDataFusionError)?; + + let fields = resolved_schema.all_fields(); + + let config = ListingTableConfig::new(table_path.clone()) + .with_listing_options(listing_options) + .with_schema(resolved_schema.clone()); + + let provider = Arc::new( + ListingTable::try_new(config) + .map_err(ObjectStoreConnectorError::InternalDataFusionError)?, + ); + + let cols: Vec<&str> = if table.column_names.is_empty() { + fields.iter().map(|f| f.name().as_str()).collect() + } else { + table.column_names.iter().map(|c| c.as_str()).collect() + }; + let data = ctx + .read_table(provider.clone()) + .map_err(|e| { + ObjectStoreConnectorError::TableReaderError( + ObjectStoreTableReaderError::TableReadFailed(e), + ) + })? + .select_columns(&cols) + .map_err(|e| { + ObjectStoreConnectorError::TableReaderError( + ObjectStoreTableReaderError::ColumnsSelectFailed(e), + ) + })? + .execute_stream() + .await + .map_err(|e| { + ObjectStoreConnectorError::TableReaderError( + ObjectStoreTableReaderError::StreamExecutionError(e), + ) + })?; + + tokio::pin!(data); + + while let Some(batch) = data.next().await { + let batch = match batch { + Ok(batch) => batch, + Err(e) => { + error!("Error reading record batch from {table_path:?}: {e}"); + continue; + } + }; + + let batch_schema = batch.schema(); + let dozer_schema = map_schema_to_dozer(&batch_schema)?; + + for row in 0..batch.num_rows() { + let fields = batch + .columns() + .iter() + .enumerate() + .map(|(col, column)| { + map_value_to_dozer_field( + column, + row, + resolved_schema.field(col).name(), + &dozer_schema, + ) + }) + .collect::, _>>()?; + + let evt = Operation::Insert { + new: Record { + values: fields, + lifetime: None, + }, + }; + + if sender + .send(Ok(Some(IngestionMessage::OperationEvent { + table_index, + op: evt, + id: None, + }))) + .await + .is_err() + { + break; + } + } + } + + // sender.send(Ok(None)).await.unwrap(); + + Ok(()) +} diff --git a/dozer-ingestion/src/connectors/object_store/table_watcher.rs b/dozer-ingestion/object-store/src/table_watcher.rs similarity index 70% rename from dozer-ingestion/src/connectors/object_store/table_watcher.rs rename to dozer-ingestion/object-store/src/table_watcher.rs index 5c1e1946d1..ffa53b8113 100644 --- a/dozer-ingestion/src/connectors/object_store/table_watcher.rs +++ b/dozer-ingestion/object-store/src/table_watcher.rs @@ -1,14 +1,16 @@ -use crate::{ - connectors::TableInfo, - errors::{ConnectorError, ObjectStoreConnectorError}, -}; use std::collections::HashMap; -use dozer_types::chrono::{DateTime, Utc}; -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::tonic::async_trait; -use tokio::sync::mpsc::Sender; -use tokio::task::JoinHandle; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + chrono::{DateTime, Utc}, + models::ingestion_types::IngestionMessage, + }, + tokio::{sync::mpsc::Sender, task::JoinHandle}, + TableInfo, +}; + +use crate::ObjectStoreConnectorError; #[derive(Debug, Eq, Clone)] pub struct FileInfo { @@ -41,7 +43,7 @@ pub trait TableWatcher { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), ObjectStoreConnectorError> { self.ingest(table_index, table, sender.clone()).await?; Ok(()) } @@ -51,12 +53,15 @@ pub trait TableWatcher { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result>)>, ConnectorError>; + ) -> Result< + JoinHandle<(usize, HashMap>)>, + ObjectStoreConnectorError, + >; async fn ingest( &self, table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError>; + ) -> Result<(), ObjectStoreConnectorError>; } diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/all_types_csv/new_sample.csv b/dozer-ingestion/object-store/src/tests/files/all_types_csv/new_sample.csv similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/all_types_csv/new_sample.csv rename to dozer-ingestion/object-store/src/tests/files/all_types_csv/new_sample.csv diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/all_types_csv/sample.csv b/dozer-ingestion/object-store/src/tests/files/all_types_csv/sample.csv similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/all_types_csv/sample.csv rename to dozer-ingestion/object-store/src/tests/files/all_types_csv/sample.csv diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/all_types_parquet/alltypes_plain.parquet b/dozer-ingestion/object-store/src/tests/files/all_types_parquet/alltypes_plain.parquet similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/all_types_parquet/alltypes_plain.parquet rename to dozer-ingestion/object-store/src/tests/files/all_types_parquet/alltypes_plain.parquet diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/marker_csv/marker_sample.csv b/dozer-ingestion/object-store/src/tests/files/marker_csv/marker_sample.csv similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/marker_csv/marker_sample.csv rename to dozer-ingestion/object-store/src/tests/files/marker_csv/marker_sample.csv diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/marker_csv/marker_sample.marker b/dozer-ingestion/object-store/src/tests/files/marker_csv/marker_sample.marker similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/marker_csv/marker_sample.marker rename to dozer-ingestion/object-store/src/tests/files/marker_csv/marker_sample.marker diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/marker_only_one_csv/maker_new_sample.csv b/dozer-ingestion/object-store/src/tests/files/marker_only_one_csv/maker_new_sample.csv similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/marker_only_one_csv/maker_new_sample.csv rename to dozer-ingestion/object-store/src/tests/files/marker_only_one_csv/maker_new_sample.csv diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/marker_only_one_csv/marker_sample.csv b/dozer-ingestion/object-store/src/tests/files/marker_only_one_csv/marker_sample.csv similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/marker_only_one_csv/marker_sample.csv rename to dozer-ingestion/object-store/src/tests/files/marker_only_one_csv/marker_sample.csv diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/marker_only_one_csv/marker_sample.marker b/dozer-ingestion/object-store/src/tests/files/marker_only_one_csv/marker_sample.marker similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/marker_only_one_csv/marker_sample.marker rename to dozer-ingestion/object-store/src/tests/files/marker_only_one_csv/marker_sample.marker diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/marker_parquet/marker_plain.marker b/dozer-ingestion/object-store/src/tests/files/marker_parquet/marker_plain.marker similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/marker_parquet/marker_plain.marker rename to dozer-ingestion/object-store/src/tests/files/marker_parquet/marker_plain.marker diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/marker_parquet/marker_plain.parquet b/dozer-ingestion/object-store/src/tests/files/marker_parquet/marker_plain.parquet similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/marker_parquet/marker_plain.parquet rename to dozer-ingestion/object-store/src/tests/files/marker_parquet/marker_plain.parquet diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/no_marker_csv/marker_sample.csv b/dozer-ingestion/object-store/src/tests/files/no_marker_csv/marker_sample.csv similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/no_marker_csv/marker_sample.csv rename to dozer-ingestion/object-store/src/tests/files/no_marker_csv/marker_sample.csv diff --git a/dozer-ingestion/src/connectors/object_store/tests/files/no_marker_parquet/marker_plain.parquet b/dozer-ingestion/object-store/src/tests/files/no_marker_parquet/marker_plain.parquet similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/files/no_marker_parquet/marker_plain.parquet rename to dozer-ingestion/object-store/src/tests/files/no_marker_parquet/marker_plain.parquet diff --git a/dozer-ingestion/src/connectors/object_store/tests/local_storage_tests.rs b/dozer-ingestion/object-store/src/tests/local_storage_tests.rs similarity index 96% rename from dozer-ingestion/src/connectors/object_store/tests/local_storage_tests.rs rename to dozer-ingestion/object-store/src/tests/local_storage_tests.rs index 5aa4a6c347..ca749f9f64 100644 --- a/dozer-ingestion/src/connectors/object_store/tests/local_storage_tests.rs +++ b/dozer-ingestion/object-store/src/tests/local_storage_tests.rs @@ -1,10 +1,13 @@ -use crate::connectors::object_store::connector::ObjectStoreConnector; -use crate::connectors::Connector; -use crate::test_util::create_runtime_and_spawn_connector_all_tables; -use dozer_types::models::ingestion_types::IngestionMessage; - -use crate::connectors::object_store::tests::test_utils::get_local_storage_config; -use dozer_types::types::{Field, FieldType, Operation}; +use dozer_ingestion_connector::{ + dozer_types::{ + models::ingestion_types::IngestionMessage, + types::{Field, FieldType, Operation}, + }, + test_util::create_runtime_and_spawn_connector_all_tables, + tokio, Connector, +}; + +use crate::{connector::ObjectStoreConnector, tests::test_utils::get_local_storage_config}; #[macro_export] macro_rules! test_type_conversion { diff --git a/dozer-ingestion/src/connectors/object_store/tests/mod.rs b/dozer-ingestion/object-store/src/tests/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/object_store/tests/mod.rs rename to dozer-ingestion/object-store/src/tests/mod.rs diff --git a/dozer-ingestion/src/connectors/object_store/tests/test_utils.rs b/dozer-ingestion/object-store/src/tests/test_utils.rs similarity index 95% rename from dozer-ingestion/src/connectors/object_store/tests/test_utils.rs rename to dozer-ingestion/object-store/src/tests/test_utils.rs index fa4cd286fb..ec680b44c7 100644 --- a/dozer-ingestion/src/connectors/object_store/tests/test_utils.rs +++ b/dozer-ingestion/object-store/src/tests/test_utils.rs @@ -1,10 +1,10 @@ -use dozer_types::models::ingestion_types::{ +use dozer_ingestion_connector::dozer_types::models::ingestion_types::{ CsvConfig, LocalDetails, LocalStorage, ParquetConfig, Table, TableConfig, }; use std::path::PathBuf; pub fn get_local_storage_config(typ: &str, prefix: &str) -> LocalStorage { - let p = PathBuf::from("src/connectors/object_store/tests/files".to_string()); + let p = PathBuf::from("src/tests/files".to_string()); match typ { "parquet" => match prefix { "" => LocalStorage { diff --git a/dozer-ingestion/src/connectors/object_store/watcher.rs b/dozer-ingestion/object-store/src/watcher.rs similarity index 90% rename from dozer-ingestion/src/connectors/object_store/watcher.rs rename to dozer-ingestion/object-store/src/watcher.rs index fb3b086fb1..164508e324 100644 --- a/dozer-ingestion/src/connectors/object_store/watcher.rs +++ b/dozer-ingestion/object-store/src/watcher.rs @@ -1,22 +1,19 @@ +use std::path::Path; use std::{collections::HashMap, sync::Arc, time::Duration}; use deltalake::{ datafusion::{datasource::listing::ListingTableUrl, prelude::SessionContext}, Path as DeltaPath, }; -use dozer_types::tonic::async_trait; -use dozer_types::tracing::info; -use futures::StreamExt; +use dozer_ingestion_connector::dozer_types::log::info; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::IngestionMessage; +use dozer_ingestion_connector::futures::StreamExt; +use dozer_ingestion_connector::tokio::sync::mpsc::Sender; +use dozer_ingestion_connector::{async_trait, tokio, TableInfo}; use object_store::ObjectStore; -use tokio::sync::mpsc::Sender; - -use crate::{ - connectors::{object_store::helper::map_listing_options, TableInfo}, - errors::{ConnectorError, ObjectStoreConnectorError, ObjectStoreObjectError}, -}; -use dozer_types::models::ingestion_types::IngestionMessage; -use std::path::Path; +use crate::helper::map_listing_options; +use crate::{table_reader, ObjectStoreConnectorError, ObjectStoreObjectError}; use super::{adapters::DozerObjectStore, table_reader::TableReader}; @@ -53,7 +50,7 @@ pub trait Watcher { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError>; + ) -> Result<(), ObjectStoreConnectorError>; } #[async_trait] @@ -63,7 +60,7 @@ impl Watcher for TableReader { table_index: usize, table: &TableInfo, sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), ObjectStoreConnectorError> { let params = self.config.table_params(&table.name)?; let store = Arc::new(params.object_store); @@ -147,7 +144,7 @@ impl Watcher for TableReader { }) .unwrap(); - let result = Self::read( + let result = table_reader::read( table_index, ctx.clone(), file_path, diff --git a/dozer-ingestion/postgres/Cargo.toml b/dozer-ingestion/postgres/Cargo.toml new file mode 100644 index 0000000000..55bd95d270 --- /dev/null +++ b/dozer-ingestion/postgres/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "dozer-ingestion-postgres" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +postgres-protocol = "0.6.4" +postgres-types = { version = "0.2.4", features = [ + "with-serde_json-1", + "with-uuid-1", +] } +tokio-postgres = { version = "0.7.7", features = [ + "with-chrono-0_4", + "with-geo-types-0_7", + "with-uuid-1", +] } +uuid = { version = "1.3.1", features = ["serde", "v4"] } +rustls = { version = "0.21.7", features = ["dangerous_configuration"] } +tokio-postgres-rustls = "0.10.0" +rustls-native-certs = "0.6.3" +regex = "1" +rand = "0.8.5" + +[dev-dependencies] +serial_test = "1.0.0" diff --git a/dozer-ingestion/src/connectors/postgres/connection.rs b/dozer-ingestion/postgres/src/connection.rs similarity index 100% rename from dozer-ingestion/src/connectors/postgres/connection.rs rename to dozer-ingestion/postgres/src/connection.rs diff --git a/dozer-ingestion/src/connectors/postgres/connection/client.rs b/dozer-ingestion/postgres/src/connection/client.rs similarity index 96% rename from dozer-ingestion/src/connectors/postgres/connection/client.rs rename to dozer-ingestion/postgres/src/connection/client.rs index 4d0869c95f..ec59b0952b 100644 --- a/dozer-ingestion/src/connectors/postgres/connection/client.rs +++ b/dozer-ingestion/postgres/src/connection/client.rs @@ -2,17 +2,21 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{ready, Poll}; -use futures::future::BoxFuture; -use futures::lock::Mutex; -use futures::stream::BoxStream; -use futures::Stream; +use dozer_ingestion_connector::{ + dozer_types::{self, bytes}, + futures::future::BoxFuture, + futures::stream::BoxStream, + futures::Stream, + retry_on_network_failure, + tokio::{self, sync::Mutex}, +}; use tokio_postgres::types::ToSql; use tokio_postgres::{Config, CopyBothDuplex, Row, SimpleQueryMessage, Statement, ToStatement}; +use crate::connection::helper::is_network_failure; +use crate::PostgresConnectorError; + use super::helper; -use crate::connectors::postgres::connection::helper::is_network_failure; -use crate::errors::PostgresConnectorError; -use crate::retry_on_network_failure; #[derive(Debug)] pub struct Client { diff --git a/dozer-ingestion/src/connectors/postgres/connection/helper.rs b/dozer-ingestion/postgres/src/connection/helper.rs similarity index 88% rename from dozer-ingestion/src/connectors/postgres/connection/helper.rs rename to dozer-ingestion/postgres/src/connection/helper.rs index 8cce2d7b85..c51c4ffd82 100644 --- a/dozer-ingestion/src/connectors/postgres/connection/helper.rs +++ b/dozer-ingestion/postgres/src/connection/helper.rs @@ -1,10 +1,14 @@ +use crate::PostgresConnectorError; + use super::client::Client; -use crate::errors::ConnectorError::WrongConnectionConfiguration; -use crate::errors::PostgresConnectorError::InvalidSslError; -use crate::errors::{ConnectorError, PostgresConnectorError}; -use crate::retry_on_network_failure; -use dozer_types::log::{debug, error}; -use dozer_types::models::connection::ConnectionConfig; +use dozer_ingestion_connector::{ + dozer_types::{ + self, + log::{debug, error}, + models::connection::ConnectionConfig, + }, + retry_on_network_failure, tokio, +}; use rustls::client::{ServerCertVerified, ServerCertVerifier}; use rustls::{Certificate, Error, ServerName}; use std::sync::Arc; @@ -14,11 +18,11 @@ use tokio_postgres::{Connection, NoTls, Socket}; pub fn map_connection_config( auth_details: &ConnectionConfig, -) -> Result { +) -> Result { if let ConnectionConfig::Postgres(postgres) = auth_details { let config_replenished = match postgres.replenish() { Ok(conf) => conf, - Err(e) => return Err(WrongConnectionConfiguration(e)), + Err(e) => return Err(PostgresConnectorError::WrongConnectionConfiguration(e)), }; let mut config = tokio_postgres::Config::new(); config @@ -30,9 +34,7 @@ pub fn map_connection_config( .ssl_mode(config_replenished.sslmode); Ok(config) } else { - Err(ConnectorError::UnavailableConnectionConfiguration( - "Unable to map connection config".to_string(), - )) + panic!("Postgres config was expected") } } @@ -112,7 +114,7 @@ pub async fn connect(config: tokio_postgres::Config) -> Result Err(InvalidSslError(ssl_mode)), + ssl_mode => Err(PostgresConnectorError::InvalidSslError(ssl_mode)), } } diff --git a/dozer-ingestion/src/connectors/postgres/connection/tables_validator.rs b/dozer-ingestion/postgres/src/connection/tables_validator.rs similarity index 83% rename from dozer-ingestion/src/connectors/postgres/connection/tables_validator.rs rename to dozer-ingestion/postgres/src/connection/tables_validator.rs index 0394ca8470..562a2c77ac 100644 --- a/dozer-ingestion/src/connectors/postgres/connection/tables_validator.rs +++ b/dozer-ingestion/postgres/src/connection/tables_validator.rs @@ -1,12 +1,10 @@ -use crate::connectors::postgres::schema::helper::DEFAULT_SCHEMA_NAME; -use crate::connectors::ListOrFilterColumns; -use crate::errors::PostgresConnectorError::{ColumnsNotFound, InvalidQueryError, TablesNotFound}; -use crate::errors::PostgresSchemaError; use std::collections::hash_map::Entry; use std::collections::HashMap; -use crate::errors::PostgresConnectorError; -use crate::errors::PostgresSchemaError::TableTypeNotFound; +use dozer_ingestion_connector::utils::ListOrFilterColumns; + +use crate::schema::helper::DEFAULT_SCHEMA_NAME; +use crate::{PostgresConnectorError, PostgresSchemaError}; use super::client::Client; @@ -50,13 +48,19 @@ impl<'a> TablesValidator<'a> { &[&self.tables_identifiers], ) .await - .map_err(InvalidQueryError)?; + .map_err(PostgresConnectorError::InvalidQueryError)?; let mut tables = HashMap::new(); for r in result.iter() { - let schema_name: String = r.try_get(0).map_err(InvalidQueryError)?; - let table_name: String = r.try_get(1).map_err(InvalidQueryError)?; - let table_type: Option = r.try_get(2).map_err(InvalidQueryError)?; + let schema_name: String = r + .try_get(0) + .map_err(PostgresConnectorError::InvalidQueryError)?; + let table_name: String = r + .try_get(1) + .map_err(PostgresConnectorError::InvalidQueryError)?; + let table_type: Option = r + .try_get(2) + .map_err(PostgresConnectorError::InvalidQueryError)?; tables.insert((schema_name, table_name), table_type); } @@ -76,7 +80,7 @@ impl<'a> TablesValidator<'a> { &[&self.tables_identifiers], ) .await - .map_err(InvalidQueryError)?; + .map_err(PostgresConnectorError::InvalidQueryError)?; let mut table_columns_map: HashMap> = HashMap::new(); tables_columns.iter().for_each(|r| { @@ -120,12 +124,14 @@ impl<'a> TablesValidator<'a> { }) .collect(); - return Err(ColumnsNotFound(error_columns.join(", "))); + return Err(PostgresConnectorError::ColumnsNotFound( + error_columns.join(", "), + )); } let missing_tables = self.find_missing_tables(tables)?; if !missing_tables.is_empty() { - return Err(TablesNotFound(missing_tables)); + return Err(PostgresConnectorError::TablesNotFound(missing_tables)); } Ok(()) @@ -140,7 +146,7 @@ impl<'a> TablesValidator<'a> { let table_info = self .tables .get(key) - .ok_or(TablesNotFound(vec![key.clone()]))?; + .ok_or(PostgresConnectorError::TablesNotFound(vec![key.clone()]))?; if let Some(column_names) = table_info.columns.clone() { for c in column_names { @@ -169,7 +175,7 @@ impl<'a> TablesValidator<'a> { |table_type| { table_type .as_ref() - .map_or(Err(TableTypeNotFound), |typ| { + .map_or(Err(PostgresSchemaError::TableTypeNotFound), |typ| { if typ.clone() != *"BASE TABLE" { Err(PostgresSchemaError::UnsupportedTableType( typ.clone(), diff --git a/dozer-ingestion/src/connectors/postgres/connection/validator.rs b/dozer-ingestion/postgres/src/connection/validator.rs similarity index 52% rename from dozer-ingestion/src/connectors/postgres/connection/validator.rs rename to dozer-ingestion/postgres/src/connection/validator.rs index 5b27d1b3be..af1416e828 100644 --- a/dozer-ingestion/src/connectors/postgres/connection/validator.rs +++ b/dozer-ingestion/postgres/src/connection/validator.rs @@ -1,17 +1,10 @@ -use crate::connectors::postgres::connector::ReplicationSlotInfo; -use crate::connectors::ListOrFilterColumns; - -use crate::errors::PostgresConnectorError::{ - ColumnNameNotValid, ConnectionFailure, InvalidQueryError, MissingTableInReplicationSlot, - NoAvailableSlotsError, ReplicationIsNotAvailableForUserError, SlotIsInUseError, - SlotNotExistError, StartLsnIsBeforeLastFlushedLsnError, TableNameNotValid, - WALLevelIsNotCorrect, -}; +use crate::{connector::ReplicationSlotInfo, PostgresConnectorError}; -use super::client::Client; -use crate::connectors::postgres::connection::tables_validator::TablesValidator; -use crate::errors::PostgresConnectorError; -use dozer_types::indicatif::ProgressStyle; +use super::{client::Client, tables_validator::TablesValidator}; +use dozer_ingestion_connector::{ + dozer_types::indicatif::{ProgressBar, ProgressStyle}, + utils::ListOrFilterColumns, +}; use postgres_types::PgLsn; use regex::Regex; @@ -36,7 +29,7 @@ pub async fn validate_connection( Validations::WALLevel, Validations::Slot, ]; - let pb = dozer_types::indicatif::ProgressBar::new(validations_order.len() as u64); + let pb = ProgressBar::new(validations_order.len() as u64); pb.set_style( ProgressStyle::with_template(&format!( "[{}] {}", @@ -79,7 +72,7 @@ async fn validate_details(client: &mut Client) -> Result<(), PostgresConnectorEr client .simple_query("SELECT version()") .await - .map_err(ConnectionFailure)?; + .map_err(PostgresConnectorError::ConnectionFailure)?; Ok(()) } @@ -101,7 +94,7 @@ async fn validate_user(client: &mut Client) -> Result<(), PostgresConnectorError &[], ) .await - .map_or(Err(ReplicationIsNotAvailableForUserError), |row| { + .map_or(Err(PostgresConnectorError::ReplicationIsNotAvailableForUserError), |row| { let can_login: bool = row.get("can_login"); let is_replication_role: bool = row.get("is_replication_role"); let is_aws_replication_role: bool = row.get("is_aws_replication_role"); @@ -109,7 +102,7 @@ async fn validate_user(client: &mut Client) -> Result<(), PostgresConnectorError if can_login && (is_replication_role || is_aws_replication_role) { Ok(()) } else { - Err(ReplicationIsNotAvailableForUserError) + Err(PostgresConnectorError::ReplicationIsNotAvailableForUserError) } }) } @@ -118,16 +111,16 @@ async fn validate_wal_level(client: &mut Client) -> Result<(), PostgresConnector let result = client .query_one("SHOW wal_level", &[]) .await - .map_err(|_e| WALLevelIsNotCorrect())?; + .map_err(|_e| PostgresConnectorError::WALLevelIsNotCorrect())?; let wal_level: Result = result.try_get(0); wal_level.map_or_else( - |e| Err(InvalidQueryError(e)), + |e| Err(PostgresConnectorError::InvalidQueryError(e)), |level| { if level == "logical" { Ok(()) } else { - Err(WALLevelIsNotCorrect()) + Err(PostgresConnectorError::WALLevelIsNotCorrect()) } }, ) @@ -139,7 +132,7 @@ fn validate_tables_names( let table_regex = Regex::new(r"^([[:lower:]_][[:alnum:]_]*)$").unwrap(); for t in table_info { if !table_regex.is_match(&t.name) { - return Err(TableNameNotValid(t.name.clone())); + return Err(PostgresConnectorError::TableNameNotValid(t.name.clone())); } } @@ -154,7 +147,7 @@ fn validate_columns_names( if let Some(columns) = &t.columns { for column in columns { if !column_name_regex.is_match(column) { - return Err(ColumnNameNotValid(column.clone())); + return Err(PostgresConnectorError::ColumnNameNotValid(column.clone())); } } } @@ -187,17 +180,23 @@ pub async fn validate_slot( &[&replication_info.name], ) .await - .map_err(InvalidQueryError)?; + .map_err(PostgresConnectorError::InvalidQueryError)?; - let is_already_running: bool = result.try_get(0).map_err(InvalidQueryError)?; + let is_already_running: bool = result + .try_get(0) + .map_err(PostgresConnectorError::InvalidQueryError)?; if is_already_running { - return Err(SlotIsInUseError(replication_info.name.clone())); + return Err(PostgresConnectorError::SlotIsInUseError( + replication_info.name.clone(), + )); } - let flush_lsn: PgLsn = result.try_get(1).map_err(InvalidQueryError)?; + let flush_lsn: PgLsn = result + .try_get(1) + .map_err(PostgresConnectorError::InvalidQueryError)?; if flush_lsn.gt(&replication_info.start_lsn) { - return Err(StartLsnIsBeforeLastFlushedLsnError( + return Err(PostgresConnectorError::StartLsnIsBeforeLastFlushedLsnError( flush_lsn.to_string(), replication_info.start_lsn.to_string(), )); @@ -213,7 +212,9 @@ pub async fn validate_slot( &[&replication_info.name], ) .await - .map_err(|_e| SlotNotExistError(replication_info.name.clone()))?; + .map_err(|_e| { + PostgresConnectorError::SlotNotExistError(replication_info.name.clone()) + })?; let mut publication_tables: Vec = vec![]; for row in result { @@ -222,7 +223,9 @@ pub async fn validate_slot( for t in tables_list { if !publication_tables.contains(&t.name) { - return Err(MissingTableInReplicationSlot(t.name.clone())); + return Err(PostgresConnectorError::MissingTableInReplicationSlot( + t.name.clone(), + )); } } } @@ -234,20 +237,24 @@ async fn validate_limit_of_replications(client: &mut Client) -> Result<(), Postg let slots_limit_result = client .query_one("SHOW max_replication_slots", &[]) .await - .map_err(ConnectionFailure)?; + .map_err(PostgresConnectorError::ConnectionFailure)?; - let slots_limit_str: String = slots_limit_result.try_get(0).map_err(InvalidQueryError)?; + let slots_limit_str: String = slots_limit_result + .try_get(0) + .map_err(PostgresConnectorError::InvalidQueryError)?; let slots_limit: i64 = slots_limit_str.parse().unwrap(); let used_slots_result = client .query_one("SELECT COUNT(*) FROM pg_replication_slots;", &[]) .await - .map_err(ConnectionFailure)?; + .map_err(PostgresConnectorError::ConnectionFailure)?; - let used_slots: i64 = used_slots_result.try_get(0).map_err(InvalidQueryError)?; + let used_slots: i64 = used_slots_result + .try_get(0) + .map_err(PostgresConnectorError::InvalidQueryError)?; if used_slots == slots_limit { - Err(NoAvailableSlotsError) + Err(PostgresConnectorError::NoAvailableSlotsError) } else { Ok(()) } @@ -255,14 +262,16 @@ async fn validate_limit_of_replications(client: &mut Client) -> Result<(), Postg #[cfg(test)] mod tests { + use crate::{ + connection::helper::{connect, map_connection_config}, + test_utils::load_test_connection_config, + tests::client::TestPostgresClient, + PostgresSchemaError, + }; + use super::*; - use crate::connectors::postgres::connection::helper::connect; - use crate::connectors::postgres::connector::ReplicationSlotInfo; - use crate::connectors::postgres::test_utils::{get_client, get_config}; - use crate::errors::PostgresConnectorError; - use crate::errors::PostgresSchemaError::UnsupportedTableType; - use crate::test_util::run_connector_test; + use dozer_ingestion_connector::tokio; use postgres_types::PgLsn; use rand::Rng; use serial_test::serial; @@ -272,31 +281,29 @@ mod tests { #[ignore] #[serial] async fn test_connector_validation_connection_fail_to_connect() { - run_connector_test("postgres", |app_config| async move { - let mut config = get_config(app_config); - config.dbname("not_existing"); - - let result = validate_connection("pg_test_conn", config, None, None).await; - - assert!(result.is_err()); - - match result { - Ok(_) => panic!("Validation should fail"), - Err(e) => { - assert!(matches!(e, PostgresConnectorError::ConnectionFailure(_))); - - if let PostgresConnectorError::ConnectionFailure(msg) = e { - assert_eq!( - msg.to_string(), - "db error: FATAL: database \"not_existing\" does not exist" - ); - } else { - panic!("Unexpected error occurred"); - } + let config = load_test_connection_config().await; + let mut config = map_connection_config(&config).unwrap(); + config.dbname("not_existing"); + + let result = validate_connection("pg_test_conn", config, None, None).await; + + assert!(result.is_err()); + + match result { + Ok(_) => panic!("Validation should fail"), + Err(e) => { + assert!(matches!(e, PostgresConnectorError::ConnectionFailure(_))); + + if let PostgresConnectorError::ConnectionFailure(msg) = e { + assert_eq!( + msg.to_string(), + "db error: FATAL: database \"not_existing\" does not exist" + ); + } else { + panic!("Unexpected error occurred"); } } - }) - .await + } } // #[test] @@ -343,112 +350,107 @@ mod tests { #[ignore] #[serial] async fn test_connector_validation_connection_requested_tables_not_exist() { - run_connector_test("postgres", |app_config| async move { - let config = get_config(app_config); - let mut client = connect(config.clone()).await.unwrap(); - - client - .simple_query("DROP TABLE IF EXISTS not_existing") - .await - .expect("User creation failed"); + let config = load_test_connection_config().await; + let config = map_connection_config(&config).unwrap(); + let mut client = connect(config.clone()).await.unwrap(); - let tables = vec![ListOrFilterColumns { - name: "not_existing".to_string(), - schema: Some("public".to_string()), - columns: None, - }]; - let result = validate_connection("pg_test_conn", config, Some(&tables), None).await; - - assert!(result.is_err()); - - match result { - Ok(_) => panic!("Validation should fail"), - Err(e) => { - assert!(matches!(e, PostgresConnectorError::TablesNotFound(_))); - - if let PostgresConnectorError::TablesNotFound(msg) = e { - assert_eq!( - msg, - vec![("public".to_string(), "not_existing".to_string())] - ); - } else { - panic!("Unexpected error occurred"); - } + client + .simple_query("DROP TABLE IF EXISTS not_existing") + .await + .expect("User creation failed"); + + let tables = vec![ListOrFilterColumns { + name: "not_existing".to_string(), + schema: Some("public".to_string()), + columns: None, + }]; + let result = validate_connection("pg_test_conn", config, Some(&tables), None).await; + + assert!(result.is_err()); + + match result { + Ok(_) => panic!("Validation should fail"), + Err(e) => { + assert!(matches!(e, PostgresConnectorError::TablesNotFound(_))); + + if let PostgresConnectorError::TablesNotFound(msg) = e { + assert_eq!( + msg, + vec![("public".to_string(), "not_existing".to_string())] + ); + } else { + panic!("Unexpected error occurred"); } } - }) - .await + } } #[tokio::test] #[ignore] #[serial] async fn test_connector_validation_connection_requested_columns_not_exist() { - run_connector_test("postgres", |app_config| async move { - let config = get_config(app_config); - let mut client = connect(config.clone()).await.unwrap(); + let config = load_test_connection_config().await; + let config = map_connection_config(&config).unwrap(); + let mut client = connect(config.clone()).await.unwrap(); - client + client .simple_query("CREATE TABLE IF NOT EXISTS existing(column_1 serial PRIMARY KEY, column_2 serial);") .await .expect("User creation failed"); - let columns = vec![ - String::from("column_not_existing_1"), - String::from("column_not_existing_2"), - ]; + let columns = vec![ + String::from("column_not_existing_1"), + String::from("column_not_existing_2"), + ]; - let tables = vec![ListOrFilterColumns { - name: "existing".to_string(), - schema: Some("public".to_string()), - columns: Some(columns), - }]; + let tables = vec![ListOrFilterColumns { + name: "existing".to_string(), + schema: Some("public".to_string()), + columns: Some(columns), + }]; - let result = validate_connection("pg_test_conn", config, Some(&tables), None).await; + let result = validate_connection("pg_test_conn", config, Some(&tables), None).await; - assert!(result.is_err()); + assert!(result.is_err()); - match result { - Ok(_) => panic!("Validation should fail"), - Err(e) => { - assert!(matches!(e, PostgresConnectorError::ColumnsNotFound(_))); + match result { + Ok(_) => panic!("Validation should fail"), + Err(e) => { + assert!(matches!(e, PostgresConnectorError::ColumnsNotFound(_))); - if let PostgresConnectorError::ColumnsNotFound(msg) = e { - assert_eq!(msg, "column_not_existing_1 in public.existing table, column_not_existing_2 in public.existing table"); - } else { - panic!("Unexpected error occurred"); - } + if let PostgresConnectorError::ColumnsNotFound(msg) = e { + assert_eq!(msg, "column_not_existing_1 in public.existing table, column_not_existing_2 in public.existing table"); + } else { + panic!("Unexpected error occurred"); } } - }).await + } } #[tokio::test] #[ignore] #[serial] async fn test_connector_validation_connection_replication_slot_not_exist() { - run_connector_test("postgres", |app_config| async move { - let config = get_config(app_config); + let config = load_test_connection_config().await; + let config = map_connection_config(&config).unwrap(); - let new_slot = "not_existing_slot"; - let replication_info = ReplicationSlotInfo { - name: new_slot.to_string(), - start_lsn: PgLsn::from(0), - }; + let new_slot = "not_existing_slot"; + let replication_info = ReplicationSlotInfo { + name: new_slot.to_string(), + start_lsn: PgLsn::from(0), + }; - let result = - validate_connection("pg_test_conn", config, None, Some(replication_info)).await; + let result = + validate_connection("pg_test_conn", config, None, Some(replication_info)).await; - assert!(result.is_err()); + assert!(result.is_err()); - match result { - Ok(_) => panic!("Validation should fail"), - Err(e) => { - assert!(matches!(e, PostgresConnectorError::InvalidQueryError(_))); - } + match result { + Ok(_) => panic!("Validation should fail"), + Err(e) => { + assert!(matches!(e, PostgresConnectorError::InvalidQueryError(_))); } - }) - .await + } } #[test] @@ -490,91 +492,87 @@ mod tests { #[ignore] #[serial] async fn test_connector_validation_connection_valid_number_of_replication_slots() { - run_connector_test("postgres", |app_config| async move { - let config = get_config(app_config); - let mut client = connect(config.clone()).await.unwrap(); + let config = load_test_connection_config().await; + let config = map_connection_config(&config).unwrap(); + let mut client = connect(config.clone()).await.unwrap(); - let slots_limit_result = client - .query_one("SHOW max_replication_slots", &[]) - .await - .unwrap(); + let slots_limit_result = client + .query_one("SHOW max_replication_slots", &[]) + .await + .unwrap(); + + let slots_limit_str: String = slots_limit_result.try_get(0).unwrap(); + let slots_limit: i64 = slots_limit_str.parse().unwrap(); - let slots_limit_str: String = slots_limit_result.try_get(0).unwrap(); - let slots_limit: i64 = slots_limit_str.parse().unwrap(); + let used_slots_result = client + .query_one("SELECT COUNT(*) FROM pg_replication_slots;", &[]) + .await + .unwrap(); - let used_slots_result = client - .query_one("SELECT COUNT(*) FROM pg_replication_slots;", &[]) + let used_slots: i64 = used_slots_result.try_get(0).unwrap(); + + let range = used_slots..slots_limit - 1; + for n in range { + let slot_name = format!("slot_{n}"); + client + .query( + r#"SELECT pg_create_logical_replication_slot($1, 'pgoutput');"#, + &[&slot_name], + ) .await .unwrap(); + } - let used_slots: i64 = used_slots_result.try_get(0).unwrap(); - - let range = used_slots..slots_limit - 1; - for n in range { - let slot_name = format!("slot_{n}"); - client - .query( - r#"SELECT pg_create_logical_replication_slot($1, 'pgoutput');"#, - &[&slot_name], - ) - .await - .unwrap(); - } - - // One replication slot is available - let result = validate_connection("pg_test_conn", config, None, None).await; - assert!(result.is_ok()); - }) - .await + // One replication slot is available + let result = validate_connection("pg_test_conn", config, None, None).await; + assert!(result.is_ok()); } #[tokio::test] #[ignore] #[serial] async fn test_connector_validation_connection_not_any_replication_slot_availble() { - run_connector_test("postgres", |app_config| async move { - let config = get_config(app_config); - let mut client = connect(config.clone()).await.unwrap(); + let config = load_test_connection_config().await; + let config = map_connection_config(&config).unwrap(); + let mut client = connect(config.clone()).await.unwrap(); - let slots_limit_result = client - .query_one("SHOW max_replication_slots", &[]) - .await - .unwrap(); + let slots_limit_result = client + .query_one("SHOW max_replication_slots", &[]) + .await + .unwrap(); - let slots_limit_str: String = slots_limit_result.try_get(0).unwrap(); - let slots_limit: i64 = slots_limit_str.parse().unwrap(); + let slots_limit_str: String = slots_limit_result.try_get(0).unwrap(); + let slots_limit: i64 = slots_limit_str.parse().unwrap(); - let used_slots_result = client - .query_one("SELECT COUNT(*) FROM pg_replication_slots;", &[]) + let used_slots_result = client + .query_one("SELECT COUNT(*) FROM pg_replication_slots;", &[]) + .await + .unwrap(); + + let used_slots: i64 = used_slots_result.try_get(0).unwrap(); + + let range = used_slots..slots_limit; + for n in range { + let slot_name = format!("slot_{n}"); + client + .query( + r#"SELECT pg_create_logical_replication_slot($1, 'pgoutput');"#, + &[&slot_name], + ) .await .unwrap(); + } - let used_slots: i64 = used_slots_result.try_get(0).unwrap(); - - let range = used_slots..slots_limit; - for n in range { - let slot_name = format!("slot_{n}"); - client - .query( - r#"SELECT pg_create_logical_replication_slot($1, 'pgoutput');"#, - &[&slot_name], - ) - .await - .unwrap(); - } - - let result = validate_connection("pg_test_conn", config, None, None).await; + let result = validate_connection("pg_test_conn", config, None, None).await; - assert!(result.is_err()); + assert!(result.is_err()); - match result { - Ok(_) => panic!("Validation should fail"), - Err(e) => { - assert!(matches!(e, PostgresConnectorError::NoAvailableSlotsError)); - } + match result { + Ok(_) => panic!("Validation should fail"), + Err(e) => { + assert!(matches!(e, PostgresConnectorError::NoAvailableSlotsError)); } - }) - .await + } } #[test] @@ -621,52 +619,50 @@ mod tests { #[ignore] #[serial] async fn test_connector_return_error_on_view_in_table_validation() { - run_connector_test("postgres", |app_config| async move { - let mut client = get_client(app_config.clone()).await; + let config = load_test_connection_config().await; + let mut client = TestPostgresClient::new(&config).await; - let mut rng = rand::thread_rng(); + let mut rng = rand::thread_rng(); - let schema = format!("schema_helper_test_{}", rng.gen::()); - let table_name = format!("products_test_{}", rng.gen::()); - let view_name = format!("products_view_test_{}", rng.gen::()); + let schema = format!("schema_helper_test_{}", rng.gen::()); + let table_name = format!("products_test_{}", rng.gen::()); + let view_name = format!("products_view_test_{}", rng.gen::()); - client.create_schema(&schema).await; - client.create_simple_table(&schema, &table_name).await; - client.create_view(&schema, &table_name, &view_name).await; + client.create_schema(&schema).await; + client.create_simple_table(&schema, &table_name).await; + client.create_view(&schema, &table_name, &view_name).await; - let config = get_config(app_config); - let mut pg_client = connect(config).await.unwrap(); + let config = map_connection_config(&config).unwrap(); + let mut pg_client = connect(config).await.unwrap(); - let result = validate_tables( - &mut pg_client, - &vec![ListOrFilterColumns { - name: table_name, - schema: Some(schema.clone()), - columns: None, - }], - ) - .await; + let result = validate_tables( + &mut pg_client, + &vec![ListOrFilterColumns { + name: table_name, + schema: Some(schema.clone()), + columns: None, + }], + ) + .await; - assert!(result.is_ok()); + assert!(result.is_ok()); - let result = validate_tables( - &mut pg_client, - &vec![ListOrFilterColumns { - name: view_name, - schema: Some(schema), - columns: None, - }], - ) - .await; - - assert!(result.is_err()); - assert!(matches!( - result, - Err(PostgresConnectorError::PostgresSchemaError( - UnsupportedTableType(_, _) - )) - )); - }) - .await + let result = validate_tables( + &mut pg_client, + &vec![ListOrFilterColumns { + name: view_name, + schema: Some(schema), + columns: None, + }], + ) + .await; + + assert!(result.is_err()); + assert!(matches!( + result, + Err(PostgresConnectorError::PostgresSchemaError( + PostgresSchemaError::UnsupportedTableType(_, _) + )) + )); } } diff --git a/dozer-ingestion/src/connectors/postgres/connector.rs b/dozer-ingestion/postgres/src/connector.rs similarity index 84% rename from dozer-ingestion/src/connectors/postgres/connector.rs rename to dozer-ingestion/postgres/src/connector.rs index ee6f47035d..a543d474c3 100644 --- a/dozer-ingestion/src/connectors/postgres/connector.rs +++ b/dozer-ingestion/postgres/src/connector.rs @@ -1,22 +1,22 @@ -use crate::connectors::postgres::connection::validator::validate_connection; -use crate::connectors::postgres::iterator::PostgresIterator; -use crate::connectors::{ - Connector, ListOrFilterColumns, SourceSchemaResult, TableIdentifier, TableInfo, TableToIngest, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{errors::internal::BoxedError, log::info, types::FieldType}, + utils::ListOrFilterColumns, + Connector, Ingestor, SourceSchemaResult, TableIdentifier, TableInfo, TableToIngest, }; -use crate::errors::ConnectorError; -use crate::ingestion::Ingestor; -use dozer_types::tonic::async_trait; -use dozer_types::tracing::info; use postgres_types::PgLsn; use rand::distributions::Alphanumeric; use rand::Rng; - -use crate::connectors::postgres::schema::helper::{SchemaHelper, DEFAULT_SCHEMA_NAME}; -use crate::errors::ConnectorError::PostgresConnectorError; -use crate::errors::PostgresConnectorError::{CreatePublicationError, DropPublicationError}; use tokio_postgres::config::ReplicationMode; use tokio_postgres::Config; +use crate::{ + connection::validator::validate_connection, + iterator::PostgresIterator, + schema::helper::{SchemaHelper, DEFAULT_SCHEMA_NAME}, + PostgresConnectorError, +}; + use super::connection::client::Client; use super::connection::helper; @@ -85,20 +85,20 @@ impl PostgresConnector { #[async_trait] impl Connector for PostgresConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { + async fn validate_connection(&self) -> Result<(), BoxedError> { validate_connection(&self.name, self.conn_config.clone(), None, None) .await .map_err(Into::into) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { Ok(self .schema_helper .get_tables(None) @@ -108,7 +108,7 @@ impl Connector for PostgresConnector { .collect()) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let tables = tables .iter() .map(|table| ListOrFilterColumns { @@ -125,7 +125,7 @@ impl Connector for PostgresConnector { async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let table_infos = tables .iter() .map(|table| ListOrFilterColumns { @@ -150,7 +150,7 @@ impl Connector for PostgresConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let table_infos = table_infos .iter() .map(|table| ListOrFilterColumns { @@ -159,20 +159,21 @@ impl Connector for PostgresConnector { columns: Some(table.column_names.clone()), }) .collect::>(); - self.schema_helper + Ok(self + .schema_helper .get_schemas(&table_infos) - .await - .map_err(Into::into) + .await? + .into_iter() + .map(|schema_result| schema_result.map_err(Into::into)) + .collect()) } async fn start( &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { - let client = helper::connect(self.replication_conn_config.clone()) - .await - .map_err(PostgresConnectorError)?; + ) -> Result<(), BoxedError> { + let client = helper::connect(self.replication_conn_config.clone()).await?; let table_identifiers = tables .iter() .map(|table| TableIdentifier::new(table.schema.clone(), table.name.clone())) @@ -203,7 +204,7 @@ impl Connector for PostgresConnector { self.conn_config.clone(), self.schema.clone(), ); - iterator.start(lsn).await + iterator.start(lsn).await.map_err(Into::into) } } @@ -230,7 +231,7 @@ impl PostgresConnector { &self, mut client: Client, table_identifiers: Option<&[TableIdentifier]>, - ) -> Result<(), ConnectorError> { + ) -> Result<(), PostgresConnectorError> { let publication_name = self.get_publication_name(); let table_str: String = match table_identifiers { None => "ALL TABLES".to_string(), @@ -255,12 +256,12 @@ impl PostgresConnector { client .simple_query(format!("DROP PUBLICATION IF EXISTS {publication_name}").as_str()) .await - .map_err(DropPublicationError)?; + .map_err(PostgresConnectorError::DropPublicationError)?; client .simple_query(format!("CREATE PUBLICATION {publication_name} FOR {table_str}").as_str()) .await - .map_err(CreatePublicationError)?; + .map_err(PostgresConnectorError::CreatePublicationError)?; Ok(()) } diff --git a/dozer-ingestion/src/connectors/postgres/helper.rs b/dozer-ingestion/postgres/src/helper.rs similarity index 91% rename from dozer-ingestion/src/connectors/postgres/helper.rs rename to dozer-ingestion/postgres/src/helper.rs index d3e4e32881..8b242518f3 100644 --- a/dozer-ingestion/src/connectors/postgres/helper.rs +++ b/dozer-ingestion/postgres/src/helper.rs @@ -1,16 +1,13 @@ -use crate::connectors::postgres::xlog_mapper::TableColumn; -use crate::errors::PostgresSchemaError::{ - ColumnTypeNotSupported, CustomTypeNotSupported, PointParseError, StringParseError, - ValueConversionError, +use dozer_ingestion_connector::dozer_types::{ + bytes::Bytes, + chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime, Offset, Utc}, + errors::types::TypeError, + geo::Point as GeoPoint, + json_types::{serde_json_to_json_value, JsonValue}, + ordered_float::OrderedFloat, + rust_decimal, serde_json, + types::*, }; -use crate::errors::{ConnectorError, PostgresSchemaError}; -use dozer_types::bytes::Bytes; -use dozer_types::chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime, Offset, Utc}; -use dozer_types::errors::types::TypeError; -use dozer_types::geo::Point as GeoPoint; -use dozer_types::json_types::{serde_json_to_json_value, JsonValue}; -use dozer_types::ordered_float::OrderedFloat; -use dozer_types::{rust_decimal, serde_json, types::*}; use postgres_types::{Type, WasNull}; use rust_decimal::prelude::FromPrimitive; use rust_decimal::Decimal; @@ -19,6 +16,8 @@ use std::vec; use tokio_postgres::{Column, Row}; use uuid::Uuid; +use crate::{xlog_mapper::TableColumn, PostgresConnectorError, PostgresSchemaError}; + pub fn postgres_type_to_field( value: Option<&Bytes>, column: &TableColumn, @@ -103,11 +102,13 @@ pub fn postgres_type_to_field( Type::BOOL => Ok(Field::Boolean(v.slice(0..1) == "t")), Type::POINT => Ok(Field::Point( String::from_utf8(v.to_vec()) - .map_err(StringParseError)? + .map_err(PostgresSchemaError::StringParseError)? .parse::() - .map_err(|_| PointParseError)?, + .map_err(|_| PostgresSchemaError::PointParseError)?, + )), + _ => Err(PostgresSchemaError::ColumnTypeNotSupported( + column_type.name().to_string(), )), - _ => Err(ColumnTypeNotSupported(column_type.name().to_string())), }) } @@ -132,7 +133,9 @@ pub fn postgres_type_to_dozer_type(column_type: Type) -> Result Ok(FieldType::Json), Type::DATE => Ok(FieldType::Date), Type::POINT => Ok(FieldType::Point), - _ => Err(ColumnTypeNotSupported(column_type.name().to_string())), + _ => Err(PostgresSchemaError::ColumnTypeNotSupported( + column_type.name().to_string(), + )), } } @@ -141,10 +144,10 @@ fn handle_error(e: tokio_postgres::error::Error) -> Result() { Ok(Field::Null) } else { - Err(ValueConversionError(e.to_string())) + Err(PostgresSchemaError::ValueConversionError(e.to_string())) } } else { - Err(ValueConversionError(e.to_string())) + Err(PostgresSchemaError::ValueConversionError(e.to_string())) } } @@ -216,9 +219,13 @@ pub fn value_to_field( } _ => { if col_type.schema() == "pg_catalog" { - Err(ColumnTypeNotSupported(col_type.name().to_string())) + Err(PostgresSchemaError::ColumnTypeNotSupported( + col_type.name().to_string(), + )) } else { - Err(CustomTypeNotSupported(col_type.name().to_string())) + Err(PostgresSchemaError::CustomTypeNotSupported( + col_type.name().to_string(), + )) } } } @@ -248,7 +255,7 @@ pub fn map_row_to_operation_event( } } -pub fn map_schema(columns: &[Column]) -> Result { +pub fn map_schema(columns: &[Column]) -> Result { let field_defs: Result, _> = columns.iter().map(convert_column_to_field).collect(); @@ -270,7 +277,7 @@ pub fn convert_column_to_field(column: &Column) -> Result PostgresIterator<'a> { } impl<'a> PostgresIterator<'a> { - pub async fn start(self, lsn: Option<(PgLsn, u64)>) -> Result<(), ConnectorError> { + pub async fn start(self, lsn: Option<(PgLsn, u64)>) -> Result<(), PostgresConnectorError> { let state = ReplicationState::Pending; let details = self.details.clone(); @@ -103,12 +99,10 @@ impl<'a> PostgresIteratorHandler<'a> { 3) Replicating - Replicate CDC events using lsn */ - pub async fn start(&mut self) -> Result<(), ConnectorError> { + pub async fn start(&mut self) -> Result<(), PostgresConnectorError> { let details = Arc::clone(&self.details); let replication_conn_config = details.replication_conn_config.to_owned(); - let mut client = helper::connect(replication_conn_config) - .await - .map_err(ConnectorError::PostgresConnectorError)?; + let mut client = helper::connect(replication_conn_config).await?; // TODO: Handle cases: // - When snapshot replication is not completed @@ -116,22 +110,19 @@ impl<'a> PostgresIteratorHandler<'a> { // - When publication tables changes // We clear inactive replication slots before starting replication - ReplicationSlotHelper::clear_inactive_slots(&mut client, REPLICATION_SLOT_PREFIX) - .await - .map_err(ConnectorError::PostgresConnectorError)?; + ReplicationSlotHelper::clear_inactive_slots(&mut client, REPLICATION_SLOT_PREFIX).await?; if self.lsn.is_none() { debug!("\nCreating Slot...."); let slot_exist = ReplicationSlotHelper::replication_slot_exists(&mut client, &details.slot_name) - .await - .map_err(ConnectorError::PostgresConnectorError)?; + .await?; if slot_exist { // We dont have lsn, so we need to drop replication slot and start from scratch ReplicationSlotHelper::drop_replication_slot(&mut client, &details.slot_name) .await - .map_err(InvalidQueryError)?; + .map_err(PostgresConnectorError::InvalidQueryError)?; } client @@ -146,13 +137,11 @@ impl<'a> PostgresIteratorHandler<'a> { ReplicationSlotHelper::create_replication_slot(&mut client, &details.slot_name) .await?; if let Some(lsn) = replication_slot_lsn { - let parsed_lsn = - PgLsn::from_str(&lsn).map_err(|_| LsnParseError(lsn.to_string()))?; + let parsed_lsn = PgLsn::from_str(&lsn) + .map_err(|_| PostgresConnectorError::LsnParseError(lsn.to_string()))?; self.lsn = Some((parsed_lsn, 0)); } else { - return Err(ConnectorError::PostgresConnectorError( - LsnNotReturnedFromReplicationSlot, - )); + return Err(PostgresConnectorError::LsnNotReturnedFromReplicationSlot); } self.state = ReplicationState::SnapshotInProgress; @@ -176,16 +165,21 @@ impl<'a> PostgresIteratorHandler<'a> { .collect::>(); snapshotter.sync_tables(&tables).await?; - self.ingestor + if self + .ingestor .handle_message(IngestionMessage::SnapshottingDone) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiver is dropped, we can return + return Ok(()); + } debug!("\nInitialized with tables: {:?}", details.tables); client.simple_query("COMMIT;").await.map_err(|_e| { debug!("failed to commit txn for replication"); - ConnectorError::PostgresConnectorError(PostgresConnectorError::CommitReplication) + PostgresConnectorError::CommitReplication })?; } @@ -195,11 +189,11 @@ impl<'a> PostgresIteratorHandler<'a> { self.replicate().await } - async fn replicate(&self) -> Result<(), ConnectorError> { - let (lsn, offset) = self - .lsn - .as_ref() - .map_or(Err(LSNNotStoredError), |(x, offset)| Ok((x, offset)))?; + async fn replicate(&self) -> Result<(), PostgresConnectorError> { + let (lsn, offset) = self.lsn.as_ref().map_or( + Err(PostgresConnectorError::LSNNotStoredError), + |(x, offset)| Ok((x, offset)), + )?; let publication_name = self.details.publication_name.clone(); let slot_name = self.details.slot_name.clone(); diff --git a/dozer-ingestion/postgres/src/lib.rs b/dozer-ingestion/postgres/src/lib.rs new file mode 100644 index 0000000000..2c0985e9af --- /dev/null +++ b/dozer-ingestion/postgres/src/lib.rs @@ -0,0 +1,199 @@ +use std::string::FromUtf8Error; + +use dozer_ingestion_connector::dozer_types::{ + chrono, + errors::types::{DeserializationError, TypeError}, + thiserror::{self, Error}, +}; +use tokio_postgres::config::SslMode; + +pub mod connection; +pub mod connector; +pub mod helper; +pub mod iterator; +mod replication_slot_helper; +pub mod replicator; +mod schema; +pub mod snapshotter; +#[cfg(test)] +pub mod test_utils; +#[cfg(test)] +pub mod tests; +pub mod xlog_mapper; + +pub use tokio_postgres; + +#[derive(Error, Debug)] +pub enum PostgresConnectorError { + #[error("Failed to map configuration: {0}")] + WrongConnectionConfiguration(DeserializationError), + + #[error("Invalid SslMode: {0:?}")] + InvalidSslError(SslMode), + + #[error("Query failed in connector: {0}")] + InvalidQueryError(#[source] tokio_postgres::Error), + + #[error("Failed to connect to postgres with the specified configuration. {0}")] + ConnectionFailure(#[source] tokio_postgres::Error), + + #[error("Replication is not available for user")] + ReplicationIsNotAvailableForUserError, + + #[error("WAL level should be 'logical'")] + WALLevelIsNotCorrect(), + + #[error("Cannot find tables {0:?}")] + TablesNotFound(Vec<(String, String)>), + + #[error("Cannot find column {0} in {1}")] + ColumnNotFound(String, String), + + #[error("Cannot find columns {0}")] + ColumnsNotFound(String), + + #[error("Failed to create a replication slot \"{0}\". Error: {1}")] + CreateSlotError(String, #[source] tokio_postgres::Error), + + #[error("Failed to create publication: {0}")] + CreatePublicationError(#[source] tokio_postgres::Error), + + #[error("Failed to drop publication: {0}")] + DropPublicationError(#[source] tokio_postgres::Error), + + #[error("Failed to begin txn for replication")] + BeginReplication, + + #[error("Failed to begin txn for replication")] + CommitReplication, + + #[error("Fetch of replication slot info failed. Error: {0}")] + FetchReplicationSlotError(#[source] tokio_postgres::Error), + + #[error("No slots available or all available slots are used")] + NoAvailableSlotsError, + + #[error("Slot {0} not found")] + SlotNotExistError(String), + + #[error("Slot {0} is already used by another process")] + SlotIsInUseError(String), + + #[error("Table {0} changes is not replicated to slot")] + MissingTableInReplicationSlot(String), + + #[error("Start lsn is before first available lsn - {0} < {1}")] + StartLsnIsBeforeLastFlushedLsnError(String, String), + + #[error("fetch of replication slot info failed. Error: {0}")] + SyncWithSnapshotError(String), + + #[error("Replication stream error. Error: {0}")] + ReplicationStreamError(tokio_postgres::Error), + + #[error("Received unexpected message in replication stream")] + UnexpectedReplicationMessageError, + + #[error("Replication stream error")] + ReplicationStreamEndError, + + #[error(transparent)] + PostgresSchemaError(#[from] PostgresSchemaError), + + #[error("LSN not stored for replication slot")] + LSNNotStoredError, + + #[error("LSN parse error. Given lsn: {0}")] + LsnParseError(String), + + #[error("LSN not returned from replication slot creation query")] + LsnNotReturnedFromReplicationSlot, + + #[error("Table name \"{0}\" not valid")] + TableNameNotValid(String), + + #[error("Column name \"{0}\" not valid")] + ColumnNameNotValid(String), + + #[error("Relation not found in replication: {0}")] + RelationNotFound(#[source] std::io::Error), + + #[error("Failed to send message on snapshot read channel")] + SnapshotReadError, + + #[error("Failed to load native certs: {0}")] + LoadNativeCerts(#[source] std::io::Error), + + #[error("Non utf8 column name in table {table_index} column {column_index}")] + NonUtf8ColumnName { + table_index: usize, + column_index: usize, + }, + + #[error("Column type changed in table {table_index} column {column_name} from {old_type} to {new_type}")] + ColumnTypeChanged { + table_index: usize, + column_name: String, + old_type: postgres_types::Type, + new_type: postgres_types::Type, + }, + + #[error("Unexpected query message")] + UnexpectedQueryMessageError, +} + +#[derive(Error, Debug)] +pub enum PostgresSchemaError { + #[error("Schema's '{0}' doesn't have primary key")] + PrimaryKeyIsMissingInSchema(String), + + #[error("Table: '{0}' replication identity settings are not correct. It is either not set or NOTHING. Missing a primary key ?")] + SchemaReplicationIdentityError(String), + + #[error("Column type {0} not supported")] + ColumnTypeNotSupported(String), + + #[error("Custom type {0:?} is not supported yet. Join our Discord at https://discord.com/invite/3eWXBgJaEQ - we're here to help with your use case!")] + CustomTypeNotSupported(String), + + #[error("ColumnTypeNotFound")] + ColumnTypeNotFound, + + #[error("Invalid column type of column {0}")] + InvalidColumnType(String), + + #[error("Value conversion error: {0}")] + ValueConversionError(String), + + #[error("String parse failed")] + StringParseError(#[source] FromUtf8Error), + + #[error("JSONB parse failed: {0}")] + JSONBParseError(String), + + #[error("Point parse failed")] + PointParseError, + + #[error("Unsupported replication type - '{0}'")] + UnsupportedReplicationType(String), + + #[error( + "Table type '{0}' of '{1}' table is not supported. Only 'BASE TABLE' type is supported" + )] + UnsupportedTableType(String, String), + + #[error("Table type cannot be determined")] + TableTypeNotFound, + + #[error("Column not found")] + ColumnNotFound, + + #[error("Type error: {0}")] + TypeError(#[from] TypeError), + + #[error("Failed to read string from utf8. Error: {0}")] + StringReadError(#[from] FromUtf8Error), + + #[error("Failed to read date. Error: {0}")] + DateReadError(#[from] chrono::ParseError), +} diff --git a/dozer-ingestion/src/connectors/postgres/readme.md b/dozer-ingestion/postgres/src/readme.md similarity index 100% rename from dozer-ingestion/src/connectors/postgres/readme.md rename to dozer-ingestion/postgres/src/readme.md diff --git a/dozer-ingestion/src/connectors/postgres/readme_replication.md b/dozer-ingestion/postgres/src/readme_replication.md similarity index 100% rename from dozer-ingestion/src/connectors/postgres/readme_replication.md rename to dozer-ingestion/postgres/src/readme_replication.md diff --git a/dozer-ingestion/postgres/src/replication_slot_helper.rs b/dozer-ingestion/postgres/src/replication_slot_helper.rs new file mode 100644 index 0000000000..0b57d59a30 --- /dev/null +++ b/dozer-ingestion/postgres/src/replication_slot_helper.rs @@ -0,0 +1,249 @@ +use crate::PostgresConnectorError; + +use super::connection::client::Client; +use dozer_ingestion_connector::dozer_types::log::debug; +use tokio_postgres::{Error, SimpleQueryMessage}; + +pub struct ReplicationSlotHelper {} + +impl ReplicationSlotHelper { + pub async fn drop_replication_slot( + client: &mut Client, + slot_name: &str, + ) -> Result, Error> { + let res = client + .simple_query(format!("select pg_drop_replication_slot('{slot_name}');").as_ref()) + .await; + match res { + Ok(_) => debug!("dropped replication slot {}", slot_name), + Err(_) => debug!("failed to drop replication slot..."), + }; + + res + } + + pub async fn create_replication_slot( + client: &mut Client, + slot_name: &str, + ) -> Result, PostgresConnectorError> { + let create_replication_slot_query = + format!(r#"CREATE_REPLICATION_SLOT {slot_name:?} LOGICAL "pgoutput" USE_SNAPSHOT"#); + + let slot_query_row = client + .simple_query(&create_replication_slot_query) + .await + .map_err(|e| { + debug!("failed to create replication slot {}", slot_name); + PostgresConnectorError::CreateSlotError(slot_name.to_string(), e) + })?; + + if let SimpleQueryMessage::Row(row) = &slot_query_row[0] { + Ok(row.get("consistent_point").map(|lsn| lsn.to_string())) + } else { + Err(PostgresConnectorError::UnexpectedQueryMessageError) + } + } + + pub async fn replication_slot_exists( + client: &mut Client, + slot_name: &str, + ) -> Result { + let replication_slot_info_query = + format!(r#"SELECT * FROM pg_replication_slots where slot_name = '{slot_name}';"#); + + let slot_query_row = client + .simple_query(&replication_slot_info_query) + .await + .map_err(PostgresConnectorError::FetchReplicationSlotError)?; + + Ok(matches!( + slot_query_row.get(0), + Some(SimpleQueryMessage::Row(_)) + )) + } + + pub async fn clear_inactive_slots( + client: &mut Client, + slot_name_prefix: &str, + ) -> Result<(), PostgresConnectorError> { + let inactive_slots_query = format!( + r#"SELECT * FROM pg_replication_slots where active = false AND slot_name LIKE '{slot_name_prefix}%';"# + ); + + let slots = client + .simple_query(&inactive_slots_query) + .await + .map_err(PostgresConnectorError::FetchReplicationSlotError)?; + + let column_index = if let Some(SimpleQueryMessage::Row(row)) = slots.get(0) { + row.columns().iter().position(|c| c.name() == "slot_name") + } else { + None + }; + + for slot_message in slots { + if let SimpleQueryMessage::Row(row) = slot_message { + if let Some(index) = column_index { + let slot_name = row.get(index); + + if let Some(name) = slot_name { + Self::drop_replication_slot(client, name) + .await + .map_err(PostgresConnectorError::InvalidQueryError)?; + } + } + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use dozer_ingestion_connector::tokio; + use serial_test::serial; + use tokio_postgres::config::ReplicationMode; + + use crate::{ + connection::helper::{connect, map_connection_config}, + test_utils::load_test_connection_config, + PostgresConnectorError, + }; + + use super::ReplicationSlotHelper; + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_replication_slot_create_successfully() { + let config = load_test_connection_config().await; + let mut config = map_connection_config(&config).unwrap(); + config.replication_mode(ReplicationMode::Logical); + + let mut client = connect(config).await.unwrap(); + + client + .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") + .await + .unwrap(); + + let actual = ReplicationSlotHelper::create_replication_slot(&mut client, "test").await; + + assert!(actual.is_ok()); + + match actual { + Err(_) => panic!("Validation should fail"), + Ok(result) => { + if let Some(address) = result { + assert_ne!(address, "") + } else { + panic!("Validation should fail") + } + } + } + } + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_replication_slot_create_failed_if_existed() { + let slot_name = "test"; + let config = load_test_connection_config().await; + let mut config = map_connection_config(&config).unwrap(); + config.replication_mode(ReplicationMode::Logical); + + let mut client = connect(config).await.unwrap(); + + client + .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") + .await + .unwrap(); + + let create_replication_slot_query = + format!(r#"CREATE_REPLICATION_SLOT {slot_name:?} LOGICAL "pgoutput" USE_SNAPSHOT"#); + + client + .simple_query(&create_replication_slot_query) + .await + .expect("failed"); + + let actual = ReplicationSlotHelper::create_replication_slot(&mut client, slot_name).await; + + assert!(actual.is_err()); + + match actual { + Ok(_) => panic!("Validation should fail"), + Err(e) => { + if let PostgresConnectorError::CreateSlotError(_, err) = e { + assert_eq!( + err.as_db_error().unwrap().message(), + format!("replication slot \"{slot_name}\" already exists") + ); + } else { + panic!("Unexpected error occurred"); + } + } + } + } + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_replication_slot_drop_successfully() { + let slot_name = "test"; + let config = load_test_connection_config().await; + let mut config = map_connection_config(&config).unwrap(); + config.replication_mode(ReplicationMode::Logical); + + let mut client = connect(config).await.unwrap(); + + client + .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") + .await + .unwrap(); + + let create_replication_slot_query = + format!(r#"CREATE_REPLICATION_SLOT {slot_name:?} LOGICAL "pgoutput" USE_SNAPSHOT"#); + + client + .simple_query(&create_replication_slot_query) + .await + .expect("failed"); + + let actual = ReplicationSlotHelper::drop_replication_slot(&mut client, slot_name).await; + + assert!(actual.is_ok()); + } + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_replication_slot_drop_failed_if_slot_not_exist() { + let slot_name = "test"; + let config = load_test_connection_config().await; + let mut config = map_connection_config(&config).unwrap(); + config.replication_mode(ReplicationMode::Logical); + + let mut client = connect(config).await.unwrap(); + + client + .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") + .await + .unwrap(); + + let actual = ReplicationSlotHelper::drop_replication_slot(&mut client, slot_name).await; + + assert!(actual.is_err()); + + match actual { + Ok(_) => panic!("Validation should fail"), + Err(e) => { + assert_eq!( + e.as_db_error().unwrap().message(), + format!("replication slot \"{slot_name}\" does not exist") + ); + } + } + } +} diff --git a/dozer-ingestion/src/connectors/postgres/replicator.rs b/dozer-ingestion/postgres/src/replicator.rs similarity index 81% rename from dozer-ingestion/src/connectors/postgres/replicator.rs rename to dozer-ingestion/postgres/src/replicator.rs index cf68848dac..27b5bbbf61 100644 --- a/dozer-ingestion/src/connectors/postgres/replicator.rs +++ b/dozer-ingestion/postgres/src/replicator.rs @@ -1,25 +1,22 @@ -use crate::connectors::postgres::connection::client::Client; -use crate::connectors::postgres::connection::helper::{self, is_network_failure}; -use crate::connectors::postgres::xlog_mapper::XlogMapper; -use crate::errors::ConnectorError; -use crate::errors::ConnectorError::PostgresConnectorError; -use crate::errors::PostgresConnectorError::{ - ReplicationStreamEndError, ReplicationStreamError, UnexpectedReplicationMessageError, -}; -use crate::ingestion::Ingestor; -use dozer_types::bytes; -use dozer_types::chrono::{TimeZone, Utc}; -use dozer_types::log::{error, info}; -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::node::OpIdentifier; -use futures::StreamExt; +use dozer_ingestion_connector::dozer_types::bytes; +use dozer_ingestion_connector::dozer_types::chrono::{TimeZone, Utc}; +use dozer_ingestion_connector::dozer_types::log::{error, info}; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::IngestionMessage; +use dozer_ingestion_connector::dozer_types::node::OpIdentifier; +use dozer_ingestion_connector::futures::StreamExt; +use dozer_ingestion_connector::Ingestor; use postgres_protocol::message::backend::ReplicationMessage::*; use postgres_protocol::message::backend::{LogicalReplicationMessage, ReplicationMessage}; use postgres_types::PgLsn; +use tokio_postgres::Error; use std::pin::Pin; use std::time::SystemTime; -use tokio_postgres::Error; + +use crate::connection::client::Client; +use crate::connection::helper::{self, is_network_failure}; +use crate::xlog_mapper::XlogMapper; +use crate::PostgresConnectorError; use super::schema::helper::PostgresTableInfo; use super::xlog_mapper::MappedReplicationMessage; @@ -42,9 +39,12 @@ pub struct CDCHandler<'a> { } impl<'a> CDCHandler<'a> { - pub async fn start(&mut self, tables: Vec) -> Result<(), ConnectorError> { + pub async fn start( + &mut self, + tables: Vec, + ) -> Result<(), PostgresConnectorError> { let replication_conn_config = self.replication_conn_config.clone(); - let client: Client = helper::connect(replication_conn_config).await?; + let client = helper::connect(replication_conn_config).await?; info!( "[{}] Starting Replication: {:?}, {:?}", @@ -65,7 +65,7 @@ impl<'a> CDCHandler<'a> { let mut stream = LogicalReplicationStream::new(client, self.slot_name.clone(), lsn, options) .await - .map_err(|e| ConnectorError::InternalError(Box::new(e)))?; + .map_err(PostgresConnectorError::ReplicationStreamError)?; let tables_columns = tables .into_iter() @@ -109,13 +109,11 @@ impl<'a> CDCHandler<'a> { &mut self, message: Option, Error>>, mapper: &mut XlogMapper, - ) -> Result<(), ConnectorError> { + ) -> Result<(), PostgresConnectorError> { match message { Some(Ok(XLogData(body))) => { let lsn = body.wal_start(); - let message = mapper - .handle_message(body) - .map_err(PostgresConnectorError)?; + let message = mapper.handle_message(body)?; match message { Some(MappedReplicationMessage::Commit(commit)) => { @@ -127,15 +125,19 @@ impl<'a> CDCHandler<'a> { } Some(MappedReplicationMessage::Operation { table_index, op }) => { self.seq_no += 1; - if self.begin_lsn != self.offset_lsn || self.offset < self.seq_no { - self.ingestor + if (self.begin_lsn != self.offset_lsn || self.offset < self.seq_no) + && self + .ingestor .handle_message(IngestionMessage::OperationEvent { table_index, op, id: Some(OpIdentifier::new(self.begin_lsn, self.seq_no)), }) .await - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If the ingestion channel is closed, we should stop the replication + return Ok(()); } } None => {} @@ -145,12 +147,10 @@ impl<'a> CDCHandler<'a> { } Some(Ok(msg)) => { error!("Unexpected message: {:?}", msg); - Err(PostgresConnectorError(UnexpectedReplicationMessageError)) + Err(PostgresConnectorError::UnexpectedReplicationMessageError) } - Some(Err(e)) => Err(PostgresConnectorError(ReplicationStreamError( - e.to_string(), - ))), - None => Err(PostgresConnectorError(ReplicationStreamEndError)), + Some(Err(e)) => Err(PostgresConnectorError::ReplicationStreamError(e)), + None => Err(PostgresConnectorError::ReplicationStreamEndError), } } } diff --git a/dozer-ingestion/src/connectors/postgres/schema/helper.rs b/dozer-ingestion/postgres/src/schema/helper.rs similarity index 91% rename from dozer-ingestion/src/connectors/postgres/schema/helper.rs rename to dozer-ingestion/postgres/src/schema/helper.rs index 99a4b55b21..b0a037f521 100644 --- a/dozer-ingestion/src/connectors/postgres/schema/helper.rs +++ b/dozer-ingestion/postgres/src/schema/helper.rs @@ -1,19 +1,19 @@ use std::collections::HashMap; -use crate::connectors::{CdcType, ListOrFilterColumns, SourceSchema, SourceSchemaResult}; -use crate::errors::{ConnectorError, PostgresConnectorError, PostgresSchemaError}; -use dozer_types::types::{FieldDefinition, FieldType, Schema, SourceDefinition}; - -use crate::connectors::postgres::connection::helper; -use crate::connectors::postgres::helper::postgres_type_to_dozer_type; -use crate::errors::PostgresSchemaError::{InvalidColumnType, ValueConversionError}; - +use dozer_ingestion_connector::{ + dozer_types::types::{FieldDefinition, FieldType, Schema, SourceDefinition}, + utils::ListOrFilterColumns, + CdcType, SourceSchema, +}; use postgres_types::Type; - -use crate::connectors::postgres::schema::sorter::sort_schemas; use tokio_postgres::Row; -use PostgresSchemaError::TableTypeNotFound; +use crate::{ + connection::helper, helper::postgres_type_to_dozer_type, PostgresConnectorError, + PostgresSchemaError, +}; + +use super::sorter::sort_schemas; #[derive(Debug)] pub struct SchemaHelper { @@ -96,7 +96,7 @@ impl SchemaHelper { pub async fn get_tables( &self, tables: Option<&[ListOrFilterColumns]>, - ) -> Result, ConnectorError> { + ) -> Result, PostgresConnectorError> { let (results, tables_columns_map) = self.get_columns(tables).await?; let mut table_columns_map: HashMap)> = @@ -209,7 +209,7 @@ impl SchemaHelper { pub async fn get_schemas( &self, tables: &[ListOrFilterColumns], - ) -> Result, PostgresConnectorError> { + ) -> Result>, PostgresConnectorError> { let (results, tables_columns_map) = self.get_columns(Some(tables)).await?; let mut columns_map: HashMap = HashMap::new(); @@ -250,15 +250,12 @@ impl SchemaHelper { fn map_columns_to_schemas( postgres_tables: Vec<(SchemaTableIdentifier, PostgresTable)>, - ) -> Vec { + ) -> Vec> { postgres_tables .into_iter() .map(|((_, table_name), table)| { - Self::map_schema(&table_name, table).map_err(|e| { - ConnectorError::PostgresConnectorError( - PostgresConnectorError::PostgresSchemaError(e), - ) - }) + Self::map_schema(&table_name, table) + .map_err(PostgresConnectorError::PostgresSchemaError) }) .collect() } @@ -323,7 +320,7 @@ impl SchemaHelper { return Err(PostgresSchemaError::UnsupportedTableType(typ, table_name)); } } else { - return Err(TableTypeNotFound); + return Err(PostgresSchemaError::TableTypeNotFound); } let column_name: String = row.get(1); @@ -338,13 +335,15 @@ impl SchemaHelper { } else { let oid_typ = Type::from_oid(type_oid); oid_typ.map_or_else( - || Err(InvalidColumnType(column_name.clone())), + || Err(PostgresSchemaError::InvalidColumnType(column_name.clone())), postgres_type_to_dozer_type, )? }; - let replication_type = String::from_utf8(vec![replication_type_int as u8]) - .map_err(|_e| ValueConversionError("Replication type".to_string()))?; + let replication_type = + String::from_utf8(vec![replication_type_int as u8]).map_err(|_e| { + PostgresSchemaError::ValueConversionError("Replication type".to_string()) + })?; Ok(PostgresTableRow { schema, diff --git a/dozer-ingestion/src/connectors/postgres/schema/mod.rs b/dozer-ingestion/postgres/src/schema/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/postgres/schema/mod.rs rename to dozer-ingestion/postgres/src/schema/mod.rs diff --git a/dozer-ingestion/src/connectors/postgres/schema/sorter.rs b/dozer-ingestion/postgres/src/schema/sorter.rs similarity index 92% rename from dozer-ingestion/src/connectors/postgres/schema/sorter.rs rename to dozer-ingestion/postgres/src/schema/sorter.rs index 7d6d41d347..8f8aec97e8 100644 --- a/dozer-ingestion/src/connectors/postgres/schema/sorter.rs +++ b/dozer-ingestion/postgres/src/schema/sorter.rs @@ -1,12 +1,10 @@ use std::collections::HashMap; -use crate::connectors::postgres::schema::helper::PostgresTable; -use crate::connectors::ListOrFilterColumns; -use crate::errors::PostgresSchemaError; -use crate::errors::PostgresSchemaError::ColumnNotFound; -use dozer_types::types::FieldDefinition; +use dozer_ingestion_connector::{dozer_types::types::FieldDefinition, utils::ListOrFilterColumns}; -use super::helper::{SchemaTableIdentifier, DEFAULT_SCHEMA_NAME}; +use crate::PostgresSchemaError; + +use super::helper::{PostgresTable, SchemaTableIdentifier, DEFAULT_SCHEMA_NAME}; pub fn sort_schemas( expected_tables_order: &[ListOrFilterColumns], @@ -21,7 +19,9 @@ pub fn sort_schemas( .unwrap_or(DEFAULT_SCHEMA_NAME.to_string()), table.name.clone(), ); - let postgres_table = mapped_tables.get(&table_identifier).ok_or(ColumnNotFound)?; + let postgres_table = mapped_tables + .get(&table_identifier) + .ok_or(PostgresSchemaError::ColumnNotFound)?; let sorted_table = table.columns.as_ref().map_or_else( || Ok::(postgres_table.clone()), @@ -57,14 +57,14 @@ fn sort_fields( .fields() .iter() .position(|f| c == &f.name) - .ok_or(ColumnNotFound)?; + .ok_or(PostgresSchemaError::ColumnNotFound)?; let field = postgres_table .get_field(current_index) - .ok_or(ColumnNotFound)?; + .ok_or(PostgresSchemaError::ColumnNotFound)?; let is_index_field = postgres_table .is_index_field(current_index) - .ok_or(ColumnNotFound)?; + .ok_or(PostgresSchemaError::ColumnNotFound)?; sorted_fields.push((field.clone(), *is_index_field)); } @@ -74,13 +74,16 @@ fn sort_fields( #[cfg(test)] mod tests { - use dozer_types::types::{FieldType, SourceDefinition}; + use dozer_ingestion_connector::{ + dozer_types::types::{FieldDefinition, FieldType, SourceDefinition}, + utils::ListOrFilterColumns, + }; use std::collections::HashMap; - use crate::connectors::postgres::schema::helper::PostgresTable; - use crate::connectors::postgres::schema::sorter::{sort_fields, sort_schemas}; - use crate::connectors::ListOrFilterColumns; - use dozer_types::types::FieldDefinition; + use crate::schema::{ + helper::PostgresTable, + sorter::{sort_fields, sort_schemas}, + }; fn generate_postgres_table() -> PostgresTable { let mut postgres_table = PostgresTable::new("d".to_string()); diff --git a/dozer-ingestion/postgres/src/schema/tests.rs b/dozer-ingestion/postgres/src/schema/tests.rs new file mode 100644 index 0000000000..a17bf03b1f --- /dev/null +++ b/dozer-ingestion/postgres/src/schema/tests.rs @@ -0,0 +1,169 @@ +use dozer_ingestion_connector::tokio; +use dozer_ingestion_connector::utils::ListOrFilterColumns; +use rand::Rng; +use serial_test::serial; +use std::collections::HashSet; +use std::hash::Hash; + +use crate::schema::helper::SchemaHelper; +use crate::test_utils::load_test_connection_config; +use crate::tests::client::TestPostgresClient; +use crate::{PostgresConnectorError, PostgresSchemaError}; + +fn assert_vec_eq(a: &[T], b: &[T]) -> bool +where + T: Eq + Hash, +{ + let a: HashSet<_> = a.iter().collect(); + let b: HashSet<_> = b.iter().collect(); + + a == b +} + +#[tokio::test] +#[ignore] +#[serial] +async fn test_connector_get_tables() { + let config = load_test_connection_config().await; + let mut client = TestPostgresClient::new(&config).await; + + let mut rng = rand::thread_rng(); + + let schema = format!("schema_helper_test_{}", rng.gen::()); + let table_name = format!("products_test_{}", rng.gen::()); + + client.create_schema(&schema).await; + client.create_simple_table(&schema, &table_name).await; + + let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); + let result = schema_helper.get_tables(None).await.unwrap(); + + let table = result.get(0).unwrap(); + assert_eq!(table_name, table.name); + assert!(assert_vec_eq( + &[ + "name".to_string(), + "description".to_string(), + "weight".to_string(), + "id".to_string(), + ], + &table.columns + )); + + client.drop_schema(&schema).await; +} + +#[tokio::test] +#[ignore] +#[serial] +async fn test_connector_get_schema_with_selected_columns() { + let config = load_test_connection_config().await; + let mut client = TestPostgresClient::new(&config).await; + + let mut rng = rand::thread_rng(); + + let schema = format!("schema_helper_test_{}", rng.gen::()); + let table_name = format!("products_test_{}", rng.gen::()); + + client.create_schema(&schema).await; + client.create_simple_table(&schema, &table_name).await; + + let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); + let table_info = ListOrFilterColumns { + schema: Some(schema.clone()), + name: table_name.clone(), + columns: Some(vec!["name".to_string(), "id".to_string()]), + }; + let result = schema_helper.get_tables(Some(&[table_info])).await.unwrap(); + + let table = result.get(0).unwrap(); + assert_eq!(table_name, table.name); + assert!(assert_vec_eq( + &["name".to_string(), "id".to_string()], + &table.columns + )); + + client.drop_schema(&schema).await; +} + +#[tokio::test] +#[ignore] +#[serial] +async fn test_connector_get_schema_without_selected_columns() { + let config = load_test_connection_config().await; + let mut client = TestPostgresClient::new(&config).await; + + let mut rng = rand::thread_rng(); + + let schema = format!("schema_helper_test_{}", rng.gen::()); + let table_name = format!("products_test_{}", rng.gen::()); + + client.create_schema(&schema).await; + client.create_simple_table(&schema, &table_name).await; + + let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); + let table_info = ListOrFilterColumns { + name: table_name.clone(), + schema: Some(schema.clone()), + columns: Some(vec![]), + }; + let result = schema_helper.get_tables(Some(&[table_info])).await.unwrap(); + + let table = result.get(0).unwrap(); + assert_eq!(table_name, table.name.clone()); + assert!(assert_vec_eq( + &[ + "id".to_string(), + "name".to_string(), + "description".to_string(), + "weight".to_string(), + ], + &table.columns + )); + + client.drop_schema(&schema).await; +} + +#[tokio::test] +#[ignore] +#[serial] +async fn test_connector_view_cannot_be_used() { + let config = load_test_connection_config().await; + let mut client = TestPostgresClient::new(&config).await; + + let mut rng = rand::thread_rng(); + + let schema = format!("schema_helper_test_{}", rng.gen::()); + let table_name = format!("products_test_{}", rng.gen::()); + let view_name = format!("products_view_test_{}", rng.gen::()); + + client.create_schema(&schema).await; + client.create_simple_table(&schema, &table_name).await; + client.create_view(&schema, &table_name, &view_name).await; + + let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); + let table_info = ListOrFilterColumns { + name: view_name, + schema: Some(schema.clone()), + columns: Some(vec![]), + }; + + let result = schema_helper.get_schemas(&[table_info]).await; + assert!(result.is_err()); + assert!(matches!( + result, + Err(PostgresConnectorError::PostgresSchemaError( + PostgresSchemaError::UnsupportedTableType(_, _) + )) + )); + + let table_info = ListOrFilterColumns { + name: table_name, + schema: Some(schema.clone()), + columns: Some(vec![]), + }; + let result = schema_helper.get_schemas(&[table_info]).await; + assert!(result.is_ok()); + + client.drop_schema(&schema).await; +} diff --git a/dozer-ingestion/postgres/src/snapshotter.rs b/dozer-ingestion/postgres/src/snapshotter.rs new file mode 100644 index 0000000000..d6c8a77d54 --- /dev/null +++ b/dozer-ingestion/postgres/src/snapshotter.rs @@ -0,0 +1,295 @@ +use dozer_ingestion_connector::{ + dozer_types::{ + models::ingestion_types::IngestionMessage, + types::{Operation, Schema}, + }, + futures::StreamExt, + tokio::{ + self, + sync::mpsc::{channel, Sender}, + task::JoinSet, + }, + utils::ListOrFilterColumns, + Ingestor, SourceSchema, +}; + +use crate::{ + connection::helper as connection_helper, schema::helper::SchemaHelper, PostgresConnectorError, +}; + +use super::helper; + +pub struct PostgresSnapshotter<'a> { + pub conn_config: tokio_postgres::Config, + pub ingestor: &'a Ingestor, + pub schema: Option, +} + +impl<'a> PostgresSnapshotter<'a> { + pub async fn get_tables( + &self, + tables: &[ListOrFilterColumns], + ) -> Result>, PostgresConnectorError> { + let helper = SchemaHelper::new(self.conn_config.clone(), self.schema.clone()); + helper.get_schemas(tables).await + } + + pub async fn sync_table( + schema: Schema, + schema_name: String, + table_name: String, + table_index: usize, + conn_config: tokio_postgres::Config, + sender: Sender>, + ) -> Result<(), PostgresConnectorError> { + let mut client_plain = connection_helper::connect(conn_config).await?; + + let column_str: Vec = schema + .fields + .iter() + .map(|f| format!("\"{0}\"", f.name)) + .collect(); + + let column_str = column_str.join(","); + let query = format!("select {column_str} from {schema_name}.{table_name}"); + let stmt = client_plain + .prepare(&query) + .await + .map_err(PostgresConnectorError::InvalidQueryError)?; + let columns = stmt.columns(); + + let empty_vec: Vec = Vec::new(); + let row_stream = client_plain + .query_raw(query, empty_vec) + .await + .map_err(PostgresConnectorError::InvalidQueryError)?; + tokio::pin!(row_stream); + while let Some(msg) = row_stream.next().await { + match msg { + Ok(msg) => { + let evt = helper::map_row_to_operation_event(&msg, columns) + .map_err(PostgresConnectorError::PostgresSchemaError)?; + + let Ok(_) = sender.send(Ok((table_index, evt))).await else { + // If we can't send, the parent task has quit. There is + // no use in going on, but if there was an error, it was + // handled by the parent. + return Ok(()); + }; + } + Err(e) => return Err(PostgresConnectorError::SyncWithSnapshotError(e.to_string())), + } + } + + Ok(()) + } + + pub async fn sync_tables( + &self, + tables: &[ListOrFilterColumns], + ) -> Result<(), PostgresConnectorError> { + let schemas = self.get_tables(tables).await?; + + let (tx, mut rx) = channel(16); + + let mut joinset = JoinSet::new(); + for (table_index, (schema, table)) in schemas.into_iter().zip(tables).enumerate() { + let schema = schema?; + let schema = schema.schema; + let schema_name = table.schema.clone().unwrap_or("public".to_string()); + let table_name = table.name.clone(); + let conn_config = self.conn_config.clone(); + let sender = tx.clone(); + joinset.spawn(async move { + if let Err(e) = Self::sync_table( + schema, + schema_name, + table_name, + table_index, + conn_config, + sender.clone(), + ) + .await + { + sender.send(Err(e)).await.unwrap(); + } + }); + } + // Make sure the last sender is dropped so receiving on the channel doesn't + // deadlock + drop(tx); + + if self + .ingestor + .handle_message(IngestionMessage::SnapshottingStarted) + .await + .is_err() + { + // If receiving side is closed, we can stop + return Ok(()); + } + + while let Some(message) = rx.recv().await { + let (table_index, evt) = message?; + if self + .ingestor + .handle_message(IngestionMessage::OperationEvent { + table_index, + op: evt, + id: None, + }) + .await + .is_err() + { + // If receiving side is closed, we can stop + return Ok(()); + } + } + + if self + .ingestor + .handle_message(IngestionMessage::SnapshottingDone) + .await + .is_err() + { + // If receiving side is closed, we can stop + return Ok(()); + } + + // All tasks in the joinset should have finished (because they have dropped their senders) + // Otherwise, they will be aborted when the joinset is dropped + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use dozer_ingestion_connector::{tokio, utils::ListOrFilterColumns, IngestionConfig, Ingestor}; + use rand::Rng; + use serial_test::serial; + + use crate::{ + connection::helper::map_connection_config, test_utils::load_test_connection_config, + tests::client::TestPostgresClient, + }; + + use super::PostgresSnapshotter; + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_snapshotter_sync_tables_successfully_1_requested_table() { + let config = load_test_connection_config().await; + + let mut test_client = TestPostgresClient::new(&config).await; + + let mut rng = rand::thread_rng(); + let table_name = format!("test_table_{}", rng.gen::()); + + test_client.create_simple_table("public", &table_name).await; + test_client.insert_rows(&table_name, 2, None).await; + + let conn_config = map_connection_config(&config).unwrap(); + + let input_tables = vec![ListOrFilterColumns { + name: table_name, + schema: Some("public".to_string()), + columns: None, + }]; + + let ingestion_config = IngestionConfig::default(); + let (ingestor, mut iterator) = Ingestor::initialize_channel(ingestion_config); + + let snapshotter = PostgresSnapshotter { + conn_config, + ingestor: &ingestor, + schema: None, + }; + + let actual = snapshotter.sync_tables(&input_tables).await; + + assert!(actual.is_ok()); + + let mut i = 0; + while i < 2 { + if iterator + .next_timeout(Duration::from_secs(1)) + .await + .is_none() + { + panic!("Unexpected operation"); + } + i += 1; + } + } + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_snapshotter_sync_tables_successfully_not_match_table() { + let config = load_test_connection_config().await; + + let mut test_client = TestPostgresClient::new(&config).await; + + let mut rng = rand::thread_rng(); + let table_name = format!("test_table_{}", rng.gen::()); + + test_client.create_simple_table("public", &table_name).await; + test_client.insert_rows(&table_name, 2, None).await; + + let conn_config = map_connection_config(&config).unwrap(); + + let input_table_name = String::from("not_existing_table"); + let input_tables = vec![ListOrFilterColumns { + name: input_table_name, + schema: Some("public".to_string()), + columns: None, + }]; + + let ingestion_config = IngestionConfig::default(); + let (ingestor, mut _iterator) = Ingestor::initialize_channel(ingestion_config); + + let snapshotter = PostgresSnapshotter { + conn_config, + ingestor: &ingestor, + schema: None, + }; + + let actual = snapshotter.sync_tables(&input_tables).await; + + assert!(actual.is_err()); + } + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_snapshotter_sync_tables_successfully_table_not_exist() { + let config = load_test_connection_config().await; + + let mut rng = rand::thread_rng(); + let table_name = format!("test_table_{}", rng.gen::()); + + let conn_config = map_connection_config(&config).unwrap(); + + let input_tables = vec![ListOrFilterColumns { + name: table_name, + schema: Some("public".to_string()), + columns: None, + }]; + + let ingestion_config = IngestionConfig::default(); + let (ingestor, mut _iterator) = Ingestor::initialize_channel(ingestion_config); + + let snapshotter = PostgresSnapshotter { + conn_config, + ingestor: &ingestor, + schema: None, + }; + + let actual = snapshotter.sync_tables(&input_tables).await; + + assert!(actual.is_err()); + } +} diff --git a/dozer-ingestion/src/connectors/postgres/test_utils.rs b/dozer-ingestion/postgres/src/test_utils.rs similarity index 55% rename from dozer-ingestion/src/connectors/postgres/test_utils.rs rename to dozer-ingestion/postgres/src/test_utils.rs index 44a2237ee1..f8dba354ec 100644 --- a/dozer-ingestion/src/connectors/postgres/test_utils.rs +++ b/dozer-ingestion/postgres/src/test_utils.rs @@ -1,19 +1,14 @@ -use crate::connectors::postgres::tests::client::TestPostgresClient; +use dozer_ingestion_connector::dozer_types::models::connection::ConnectionConfig; use postgres_types::PgLsn; -use std::ops::Deref; -use std::{error::Error, panic}; - -use crate::connectors::postgres::replication_slot_helper::ReplicationSlotHelper; -use dozer_types::models::{config::Config, connection::ConnectionConfig}; +use std::error::Error; use std::str::FromStr; use tokio_postgres::{error::DbError, Error as PostgresError, SimpleQueryMessage}; -use super::connection::client::Client; +use crate::connection::helper::map_connection_config; +use crate::replication_slot_helper::ReplicationSlotHelper; +use crate::tests::client::TestPostgresClient; -pub async fn get_client(app_config: Config) -> TestPostgresClient { - let config = &app_config.connections[0].config; - TestPostgresClient::new(config).await -} +use super::connection::client::Client; pub async fn create_slot(client_mut: &mut Client, slot_name: &str) -> PgLsn { client_mut @@ -54,20 +49,24 @@ pub async fn retry_drop_active_slot( } } -pub fn get_config(app_config: Config) -> tokio_postgres::Config { - if let ConnectionConfig::Postgres(connection) = &app_config.connections.get(0).unwrap().config { - let config_replenished = connection.replenish().unwrap(); - let mut config = tokio_postgres::Config::new(); - config - .dbname(&config_replenished.database) - .user(&config_replenished.user) - .host(&config_replenished.host) - .password(&config_replenished.password) - .port(config_replenished.port as u16) - .ssl_mode(config_replenished.sslmode) - .deref() - .clone() - } else { - panic!("Postgres config was expected") - } +pub async fn load_test_connection_config() -> ConnectionConfig { + let config = dozer_ingestion_connector::test_util::load_test_connection_config(); + let postgres_config = map_connection_config(&config).unwrap(); + // We're going to drop `dozer_test` so connect to another database. + let mut connect_config = postgres_config.clone(); + connect_config.dbname("postgres"); + let mut client = TestPostgresClient::new_with_postgres_config(connect_config).await; + client + .execute_query(&format!( + "DROP DATABASE IF EXISTS {}", + postgres_config.get_dbname().unwrap() + )) + .await; + client + .execute_query(&format!( + "CREATE DATABASE {}", + postgres_config.get_dbname().unwrap() + )) + .await; + config } diff --git a/dozer-ingestion/src/connectors/postgres/tests/client.rs b/dozer-ingestion/postgres/src/tests/client.rs similarity index 91% rename from dozer-ingestion/src/connectors/postgres/tests/client.rs rename to dozer-ingestion/postgres/src/tests/client.rs index 1abc40987f..feed792ec5 100644 --- a/dozer-ingestion/src/connectors/postgres/tests/client.rs +++ b/dozer-ingestion/postgres/src/tests/client.rs @@ -1,9 +1,14 @@ -use crate::connectors::postgres::connection::client::Client; -use crate::connectors::postgres::connection::helper::{connect, map_connection_config}; -use dozer_types::models::connection::ConnectionConfig; -use dozer_types::rust_decimal::Decimal; use std::fmt::Write; +use dozer_ingestion_connector::dozer_types::{ + models::connection::ConnectionConfig, rust_decimal::Decimal, +}; + +use crate::connection::{ + client::Client, + helper::{connect, map_connection_config}, +}; + pub struct TestPostgresClient { client: Client, pub postgres_config: tokio_postgres::Config, diff --git a/dozer-ingestion/postgres/src/tests/continue_replication_tests.rs b/dozer-ingestion/postgres/src/tests/continue_replication_tests.rs new file mode 100644 index 0000000000..926df7e579 --- /dev/null +++ b/dozer-ingestion/postgres/src/tests/continue_replication_tests.rs @@ -0,0 +1,163 @@ +#[cfg(test)] +mod tests { + use dozer_ingestion_connector::{tokio, TableIdentifier}; + // use crate::connectors::Connector; + // use crate::ingestion::IngestionConfig; + // use dozer_types::models::ingestion_types::IngestionMessage; + // use dozer_types::node::OpIdentifier; + use rand::Rng; + use serial_test::serial; + use tokio_postgres::config::ReplicationMode; + + use crate::{ + connection::helper::{self, map_connection_config}, + connector::{PostgresConfig, PostgresConnector}, + replication_slot_helper::ReplicationSlotHelper, + test_utils::{create_slot, load_test_connection_config, retry_drop_active_slot}, + tests::client::TestPostgresClient, + }; + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_continue_replication() { + let config = load_test_connection_config().await; + let conn_config = map_connection_config(&config).unwrap(); + let postgres_config = PostgresConfig { + name: "test".to_string(), + config: conn_config.clone(), + schema: None, + }; + + let connector = PostgresConnector::new(postgres_config); + + // let result = connector.can_start_from((1, 0)).unwrap(); + // assert!(!result, "Cannot continue, because slot doesnt exist"); + + let mut replication_conn_config = conn_config; + replication_conn_config.replication_mode(ReplicationMode::Logical); + + // Creating publication + let client = helper::connect(replication_conn_config.clone()) + .await + .unwrap(); + connector.create_publication(client, None).await.unwrap(); + + // Creating slot + let mut client = helper::connect(replication_conn_config.clone()) + .await + .unwrap(); + let slot_name = connector.get_slot_name(); + let _parsed_lsn = create_slot(&mut client, &slot_name).await; + + // let result = connector + // .can_start_from((u64::from(parsed_lsn), 0)) + // .unwrap(); + + ReplicationSlotHelper::drop_replication_slot(&mut client, &slot_name) + .await + .unwrap(); + // assert!( + // result, + // "Replication slot is created and it should be possible to continue" + // ); + } + + #[tokio::test] + #[ignore] + #[serial] + async fn test_connector_continue_replication_from_lsn() { + let config = load_test_connection_config().await; + + let mut test_client = TestPostgresClient::new(&config).await; + let mut rng = rand::thread_rng(); + let table_name = format!("test_table_{}", rng.gen::()); + let connector_name = format!("pg_connector_{}", rng.gen::()); + test_client.create_simple_table("public", &table_name).await; + + let conn_config = map_connection_config(&config).unwrap(); + let postgres_config = PostgresConfig { + name: connector_name, + config: conn_config.clone(), + schema: None, + }; + + let connector = PostgresConnector::new(postgres_config); + + let mut replication_conn_config = conn_config; + replication_conn_config.replication_mode(ReplicationMode::Logical); + + // Creating publication + let client = helper::connect(replication_conn_config.clone()) + .await + .unwrap(); + let table_identifier = TableIdentifier { + schema: Some("public".to_string()), + name: table_name.clone(), + }; + connector + .create_publication(client, Some(&[table_identifier])) + .await + .unwrap(); + + // Creating slot + let mut client = helper::connect(replication_conn_config.clone()) + .await + .unwrap(); + + let slot_name = connector.get_slot_name(); + let _parsed_lsn = create_slot(&mut client, &slot_name).await; + + // let config = IngestionConfig::default(); + // let (ingestor, mut iterator) = Ingestor::initialize_channel(config); + + test_client.insert_rows(&table_name, 4, None).await; + + // assume that we already received two rows + // let last_parsed_position = 2_u64; + // thread::spawn(move || { + // let connector = PostgresConnector::new(postgres_config); + // let _ = connector.start( + // Some((u64::from(parsed_lsn), last_parsed_position)), + // &ingestor, + // tables, + // ); + // }); + + // let mut i = last_parsed_position; + // while i < 4 { + // i += 1; + // if let Some(IngestionMessage { + // identifier: OpIdentifier { seq_in_tx, .. }, + // .. + // }) = iterator.next() + // { + // assert_eq!(i, seq_in_tx); + // } else { + // panic!("Unexpected operation"); + // } + // } + + // test_client.insert_rows(&table_name, 3, None); + // let mut i = 0; + // while i < 3 { + // i += 1; + // if let Some(IngestionMessage { + // identifier: OpIdentifier { seq_in_tx, .. }, + // .. + // }) = iterator.next() + // { + // assert_eq!(i, seq_in_tx); + // } else { + // panic!("Unexpected operation"); + // } + // } + + if let Err(e) = ReplicationSlotHelper::drop_replication_slot(&mut client, &slot_name).await + { + retry_drop_active_slot(e, &mut client, &slot_name) + .await + .unwrap(); + } + } +} diff --git a/dozer-ingestion/src/tests/cases/postgres/dozer-config.yaml b/dozer-ingestion/postgres/src/tests/dozer-config.yaml similarity index 100% rename from dozer-ingestion/src/tests/cases/postgres/dozer-config.yaml rename to dozer-ingestion/postgres/src/tests/dozer-config.yaml diff --git a/dozer-ingestion/src/connectors/postgres/tests/e2e.rs b/dozer-ingestion/postgres/src/tests/e2e.rs similarity index 100% rename from dozer-ingestion/src/connectors/postgres/tests/e2e.rs rename to dozer-ingestion/postgres/src/tests/e2e.rs diff --git a/dozer-ingestion/src/connectors/postgres/tests/mod.rs b/dozer-ingestion/postgres/src/tests/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/postgres/tests/mod.rs rename to dozer-ingestion/postgres/src/tests/mod.rs diff --git a/dozer-ingestion/src/connectors/postgres/xlog_mapper.rs b/dozer-ingestion/postgres/src/xlog_mapper.rs similarity index 97% rename from dozer-ingestion/src/connectors/postgres/xlog_mapper.rs rename to dozer-ingestion/postgres/src/xlog_mapper.rs index 3e0ebaf119..08b0303e6d 100644 --- a/dozer-ingestion/src/connectors/postgres/xlog_mapper.rs +++ b/dozer-ingestion/postgres/src/xlog_mapper.rs @@ -1,8 +1,7 @@ -use crate::connectors::postgres::helper; -use crate::errors::{PostgresConnectorError, PostgresSchemaError}; -use dozer_types::node::OpIdentifier; -use dozer_types::types::{Field, Operation, Record}; -use helper::postgres_type_to_dozer_type; +use dozer_ingestion_connector::dozer_types::{ + node::OpIdentifier, + types::{Field, Operation, Record}, +}; use postgres_protocol::message::backend::LogicalReplicationMessage::{ Begin, Commit, Delete, Insert, Relation, Update, }; @@ -13,6 +12,11 @@ use postgres_types::Type; use std::collections::hash_map::Entry; use std::collections::HashMap; +use crate::{ + helper::{self, postgres_type_to_dozer_type}, + PostgresConnectorError, PostgresSchemaError, +}; + #[derive(Debug)] pub struct Table { columns: Vec, diff --git a/dozer-ingestion/snowflake/Cargo.toml b/dozer-ingestion/snowflake/Cargo.toml new file mode 100644 index 0000000000..fee5a9a1ee --- /dev/null +++ b/dozer-ingestion/snowflake/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "dozer-ingestion-snowflake" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dozer-ingestion-connector = { path = "../connector" } +odbc = "0.17.0" +include_dir = "0.7.3" +genawaiter = "0.99.1" +memchr = "2.6.4" +rand = "0.8.5" diff --git a/dozer-ingestion/src/connectors/snowflake/README.md b/dozer-ingestion/snowflake/src/README.md similarity index 100% rename from dozer-ingestion/src/connectors/snowflake/README.md rename to dozer-ingestion/snowflake/src/README.md diff --git a/dozer-ingestion/src/connectors/snowflake/connection/client.rs b/dozer-ingestion/snowflake/src/connection/client.rs similarity index 86% rename from dozer-ingestion/src/connectors/snowflake/connection/client.rs rename to dozer-ingestion/snowflake/src/connection/client.rs index 43baed4a70..b5c4af758e 100644 --- a/dozer-ingestion/src/connectors/snowflake/connection/client.rs +++ b/dozer-ingestion/snowflake/src/connection/client.rs @@ -1,20 +1,14 @@ -use dozer_types::log::debug; -use dozer_types::models::ingestion_types::SnowflakeConfig; - -use crate::errors::{ConnectorError, SnowflakeError, SnowflakeSchemaError}; - -use crate::connectors::snowflake::schema_helper::SchemaHelper; -use crate::connectors::{CdcType, SourceSchema}; -use crate::errors::SnowflakeError::{NonResumableQuery, QueryError, SnowflakeStreamError}; -use crate::errors::SnowflakeSchemaError::SchemaConversionError; -use crate::errors::SnowflakeSchemaError::{ - DecimalConvertError, InvalidDateError, InvalidTimeError, +use dozer_ingestion_connector::{ + dozer_types::{ + chrono::{NaiveDate, NaiveDateTime, NaiveTime}, + indexmap::IndexMap, + log::debug, + models::ingestion_types::SnowflakeConfig, + rust_decimal::Decimal, + types::*, + }, + CdcType, SourceSchema, }; -use crate::errors::SnowflakeStreamError::TimeTravelNotAvailableError; -use dozer_types::chrono::{NaiveDate, NaiveDateTime, NaiveTime}; -use dozer_types::indexmap::IndexMap; -use dozer_types::rust_decimal::Decimal; -use dozer_types::types::*; use odbc::ffi::{SqlDataType, SQL_DATE_STRUCT, SQL_TIMESTAMP_STRUCT}; use odbc::odbc_safe::{AutocommitOn, Odbc3}; use odbc::{ColumnDescriptor, Cursor, DiagnosticRecord, Environment, Executed, HasResult}; @@ -24,6 +18,10 @@ use std::collections::HashMap; use std::fmt::Write; use std::ops::Deref; +use crate::{ + schema_helper::SchemaHelper, SnowflakeError, SnowflakeSchemaError, SnowflakeStreamError, +}; + use super::helpers::is_network_failure; use super::pool::{Conn, Pool}; @@ -43,7 +41,8 @@ fn convert_decimal(bytes: &[u8], scale: u16) -> Result { } impl<'env> Client<'env> { - pub fn new(config: &SnowflakeConfig, env: &'env Environment) -> Self { + pub fn new(config: SnowflakeConfig, env: &'env Environment) -> Self { let mut conn_hashmap: HashMap = HashMap::new(); let driver = match &config.driver { None => "Snowflake".to_string(), @@ -165,14 +164,14 @@ impl<'env> Client<'env> { }; conn_hashmap.insert("Driver".to_string(), driver); - conn_hashmap.insert("Server".to_string(), config.clone().server); - conn_hashmap.insert("Port".to_string(), config.clone().port); - conn_hashmap.insert("Uid".to_string(), config.clone().user); - conn_hashmap.insert("Pwd".to_string(), config.clone().password); - conn_hashmap.insert("Schema".to_string(), config.clone().schema); - conn_hashmap.insert("Warehouse".to_string(), config.clone().warehouse); - conn_hashmap.insert("Database".to_string(), config.clone().database); - conn_hashmap.insert("Role".to_string(), config.clone().role); + conn_hashmap.insert("Server".to_string(), config.server); + conn_hashmap.insert("Port".to_string(), config.port); + conn_hashmap.insert("Uid".to_string(), config.user); + conn_hashmap.insert("Pwd".to_string(), config.password); + conn_hashmap.insert("Schema".to_string(), config.schema); + conn_hashmap.insert("Warehouse".to_string(), config.warehouse); + conn_hashmap.insert("Database".to_string(), config.database); + conn_hashmap.insert("Role".to_string(), config.role); let mut parts = vec![]; conn_hashmap.keys().for_each(|k| { @@ -196,7 +195,7 @@ impl<'env> Client<'env> { } pub fn exec(&self, query: &str) -> Result<(), SnowflakeError> { - exec_drop(&self.pool, query).map_err(QueryError) + exec_drop(&self.pool, query).map_err(SnowflakeError::QueryError) } pub fn exec_stream_creation(&self, query: String) -> Result { @@ -206,9 +205,11 @@ impl<'env> Client<'env> { if e.get_native_error() == 2203 { Ok(false) } else if e.get_native_error() == 707 { - Err(SnowflakeStreamError(TimeTravelNotAvailableError)) + Err(SnowflakeError::SnowflakeStreamError( + SnowflakeStreamError::TimeTravelNotAvailableError, + )) } else { - Err(QueryError(e)) + Err(SnowflakeError::QueryError(e)) } }, |_| Ok(true), @@ -219,7 +220,7 @@ impl<'env> Client<'env> { if e.get_native_error() == 2203 { Ok(false) } else { - Err(QueryError(e)) + Err(SnowflakeError::QueryError(e)) } } @@ -227,7 +228,7 @@ impl<'env> Client<'env> { if e.get_native_error() == 2003 { Ok(false) } else { - Err(QueryError(e)) + Err(SnowflakeError::QueryError(e)) } } @@ -253,7 +254,7 @@ impl<'env> Client<'env> { tables_indexes: Option>, keys: HashMap>, schema_name: String, - ) -> Result>, SnowflakeError> { + ) -> Result>, SnowflakeError> { let tables_condition = tables_indexes.as_ref().map_or("".to_string(), |tables| { let mut buf = String::new(); buf.write_str(" AND TABLE_NAME IN(").unwrap(); @@ -369,9 +370,7 @@ impl<'env> Client<'env> { Ok((name, SourceSchema::new(schema, cdc_type))) } - Err(e) => Err(ConnectorError::SnowflakeError( - SnowflakeError::SnowflakeSchemaError(e), - )), + Err(e) => Err(SnowflakeError::SnowflakeSchemaError(e)), }) .collect()) } @@ -383,7 +382,7 @@ impl<'env> Client<'env> { let mut keys: HashMap> = HashMap::new(); for result in results { let row_data = match result { - Err(NonResumableQuery(_)) => continue 'retry, + Err(SnowflakeError::NonResumableQuery(_)) => continue 'retry, result => result?, }; let empty = "".to_string(); @@ -427,7 +426,7 @@ fn add_query_offset(query: &str, offset: u64) -> Result "{query} LIMIT 18446744073709551615 OFFSET {offset}" )) } else { - Err(NonResumableQuery(query.to_string())) + Err(SnowflakeError::NonResumableQuery(query.to_string())) } } } @@ -478,28 +477,32 @@ fn exec_iter(pool: Pool, query: String) -> Result { let mut generator: Gen = gen!({ let mut cursor_position = 0u64; 'retry: loop { - let conn = pool.get_conn().map_err(QueryError)?; + let conn = pool.get_conn().map_err(SnowflakeError::QueryError)?; { let mut data = match exec_helper(&conn, &add_query_offset(&query, cursor_position)?) - .map_err(QueryError)? + .map_err(SnowflakeError::QueryError)? { Some(data) => data, None => break, }; - let cols = data.num_result_cols().map_err(|e| QueryError(e.into()))?; + let cols = data + .num_result_cols() + .map_err(|e| SnowflakeError::QueryError(e.into()))?; let mut schema = Vec::new(); for i in 1..(cols + 1) { let value = i.try_into(); let column_descriptor = match value { - Ok(v) => data.describe_col(v).map_err(|e| QueryError(e.into()))?, - Err(e) => Err(SchemaConversionError(e))?, + Ok(v) => data + .describe_col(v) + .map_err(|e| SnowflakeError::QueryError(e.into()))?, + Err(e) => Err(SnowflakeSchemaError::SchemaConversionError(e))?, }; schema.push(column_descriptor) } yield_!(Schema(schema.clone())); while let Some(cursor) = - retry!(data.fetch(),'retry).map_err(|e| QueryError(e.into()))? + retry!(data.fetch(),'retry).map_err(|e| SnowflakeError::QueryError(e.into()))? { let fields = get_fields_from_cursor(cursor, cols, &schema)?; yield_!(Row(fields)); diff --git a/dozer-ingestion/src/connectors/snowflake/connection/helpers.rs b/dozer-ingestion/snowflake/src/connection/helpers.rs similarity index 100% rename from dozer-ingestion/src/connectors/snowflake/connection/helpers.rs rename to dozer-ingestion/snowflake/src/connection/helpers.rs diff --git a/dozer-ingestion/src/connectors/snowflake/connection/mod.rs b/dozer-ingestion/snowflake/src/connection/mod.rs similarity index 100% rename from dozer-ingestion/src/connectors/snowflake/connection/mod.rs rename to dozer-ingestion/snowflake/src/connection/mod.rs diff --git a/dozer-ingestion/src/connectors/snowflake/connection/pool.rs b/dozer-ingestion/snowflake/src/connection/pool.rs similarity index 96% rename from dozer-ingestion/src/connectors/snowflake/connection/pool.rs rename to dozer-ingestion/snowflake/src/connection/pool.rs index 86405bf25a..9e3d0600ba 100644 --- a/dozer-ingestion/src/connectors/snowflake/connection/pool.rs +++ b/dozer-ingestion/snowflake/src/connection/pool.rs @@ -5,13 +5,12 @@ use std::{ rc::Rc, }; +use dozer_ingestion_connector::{blocking_retry_on_network_failure, dozer_types}; use odbc::{ odbc_safe::{AutocommitOn, Odbc3}, Connection, DiagnosticRecord, Environment, }; -use crate::blocking_retry_on_network_failure; - use super::helpers::is_network_failure; const MAX_POOL_SIZE: usize = 16; diff --git a/dozer-ingestion/snowflake/src/connector/mod.rs b/dozer-ingestion/snowflake/src/connector/mod.rs new file mode 100644 index 0000000000..4c719cecdd --- /dev/null +++ b/dozer-ingestion/snowflake/src/connector/mod.rs @@ -0,0 +1,2 @@ +mod snowflake; +pub use snowflake::SnowflakeConnector; diff --git a/dozer-ingestion/src/connectors/snowflake/connector/snowflake.rs b/dozer-ingestion/snowflake/src/connector/snowflake.rs similarity index 75% rename from dozer-ingestion/src/connectors/snowflake/connector/snowflake.rs rename to dozer-ingestion/snowflake/src/connector/snowflake.rs index c889d92945..dde8c76d28 100644 --- a/dozer-ingestion/src/connectors/snowflake/connector/snowflake.rs +++ b/dozer-ingestion/snowflake/src/connector/snowflake.rs @@ -1,21 +1,21 @@ -use crate::connectors::snowflake::connection::client::Client; -use crate::connectors::{ - Connector, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, TableToIngest, +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + errors::internal::BoxedError, + log::{info, warn}, + models::ingestion_types::{default_snowflake_poll_interval, SnowflakeConfig}, + node::OpIdentifier, + types::FieldType, + }, + tokio, Connector, Ingestor, SourceSchema, SourceSchemaResult, TableIdentifier, TableInfo, + TableToIngest, }; -use crate::errors::ConnectorError; -use crate::ingestion::Ingestor; -use dozer_types::models::ingestion_types::{default_snowflake_poll_interval, SnowflakeConfig}; -use dozer_types::node::OpIdentifier; -use dozer_types::tonic::async_trait; use odbc::create_environment_v3; -use crate::connectors::snowflake::stream_consumer::StreamConsumer; - -use dozer_types::log::{info, warn}; - -use crate::connectors::snowflake::schema_helper::SchemaHelper; - -use crate::errors::{SnowflakeError, SnowflakeStreamError}; +use crate::{ + connection::client::Client, schema_helper::SchemaHelper, stream_consumer::StreamConsumer, + SnowflakeError, SnowflakeStreamError, +}; #[derive(Debug)] pub struct SnowflakeConnector { @@ -31,26 +31,27 @@ impl SnowflakeConnector { async fn get_schemas_async( &self, table_names: Option>, - ) -> Result>, ConnectorError> { + ) -> Result>, SnowflakeError> { let config = self.config.clone(); - spawn_blocking(move || SchemaHelper::get_schema(&config, table_names.as_deref())).await + spawn_blocking(move || SchemaHelper::get_schema(config, table_names.as_deref())).await } } #[async_trait] impl Connector for SnowflakeConnector { - fn types_mapping() -> Vec<(String, Option)> + fn types_mapping() -> Vec<(String, Option)> where Self: Sized, { todo!() } - async fn validate_connection(&self) -> Result<(), ConnectorError> { - self.get_schemas_async(None).await.map(|_| ()) + async fn validate_connection(&self) -> Result<(), BoxedError> { + self.get_schemas_async(None).await?; + Ok(()) } - async fn list_tables(&self) -> Result, ConnectorError> { + async fn list_tables(&self) -> Result, BoxedError> { let schemas = self.get_schemas_async(None).await?; let mut tables = vec![]; for schema in schemas { @@ -59,7 +60,7 @@ impl Connector for SnowflakeConnector { Ok(tables) } - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError> { + async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { let table_names = tables .iter() .map(|table| table.name.clone()) @@ -74,7 +75,7 @@ impl Connector for SnowflakeConnector { async fn list_columns( &self, tables: Vec, - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { let table_names = tables .iter() .map(|table| table.name.clone()) @@ -101,7 +102,7 @@ impl Connector for SnowflakeConnector { async fn get_schemas( &self, table_infos: &[TableInfo], - ) -> Result, ConnectorError> { + ) -> Result, BoxedError> { warn!("TODO: respect `column_names` in `table_infos`"); let table_names = table_infos .iter() @@ -111,7 +112,7 @@ impl Connector for SnowflakeConnector { .get_schemas_async(Some(table_names)) .await? .into_iter() - .map(|schema_result| schema_result.map(|(_, schema)| schema)) + .map(|schema_result| schema_result.map(|(_, schema)| schema).map_err(Into::into)) .collect()) } @@ -119,7 +120,7 @@ impl Connector for SnowflakeConnector { &self, ingestor: &Ingestor, tables: Vec, - ) -> Result<(), ConnectorError> { + ) -> Result<(), BoxedError> { spawn_blocking({ let name = self.name.clone(); let config = self.config.clone(); @@ -127,6 +128,7 @@ impl Connector for SnowflakeConnector { move || run(name, config, tables, ingestor) }) .await + .map_err(Into::into) } } @@ -135,13 +137,13 @@ fn run( config: SnowflakeConfig, tables: Vec, ingestor: Ingestor, -) -> Result<(), ConnectorError> { +) -> Result<(), SnowflakeError> { // SNAPSHOT part - run it when stream table doesn't exist let env = create_environment_v3().unwrap(); - let stream_client = Client::new(&config, &env); let interval = config .poll_interval_seconds .unwrap_or_else(default_snowflake_poll_interval); + let stream_client = Client::new(config, &env); let mut consumer = StreamConsumer::new(); let mut iteration = 0; @@ -163,10 +165,8 @@ fn run( if let Ok(false) = StreamConsumer::is_stream_created(&stream_client, &table.name) { - return Err(ConnectorError::SnowflakeError( - SnowflakeError::SnowflakeStreamError( - SnowflakeStreamError::StreamNotFound, - ), + return Err(SnowflakeError::SnowflakeStreamError( + SnowflakeStreamError::StreamNotFound, )); } } diff --git a/dozer-ingestion/src/connectors/snowflake/flow.png b/dozer-ingestion/snowflake/src/flow.png similarity index 100% rename from dozer-ingestion/src/connectors/snowflake/flow.png rename to dozer-ingestion/snowflake/src/flow.png diff --git a/dozer-ingestion/snowflake/src/lib.rs b/dozer-ingestion/snowflake/src/lib.rs new file mode 100644 index 0000000000..38f38124c8 --- /dev/null +++ b/dozer-ingestion/snowflake/src/lib.rs @@ -0,0 +1,70 @@ +use std::num::TryFromIntError; + +use dozer_ingestion_connector::dozer_types::{ + rust_decimal, + thiserror::{self, Error}, +}; +use odbc::DiagnosticRecord; + +pub mod connection; +pub mod connector; +mod schema_helper; +pub mod stream_consumer; +pub mod test_utils; + +#[cfg(test)] +mod tests; + +#[derive(Error, Debug)] +pub enum SnowflakeError { + #[error("Snowflake query error")] + QueryError(#[source] Box), + + #[error("Snowflake connection error")] + ConnectionError(#[source] Box), + + #[error(transparent)] + SnowflakeSchemaError(#[from] SnowflakeSchemaError), + + #[error(transparent)] + SnowflakeStreamError(#[from] SnowflakeStreamError), + + #[error("A network error occurred, but this query is not resumable. query: {0}")] + NonResumableQuery(String), +} + +#[derive(Error, Debug)] +pub enum SnowflakeSchemaError { + #[error("Column type {0} not supported")] + ColumnTypeNotSupported(String), + + #[error("Value conversion Error")] + ValueConversionError(#[source] Box), + + #[error("Invalid date")] + InvalidDateError, + + #[error("Invalid time")] + InvalidTimeError, + + #[error("Schema conversion Error: {0}")] + SchemaConversionError(#[source] TryFromIntError), + + #[error("Decimal convert error")] + DecimalConvertError(#[source] rust_decimal::Error), +} + +#[derive(Error, Debug)] +pub enum SnowflakeStreamError { + #[error("Time travel not available for table")] + TimeTravelNotAvailableError, + + #[error("Unsupported \"{0}\" action in stream")] + UnsupportedActionInStream(String), + + #[error("Cannot determine action")] + CannotDetermineAction, + + #[error("Stream not found")] + StreamNotFound, +} diff --git a/dozer-ingestion/src/connectors/snowflake/schema_helper.rs b/dozer-ingestion/snowflake/src/schema_helper.rs similarity index 70% rename from dozer-ingestion/src/connectors/snowflake/schema_helper.rs rename to dozer-ingestion/snowflake/src/schema_helper.rs index 5d7abce3d7..31eb7085cf 100644 --- a/dozer-ingestion/src/connectors/snowflake/schema_helper.rs +++ b/dozer-ingestion/snowflake/src/schema_helper.rs @@ -1,26 +1,24 @@ -use crate::errors::{ConnectorError, SnowflakeSchemaError}; -use dozer_types::models::ingestion_types::SnowflakeConfig; +use dozer_ingestion_connector::{ + dozer_types::{models::ingestion_types::SnowflakeConfig, types::FieldType}, + SourceSchema, +}; use odbc::create_environment_v3; use std::collections::HashMap; -use crate::connectors::snowflake::connection::client::Client; -use crate::connectors::SourceSchema; -use dozer_types::types::FieldType; +use crate::{connection::client::Client, SnowflakeError, SnowflakeSchemaError}; pub struct SchemaHelper {} impl SchemaHelper { #[allow(clippy::type_complexity)] pub fn get_schema( - config: &SnowflakeConfig, + config: SnowflakeConfig, table_names: Option<&[String]>, - ) -> Result>, ConnectorError> { + ) -> Result>, SnowflakeError> { let env = create_environment_v3().map_err(|e| e.unwrap()).unwrap(); - let client = Client::new(config, &env); + let client = Client::new(config.clone(), &env); - let keys = client - .fetch_keys() - .map_err(ConnectorError::SnowflakeError)?; + let keys = client.fetch_keys()?; let tables_indexes = table_names.map(|table_names| { let mut result = HashMap::new(); @@ -31,9 +29,7 @@ impl SchemaHelper { result }); - client - .fetch_tables(tables_indexes, keys, config.schema.to_string()) - .map_err(ConnectorError::SnowflakeError) + client.fetch_tables(tables_indexes, keys, config.schema) } pub fn map_schema_type( diff --git a/dozer-ingestion/src/connectors/snowflake/stream_consumer.rs b/dozer-ingestion/snowflake/src/stream_consumer.rs similarity index 76% rename from dozer-ingestion/src/connectors/snowflake/stream_consumer.rs rename to dozer-ingestion/snowflake/src/stream_consumer.rs index c092491a55..b1b4ed4dfd 100644 --- a/dozer-ingestion/src/connectors/snowflake/stream_consumer.rs +++ b/dozer-ingestion/snowflake/src/stream_consumer.rs @@ -1,12 +1,13 @@ -use crate::connectors::snowflake::connection::client::Client; +use dozer_ingestion_connector::{ + dozer_types::{ + models::ingestion_types::IngestionMessage, + node::OpIdentifier, + types::{Field, Operation, Record}, + }, + Ingestor, +}; -use crate::errors::{ConnectorError, SnowflakeError}; -use crate::ingestion::Ingestor; -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::node::OpIdentifier; - -use crate::errors::SnowflakeStreamError::{CannotDetermineAction, UnsupportedActionInStream}; -use dozer_types::types::{Field, Operation, Record}; +use crate::{connection::client::Client, SnowflakeError, SnowflakeStreamError}; #[derive(Default)] pub struct StreamConsumer {} @@ -24,10 +25,8 @@ impl StreamConsumer { format!("dozer_{table_name}_{client_name}_stream_temp") } - pub fn is_stream_created(client: &Client, table_name: &str) -> Result { - client - .stream_exist(&Self::get_stream_table_name(table_name, &client.get_name())) - .map_err(ConnectorError::SnowflakeError) + pub fn is_stream_created(client: &Client, table_name: &str) -> Result { + client.stream_exist(&Self::get_stream_table_name(table_name, &client.get_name())) } pub fn drop_stream(client: &Client, table_name: &str) -> Result<(), SnowflakeError> { @@ -39,7 +38,7 @@ impl StreamConsumer { client.exec(&query) } - pub fn create_stream(client: &Client, table_name: &String) -> Result<(), ConnectorError> { + pub fn create_stream(client: &Client, table_name: &String) -> Result<(), SnowflakeError> { let query = format!( "CREATE STREAM {} on table {} SHOW_INITIAL_ROWS = TRUE", Self::get_stream_table_name(table_name, &client.get_name()), @@ -64,7 +63,7 @@ impl StreamConsumer { row: Vec, action_idx: usize, used_columns_for_schema: usize, - ) -> Result { + ) -> Result { if let Field::String(action) = row.get(action_idx).unwrap() { let mut row_mut = row.clone(); let insert_action = &"INSERT"; @@ -81,13 +80,13 @@ impl StreamConsumer { old: Record::new(row_mut), }) } else { - Err(ConnectorError::SnowflakeError( - SnowflakeError::SnowflakeStreamError(UnsupportedActionInStream(action.clone())), + Err(SnowflakeError::SnowflakeStreamError( + SnowflakeStreamError::UnsupportedActionInStream(action.clone()), )) } } else { - Err(ConnectorError::SnowflakeError( - SnowflakeError::SnowflakeStreamError(CannotDetermineAction), + Err(SnowflakeError::SnowflakeStreamError( + SnowflakeStreamError::CannotDetermineAction, )) } } @@ -99,7 +98,7 @@ impl StreamConsumer { ingestor: &Ingestor, table_index: usize, iteration: u64, - ) -> Result<(), ConnectorError> { + ) -> Result<(), SnowflakeError> { let temp_table_name = Self::get_stream_temp_table_name(table_name, &client.get_name()); let stream_name = Self::get_stream_table_name(table_name, &client.get_name()); @@ -123,18 +122,22 @@ impl StreamConsumer { for (idx, result) in rows.enumerate() { let row = result?; let op = Self::get_operation(row, action_idx, used_columns_for_schema)?; - ingestor + if ingestor .blocking_handle_message(IngestionMessage::OperationEvent { table_index, op, id: Some(OpIdentifier::new(iteration, idx as u64)), }) - .map_err(|_| ConnectorError::IngestorError)?; + .is_err() + { + // If receiver is dropped, we can stop processing + return Ok(()); + } } } let query = format!("DROP TABLE {temp_table_name};"); - client.exec(&query).map_err(ConnectorError::SnowflakeError) + client.exec(&query) } } diff --git a/dozer-ingestion/src/connectors/snowflake/test_utils.rs b/dozer-ingestion/snowflake/src/test_utils.rs similarity index 56% rename from dozer-ingestion/src/connectors/snowflake/test_utils.rs rename to dozer-ingestion/snowflake/src/test_utils.rs index 1b80d6c152..d2d6defe1f 100644 --- a/dozer-ingestion/src/connectors/snowflake/test_utils.rs +++ b/dozer-ingestion/snowflake/src/test_utils.rs @@ -1,11 +1,10 @@ -use crate::connectors::snowflake::connection::client::Client; -use crate::connectors::snowflake::stream_consumer::StreamConsumer; -use crate::errors::SnowflakeError; -use dozer_types::models::ingestion_types::SnowflakeConfig; +use dozer_ingestion_connector::dozer_types::models::ingestion_types::SnowflakeConfig; use odbc::create_environment_v3; +use crate::{connection::client::Client, stream_consumer::StreamConsumer, SnowflakeError}; + pub fn remove_streams( - connection: &SnowflakeConfig, + connection: SnowflakeConfig, table_name: &str, ) -> Result { let env = create_environment_v3().unwrap(); diff --git a/dozer-ingestion/src/tests/cases/snowflake/dozer-config.yaml b/dozer-ingestion/snowflake/src/tests/dozer-config.yaml similarity index 100% rename from dozer-ingestion/src/tests/cases/snowflake/dozer-config.yaml rename to dozer-ingestion/snowflake/src/tests/dozer-config.yaml diff --git a/dozer-ingestion/snowflake/src/tests/mod.rs b/dozer-ingestion/snowflake/src/tests/mod.rs new file mode 100644 index 0000000000..372926d59d --- /dev/null +++ b/dozer-ingestion/snowflake/src/tests/mod.rs @@ -0,0 +1,206 @@ +use std::time::Duration; + +use dozer_ingestion_connector::{ + dozer_types::{ + models::connection::ConnectionConfig, + types::FieldType::{Binary, Boolean, Date, Decimal, Float, Int, String, Timestamp}, + }, + test_util::{create_test_runtime, load_test_connection_config, spawn_connector}, + tokio, Connector, TableIdentifier, +}; +use odbc::create_environment_v3; +use rand::Rng; + +use crate::{ + connection::client::Client, connector::SnowflakeConnector, stream_consumer::StreamConsumer, + test_utils::remove_streams, +}; + +const TABLE_NAME: &str = "CUSTOMERS"; + +#[tokio::test] +#[ignore] +async fn test_disabled_connector_and_read_from_stream() { + let config = load_test_connection_config(); + let ConnectionConfig::Snowflake(connection) = config else { + panic!("Snowflake config expected"); + }; + + let env = create_environment_v3().map_err(|e| e.unwrap()).unwrap(); + let client = Client::new(connection.clone(), &env); + + let mut rng = rand::thread_rng(); + let table_name = format!("CUSTOMER_TEST_{}", rng.gen::()); + + client + .exec(&format!( + "CREATE TABLE {table_name} LIKE SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.CUSTOMER;" + )) + .unwrap(); + client.exec(&format!("ALTER TABLE PUBLIC.{table_name} ADD CONSTRAINT {table_name}_PK PRIMARY KEY (C_CUSTKEY);")).unwrap(); + client.exec(&format!("INSERT INTO {table_name} SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.CUSTOMER LIMIT 100")).unwrap(); + + remove_streams(connection.clone(), TABLE_NAME).unwrap(); + + let runtime = create_test_runtime(); + let connector = SnowflakeConnector::new("snowflake".to_string(), connection.clone()); + let tables = runtime + .block_on( + connector.list_columns(vec![TableIdentifier::from_table_name(table_name.clone())]), + ) + .unwrap(); + + let (mut iterator, _) = spawn_connector(runtime, connector, tables); + + let mut i = 0; + while i < 100 { + iterator.next_timeout(Duration::from_secs(10)).await; + i += 1; + } + + assert_eq!(100, i); + + client.exec(&format!("INSERT INTO {table_name} SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.CUSTOMER LIMIT 100 OFFSET 100")).unwrap(); + + let mut i = 0; + while i < 100 { + iterator.next_timeout(Duration::from_secs(10)).await; + i += 1; + } + + assert_eq!(100, i); +} + +#[tokio::test] +#[ignore] +async fn test_disabled_connector_get_schemas_test() { + let config = load_test_connection_config(); + let ConnectionConfig::Snowflake(connection) = config else { + panic!("Snowflake config expected"); + }; + let connector = SnowflakeConnector::new("snowflake".to_string(), connection.clone()); + let env = create_environment_v3().map_err(|e| e.unwrap()).unwrap(); + let client = Client::new(connection, &env); + + let mut rng = rand::thread_rng(); + let table_name = format!("SCHEMA_MAPPING_TEST_{}", rng.gen::()); + + client + .exec(&format!( + "create table {table_name} + ( + integer_column integer, + float_column float, + text_column varchar, + binary_column binary, + boolean_column boolean, + date_column date, + datetime_column datetime, + decimal_column decimal(5, 2) + ) + data_retention_time_in_days = 0; + + " + )) + .unwrap(); + + let table_infos = connector + .list_columns(vec![TableIdentifier::from_table_name(table_name.clone())]) + .await + .unwrap(); + let schemas = connector.get_schemas(&table_infos).await.unwrap(); + + let source_schema = schemas[0].as_ref().unwrap(); + + for field in &source_schema.schema.fields { + let expected_type = match field.name.as_str() { + "INTEGER_COLUMN" => Int, + "FLOAT_COLUMN" => Float, + "TEXT_COLUMN" => String, + "BINARY_COLUMN" => Binary, + "BOOLEAN_COLUMN" => Boolean, + "DATE_COLUMN" => Date, + "DATETIME_COLUMN" => Timestamp, + "DECIMAL_COLUMN" => Decimal, + _ => { + panic!("Unexpected column: {}", field.name) + } + }; + + assert_eq!(expected_type, field.typ); + } + + client.exec(&format!("DROP TABLE {table_name};")).unwrap(); +} + +#[tokio::test] +#[ignore] +async fn test_disabled_connector_missing_table_validator() { + let config = load_test_connection_config(); + let ConnectionConfig::Snowflake(connection) = config else { + panic!("Snowflake config expected"); + }; + let connector = SnowflakeConnector::new("snowflake".to_string(), connection.clone()); + + let not_existing_table = "not_existing_table".to_string(); + let result = connector + .list_columns(vec![TableIdentifier::from_table_name(not_existing_table)]) + .await; + + assert!(result + .unwrap_err() + .to_string() + .starts_with("table not found")); + + let table_infos = connector + .list_columns(vec![TableIdentifier::from_table_name( + TABLE_NAME.to_string(), + )]) + .await + .unwrap(); + let result = connector.get_schemas(&table_infos).await.unwrap(); + + assert!(result[0].is_ok()); +} + +#[tokio::test] +#[ignore] +async fn test_disabled_connector_is_stream_created() { + let config = load_test_connection_config(); + let ConnectionConfig::Snowflake(connection) = config else { + panic!("Snowflake config expected"); + }; + + let env = create_environment_v3().map_err(|e| e.unwrap()).unwrap(); + let client = Client::new(connection, &env); + + let mut rng = rand::thread_rng(); + let table_name = format!("STREAM_EXIST_TEST_{}", rng.gen::()); + + client + .exec(&format!( + "CREATE TABLE {table_name} (id INTEGER) + data_retention_time_in_days = 0; " + )) + .unwrap(); + + let result = StreamConsumer::is_stream_created(&client, &table_name).unwrap(); + assert!( + !result, + "Stream was not created yet, so result of check should be false" + ); + + StreamConsumer::create_stream(&client, &table_name).unwrap(); + let result = StreamConsumer::is_stream_created(&client, &table_name).unwrap(); + assert!( + result, + "Stream is created, so result of check should be true" + ); + + StreamConsumer::drop_stream(&client, &table_name).unwrap(); + let result = StreamConsumer::is_stream_created(&client, &table_name).unwrap(); + assert!( + !result, + "Stream was dropped, so result of check should be false" + ); +} diff --git a/dozer-ingestion/src/connectors/dozer/mod.rs b/dozer-ingestion/src/connectors/dozer/mod.rs deleted file mode 100644 index ace94384df..0000000000 --- a/dozer-ingestion/src/connectors/dozer/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -mod connector; -pub use connector::NestedDozerConnector; diff --git a/dozer-ingestion/src/connectors/ethereum/log/mod.rs b/dozer-ingestion/src/connectors/ethereum/log/mod.rs deleted file mode 100644 index 980ad53896..0000000000 --- a/dozer-ingestion/src/connectors/ethereum/log/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -mod connector; -mod helper; -mod sender; -pub use connector::EthLogConnector; - -#[cfg(test)] -mod tests; diff --git a/dozer-ingestion/src/connectors/grpc/mod.rs b/dozer-ingestion/src/connectors/grpc/mod.rs deleted file mode 100644 index 7a94cd234a..0000000000 --- a/dozer-ingestion/src/connectors/grpc/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -#[allow(dead_code)] -pub mod connector; -mod ingest; - -mod adapter; -pub use adapter::{ArrowAdapter, DefaultAdapter, GrpcIngestMessage, GrpcIngestor, IngestAdapter}; - -#[cfg(test)] -mod tests; diff --git a/dozer-ingestion/src/connectors/kafka/mod.rs b/dozer-ingestion/src/connectors/kafka/mod.rs deleted file mode 100644 index e7fb6222e8..0000000000 --- a/dozer-ingestion/src/connectors/kafka/mod.rs +++ /dev/null @@ -1,11 +0,0 @@ -pub mod connector; -pub mod debezium; -pub mod no_schema_registry_basic; -pub mod schema_registry_basic; -pub mod stream_consumer; -pub mod stream_consumer_basic; -mod stream_consumer_helper; -#[cfg(any(test, feature = "debezium_bench"))] -pub mod test_utils; -#[cfg(test)] -mod tests; diff --git a/dozer-ingestion/src/connectors/mod.rs b/dozer-ingestion/src/connectors/mod.rs deleted file mode 100644 index f1a1877cd7..0000000000 --- a/dozer-ingestion/src/connectors/mod.rs +++ /dev/null @@ -1,334 +0,0 @@ -pub mod dozer; -#[cfg(feature = "ethereum")] -pub mod ethereum; -pub mod grpc; -#[cfg(feature = "kafka")] -pub mod kafka; -pub mod mysql; -pub mod object_store; -pub mod postgres; - -#[cfg(feature = "mongodb")] -pub mod mongodb; - -use crate::connectors::postgres::connection::helper::map_connection_config; - -use std::fmt::Debug; - -#[cfg(feature = "mongodb")] -use self::mongodb::MongodbConnector; -#[cfg(feature = "kafka")] -use crate::connectors::kafka::connector::KafkaConnector; -use crate::connectors::postgres::connector::{PostgresConfig, PostgresConnector}; - -use crate::errors::ConnectorError; -use crate::ingestion::Ingestor; - -use dozer_types::log::debug; -use dozer_types::models::connection::Connection; -use dozer_types::models::connection::ConnectionConfig; -use dozer_types::models::ingestion_types::default_grpc_adapter; -use dozer_types::node::OpIdentifier; -use dozer_types::tonic::async_trait; - -use crate::connectors::object_store::connector::ObjectStoreConnector; - -use crate::connectors::delta_lake::DeltaLakeConnector; -use dozer_types::prettytable::Table; -use dozer_types::serde; -use dozer_types::serde::{Deserialize, Serialize}; -use dozer_types::types::{FieldType, Schema}; - -pub mod delta_lake; -pub mod snowflake; - -use self::dozer::NestedDozerConnector; -#[cfg(feature = "ethereum")] -use self::ethereum::{EthLogConnector, EthTraceConnector}; -#[cfg(feature = "ethereum")] -use dozer_types::models::ingestion_types::EthProviderConfig; - -use self::grpc::connector::GrpcConnector; -use self::grpc::{ArrowAdapter, DefaultAdapter}; -use self::mysql::connector::{mysql_connection_opts_from_url, MySQLConnector}; -#[cfg(feature = "snowflake")] -use crate::connectors::snowflake::connector::SnowflakeConnector; - -#[derive(Clone, Copy, Serialize, Deserialize, Debug, Eq, PartialEq, Default)] -#[serde(crate = "dozer_types::serde")] -/// A source table's CDC event type. -pub enum CdcType { - /// Connector gets old record on delete/update operations. - FullChanges, - /// Connector only gets PK of old record on delete/update operations. - OnlyPK, - #[default] - /// Connector cannot get any info about old records. In other words, the table is append-only. - Nothing, -} - -#[derive(Clone, Serialize, Deserialize, Debug, Eq, PartialEq)] -#[serde(crate = "dozer_types::serde")] -/// A source table's schema and CDC type. -pub struct SourceSchema { - /// Dozer schema mapped from the source table. Columns are already filtered based on `TableInfo.column_names`. - pub schema: Schema, - #[serde(default)] - /// The source table's CDC type. - pub cdc_type: CdcType, -} - -impl SourceSchema { - pub fn new(schema: Schema, cdc_type: CdcType) -> Self { - Self { schema, cdc_type } - } -} - -/// Result of mapping one source table schema to Dozer schema. -pub type SourceSchemaResult = Result; - -#[async_trait] -pub trait Connector: Send + Sync + Debug { - /// Returns all the external types and their corresponding Dozer types. - /// If the external type is not supported, None should be returned. - fn types_mapping() -> Vec<(String, Option)> - where - Self: Sized; - - /// Validates the connector's connection level properties. - async fn validate_connection(&self) -> Result<(), ConnectorError>; - - /// Lists all the table names in the connector. - async fn list_tables(&self) -> Result, ConnectorError>; - - /// Validates the connector's table level properties for each table. - async fn validate_tables(&self, tables: &[TableIdentifier]) -> Result<(), ConnectorError>; - - /// Lists all the column names for each table. - async fn list_columns( - &self, - tables: Vec, - ) -> Result, ConnectorError>; - - /// Gets the schema for each table. Only requested columns need to be mapped. - /// - /// If this function fails at the connector level, such as a network error, it should return a outer level `Err`. - /// Otherwise the outer level `Ok` should always contain the same number of elements as `table_infos`. - /// - /// If it fails at the table or column level, such as a unsupported data type, one of the elements should be `Err`. - async fn get_schemas( - &self, - table_infos: &[TableInfo], - ) -> Result, ConnectorError>; - - /// Lists all tables and columns and gets the schema for each table. - async fn list_all_schemas( - &self, - ) -> Result<(Vec, Vec), ConnectorError> { - let tables = self.list_tables().await?; - let table_infos = self.list_columns(tables).await?; - let schemas = self - .get_schemas(&table_infos) - .await? - .into_iter() - .collect::, _>>()?; - Ok((table_infos, schemas)) - } - - /// Starts outputting data from `tables` to `ingestor`. This method should never return unless there is an unrecoverable error. - async fn start( - &self, - ingestor: &Ingestor, - tables: Vec, - ) -> Result<(), ConnectorError>; -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -/// Unique identifier of a source table. A source table must have a `name`, optionally under a `schema` scope. -pub struct TableIdentifier { - /// The `schema` scope of the table. - /// - /// Connector that supports schema scope must decide on a default schema, that doesn't must assert that `schema.is_none()`. - pub schema: Option, - /// The table name, must be unique under the `schema` scope, or global scope if `schema` is `None`. - pub name: String, -} - -impl TableIdentifier { - pub fn new(schema: Option, name: String) -> Self { - Self { schema, name } - } - - pub fn from_table_name(name: String) -> Self { - Self { schema: None, name } - } -} - -#[derive(Serialize, Deserialize, Clone, Debug, Eq, PartialEq)] -#[serde(crate = "self::serde")] -/// `TableIdentifier` with column names. -pub struct TableInfo { - /// The `schema` scope of the table. - pub schema: Option, - /// The table name, must be unique under the `schema` scope, or global scope if `schema` is `None`. - pub name: String, - /// The column names to be mapped. - pub column_names: Vec, -} - -#[derive(Debug, Clone)] -/// `TableInfo` with an optional checkpoint info. -pub struct TableToIngest { - /// The `schema` scope of the table. - pub schema: Option, - /// The table name, must be unique under the `schema` scope, or global scope if `schema` is `None`. - pub name: String, - /// The column names to be mapped. - pub column_names: Vec, - /// The checkpoint to start after. - pub checkpoint: Option, -} - -impl TableToIngest { - pub fn from_scratch(table_info: TableInfo) -> Self { - Self { - schema: table_info.schema, - name: table_info.name, - column_names: table_info.column_names, - checkpoint: None, - } - } -} - -pub fn get_connector(connection: Connection) -> Result, ConnectorError> { - let config = connection.config; - match config.clone() { - ConnectionConfig::Postgres(c) => { - let config = map_connection_config(&config)?; - let postgres_config = PostgresConfig { - name: connection.name, - config, - schema: c.schema, - }; - - if let Some(dbname) = postgres_config.config.get_dbname() { - debug!("Connecting to postgres database - {}", dbname.to_string()); - } - Ok(Box::new(PostgresConnector::new(postgres_config))) - } - #[cfg(feature = "ethereum")] - ConnectionConfig::Ethereum(eth_config) => match eth_config.provider { - EthProviderConfig::Log(log_config) => { - Ok(Box::new(EthLogConnector::new(log_config, connection.name))) - } - EthProviderConfig::Trace(trace_config) => Ok(Box::new(EthTraceConnector::new( - trace_config, - connection.name, - ))), - }, - #[cfg(not(feature = "ethereum"))] - ConnectionConfig::Ethereum(_) => Err(ConnectorError::EthereumFeatureNotEnabled), - ConnectionConfig::Grpc(grpc_config) => { - match grpc_config - .adapter - .clone() - .unwrap_or_else(default_grpc_adapter) - .as_str() - { - "arrow" => Ok(Box::new(GrpcConnector::::new( - connection.name, - grpc_config, - )?)), - "default" => Ok(Box::new(GrpcConnector::::new( - connection.name, - grpc_config, - )?)), - _ => Err(ConnectorError::UnsupportedGrpcAdapter( - connection.name, - grpc_config.adapter, - )), - } - } - #[cfg(feature = "snowflake")] - ConnectionConfig::Snowflake(snowflake) => { - let snowflake_config = snowflake; - - Ok(Box::new(SnowflakeConnector::new( - connection.name, - snowflake_config, - ))) - } - #[cfg(not(feature = "snowflake"))] - ConnectionConfig::Snowflake(_) => Err(ConnectorError::SnowflakeFeatureNotEnabled), - #[cfg(feature = "kafka")] - ConnectionConfig::Kafka(kafka_config) => Ok(Box::new(KafkaConnector::new(kafka_config))), - #[cfg(not(feature = "kafka"))] - ConnectionConfig::Kafka(_) => Err(ConnectorError::KafkaFeatureNotEnabled), - ConnectionConfig::S3Storage(object_store_config) => { - Ok(Box::new(ObjectStoreConnector::new(object_store_config))) - } - ConnectionConfig::LocalStorage(object_store_config) => { - Ok(Box::new(ObjectStoreConnector::new(object_store_config))) - } - ConnectionConfig::DeltaLake(delta_lake_config) => { - Ok(Box::new(DeltaLakeConnector::new(delta_lake_config))) - } - #[cfg(feature = "mongodb")] - ConnectionConfig::MongoDB(mongodb_config) => { - let connection_string = mongodb_config.connection_string; - Ok(Box::new(MongodbConnector::new(connection_string)?)) - } - #[cfg(not(feature = "mongodb"))] - ConnectionConfig::MongoDB(_) => Err(ConnectorError::MongodbFeatureNotEnabled), - ConnectionConfig::MySQL(mysql_config) => { - let opts = mysql_connection_opts_from_url(&mysql_config.url)?; - Ok(Box::new(MySQLConnector::new( - mysql_config.url, - opts, - mysql_config.server_id, - ))) - } - ConnectionConfig::Dozer(dozer_config) => { - Ok(Box::new(NestedDozerConnector::new(dozer_config))) - } - } -} - -pub fn get_connector_info_table(connection: &Connection) -> Option { - match &connection.config { - ConnectionConfig::Postgres(config) => match config.replenish() { - Ok(conf) => Some(conf.convert_to_table()), - Err(_) => None, - }, - ConnectionConfig::Ethereum(config) => Some(config.convert_to_table()), - ConnectionConfig::Snowflake(config) => Some(config.convert_to_table()), - ConnectionConfig::Kafka(config) => Some(config.convert_to_table()), - ConnectionConfig::S3Storage(config) => Some(config.convert_to_table()), - ConnectionConfig::LocalStorage(config) => Some(config.convert_to_table()), - _ => None, - } -} - -fn table_name(schema: Option<&str>, name: &str) -> String { - if let Some(schema) = &schema { - format!("{}.{}", schema, name) - } else { - name.to_string() - } -} - -#[derive(Debug, Clone)] -pub struct ListOrFilterColumns { - pub schema: Option, - pub name: String, - pub columns: Option>, -} - -pub(crate) fn warn_dropped_primary_index(table_name: &str) { - dozer_types::log::warn!( - "One or more primary index columns from the source table are \ - not part of the defined schema for table: '{0}'. \ - The primary index will therefore not be present in the Dozer table", - table_name - ); -} diff --git a/dozer-ingestion/src/connectors/mysql/mod.rs b/dozer-ingestion/src/connectors/mysql/mod.rs deleted file mode 100644 index 5c023b4704..0000000000 --- a/dozer-ingestion/src/connectors/mysql/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -mod binlog; -mod connection; -pub mod connector; -mod conversion; -pub(crate) mod helpers; -mod schema; -#[cfg(test)] -mod tests; diff --git a/dozer-ingestion/src/connectors/object_store/mod.rs b/dozer-ingestion/src/connectors/object_store/mod.rs deleted file mode 100644 index 9c63260dcf..0000000000 --- a/dozer-ingestion/src/connectors/object_store/mod.rs +++ /dev/null @@ -1,14 +0,0 @@ -mod adapters; -mod connection; -pub mod connector; -mod csv; -mod delta; -mod helper; -mod parquet; -mod schema_helper; -pub mod schema_mapper; -mod table_reader; -pub(crate) mod table_watcher; -#[cfg(test)] -mod tests; -mod watcher; diff --git a/dozer-ingestion/src/connectors/object_store/table_reader.rs b/dozer-ingestion/src/connectors/object_store/table_reader.rs deleted file mode 100644 index 7d75c929ae..0000000000 --- a/dozer-ingestion/src/connectors/object_store/table_reader.rs +++ /dev/null @@ -1,144 +0,0 @@ -use crate::connectors::object_store::adapters::DozerObjectStore; -use crate::connectors::TableInfo; -use crate::errors::ObjectStoreConnectorError::TableReaderError; -use crate::errors::ObjectStoreTableReaderError::{ - ColumnsSelectFailed, StreamExecutionError, TableReadFailed, -}; -use crate::errors::{ConnectorError, ObjectStoreConnectorError}; -use crate::ingestion::Ingestor; -use deltalake::datafusion::datasource::listing::{ - ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, -}; - -use deltalake::datafusion::prelude::SessionContext; -use dozer_types::arrow_types::from_arrow::{map_schema_to_dozer, map_value_to_dozer_field}; -use dozer_types::log::error; -use dozer_types::models::ingestion_types::IngestionMessage; -use dozer_types::tonic::async_trait; -use dozer_types::types::{Operation, Record}; -use futures::StreamExt; -use std::sync::Arc; -use tokio::sync::mpsc::Sender; - -pub struct TableReader { - pub(crate) config: T, -} - -impl TableReader { - pub fn _new(config: T) -> TableReader { - Self { config } - } - - pub async fn read( - table_index: usize, - ctx: SessionContext, - table_path: ListingTableUrl, - listing_options: ListingOptions, - table: &TableInfo, - sender: Sender, ObjectStoreConnectorError>>, - ) -> Result<(), ObjectStoreConnectorError> { - let resolved_schema = listing_options - .infer_schema(&ctx.state(), &table_path) - .await - .map_err(ObjectStoreConnectorError::InternalDataFusionError)?; - - let fields = resolved_schema.all_fields(); - - let config = ListingTableConfig::new(table_path.clone()) - .with_listing_options(listing_options) - .with_schema(resolved_schema.clone()); - - let provider = Arc::new( - ListingTable::try_new(config) - .map_err(ObjectStoreConnectorError::InternalDataFusionError)?, - ); - - let cols: Vec<&str> = if table.column_names.is_empty() { - fields.iter().map(|f| f.name().as_str()).collect() - } else { - table.column_names.iter().map(|c| c.as_str()).collect() - }; - let data = ctx - .read_table(provider.clone()) - .map_err(|e| TableReaderError(TableReadFailed(e)))? - .select_columns(&cols) - .map_err(|e| TableReaderError(ColumnsSelectFailed(e)))? - .execute_stream() - .await - .map_err(|e| TableReaderError(StreamExecutionError(e)))?; - - tokio::pin!(data); - - while let Some(batch) = data.next().await { - let batch = match batch { - Ok(batch) => batch, - Err(e) => { - error!("Error reading record batch from {table_path:?}: {e}"); - continue; - } - }; - - let batch_schema = batch.schema(); - let dozer_schema = map_schema_to_dozer(&batch_schema)?; - - for row in 0..batch.num_rows() { - let fields = batch - .columns() - .iter() - .enumerate() - .map(|(col, column)| { - map_value_to_dozer_field( - column, - row, - resolved_schema.field(col).name(), - &dozer_schema, - ) - }) - .collect::, _>>()?; - - let evt = Operation::Insert { - new: Record { - values: fields, - lifetime: None, - }, - }; - - if sender - .send(Ok(Some(IngestionMessage::OperationEvent { - table_index, - op: evt, - id: None, - }))) - .await - .is_err() - { - break; - } - } - } - - // sender.send(Ok(None)).await.unwrap(); - - Ok(()) - } -} - -#[async_trait] -pub trait Reader { - async fn read_tables( - &self, - tables: &[TableInfo], - ingestor: &Ingestor, - ) -> Result<(), ConnectorError>; -} - -#[async_trait] -impl Reader for TableReader { - async fn read_tables( - &self, - _tables: &[TableInfo], - _ingestor: &Ingestor, - ) -> Result<(), ConnectorError> { - Ok(()) - } -} diff --git a/dozer-ingestion/src/connectors/postgres/mod.rs b/dozer-ingestion/src/connectors/postgres/mod.rs deleted file mode 100644 index bb579b8dcd..0000000000 --- a/dozer-ingestion/src/connectors/postgres/mod.rs +++ /dev/null @@ -1,13 +0,0 @@ -pub mod connection; -pub mod connector; -pub mod helper; -pub mod iterator; -mod replication_slot_helper; -pub mod replicator; -mod schema; -pub mod snapshotter; -#[cfg(test)] -pub mod test_utils; -#[cfg(test)] -pub mod tests; -pub mod xlog_mapper; diff --git a/dozer-ingestion/src/connectors/postgres/replication_slot_helper.rs b/dozer-ingestion/src/connectors/postgres/replication_slot_helper.rs deleted file mode 100644 index f0e0253e4d..0000000000 --- a/dozer-ingestion/src/connectors/postgres/replication_slot_helper.rs +++ /dev/null @@ -1,265 +0,0 @@ -use super::connection::client::Client; -use crate::errors::ConnectorError::UnexpectedQueryMessageError; -use crate::errors::PostgresConnectorError::{FetchReplicationSlotError, InvalidQueryError}; -use crate::errors::{ConnectorError, PostgresConnectorError}; -use dozer_types::log::debug; -use tokio_postgres::{Error, SimpleQueryMessage}; - -pub struct ReplicationSlotHelper {} - -impl ReplicationSlotHelper { - pub async fn drop_replication_slot( - client: &mut Client, - slot_name: &str, - ) -> Result, Error> { - let res = client - .simple_query(format!("select pg_drop_replication_slot('{slot_name}');").as_ref()) - .await; - match res { - Ok(_) => debug!("dropped replication slot {}", slot_name), - Err(_) => debug!("failed to drop replication slot..."), - }; - - res - } - - pub async fn create_replication_slot( - client: &mut Client, - slot_name: &str, - ) -> Result, ConnectorError> { - let create_replication_slot_query = - format!(r#"CREATE_REPLICATION_SLOT {slot_name:?} LOGICAL "pgoutput" USE_SNAPSHOT"#); - - let slot_query_row = client - .simple_query(&create_replication_slot_query) - .await - .map_err(|e| { - debug!("failed to create replication slot {}", slot_name); - ConnectorError::PostgresConnectorError(PostgresConnectorError::CreateSlotError( - slot_name.to_string(), - e, - )) - })?; - - if let SimpleQueryMessage::Row(row) = &slot_query_row[0] { - Ok(row.get("consistent_point").map(|lsn| lsn.to_string())) - } else { - Err(UnexpectedQueryMessageError) - } - } - - pub async fn replication_slot_exists( - client: &mut Client, - slot_name: &str, - ) -> Result { - let replication_slot_info_query = - format!(r#"SELECT * FROM pg_replication_slots where slot_name = '{slot_name}';"#); - - let slot_query_row = client - .simple_query(&replication_slot_info_query) - .await - .map_err(FetchReplicationSlotError)?; - - Ok(matches!( - slot_query_row.get(0), - Some(SimpleQueryMessage::Row(_)) - )) - } - - pub async fn clear_inactive_slots( - client: &mut Client, - slot_name_prefix: &str, - ) -> Result<(), PostgresConnectorError> { - let inactive_slots_query = format!( - r#"SELECT * FROM pg_replication_slots where active = false AND slot_name LIKE '{slot_name_prefix}%';"# - ); - - let slots = client - .simple_query(&inactive_slots_query) - .await - .map_err(FetchReplicationSlotError)?; - - let column_index = if let Some(SimpleQueryMessage::Row(row)) = slots.get(0) { - row.columns().iter().position(|c| c.name() == "slot_name") - } else { - None - }; - - for slot_message in slots { - if let SimpleQueryMessage::Row(row) = slot_message { - if let Some(index) = column_index { - let slot_name = row.get(index); - - if let Some(name) = slot_name { - Self::drop_replication_slot(client, name) - .await - .map_err(InvalidQueryError)?; - } - } - } - } - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use serial_test::serial; - use tokio_postgres::config::ReplicationMode; - - use crate::connectors::postgres::connection::helper::connect; - use crate::connectors::postgres::test_utils::get_config; - use crate::errors::{ConnectorError, PostgresConnectorError}; - use crate::test_util::run_connector_test; - - use super::ReplicationSlotHelper; - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_replication_slot_create_successfully() { - run_connector_test("postgres", |app_config| async move { - let mut config = get_config(app_config); - config.replication_mode(ReplicationMode::Logical); - - let mut client = connect(config).await.unwrap(); - - client - .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") - .await - .unwrap(); - - let actual = ReplicationSlotHelper::create_replication_slot(&mut client, "test").await; - - assert!(actual.is_ok()); - - match actual { - Err(_) => panic!("Validation should fail"), - Ok(result) => { - if let Some(address) = result { - assert_ne!(address, "") - } else { - panic!("Validation should fail") - } - } - } - }) - .await - } - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_replication_slot_create_failed_if_existed() { - run_connector_test("postgres", |app_config| async move { - let slot_name = "test"; - let mut config = get_config(app_config); - config.replication_mode(ReplicationMode::Logical); - - let mut client = connect(config).await.unwrap(); - - client - .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") - .await - .unwrap(); - - let create_replication_slot_query = - format!(r#"CREATE_REPLICATION_SLOT {slot_name:?} LOGICAL "pgoutput" USE_SNAPSHOT"#); - - client - .simple_query(&create_replication_slot_query) - .await - .expect("failed"); - - let actual = - ReplicationSlotHelper::create_replication_slot(&mut client, slot_name).await; - - assert!(actual.is_err()); - - match actual { - Ok(_) => panic!("Validation should fail"), - Err(e) => { - assert!(matches!(e, ConnectorError::PostgresConnectorError(_))); - - if let ConnectorError::PostgresConnectorError( - PostgresConnectorError::CreateSlotError(_, err), - ) = e - { - assert_eq!( - err.as_db_error().unwrap().message(), - format!("replication slot \"{slot_name}\" already exists") - ); - } else { - panic!("Unexpected error occurred"); - } - } - } - }) - .await - } - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_replication_slot_drop_successfully() { - run_connector_test("postgres", |app_config| async move { - let slot_name = "test"; - let mut config = get_config(app_config); - config.replication_mode(ReplicationMode::Logical); - - let mut client = connect(config).await.unwrap(); - - client - .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") - .await - .unwrap(); - - let create_replication_slot_query = - format!(r#"CREATE_REPLICATION_SLOT {slot_name:?} LOGICAL "pgoutput" USE_SNAPSHOT"#); - - client - .simple_query(&create_replication_slot_query) - .await - .expect("failed"); - - let actual = ReplicationSlotHelper::drop_replication_slot(&mut client, slot_name).await; - - assert!(actual.is_ok()); - }) - .await - } - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_replication_slot_drop_failed_if_slot_not_exist() { - run_connector_test("postgres", |app_config| async move { - let slot_name = "test"; - let mut config = get_config(app_config); - config.replication_mode(ReplicationMode::Logical); - - let mut client = connect(config).await.unwrap(); - - client - .simple_query("BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;") - .await - .unwrap(); - - let actual = ReplicationSlotHelper::drop_replication_slot(&mut client, slot_name).await; - - assert!(actual.is_err()); - - match actual { - Ok(_) => panic!("Validation should fail"), - Err(e) => { - assert_eq!( - e.as_db_error().unwrap().message(), - format!("replication slot \"{slot_name}\" does not exist") - ); - } - } - }) - .await - } -} diff --git a/dozer-ingestion/src/connectors/postgres/schema/tests.rs b/dozer-ingestion/src/connectors/postgres/schema/tests.rs deleted file mode 100644 index 1bd5a2b009..0000000000 --- a/dozer-ingestion/src/connectors/postgres/schema/tests.rs +++ /dev/null @@ -1,174 +0,0 @@ -use crate::connectors::postgres::schema::helper::SchemaHelper; -use crate::connectors::postgres::test_utils::get_client; -use crate::connectors::ListOrFilterColumns; -use crate::errors::PostgresConnectorError::PostgresSchemaError; -use crate::errors::PostgresSchemaError::UnsupportedTableType; -use crate::test_util::run_connector_test; -use rand::Rng; -use serial_test::serial; -use std::collections::HashSet; -use std::hash::Hash; - -fn assert_vec_eq(a: &[T], b: &[T]) -> bool -where - T: Eq + Hash, -{ - let a: HashSet<_> = a.iter().collect(); - let b: HashSet<_> = b.iter().collect(); - - a == b -} - -#[tokio::test] -#[ignore] -#[serial] -async fn test_connector_get_tables() { - run_connector_test("postgres", |app_config| async move { - let mut client = get_client(app_config).await; - - let mut rng = rand::thread_rng(); - - let schema = format!("schema_helper_test_{}", rng.gen::()); - let table_name = format!("products_test_{}", rng.gen::()); - - client.create_schema(&schema).await; - client.create_simple_table(&schema, &table_name).await; - - let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); - let result = schema_helper.get_tables(None).await.unwrap(); - - let table = result.get(0).unwrap(); - assert_eq!(table_name, table.name); - assert!(assert_vec_eq( - &[ - "name".to_string(), - "description".to_string(), - "weight".to_string(), - "id".to_string(), - ], - &table.columns - )); - - client.drop_schema(&schema).await; - }) - .await -} - -#[tokio::test] -#[ignore] -#[serial] -async fn test_connector_get_schema_with_selected_columns() { - run_connector_test("postgres", |app_config| async move { - let mut client = get_client(app_config).await; - - let mut rng = rand::thread_rng(); - - let schema = format!("schema_helper_test_{}", rng.gen::()); - let table_name = format!("products_test_{}", rng.gen::()); - - client.create_schema(&schema).await; - client.create_simple_table(&schema, &table_name).await; - - let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); - let table_info = ListOrFilterColumns { - schema: Some(schema.clone()), - name: table_name.clone(), - columns: Some(vec!["name".to_string(), "id".to_string()]), - }; - let result = schema_helper.get_tables(Some(&[table_info])).await.unwrap(); - - let table = result.get(0).unwrap(); - assert_eq!(table_name, table.name); - assert!(assert_vec_eq( - &["name".to_string(), "id".to_string()], - &table.columns - )); - - client.drop_schema(&schema).await; - }) - .await -} - -#[tokio::test] -#[ignore] -#[serial] -async fn test_connector_get_schema_without_selected_columns() { - run_connector_test("postgres", |app_config| async move { - let mut client = get_client(app_config).await; - - let mut rng = rand::thread_rng(); - - let schema = format!("schema_helper_test_{}", rng.gen::()); - let table_name = format!("products_test_{}", rng.gen::()); - - client.create_schema(&schema).await; - client.create_simple_table(&schema, &table_name).await; - - let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); - let table_info = ListOrFilterColumns { - name: table_name.clone(), - schema: Some(schema.clone()), - columns: Some(vec![]), - }; - let result = schema_helper.get_tables(Some(&[table_info])).await.unwrap(); - - let table = result.get(0).unwrap(); - assert_eq!(table_name, table.name.clone()); - assert!(assert_vec_eq( - &[ - "id".to_string(), - "name".to_string(), - "description".to_string(), - "weight".to_string(), - ], - &table.columns - )); - - client.drop_schema(&schema).await; - }) - .await -} - -#[tokio::test] -#[ignore] -#[serial] -async fn test_connector_view_cannot_be_used() { - run_connector_test("postgres", |app_config| async move { - let mut client = get_client(app_config).await; - - let mut rng = rand::thread_rng(); - - let schema = format!("schema_helper_test_{}", rng.gen::()); - let table_name = format!("products_test_{}", rng.gen::()); - let view_name = format!("products_view_test_{}", rng.gen::()); - - client.create_schema(&schema).await; - client.create_simple_table(&schema, &table_name).await; - client.create_view(&schema, &table_name, &view_name).await; - - let schema_helper = SchemaHelper::new(client.postgres_config.clone(), None); - let table_info = ListOrFilterColumns { - name: view_name, - schema: Some(schema.clone()), - columns: Some(vec![]), - }; - - let result = schema_helper.get_schemas(&[table_info]).await; - assert!(result.is_err()); - assert!(matches!( - result, - Err(PostgresSchemaError(UnsupportedTableType(_, _))) - )); - - let table_info = ListOrFilterColumns { - name: table_name, - schema: Some(schema.clone()), - columns: Some(vec![]), - }; - let result = schema_helper.get_schemas(&[table_info]).await; - assert!(result.is_ok()); - - client.drop_schema(&schema).await; - }) - .await -} diff --git a/dozer-ingestion/src/connectors/postgres/snapshotter.rs b/dozer-ingestion/src/connectors/postgres/snapshotter.rs deleted file mode 100644 index 933cb56801..0000000000 --- a/dozer-ingestion/src/connectors/postgres/snapshotter.rs +++ /dev/null @@ -1,310 +0,0 @@ -use crate::connectors::{ListOrFilterColumns, SourceSchemaResult}; -use crate::ingestion::Ingestor; - -use super::helper; -use crate::connectors::postgres::connection::helper as connection_helper; -use crate::errors::ConnectorError; -use crate::errors::PostgresConnectorError::SyncWithSnapshotError; -use crate::errors::PostgresConnectorError::{InvalidQueryError, PostgresSchemaError}; - -use crate::connectors::postgres::schema::helper::SchemaHelper; -use crate::errors::ConnectorError::PostgresConnectorError; -use dozer_types::types::Schema; - -use dozer_types::models::ingestion_types::IngestionMessage; - -use dozer_types::types::Operation; -use futures::StreamExt; -use tokio::sync::mpsc::{channel, Sender}; -use tokio::task::JoinSet; - -pub struct PostgresSnapshotter<'a> { - pub conn_config: tokio_postgres::Config, - pub ingestor: &'a Ingestor, - pub schema: Option, -} - -impl<'a> PostgresSnapshotter<'a> { - pub async fn get_tables( - &self, - tables: &[ListOrFilterColumns], - ) -> Result, ConnectorError> { - let helper = SchemaHelper::new(self.conn_config.clone(), self.schema.clone()); - helper - .get_schemas(tables) - .await - .map_err(PostgresConnectorError) - } - - pub async fn sync_table( - schema: Schema, - schema_name: String, - table_name: String, - table_index: usize, - conn_config: tokio_postgres::Config, - sender: Sender>, - ) -> Result<(), ConnectorError> { - let mut client_plain = connection_helper::connect(conn_config) - .await - .map_err(PostgresConnectorError)?; - - let column_str: Vec = schema - .fields - .iter() - .map(|f| format!("\"{0}\"", f.name)) - .collect(); - - let column_str = column_str.join(","); - let query = format!("select {column_str} from {schema_name}.{table_name}"); - let stmt = client_plain - .prepare(&query) - .await - .map_err(|e| PostgresConnectorError(InvalidQueryError(e)))?; - let columns = stmt.columns(); - - let empty_vec: Vec = Vec::new(); - let row_stream = client_plain - .query_raw(query, empty_vec) - .await - .map_err(|e| PostgresConnectorError(InvalidQueryError(e)))?; - tokio::pin!(row_stream); - while let Some(msg) = row_stream.next().await { - match msg { - Ok(msg) => { - let evt = helper::map_row_to_operation_event(&msg, columns) - .map_err(|e| PostgresConnectorError(PostgresSchemaError(e)))?; - - let Ok(_) = sender.send(Ok((table_index, evt))).await else { - // If we can't send, the parent task has quit. There is - // no use in going on, but if there was an error, it was - // handled by the parent. - return Ok(()); - }; - } - Err(e) => return Err(PostgresConnectorError(SyncWithSnapshotError(e.to_string()))), - } - } - - Ok(()) - } - - pub async fn sync_tables(&self, tables: &[ListOrFilterColumns]) -> Result<(), ConnectorError> { - let schemas = self.get_tables(tables).await?; - - let (tx, mut rx) = channel(16); - - let mut joinset = JoinSet::new(); - for (table_index, (schema, table)) in schemas.into_iter().zip(tables).enumerate() { - let schema = schema?; - let schema = schema.schema; - let schema_name = table.schema.clone().unwrap_or("public".to_string()); - let table_name = table.name.clone(); - let conn_config = self.conn_config.clone(); - let sender = tx.clone(); - joinset.spawn(async move { - if let Err(e) = Self::sync_table( - schema, - schema_name, - table_name, - table_index, - conn_config, - sender.clone(), - ) - .await - { - sender.send(Err(e)).await.unwrap(); - } - }); - } - // Make sure the last sender is dropped so receiving on the channel doesn't - // deadlock - drop(tx); - - self.ingestor - .handle_message(IngestionMessage::SnapshottingStarted) - .await - .map_err(|_| ConnectorError::IngestorError)?; - - while let Some(message) = rx.recv().await { - let (table_index, evt) = message?; - self.ingestor - .handle_message(IngestionMessage::OperationEvent { - table_index, - op: evt, - id: None, - }) - .await - .map_err(|_| ConnectorError::IngestorError)?; - } - - self.ingestor - .handle_message(IngestionMessage::SnapshottingDone) - .await - .map_err(|_| ConnectorError::IngestorError)?; - - // All tasks in the joinset should have finished (because they have dropped their senders) - // Otherwise, they will be aborted when the joinset is dropped - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use std::time::Duration; - - use rand::Rng; - use serial_test::serial; - - use crate::{ - connectors::{ - postgres::{ - connection::helper::map_connection_config, tests::client::TestPostgresClient, - }, - ListOrFilterColumns, - }, - errors::ConnectorError, - ingestion::{IngestionConfig, Ingestor}, - test_util::run_connector_test, - }; - - use super::PostgresSnapshotter; - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_snapshotter_sync_tables_successfully_1_requested_table() { - run_connector_test("postgres", |app_config| async move { - let config = &app_config.connections[0].config; - - let mut test_client = TestPostgresClient::new(config).await; - - let mut rng = rand::thread_rng(); - let table_name = format!("test_table_{}", rng.gen::()); - - test_client.create_simple_table("public", &table_name).await; - test_client.insert_rows(&table_name, 2, None).await; - - let conn_config = map_connection_config(config).unwrap(); - - let input_tables = vec![ListOrFilterColumns { - name: table_name, - schema: Some("public".to_string()), - columns: None, - }]; - - let ingestion_config = IngestionConfig::default(); - let (ingestor, mut iterator) = Ingestor::initialize_channel(ingestion_config); - - let snapshotter = PostgresSnapshotter { - conn_config, - ingestor: &ingestor, - schema: None, - }; - - let actual = snapshotter.sync_tables(&input_tables).await; - - assert!(actual.is_ok()); - - let mut i = 0; - while i < 2 { - if iterator - .next_timeout(Duration::from_secs(1)) - .await - .is_none() - { - panic!("Unexpected operation"); - } - i += 1; - } - }) - .await - } - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_snapshotter_sync_tables_successfully_not_match_table() { - run_connector_test("postgres", |app_config| async move { - let config = &app_config.connections[0].config; - - let mut test_client = TestPostgresClient::new(config).await; - - let mut rng = rand::thread_rng(); - let table_name = format!("test_table_{}", rng.gen::()); - - test_client.create_simple_table("public", &table_name).await; - test_client.insert_rows(&table_name, 2, None).await; - - let conn_config = map_connection_config(config).unwrap(); - - let input_table_name = String::from("not_existing_table"); - let input_tables = vec![ListOrFilterColumns { - name: input_table_name, - schema: Some("public".to_string()), - columns: None, - }]; - - let ingestion_config = IngestionConfig::default(); - let (ingestor, mut _iterator) = Ingestor::initialize_channel(ingestion_config); - - let snapshotter = PostgresSnapshotter { - conn_config, - ingestor: &ingestor, - schema: None, - }; - - let actual = snapshotter.sync_tables(&input_tables).await; - - assert!(actual.is_err()); - - match actual { - Ok(_) => panic!("Test failed"), - Err(e) => { - assert!(matches!(e, ConnectorError::PostgresConnectorError(_))); - } - } - }) - .await - } - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_snapshotter_sync_tables_successfully_table_not_exist() { - run_connector_test("postgres", |app_config| async move { - let config = &app_config.connections[0].config; - - let mut rng = rand::thread_rng(); - let table_name = format!("test_table_{}", rng.gen::()); - - let conn_config = map_connection_config(config).unwrap(); - - let input_tables = vec![ListOrFilterColumns { - name: table_name, - schema: Some("public".to_string()), - columns: None, - }]; - - let ingestion_config = IngestionConfig::default(); - let (ingestor, mut _iterator) = Ingestor::initialize_channel(ingestion_config); - - let snapshotter = PostgresSnapshotter { - conn_config, - ingestor: &ingestor, - schema: None, - }; - - let actual = snapshotter.sync_tables(&input_tables).await; - - assert!(actual.is_err()); - - match actual { - Ok(_) => panic!("Test failed"), - Err(e) => { - assert!(matches!(e, ConnectorError::PostgresConnectorError(_))); - } - } - }) - .await - } -} diff --git a/dozer-ingestion/src/connectors/postgres/tests/continue_replication_tests.rs b/dozer-ingestion/src/connectors/postgres/tests/continue_replication_tests.rs deleted file mode 100644 index e51a1239b3..0000000000 --- a/dozer-ingestion/src/connectors/postgres/tests/continue_replication_tests.rs +++ /dev/null @@ -1,169 +0,0 @@ -#[cfg(test)] -mod tests { - use crate::connectors::postgres::connection::helper; - use crate::connectors::postgres::connection::helper::map_connection_config; - use crate::connectors::postgres::connector::{PostgresConfig, PostgresConnector}; - use crate::connectors::postgres::replication_slot_helper::ReplicationSlotHelper; - use crate::connectors::postgres::test_utils::{create_slot, retry_drop_active_slot}; - use crate::connectors::postgres::tests::client::TestPostgresClient; - use crate::connectors::TableIdentifier; - use crate::test_util::run_connector_test; - // use crate::connectors::Connector; - // use crate::ingestion::IngestionConfig; - // use dozer_types::models::ingestion_types::IngestionMessage; - // use dozer_types::node::OpIdentifier; - use rand::Rng; - use serial_test::serial; - use tokio_postgres::config::ReplicationMode; - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_continue_replication() { - run_connector_test("postgres", |app_config| async move { - let config = &app_config.connections[0].config; - let conn_config = map_connection_config(config).unwrap(); - let postgres_config = PostgresConfig { - name: "test".to_string(), - config: conn_config.clone(), - schema: None, - }; - - let connector = PostgresConnector::new(postgres_config); - - // let result = connector.can_start_from((1, 0)).unwrap(); - // assert!(!result, "Cannot continue, because slot doesnt exist"); - - let mut replication_conn_config = conn_config; - replication_conn_config.replication_mode(ReplicationMode::Logical); - - // Creating publication - let client = helper::connect(replication_conn_config.clone()) - .await - .unwrap(); - connector.create_publication(client, None).await.unwrap(); - - // Creating slot - let mut client = helper::connect(replication_conn_config.clone()) - .await - .unwrap(); - let slot_name = connector.get_slot_name(); - let _parsed_lsn = create_slot(&mut client, &slot_name).await; - - // let result = connector - // .can_start_from((u64::from(parsed_lsn), 0)) - // .unwrap(); - - ReplicationSlotHelper::drop_replication_slot(&mut client, &slot_name) - .await - .unwrap(); - // assert!( - // result, - // "Replication slot is created and it should be possible to continue" - // ); - }) - .await - } - - #[tokio::test] - #[ignore] - #[serial] - async fn test_connector_continue_replication_from_lsn() { - run_connector_test("postgres", |app_config| async move { - let config = &app_config.connections[0].config; - - let mut test_client = TestPostgresClient::new(config).await; - let mut rng = rand::thread_rng(); - let table_name = format!("test_table_{}", rng.gen::()); - let connector_name = format!("pg_connector_{}", rng.gen::()); - test_client.create_simple_table("public", &table_name).await; - - let conn_config = map_connection_config(config).unwrap(); - let postgres_config = PostgresConfig { - name: connector_name, - config: conn_config.clone(), - schema: None, - }; - - let connector = PostgresConnector::new(postgres_config); - - let mut replication_conn_config = conn_config; - replication_conn_config.replication_mode(ReplicationMode::Logical); - - // Creating publication - let client = helper::connect(replication_conn_config.clone()) - .await - .unwrap(); - let table_identifier = TableIdentifier { - schema: Some("public".to_string()), - name: table_name.clone(), - }; - connector - .create_publication(client, Some(&[table_identifier])) - .await - .unwrap(); - - // Creating slot - let mut client = helper::connect(replication_conn_config.clone()) - .await - .unwrap(); - - let slot_name = connector.get_slot_name(); - let _parsed_lsn = create_slot(&mut client, &slot_name).await; - - // let config = IngestionConfig::default(); - // let (ingestor, mut iterator) = Ingestor::initialize_channel(config); - - test_client.insert_rows(&table_name, 4, None).await; - - // assume that we already received two rows - // let last_parsed_position = 2_u64; - // thread::spawn(move || { - // let connector = PostgresConnector::new(postgres_config); - // let _ = connector.start( - // Some((u64::from(parsed_lsn), last_parsed_position)), - // &ingestor, - // tables, - // ); - // }); - - // let mut i = last_parsed_position; - // while i < 4 { - // i += 1; - // if let Some(IngestionMessage { - // identifier: OpIdentifier { seq_in_tx, .. }, - // .. - // }) = iterator.next() - // { - // assert_eq!(i, seq_in_tx); - // } else { - // panic!("Unexpected operation"); - // } - // } - - // test_client.insert_rows(&table_name, 3, None); - // let mut i = 0; - // while i < 3 { - // i += 1; - // if let Some(IngestionMessage { - // identifier: OpIdentifier { seq_in_tx, .. }, - // .. - // }) = iterator.next() - // { - // assert_eq!(i, seq_in_tx); - // } else { - // panic!("Unexpected operation"); - // } - // } - - if let Err(e) = - ReplicationSlotHelper::drop_replication_slot(&mut client, &slot_name).await - { - retry_drop_active_slot(e, &mut client, &slot_name) - .await - .unwrap(); - } - }) - .await - } -} diff --git a/dozer-ingestion/src/connectors/snowflake/connector/mod.rs b/dozer-ingestion/src/connectors/snowflake/connector/mod.rs deleted file mode 100644 index 4cf288862d..0000000000 --- a/dozer-ingestion/src/connectors/snowflake/connector/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -#[cfg(feature = "snowflake")] -mod snowflake; -#[cfg(feature = "snowflake")] -pub use snowflake::SnowflakeConnector; diff --git a/dozer-ingestion/src/connectors/snowflake/mod.rs b/dozer-ingestion/src/connectors/snowflake/mod.rs deleted file mode 100644 index 071e7c70d4..0000000000 --- a/dozer-ingestion/src/connectors/snowflake/mod.rs +++ /dev/null @@ -1,13 +0,0 @@ -#[cfg(feature = "snowflake")] -pub mod connection; -pub mod connector; -#[cfg(feature = "snowflake")] -mod schema_helper; -#[cfg(feature = "snowflake")] -pub mod stream_consumer; -#[cfg(feature = "snowflake")] -pub mod test_utils; - -#[cfg(test)] -#[cfg(feature = "snowflake")] -mod tests; diff --git a/dozer-ingestion/src/connectors/snowflake/tests.rs b/dozer-ingestion/src/connectors/snowflake/tests.rs deleted file mode 100644 index 5d6b02e0b0..0000000000 --- a/dozer-ingestion/src/connectors/snowflake/tests.rs +++ /dev/null @@ -1,210 +0,0 @@ -use std::time::Duration; - -use crate::connectors::snowflake::connector::SnowflakeConnector; -use crate::connectors::snowflake::test_utils::remove_streams; -use crate::connectors::{get_connector, Connector, TableIdentifier}; - -use dozer_types::types::FieldType::{ - Binary, Boolean, Date, Decimal, Float, Int, String, Timestamp, -}; - -use dozer_types::models::connection::ConnectionConfig; -use odbc::create_environment_v3; -use rand::Rng; - -use crate::errors::ConnectorError::TableNotFound; -use crate::test_util::{create_test_runtime, run_connector_test, spawn_connector}; - -use crate::connectors::snowflake::connection::client::Client; -use crate::connectors::snowflake::stream_consumer::StreamConsumer; - -#[tokio::test] -#[ignore] -async fn test_disabled_connector_and_read_from_stream() { - run_connector_test("snowflake", |config| async move { - let ConnectionConfig::Snowflake(connection) = &config.connections[0].config else { - panic!("Snowflake config expected"); - }; - let source = config.sources[0].clone(); - - let env = create_environment_v3().map_err(|e| e.unwrap()).unwrap(); - let client = Client::new(connection, &env); - - let mut rng = rand::thread_rng(); - let table_name = format!("CUSTOMER_TEST_{}", rng.gen::()); - - client - .exec( - &format!( - "CREATE TABLE {table_name} LIKE SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.CUSTOMER;" - ), - ) - .unwrap(); - client.exec(&format!("ALTER TABLE PUBLIC.{table_name} ADD CONSTRAINT {table_name}_PK PRIMARY KEY (C_CUSTKEY);")).unwrap(); - client.exec(&format!("INSERT INTO {table_name} SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.CUSTOMER LIMIT 100")).unwrap(); - - remove_streams(connection, &source.table_name).unwrap(); - - let runtime = create_test_runtime(); - let connector = SnowflakeConnector::new("snowflake".to_string(), connection.clone()); - let tables = runtime.block_on(connector.list_columns(vec![TableIdentifier::from_table_name(table_name.clone())])).unwrap(); - - let (mut iterator, _) = spawn_connector(runtime, connector, tables); - - let mut i = 0; - while i < 100 { - iterator.next_timeout(Duration::from_secs(10)).await; - i += 1; - } - - assert_eq!(100, i); - - client.exec(&format!("INSERT INTO {table_name} SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.CUSTOMER LIMIT 100 OFFSET 100")).unwrap(); - - let mut i = 0; - while i < 100 { - iterator.next_timeout(Duration::from_secs(10)).await; - i += 1; - } - - assert_eq!(100, i); - }).await -} - -#[tokio::test] -#[ignore] -async fn test_disabled_connector_get_schemas_test() { - run_connector_test("snowflake", |config| async move { - let ConnectionConfig::Snowflake(connection) = &config.connections[0].config else { - panic!("Snowflake config expected"); - }; - let connector = SnowflakeConnector::new("snowflake".to_string(), connection.clone()); - let env = create_environment_v3().map_err(|e| e.unwrap()).unwrap(); - let client = Client::new(connection, &env); - - let mut rng = rand::thread_rng(); - let table_name = format!("SCHEMA_MAPPING_TEST_{}", rng.gen::()); - - client - .exec(&format!( - "create table {table_name} - ( - integer_column integer, - float_column float, - text_column varchar, - binary_column binary, - boolean_column boolean, - date_column date, - datetime_column datetime, - decimal_column decimal(5, 2) - ) - data_retention_time_in_days = 0; - - " - )) - .unwrap(); - - let table_infos = connector - .list_columns(vec![TableIdentifier::from_table_name(table_name.clone())]) - .await - .unwrap(); - let schemas = connector.get_schemas(&table_infos).await.unwrap(); - - let source_schema = schemas[0].as_ref().unwrap(); - - for field in &source_schema.schema.fields { - let expected_type = match field.name.as_str() { - "INTEGER_COLUMN" => Int, - "FLOAT_COLUMN" => Float, - "TEXT_COLUMN" => String, - "BINARY_COLUMN" => Binary, - "BOOLEAN_COLUMN" => Boolean, - "DATE_COLUMN" => Date, - "DATETIME_COLUMN" => Timestamp, - "DECIMAL_COLUMN" => Decimal, - _ => { - panic!("Unexpected column: {}", field.name) - } - }; - - assert_eq!(expected_type, field.typ); - } - - client.exec(&format!("DROP TABLE {table_name};")).unwrap(); - }) - .await -} - -#[tokio::test] -#[ignore] -async fn test_disabled_connector_missing_table_validator() { - run_connector_test("snowflake", |mut config| async move { - let connector = get_connector(config.connections.remove(0)).unwrap(); - - let not_existing_table = "not_existing_table".to_string(); - let result = connector - .list_columns(vec![TableIdentifier::from_table_name(not_existing_table)]) - .await; - - assert!(matches!(result.unwrap_err(), TableNotFound(_))); - - let existing_table = &config.sources[0].table_name; - let table_infos = connector - .list_columns(vec![TableIdentifier::from_table_name( - existing_table.clone(), - )]) - .await - .unwrap(); - let result = connector.get_schemas(&table_infos).await.unwrap(); - - assert!(result[0].is_ok()); - }) - .await -} - -#[tokio::test] -#[ignore] -async fn test_disabled_connector_is_stream_created() { - run_connector_test("snowflake", |mut config| async move { - let snowflake_config = match config.connections.remove(0).config { - ConnectionConfig::Snowflake(snowflake_config) => snowflake_config, - _ => { - panic!("Snowflake config expected"); - } - }; - - let env = create_environment_v3().map_err(|e| e.unwrap()).unwrap(); - let client = Client::new(&snowflake_config, &env); - - let mut rng = rand::thread_rng(); - let table_name = format!("STREAM_EXIST_TEST_{}", rng.gen::()); - - client - .exec(&format!( - "CREATE TABLE {table_name} (id INTEGER) - data_retention_time_in_days = 0; " - )) - .unwrap(); - - let result = StreamConsumer::is_stream_created(&client, &table_name).unwrap(); - assert!( - !result, - "Stream was not created yet, so result of check should be false" - ); - - StreamConsumer::create_stream(&client, &table_name).unwrap(); - let result = StreamConsumer::is_stream_created(&client, &table_name).unwrap(); - assert!( - result, - "Stream is created, so result of check should be true" - ); - - StreamConsumer::drop_stream(&client, &table_name).unwrap(); - let result = StreamConsumer::is_stream_created(&client, &table_name).unwrap(); - assert!( - !result, - "Stream was dropped, so result of check should be false" - ); - }) - .await -} diff --git a/dozer-ingestion/src/errors.rs b/dozer-ingestion/src/errors.rs index 08497a2b13..b35623b588 100644 --- a/dozer-ingestion/src/errors.rs +++ b/dozer-ingestion/src/errors.rs @@ -1,118 +1,19 @@ -#![allow(clippy::enum_variant_names)] - -use deltalake::arrow::error::ArrowError; -use dozer_log::errors::{ReaderBuilderError, ReaderError}; -use dozer_types::errors::internal::BoxedError; -use dozer_types::errors::types::{DeserializationError, SerializationError, TypeError}; -use dozer_types::thiserror; -use dozer_types::thiserror::Error; -use dozer_types::{bincode, serde_json}; - -#[cfg(feature = "kafka")] -use base64::DecodeError; - -use deltalake::datafusion::error::DataFusionError; -use deltalake::DeltaTableError; -use geozero::error::GeozeroError; -#[cfg(feature = "snowflake")] -use std::num::TryFromIntError; -#[cfg(feature = "kafka")] -use std::str::Utf8Error; -use std::string::FromUtf8Error; - -use dozer_types::log::error; -#[cfg(feature = "snowflake")] -use odbc::DiagnosticRecord; - -use dozer_types::arrow_types::errors::FromArrowError; -#[cfg(feature = "kafka")] -use schema_registry_converter::error::SRCError; -use tokio_postgres::config::SslMode; - -use tokio_postgres::Error; - -#[cfg(any(feature = "kafka", feature = "snowflake"))] -use dozer_types::rust_decimal::Error as RustDecimalError; - -#[cfg(feature = "mongodb")] -use crate::connectors::mongodb::MongodbConnectorError; +use dozer_ingestion_connector::dozer_types::thiserror::{self, Error}; #[derive(Error, Debug)] pub enum ConnectorError { - #[error("Failed to map configuration: {0}")] - WrongConnectionConfiguration(DeserializationError), - - #[error("Failed to map configuration: {0}")] - UnavailableConnectionConfiguration(String), - - #[error("Failed to map configuration: {0}")] - UnableToInferSchema(DataFusionError), - #[error("Unsupported grpc adapter: {0} {1:?}")] UnsupportedGrpcAdapter(String, Option), - #[error("Arrow error: {0}")] - Arrow(#[from] ArrowError), - - #[error("Table not found: {0}")] - TableNotFound(String), - - #[error("Failed to initialize connector {0}")] - InitializationError(String), - - #[error("This connector doesn't support this method: {0}")] - UnsupportedConnectorMethod(String), - - #[error("Unexpected query message")] - UnexpectedQueryMessageError, - - #[error("Schema Identifier is not present")] - SchemaIdentifierNotFound, - - #[error(transparent)] - PostgresConnectorError(#[from] PostgresConnectorError), - - #[cfg(feature = "snowflake")] - #[error(transparent)] - SnowflakeError(#[from] SnowflakeError), - - #[cfg(feature = "kafka")] - #[error(transparent)] - KafkaError(#[from] KafkaError), #[cfg(feature = "mongodb")] - #[error(transparent)] - MongodbError(#[from] MongodbConnectorError), - - #[error(transparent)] - ObjectStoreConnectorError(#[from] ObjectStoreConnectorError), - - #[error(transparent)] - NestedDozerConnectorError(#[from] NestedDozerConnectorError), - - #[error(transparent)] - TypeError(#[from] TypeError), - - #[error(transparent)] - InternalError(#[from] BoxedError), + #[error("mongodb config error: {0}")] + MongodbConfig(#[from] dozer_ingestion_mongodb::MongodbConnectorError), - #[error("Failed to send message on channel")] - IngestorError, + #[error("mysql config error: {0}")] + MysqlConfig(#[from] dozer_ingestion_mysql::MySQLConnectorError), - #[cfg(feature = "ethereum")] - #[error("Error in Eth Connection: {0}")] - EthError(#[source] web3::Error), - - #[error("Failed fetching after {0} recursions")] - EthTooManyRecurisions(usize), - - #[error("Received empty message in connector")] - EmptyMessage, - - #[error("Delta table error: {0}")] - DeltaTableError(#[from] DeltaTableError), - - #[error("Datafusion error: {0}")] - DataFusionError(#[from] DataFusionError), + #[error("postgres config error: {0}")] + PostgresConfig(#[from] dozer_ingestion_postgres::PostgresConnectorError), #[error("snowflake feature is not enabled")] SnowflakeFeatureNotEnabled, @@ -125,451 +26,4 @@ pub enum ConnectorError { #[error("mongodb feature is not enabled")] MongodbFeatureNotEnabled, - - #[error(transparent)] - MySQLConnectorError(#[from] MySQLConnectorError), -} - -impl ConnectorError { - pub fn map_serialization_error(e: serde_json::Error) -> ConnectorError { - ConnectorError::TypeError(TypeError::SerializationError(SerializationError::Json(e))) - } - - pub fn map_bincode_serialization_error(e: bincode::Error) -> ConnectorError { - ConnectorError::TypeError(TypeError::SerializationError(SerializationError::Bincode( - e, - ))) - } -} -#[derive(Error, Debug)] -pub enum ConfigurationError { - #[error("Missing `config` for connector {0}")] - MissingConfiguration(String), - - #[error("Failed to map configuration")] - WrongConnectionConfiguration, -} -#[derive(Error, Debug)] -pub enum NestedDozerConnectorError { - #[error("Failed to connect to upstream dozer at {0}: {1:?}")] - ConnectionError(String, #[source] dozer_types::tonic::transport::Error), - - #[error("Failed to query endpoints from upstream dozer app: {0}")] - DescribeEndpointsError(#[source] dozer_types::tonic::Status), - - #[error(transparent)] - ReaderError(#[from] ReaderError), - - #[error(transparent)] - ReaderBuilderError(#[from] ReaderBuilderError), - - #[error("Column {0} not found")] - ColumnNotFound(String), -} - -#[derive(Error, Debug)] -pub enum PostgresConnectorError { - #[error("Invalid SslMode: {0:?}")] - InvalidSslError(SslMode), - - #[error("Query failed in connector: {0}")] - InvalidQueryError(#[source] tokio_postgres::Error), - - #[error("Failed to connect to postgres with the specified configuration. {0}")] - ConnectionFailure(#[source] tokio_postgres::Error), - - #[error("Replication is not available for user")] - ReplicationIsNotAvailableForUserError, - - #[error("WAL level should be 'logical'")] - WALLevelIsNotCorrect(), - - #[error("Cannot find tables {0:?}")] - TablesNotFound(Vec<(String, String)>), - - #[error("Cannot find column {0} in {1}")] - ColumnNotFound(String, String), - - #[error("Cannot find columns {0}")] - ColumnsNotFound(String), - - #[error("Failed to create a replication slot \"{0}\". Error: {1}")] - CreateSlotError(String, #[source] Error), - - #[error("Failed to create publication: {0}")] - CreatePublicationError(#[source] Error), - - #[error("Failed to drop publication: {0}")] - DropPublicationError(#[source] Error), - - #[error("Failed to begin txn for replication")] - BeginReplication, - - #[error("Failed to begin txn for replication")] - CommitReplication, - - #[error("Fetch of replication slot info failed. Error: {0}")] - FetchReplicationSlotError(#[source] tokio_postgres::Error), - - #[error("No slots available or all available slots are used")] - NoAvailableSlotsError, - - #[error("Slot {0} not found")] - SlotNotExistError(String), - - #[error("Slot {0} is already used by another process")] - SlotIsInUseError(String), - - #[error("Table {0} changes is not replicated to slot")] - MissingTableInReplicationSlot(String), - - #[error("Start lsn is before first available lsn - {0} < {1}")] - StartLsnIsBeforeLastFlushedLsnError(String, String), - - #[error("fetch of replication slot info failed. Error: {0}")] - SyncWithSnapshotError(String), - - #[error("Replication stream error. Error: {0}")] - ReplicationStreamError(String), - - #[error("Received unexpected message in replication stream")] - UnexpectedReplicationMessageError, - - #[error("Replication stream error")] - ReplicationStreamEndError, - - #[error(transparent)] - PostgresSchemaError(#[from] PostgresSchemaError), - - #[error("LSN not stored for replication slot")] - LSNNotStoredError, - - #[error("LSN parse error. Given lsn: {0}")] - LsnParseError(String), - - #[error("LSN not returned from replication slot creation query")] - LsnNotReturnedFromReplicationSlot, - - #[error("Table name \"{0}\" not valid")] - TableNameNotValid(String), - - #[error("Column name \"{0}\" not valid")] - ColumnNameNotValid(String), - - #[error("Relation not found in replication: {0}")] - RelationNotFound(#[source] std::io::Error), - - #[error("Failed to send message on snapshot read channel")] - SnapshotReadError, - - #[error("Failed to load native certs: {0}")] - LoadNativeCerts(#[source] std::io::Error), - - #[error("Non utf8 column name in table {table_index} column {column_index}")] - NonUtf8ColumnName { - table_index: usize, - column_index: usize, - }, - - #[error("Column type changed in table {table_index} column {column_name} from {old_type} to {new_type}")] - ColumnTypeChanged { - table_index: usize, - column_name: String, - old_type: postgres_types::Type, - new_type: postgres_types::Type, - }, -} - -#[derive(Error, Debug)] -pub enum PostgresSchemaError { - #[error("Schema's '{0}' doesn't have primary key")] - PrimaryKeyIsMissingInSchema(String), - - #[error("Table: '{0}' replication identity settings are not correct. It is either not set or NOTHING. Missing a primary key ?")] - SchemaReplicationIdentityError(String), - - #[error("Column type {0} not supported")] - ColumnTypeNotSupported(String), - - #[error("Custom type {0:?} is not supported yet. Join our Discord at https://discord.com/invite/3eWXBgJaEQ - we're here to help with your use case!")] - CustomTypeNotSupported(String), - - #[error("ColumnTypeNotFound")] - ColumnTypeNotFound, - - #[error("Invalid column type of column {0}")] - InvalidColumnType(String), - - #[error("Value conversion error: {0}")] - ValueConversionError(String), - - #[error("String parse failed")] - StringParseError(#[source] FromUtf8Error), - - #[error("JSONB parse failed: {0}")] - JSONBParseError(String), - - #[error("Point parse failed")] - PointParseError, - - #[error("Unsupported replication type - '{0}'")] - UnsupportedReplicationType(String), - - #[error( - "Table type '{0}' of '{1}' table is not supported. Only 'BASE TABLE' type is supported" - )] - UnsupportedTableType(String, String), - - #[error("Table type cannot be determined")] - TableTypeNotFound, - - #[error("Column not found")] - ColumnNotFound, - - #[error("Type error: {0}")] - TypeError(#[from] TypeError), - - #[error("Failed to read string from utf8. Error: {0}")] - StringReadError(#[from] FromUtf8Error), - - #[error("Failed to read date. Error: {0}")] - DateReadError(#[from] dozer_types::chrono::ParseError), -} - -#[cfg(feature = "snowflake")] -#[derive(Error, Debug)] -pub enum SnowflakeError { - #[error("Snowflake query error")] - QueryError(#[source] Box), - - #[error("Snowflake connection error")] - ConnectionError(#[source] Box), - - #[cfg(feature = "snowflake")] - #[error(transparent)] - SnowflakeSchemaError(#[from] SnowflakeSchemaError), - - #[error(transparent)] - SnowflakeStreamError(#[from] SnowflakeStreamError), - - #[error("A network error occurred, but this query is not resumable. query: {0}")] - NonResumableQuery(String), -} - -#[cfg(feature = "snowflake")] -#[derive(Error, Debug)] -pub enum SnowflakeSchemaError { - #[error("Column type {0} not supported")] - ColumnTypeNotSupported(String), - - #[error("Value conversion Error")] - ValueConversionError(#[source] Box), - - #[error("Invalid date")] - InvalidDateError, - - #[error("Invalid time")] - InvalidTimeError, - - #[error("Schema conversion Error: {0}")] - SchemaConversionError(#[source] TryFromIntError), - - #[error("Decimal convert error")] - DecimalConvertError(#[source] RustDecimalError), -} - -#[derive(Error, Debug)] -pub enum SnowflakeStreamError { - #[error("Time travel not available for table")] - TimeTravelNotAvailableError, - - #[error("Unsupported \"{0}\" action in stream")] - UnsupportedActionInStream(String), - - #[error("Cannot determine action")] - CannotDetermineAction, - - #[error("Stream not found")] - StreamNotFound, -} -#[cfg(feature = "kafka")] -#[derive(Error, Debug)] -pub enum KafkaError { - #[error(transparent)] - KafkaSchemaError(#[from] KafkaSchemaError), - - #[error("Connection error. Error: {0}")] - KafkaConnectionError(#[from] rdkafka::error::KafkaError), - - #[error("JSON decode error. Error: {0}")] - JsonDecodeError(#[source] serde_json::Error), - - #[error("Bytes convert error")] - BytesConvertError(#[source] Utf8Error), - - #[error(transparent)] - KafkaStreamError(#[from] KafkaStreamError), - - #[error("Schema registry fetch failed. Error: {0}")] - SchemaRegistryFetchError(#[source] SRCError), - - #[error("Topic not defined")] - TopicNotDefined, -} - -#[cfg(feature = "kafka")] -#[derive(Error, Debug)] -pub enum KafkaStreamError { - #[error("Consume commit error")] - ConsumeCommitError(#[source] rdkafka::error::KafkaError), - - #[error("Message consume error")] - MessageConsumeError(#[source] rdkafka::error::KafkaError), - - #[error("Polling error")] - PollingError(#[source] rdkafka::error::KafkaError), -} - -#[cfg(feature = "kafka")] -#[derive(Error, Debug, PartialEq)] -pub enum KafkaSchemaError { - #[error("Schema definition not found")] - SchemaDefinitionNotFound, - - #[error("Unsupported \"{0}\" type")] - TypeNotSupported(String), - - #[error("Field \"{0}\" not found")] - FieldNotFound(String), - - #[error("Binary decode error")] - BinaryDecodeError(#[source] DecodeError), - - #[error("Scale not found")] - ScaleNotFound, - - #[error("Scale is invalid")] - ScaleIsInvalid, - - #[error("Decimal convert error")] - DecimalConvertError(#[source] RustDecimalError), - - #[error("Invalid date")] - InvalidDateError, - - #[error("Invalid json: {0}")] - InvalidJsonError(String), - - // #[error("Invalid time")] - // InvalidTimeError, - #[error("Invalid timestamp")] - InvalidTimestampError, -} - -#[derive(Error, Debug)] -pub enum ObjectStoreConnectorError { - #[error(transparent)] - DataFusionSchemaError(#[from] ObjectStoreSchemaError), - - #[error(transparent)] - DataFusionStorageObjectError(#[from] ObjectStoreObjectError), - - #[error("Internal data fusion error")] - InternalDataFusionError(#[source] DataFusionError), - - #[error(transparent)] - TableReaderError(#[from] ObjectStoreTableReaderError), - - #[error(transparent)] - FromArrowError(#[from] FromArrowError), - - #[error("Failed to send message on data read channel")] - SendError, - - #[error("Failed to receive message on data read channel")] - RecvError, -} - -#[derive(Error, Debug, PartialEq)] -pub enum ObjectStoreSchemaError { - #[error("Unsupported type of \"{0}\" field")] - FieldTypeNotSupported(String), - - #[error("Date time conversion failed")] - DateTimeConversionError, - - #[error("Date conversion failed")] - DateConversionError, - - #[error("Time conversion failed")] - TimeConversionError, - - #[error("Duration conversion failed")] - DurationConversionError, -} - -#[derive(Error, Debug)] -pub enum ObjectStoreObjectError { - #[error("Missing storage details")] - MissingStorageDetails, - - #[error("Table definition not found")] - TableDefinitionNotFound, - - #[error("Listing path {0} parsing error: {1}")] - ListingPathParsingError(String, #[source] DataFusionError), - - #[error("File format unsupported: {0}")] - FileFormatUnsupportedError(String), - - #[error("Listing path {0} error: {1}")] - ListingPathError(String, #[source] DataFusionError), -} - -#[derive(Error, Debug)] -pub enum ObjectStoreTableReaderError { - #[error("Table read failed: {0}")] - TableReadFailed(DataFusionError), - - #[error("Columns select failed: {0}")] - ColumnsSelectFailed(DataFusionError), - - #[error("Stream execution failed: {0}")] - StreamExecutionError(DataFusionError), -} - -#[derive(Error, Debug)] -pub enum MySQLConnectorError { - #[error("Invalid connection URL: {0:?}")] - InvalidConnectionURLError(#[source] mysql_async::UrlError), - - #[error("Failed to connect to mysql with the specified url {0}. {1}")] - ConnectionFailure(String, #[source] mysql_async::Error), - - #[error("Unsupported field type: {0}")] - UnsupportedFieldType(String), - - #[error("Invalid field value. {0}")] - InvalidFieldValue(#[from] mysql_common::FromValueError), - - #[error("Invalid json value. {0}")] - JsonDeserializationError(#[from] DeserializationError), - - #[error("Invalid geometric value. {0}")] - InvalidGeometricValue(#[from] GeozeroError), - - #[error("Failed to open binlog. {0}")] - BinlogOpenError(#[source] mysql_async::Error), - - #[error("Failed to read binlog. {0}")] - BinlogReadError(#[source] mysql_async::Error), - - #[error("Binlog error: {0}")] - BinlogError(String), - - #[error("Query failed. {0}")] - QueryExecutionError(#[source] mysql_async::Error), - - #[error("Failed to fetch query result. {0}")] - QueryResultError(#[source] mysql_async::Error), } diff --git a/dozer-ingestion/src/ingestion/mod.rs b/dozer-ingestion/src/ingestion/mod.rs deleted file mode 100644 index 10e05d47ce..0000000000 --- a/dozer-ingestion/src/ingestion/mod.rs +++ /dev/null @@ -1,16 +0,0 @@ -mod ingestor; - -pub use ingestor::{IngestionIterator, Ingestor}; - -#[derive(Debug, Clone)] -pub struct IngestionConfig { - forwarder_channel_cap: usize, -} - -impl Default for IngestionConfig { - fn default() -> Self { - Self { - forwarder_channel_cap: 100000, - } - } -} diff --git a/dozer-ingestion/src/lib.rs b/dozer-ingestion/src/lib.rs index a715e024bb..60a86de284 100644 --- a/dozer-ingestion/src/lib.rs +++ b/dozer-ingestion/src/lib.rs @@ -1,5 +1,140 @@ -pub mod connectors; +#[cfg(feature = "ethereum")] +use dozer_ingestion_connector::dozer_types::models::ingestion_types::EthProviderConfig; +use dozer_ingestion_connector::dozer_types::{ + log::debug, + models::{ + connection::{Connection, ConnectionConfig}, + ingestion_types::default_grpc_adapter, + }, + prettytable::Table, +}; +use dozer_ingestion_deltalake::DeltaLakeConnector; +use dozer_ingestion_dozer::NestedDozerConnector; +#[cfg(feature = "ethereum")] +use dozer_ingestion_ethereum::{EthLogConnector, EthTraceConnector}; +use dozer_ingestion_grpc::{connector::GrpcConnector, ArrowAdapter, DefaultAdapter}; +#[cfg(feature = "kafka")] +use dozer_ingestion_kafka::connector::KafkaConnector; +#[cfg(feature = "mongodb")] +use dozer_ingestion_mongodb::MongodbConnector; +use dozer_ingestion_mysql::connector::{mysql_connection_opts_from_url, MySQLConnector}; +use dozer_ingestion_object_store::connector::ObjectStoreConnector; +use dozer_ingestion_postgres::{ + connection::helper::map_connection_config, + connector::{PostgresConfig, PostgresConnector}, +}; +#[cfg(feature = "snowflake")] +use dozer_ingestion_snowflake::connector::SnowflakeConnector; +use errors::ConnectorError; + pub mod errors; -pub mod ingestion; -pub mod test_util; -mod utils; +pub use dozer_ingestion_connector::*; + +pub fn get_connector(connection: Connection) -> Result, ConnectorError> { + let config = connection.config; + match config.clone() { + ConnectionConfig::Postgres(c) => { + let config = map_connection_config(&config)?; + let postgres_config = PostgresConfig { + name: connection.name, + config, + schema: c.schema, + }; + + if let Some(dbname) = postgres_config.config.get_dbname() { + debug!("Connecting to postgres database - {}", dbname.to_string()); + } + Ok(Box::new(PostgresConnector::new(postgres_config))) + } + #[cfg(feature = "ethereum")] + ConnectionConfig::Ethereum(eth_config) => match eth_config.provider { + EthProviderConfig::Log(log_config) => { + Ok(Box::new(EthLogConnector::new(log_config, connection.name))) + } + EthProviderConfig::Trace(trace_config) => Ok(Box::new(EthTraceConnector::new( + trace_config, + connection.name, + ))), + }, + #[cfg(not(feature = "ethereum"))] + ConnectionConfig::Ethereum(_) => Err(ConnectorError::EthereumFeatureNotEnabled), + ConnectionConfig::Grpc(grpc_config) => { + match grpc_config + .adapter + .clone() + .unwrap_or_else(default_grpc_adapter) + .as_str() + { + "arrow" => Ok(Box::new(GrpcConnector::::new( + connection.name, + grpc_config, + ))), + "default" => Ok(Box::new(GrpcConnector::::new( + connection.name, + grpc_config, + ))), + _ => Err(ConnectorError::UnsupportedGrpcAdapter( + connection.name, + grpc_config.adapter, + )), + } + } + #[cfg(feature = "snowflake")] + ConnectionConfig::Snowflake(snowflake) => { + let snowflake_config = snowflake; + + Ok(Box::new(SnowflakeConnector::new( + connection.name, + snowflake_config, + ))) + } + #[cfg(not(feature = "snowflake"))] + ConnectionConfig::Snowflake(_) => Err(ConnectorError::SnowflakeFeatureNotEnabled), + #[cfg(feature = "kafka")] + ConnectionConfig::Kafka(kafka_config) => Ok(Box::new(KafkaConnector::new(kafka_config))), + #[cfg(not(feature = "kafka"))] + ConnectionConfig::Kafka(_) => Err(ConnectorError::KafkaFeatureNotEnabled), + ConnectionConfig::S3Storage(object_store_config) => { + Ok(Box::new(ObjectStoreConnector::new(object_store_config))) + } + ConnectionConfig::LocalStorage(object_store_config) => { + Ok(Box::new(ObjectStoreConnector::new(object_store_config))) + } + ConnectionConfig::DeltaLake(delta_lake_config) => { + Ok(Box::new(DeltaLakeConnector::new(delta_lake_config))) + } + #[cfg(feature = "mongodb")] + ConnectionConfig::MongoDB(mongodb_config) => { + let connection_string = mongodb_config.connection_string; + Ok(Box::new(MongodbConnector::new(connection_string)?)) + } + #[cfg(not(feature = "mongodb"))] + ConnectionConfig::MongoDB(_) => Err(ConnectorError::MongodbFeatureNotEnabled), + ConnectionConfig::MySQL(mysql_config) => { + let opts = mysql_connection_opts_from_url(&mysql_config.url)?; + Ok(Box::new(MySQLConnector::new( + mysql_config.url, + opts, + mysql_config.server_id, + ))) + } + ConnectionConfig::Dozer(dozer_config) => { + Ok(Box::new(NestedDozerConnector::new(dozer_config))) + } + } +} + +pub fn get_connector_info_table(connection: &Connection) -> Option
{ + match &connection.config { + ConnectionConfig::Postgres(config) => match config.replenish() { + Ok(conf) => Some(conf.convert_to_table()), + Err(_) => None, + }, + ConnectionConfig::Ethereum(config) => Some(config.convert_to_table()), + ConnectionConfig::Snowflake(config) => Some(config.convert_to_table()), + ConnectionConfig::Kafka(config) => Some(config.convert_to_table()), + ConnectionConfig::S3Storage(config) => Some(config.convert_to_table()), + ConnectionConfig::LocalStorage(config) => Some(config.convert_to_table()), + _ => None, + } +} diff --git a/dozer-ingestion/src/test_util.rs b/dozer-ingestion/src/test_util.rs deleted file mode 100644 index d454371bfe..0000000000 --- a/dozer-ingestion/src/test_util.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::sync::Arc; - -use dozer_types::log::error; -use futures::stream::{AbortHandle, Abortable}; -use tokio::runtime::Runtime; - -use crate::{ - connectors::{Connector, TableInfo, TableToIngest}, - ingestion::{IngestionIterator, Ingestor}, -}; - -#[cfg(test)] -pub async fn run_connector_test< - F: futures::Future, - T: (FnOnce(dozer_types::models::config::Config) -> F) + std::panic::UnwindSafe, ->( - db_type: &str, - test: T, -) { - use dozer_types::{ - constants::DEFAULT_CONFIG_PATH, - models::{config::Config, connection::ConnectionConfig}, - }; - - use crate::connectors::postgres::tests::client::TestPostgresClient; - - let dozer_config_path = - std::path::PathBuf::from(format!("src/tests/cases/{db_type}/{DEFAULT_CONFIG_PATH}")); - - let dozer_config = std::fs::read_to_string(dozer_config_path).unwrap(); - let dozer_config = dozer_types::serde_yaml::from_str::(&dozer_config).unwrap(); - - let connection = dozer_config.connections.get(0).unwrap(); - if let ConnectionConfig::Postgres(connection_config) = connection.config.clone() { - let mut config = tokio_postgres::Config::new(); - let replenished_config = connection_config.replenish().unwrap(); - config - .user(&replenished_config.user) - .host(&replenished_config.host) - .password(&replenished_config.password) - .port(replenished_config.port as u16) - .ssl_mode(replenished_config.sslmode); - - let mut client = TestPostgresClient::new_with_postgres_config(config).await; - client - .execute_query(&format!( - "DROP DATABASE IF EXISTS {}", - replenished_config.database - )) - .await; - client - .execute_query(&format!("CREATE DATABASE {}", replenished_config.database)) - .await; - } - - test(dozer_config).await; -} - -pub fn create_test_runtime() -> Arc { - Arc::new( - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(), - ) -} - -pub fn spawn_connector( - runtime: Arc, - connector: impl Connector + 'static, - tables: Vec, -) -> (IngestionIterator, AbortHandle) { - let (ingestor, iterator) = Ingestor::initialize_channel(Default::default()); - let (abort_handle, abort_registration) = AbortHandle::new_pair(); - let tables = tables - .into_iter() - .map(TableToIngest::from_scratch) - .collect(); - runtime.clone().spawn_blocking(move || { - runtime.block_on(async move { - if let Ok(Err(e)) = - Abortable::new(connector.start(&ingestor, tables), abort_registration).await - { - error!("Connector `start` returned error: {e}") - } - }) - }); - (iterator, abort_handle) -} - -pub fn spawn_connector_all_tables( - runtime: Arc, - connector: impl Connector + 'static, -) -> (IngestionIterator, AbortHandle) { - let tables = runtime.block_on(list_all_table(&connector)); - spawn_connector(runtime, connector, tables) -} - -pub fn create_runtime_and_spawn_connector_all_tables( - connector: impl Connector + 'static, -) -> (IngestionIterator, AbortHandle) { - let runtime = create_test_runtime(); - spawn_connector_all_tables(runtime.clone(), connector) -} - -async fn list_all_table(connector: &impl Connector) -> Vec { - let tables = connector.list_tables().await.unwrap(); - connector.list_columns(tables).await.unwrap() -} diff --git a/dozer-ingestion/src/tests/cases/snowflake/expectations.json b/dozer-ingestion/src/tests/cases/snowflake/expectations.json deleted file mode 100644 index f86b4172e3..0000000000 --- a/dozer-ingestion/src/tests/cases/snowflake/expectations.json +++ /dev/null @@ -1,3 +0,0 @@ -[ - "HealthyService" -] diff --git a/dozer-ingestion/tests/test_suite/basic.rs b/dozer-ingestion/tests/test_suite/basic.rs index 2da1d8da41..2fd1430e19 100644 --- a/dozer-ingestion/tests/test_suite/basic.rs +++ b/dozer-ingestion/tests/test_suite/basic.rs @@ -1,13 +1,13 @@ use std::{sync::Arc, time::Duration}; -use dozer_ingestion::{ - connectors::{CdcType, Connector, SourceSchema, TableIdentifier, TableInfo}, +use dozer_ingestion_connector::{ + dozer_types::{ + log::warn, + models::ingestion_types::IngestionMessage, + types::{Field, FieldDefinition, FieldType, Operation, Record, Schema}, + }, test_util::spawn_connector, -}; -use dozer_types::{ - log::warn, - models::ingestion_types::IngestionMessage, - types::{Field, FieldDefinition, FieldType, Operation, Record, Schema}, + CdcType, Connector, SourceSchema, TableIdentifier, TableInfo, }; use tokio::runtime::Runtime; diff --git a/dozer-ingestion/tests/test_suite/connectors/arrow.rs b/dozer-ingestion/tests/test_suite/connectors/arrow.rs index 60e7bb1b67..e0452fa907 100644 --- a/dozer-ingestion/tests/test_suite/connectors/arrow.rs +++ b/dozer-ingestion/tests/test_suite/connectors/arrow.rs @@ -1,11 +1,13 @@ use std::sync::Arc; -use dozer_types::arrow::array::{ - Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, -}; - -use dozer_types::{ - arrow, +use dozer_ingestion_connector::dozer_types::{ + arrow::{ + self, + array::{ + Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, + }, + }, chrono::Datelike, types::{Field, FieldDefinition, FieldType}, }; diff --git a/dozer-ingestion/tests/test_suite/connectors/dozer.rs b/dozer-ingestion/tests/test_suite/connectors/dozer.rs index 1f8103de54..ffcdeaa10d 100644 --- a/dozer-ingestion/tests/test_suite/connectors/dozer.rs +++ b/dozer-ingestion/tests/test_suite/connectors/dozer.rs @@ -6,24 +6,27 @@ use std::time::Duration; use dozer_cli::shutdown::{self, ShutdownSender}; use dozer_cli::simple::SimpleOrchestrator; -use dozer_ingestion::connectors::dozer::NestedDozerConnector; -use dozer_ingestion::connectors::{CdcType, SourceSchema}; -use dozer_types::grpc_types::conversions::field_to_grpc; -use dozer_types::grpc_types::ingest::ingest_service_client::IngestServiceClient; -use dozer_types::grpc_types::ingest::{IngestRequest, OperationType}; -use dozer_types::grpc_types::types::Record; -use dozer_types::log::info; -use dozer_types::models::api_endpoint::ApiEndpoint; -use dozer_types::models::ingestion_types::GrpcConfigSchemas; -use dozer_types::models::source::Source; -use dozer_types::types::{Field, FieldDefinition, FieldType}; -use dozer_types::{ - models::ingestion_types::{GrpcConfig, NestedDozerConfig, NestedDozerLogOptions}, +use dozer_ingestion_connector::dozer_types::{ + grpc_types::{ + conversions::field_to_grpc, + ingest::{ingest_service_client::IngestServiceClient, IngestRequest, OperationType}, + types::Record, + }, + log::info, + models::{ + api_endpoint::ApiEndpoint, + ingestion_types::{ + GrpcConfig, GrpcConfigSchemas, NestedDozerConfig, NestedDozerLogOptions, + }, + source::Source, + }, serde_json, + tonic::transport::Channel, + types::{Field, FieldDefinition, FieldType}, }; +use dozer_ingestion_connector::{async_trait, dozer_types, CdcType, SourceSchema}; +use dozer_ingestion_dozer::NestedDozerConnector; -use dozer_types::tonic::async_trait; -use dozer_types::tonic::transport::Channel; use futures::lock::Mutex; use tempdir::TempDir; use tokio::runtime::Runtime; diff --git a/dozer-ingestion/tests/test_suite/connectors/mongodb.rs b/dozer-ingestion/tests/test_suite/connectors/mongodb.rs index 0c85af2561..7dfe0c9af6 100644 --- a/dozer-ingestion/tests/test_suite/connectors/mongodb.rs +++ b/dozer-ingestion/tests/test_suite/connectors/mongodb.rs @@ -1,8 +1,13 @@ -use bson::doc; -use dozer_ingestion::connectors::mongodb::MongodbConnector; -use dozer_types::tonic::async_trait; +use dozer_ingestion_connector::async_trait; +use dozer_ingestion_mongodb::{ + bson::{self, doc}, + mongodb::{ + self, + options::{ClientOptions, InsertOneOptions, WriteConcern}, + }, + MongodbConnector, +}; use dozer_utils::{process::run_docker_compose, Cleanup}; -use mongodb::options::{ClientOptions, InsertOneOptions, WriteConcern}; use tempdir::TempDir; use crate::test_suite::DataReadyConnectorTest; diff --git a/dozer-ingestion/tests/test_suite/connectors/object_store/local_storage.rs b/dozer-ingestion/tests/test_suite/connectors/object_store/local_storage.rs index 2c33587b0f..e91b83e642 100644 --- a/dozer-ingestion/tests/test_suite/connectors/object_store/local_storage.rs +++ b/dozer-ingestion/tests/test_suite/connectors/object_store/local_storage.rs @@ -1,10 +1,12 @@ -use dozer_ingestion::connectors::object_store::connector::ObjectStoreConnector; - -use dozer_types::tonic::async_trait; -use dozer_types::{ - arrow, - models::ingestion_types::{LocalDetails, LocalStorage, ParquetConfig, Table, TableConfig}, - types::Field, +use dozer_ingestion_object_store::connector::ObjectStoreConnector; + +use dozer_ingestion_connector::{ + async_trait, + dozer_types::{ + arrow, + models::ingestion_types::{LocalDetails, LocalStorage, ParquetConfig, Table, TableConfig}, + types::Field, + }, }; use tempdir::TempDir; diff --git a/dozer-ingestion/tests/test_suite/connectors/postgres.rs b/dozer-ingestion/tests/test_suite/connectors/postgres.rs index 10ecad3ec1..6965a3623f 100644 --- a/dozer-ingestion/tests/test_suite/connectors/postgres.rs +++ b/dozer-ingestion/tests/test_suite/connectors/postgres.rs @@ -1,9 +1,9 @@ -use dozer_ingestion::connectors::postgres::{ +use dozer_ingestion_connector::{async_trait, dozer_types::types::Field}; +use dozer_ingestion_postgres::{ connection::{client::Client, helper::connect}, connector::{PostgresConfig, PostgresConnector}, + tokio_postgres, }; -use dozer_types::tonic::async_trait; -use dozer_types::types::Field; use dozer_utils::{process::run_docker_compose, Cleanup}; use tempdir::TempDir; diff --git a/dozer-ingestion/tests/test_suite/connectors/sql.rs b/dozer-ingestion/tests/test_suite/connectors/sql.rs index e1105bed63..62d39b973c 100644 --- a/dozer-ingestion/tests/test_suite/connectors/sql.rs +++ b/dozer-ingestion/tests/test_suite/connectors/sql.rs @@ -1,4 +1,4 @@ -use dozer_types::types::{Field, FieldDefinition, FieldType}; +use dozer_ingestion_connector::dozer_types::types::{Field, FieldDefinition, FieldType}; use crate::test_suite::{records::Operation, FieldsAndPk}; diff --git a/dozer-ingestion/tests/test_suite/data.rs b/dozer-ingestion/tests/test_suite/data.rs index 7892a35689..d7554777bc 100644 --- a/dozer-ingestion/tests/test_suite/data.rs +++ b/dozer-ingestion/tests/test_suite/data.rs @@ -1,4 +1,4 @@ -use dozer_types::types::{Field, FieldDefinition, FieldType}; +use dozer_ingestion_connector::dozer_types::types::{Field, FieldDefinition, FieldType}; use super::{records::Operation, FieldsAndPk}; diff --git a/dozer-ingestion/tests/test_suite/mod.rs b/dozer-ingestion/tests/test_suite/mod.rs index 5a17099f45..078cbca8a0 100644 --- a/dozer-ingestion/tests/test_suite/mod.rs +++ b/dozer-ingestion/tests/test_suite/mod.rs @@ -1,5 +1,8 @@ -use dozer_ingestion::connectors::Connector; -use dozer_types::types::{Field, FieldDefinition}; +use dozer_ingestion_connector::{ + async_trait, + dozer_types::types::{Field, FieldDefinition}, + Connector, +}; #[async_trait] pub trait DataReadyConnectorTest: Send + Sized + 'static { @@ -51,4 +54,3 @@ pub use connectors::MongodbConnectorTest; pub use connectors::{ DozerConnectorTest, LocalStorageObjectStoreConnectorTest, PostgresConnectorTest, }; -use dozer_types::tonic::async_trait; diff --git a/dozer-ingestion/tests/test_suite/records.rs b/dozer-ingestion/tests/test_suite/records.rs index 1ba08b6431..8b709efa2d 100644 --- a/dozer-ingestion/tests/test_suite/records.rs +++ b/dozer-ingestion/tests/test_suite/records.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use dozer_types::types::Field; +use dozer_ingestion_connector::dozer_types::types::Field; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Operation {