diff --git a/9.0/404.html b/9.0/404.html index fc6db3822..5b71f643d 100644 --- a/9.0/404.html +++ b/9.0/404.html @@ -633,6 +633,8 @@ + + @@ -957,6 +959,26 @@ +
clean(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, dry_run: bool = True, verbose: bool = True, parallel: bool = False)\n
Clean pipeline steps.
pipeline_path
Path to pipeline definition yaml file.
TYPE: Path
Path
dotenv
Paths to dotenv files.
TYPE: list[Path] | None DEFAULT: None
list[Path] | None
None
config
Path to the dir containing config.yaml files.
TYPE: Path DEFAULT: Path()
Path()
steps
Set of steps (components) to apply the command on.
TYPE: set[str] | None DEFAULT: None
set[str] | None
filter_type
Whether steps should include/exclude the steps.
TYPE: FilterType DEFAULT: INCLUDE
FilterType
INCLUDE
dry_run
Whether to dry run the command or execute it.
TYPE: bool DEFAULT: True
bool
True
environment
The environment to generate and deploy the pipeline to.
TYPE: str | None DEFAULT: None
str | None
verbose
Enable verbose printing.
parallel
Enable or disable parallel execution of pipeline steps.
TYPE: bool DEFAULT: False
False
kpops/api/__init__.py
def clean(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n dry_run: bool = True,\n verbose: bool = True,\n parallel: bool = False,\n):\n \"\"\"Clean pipeline steps.\n\n :param pipeline_path: Path to pipeline definition yaml file.\n :param dotenv: Paths to dotenv files.\n :param config: Path to the dir containing config.yaml files.\n :param steps: Set of steps (components) to apply the command on.\n :param filter_type: Whether `steps` should include/exclude the steps.\n :param dry_run: Whether to dry run the command or execute it.\n :param environment: The environment to generate and deploy the pipeline to.\n :param verbose: Enable verbose printing.\n :param parallel: Enable or disable parallel execution of pipeline steps.\n \"\"\"\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n )\n\n async def clean_runner(component: PipelineComponent):\n log_action(\"Clean\", component)\n await component.clean(dry_run)\n\n async def async_clean():\n if parallel:\n pipeline_tasks = pipeline.build_execution_graph(clean_runner, reverse=True)\n await pipeline_tasks\n else:\n for component in reversed(pipeline.components):\n await clean_runner(component)\n\n asyncio.run(async_clean())\n
deploy(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, dry_run: bool = True, verbose: bool = True, parallel: bool = False)\n
Deploy pipeline steps.
def deploy(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n dry_run: bool = True,\n verbose: bool = True,\n parallel: bool = False,\n):\n \"\"\"Deploy pipeline steps.\n\n :param pipeline_path: Path to pipeline definition yaml file.\n :param dotenv: Paths to dotenv files.\n :param config: Path to the dir containing config.yaml files.\n :param steps: Set of steps (components) to apply the command on.\n :param filter_type: Whether `steps` should include/exclude the steps.\n :param dry_run: Whether to dry run the command or execute it.\n :param environment: The environment to generate and deploy the pipeline to.\n :param verbose: Enable verbose printing.\n :param parallel: Enable or disable parallel execution of pipeline steps.\n \"\"\"\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n )\n\n async def deploy_runner(component: PipelineComponent):\n log_action(\"Deploy\", component)\n await component.deploy(dry_run)\n\n async def async_deploy():\n if parallel:\n pipeline_tasks = pipeline.build_execution_graph(deploy_runner)\n await pipeline_tasks\n else:\n for component in pipeline.components:\n await deploy_runner(component)\n\n asyncio.run(async_deploy())\n
destroy(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, dry_run: bool = True, verbose: bool = True, parallel: bool = False)\n
Destroy pipeline steps.
def destroy(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n dry_run: bool = True,\n verbose: bool = True,\n parallel: bool = False,\n):\n \"\"\"Destroy pipeline steps.\n\n :param pipeline_path: Path to pipeline definition yaml file.\n :param dotenv: Paths to dotenv files.\n :param config: Path to the dir containing config.yaml files.\n :param steps: Set of steps (components) to apply the command on.\n :param filter_type: Whether `steps` should include/exclude the steps.\n :param dry_run: Whether to dry run the command or execute it.\n :param environment: The environment to generate and deploy the pipeline to.\n :param verbose: Enable verbose printing.\n :param parallel: Enable or disable parallel execution of pipeline steps.\n \"\"\"\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n )\n\n async def destroy_runner(component: PipelineComponent):\n log_action(\"Destroy\", component)\n await component.destroy(dry_run)\n\n async def async_destroy():\n if parallel:\n pipeline_tasks = pipeline.build_execution_graph(\n destroy_runner, reverse=True\n )\n await pipeline_tasks\n else:\n for component in reversed(pipeline.components):\n await destroy_runner(component)\n\n asyncio.run(async_destroy())\n
generate(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, verbose: bool = False, operation_mode: OperationMode = OperationMode.MANAGED) -> Pipeline\n
Generate enriched pipeline representation.
operation_mode
How KPOps should operate.
TYPE: OperationMode DEFAULT: MANAGED
OperationMode
MANAGED
Pipeline
Generated Pipeline object.
def generate(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n verbose: bool = False,\n operation_mode: OperationMode = OperationMode.MANAGED,\n) -> Pipeline:\n \"\"\"Generate enriched pipeline representation.\n\n :param pipeline_path: Path to pipeline definition yaml file.\n :param dotenv: Paths to dotenv files.\n :param config: Path to the dir containing config.yaml files.\n :param steps: Set of steps (components) to apply the command on.\n :param filter_type: Whether `steps` should include/exclude the steps.\n :param environment: The environment to generate and deploy the pipeline to.\n :param verbose: Enable verbose printing.\n :param operation_mode: How KPOps should operate.\n :return: Generated `Pipeline` object.\n \"\"\"\n kpops_config = KpopsConfig.create(\n config, dotenv, environment, verbose, operation_mode\n )\n pipeline = _create_pipeline(pipeline_path, kpops_config, environment)\n log.info(f\"Picked up pipeline '{pipeline_path.parent.name}'\")\n if steps:\n component_names = steps\n log.debug(\n f\"KPOPS_PIPELINE_STEPS is defined with values: {component_names} and filter type of {filter_type.value}\"\n )\n\n predicate = filter_type.create_default_step_names_filter_predicate(\n component_names\n )\n pipeline.filter(predicate)\n log.info(f\"Filtered pipeline:\\n{pipeline.step_names}\")\n return pipeline\n
init(path: Path, config_include_optional: bool = False)\n
Initiate a default empty project.
path
Directory in which the project should be initiated.
config_include_optional
Whether to include non-required settings in the generated config file.
def init(\n path: Path,\n config_include_optional: bool = False,\n):\n \"\"\"Initiate a default empty project.\n\n :param path: Directory in which the project should be initiated.\n :param config_include_optional: Whether to include non-required settings\n in the generated config file.\n \"\"\"\n if not path.exists():\n path.mkdir(parents=False)\n elif next(path.iterdir(), False):\n log.warning(\"Please provide a path to an empty directory.\")\n return\n init_project(path, config_include_optional)\n
manifest_clean(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, verbose: bool = True, operation_mode: OperationMode = OperationMode.MANIFEST) -> Iterator[tuple[KubernetesManifest, ...]]\n
def manifest_clean(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n verbose: bool = True,\n operation_mode: OperationMode = OperationMode.MANIFEST,\n) -> Iterator[tuple[KubernetesManifest, ...]]:\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n operation_mode=operation_mode,\n )\n for component in pipeline.components:\n resource = component.manifest_clean()\n yield resource\n
manifest_deploy(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, verbose: bool = True, operation_mode: OperationMode = OperationMode.MANIFEST) -> Iterator[tuple[KubernetesManifest, ...]]\n
def manifest_deploy(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n verbose: bool = True,\n operation_mode: OperationMode = OperationMode.MANIFEST,\n) -> Iterator[tuple[KubernetesManifest, ...]]:\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n operation_mode=operation_mode,\n )\n for component in pipeline.components:\n resource = component.manifest_deploy()\n yield resource\n
manifest_destroy(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, verbose: bool = True, operation_mode: OperationMode = OperationMode.MANIFEST) -> Iterator[tuple[KubernetesManifest, ...]]\n
def manifest_destroy(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n verbose: bool = True,\n operation_mode: OperationMode = OperationMode.MANIFEST,\n) -> Iterator[tuple[KubernetesManifest, ...]]:\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n operation_mode=operation_mode,\n )\n for component in pipeline.components:\n resource = component.manifest_destroy()\n yield resource\n
manifest_reset(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, verbose: bool = True, operation_mode: OperationMode = OperationMode.MANIFEST) -> Iterator[tuple[KubernetesManifest, ...]]\n
def manifest_reset(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n verbose: bool = True,\n operation_mode: OperationMode = OperationMode.MANIFEST,\n) -> Iterator[tuple[KubernetesManifest, ...]]:\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n operation_mode=operation_mode,\n )\n for component in pipeline.components:\n resource = component.manifest_reset()\n yield resource\n
reset(pipeline_path: Path, dotenv: list[Path] | None = None, config: Path = Path(), steps: set[str] | None = None, filter_type: FilterType = FilterType.INCLUDE, environment: str | None = None, dry_run: bool = True, verbose: bool = True, parallel: bool = False)\n
Reset pipeline steps.
def reset(\n pipeline_path: Path,\n dotenv: list[Path] | None = None,\n config: Path = Path(),\n steps: set[str] | None = None,\n filter_type: FilterType = FilterType.INCLUDE,\n environment: str | None = None,\n dry_run: bool = True,\n verbose: bool = True,\n parallel: bool = False,\n):\n \"\"\"Reset pipeline steps.\n\n :param pipeline_path: Path to pipeline definition yaml file.\n :param dotenv: Paths to dotenv files.\n :param config: Path to the dir containing config.yaml files.\n :param steps: Set of steps (components) to apply the command on.\n :param filter_type: Whether `steps` should include/exclude the steps.\n :param dry_run: Whether to dry run the command or execute it.\n :param environment: The environment to generate and deploy the pipeline to.\n :param verbose: Enable verbose printing.\n :param parallel: Enable or disable parallel execution of pipeline steps.\n \"\"\"\n pipeline = generate(\n pipeline_path=pipeline_path,\n dotenv=dotenv,\n config=config,\n steps=steps,\n filter_type=filter_type,\n environment=environment,\n verbose=verbose,\n )\n\n async def reset_runner(component: PipelineComponent):\n log_action(\"Reset\", component)\n await component.reset(dry_run)\n\n async def async_reset():\n if parallel:\n pipeline_tasks = pipeline.build_execution_graph(reset_runner, reverse=True)\n await pipeline_tasks\n else:\n for component in reversed(pipeline.components):\n await reset_runner(component)\n\n asyncio.run(async_reset())\n
Bases: BaseModel
BaseModel
Pipeline representation.
kpops/pipeline/__init__.py
class Pipeline(BaseModel):\n \"\"\"Pipeline representation.\"\"\"\n\n _component_index: dict[str, PipelineComponent] = {}\n _graph: nx.DiGraph = nx.DiGraph()\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n\n @property\n def step_names(self) -> list[str]:\n return [step.name for step in self.components]\n\n @computed_field(title=\"Components\")\n @property\n def components(self) -> list[SerializeAsAny[PipelineComponent]]:\n return list(self._component_index.values())\n\n @property\n def last(self) -> PipelineComponent:\n return self.components[-1]\n\n def add(self, component: PipelineComponent) -> None:\n if self._component_index.get(component.id) is not None:\n msg = (\n f\"Pipeline steps must have unique id, '{component.id}' already exists.\"\n )\n raise ValidationError(msg)\n self._component_index[component.id] = component\n self.__add_to_graph(component)\n\n def remove(self, component_id: str) -> None:\n self._component_index.pop(component_id)\n\n def get(self, component_id: str) -> PipelineComponent | None:\n return self._component_index.get(component_id)\n\n def find(self, predicate: ComponentFilterPredicate) -> Iterator[PipelineComponent]:\n \"\"\"Find pipeline components matching a custom predicate.\n\n :param predicate: Filter function,\n returns boolean value whether the component should be kept or removed\n :returns: Iterator of components matching the predicate\n \"\"\"\n for component in self.components:\n if predicate(component):\n yield component\n\n def filter(self, predicate: ComponentFilterPredicate) -> None:\n \"\"\"Filter pipeline components using a custom predicate.\n\n :param predicate: Filter function,\n returns boolean value whether the component should be kept or removed\n \"\"\"\n for component in self.components:\n # filter out components not matching the predicate\n if not predicate(component):\n self.remove(component.id)\n\n def validate(self) -> None: # pyright: ignore [reportIncompatibleMethodOverride]\n self.__validate_graph()\n\n def to_yaml(self) -> str:\n return yaml.dump(\n self.model_dump(mode=\"json\", by_alias=True, exclude_none=True)[\"components\"]\n )\n\n def build_execution_graph(\n self,\n runner: Callable[[PipelineComponent], Coroutine[Any, Any, None]],\n /,\n reverse: bool = False,\n ) -> Awaitable[None]:\n async def run_parallel_tasks(\n coroutines: list[Coroutine[Any, Any, None]],\n ) -> None:\n tasks: list[asyncio.Task[None]] = []\n for coro in coroutines:\n tasks.append(asyncio.create_task(coro))\n await asyncio.gather(*tasks)\n\n async def run_graph_tasks(pending_tasks: list[Awaitable[None]]) -> None:\n for pending_task in pending_tasks:\n await pending_task\n\n graph: nx.DiGraph = self._graph.copy() # pyright: ignore[reportAssignmentType, reportGeneralTypeIssues] imprecise type hint in networkx\n\n # We add an extra node to the graph, connecting all the leaf nodes to it\n # in that way we make this node the root of the graph, avoiding backtracking\n root_node = \"root_node_bfs\"\n graph.add_node(root_node)\n\n for node in graph:\n predecessors = list(graph.predecessors(node))\n if not predecessors:\n graph.add_edge(root_node, node)\n\n layers_graph: list[list[str]] = list(nx.bfs_layers(graph, root_node))\n\n sorted_tasks: list[Awaitable[None]] = []\n for layer in layers_graph[1:]:\n if parallel_tasks := self.__get_parallel_tasks_from(layer, runner):\n sorted_tasks.append(run_parallel_tasks(parallel_tasks))\n\n if reverse:\n sorted_tasks.reverse()\n\n return run_graph_tasks(sorted_tasks)\n\n def __getitem__(self, component_id: str) -> PipelineComponent:\n try:\n return self._component_index[component_id]\n except KeyError as exc:\n msg = f\"Component {component_id} not found\"\n raise ValueError(msg) from exc\n\n def __bool__(self) -> bool:\n return bool(self._component_index)\n\n def __iter__(self) -> Iterator[PipelineComponent]: # pyright: ignore [reportIncompatibleMethodOverride]\n yield from self._component_index.values()\n\n def __len__(self) -> int:\n return len(self.components)\n\n def __add_to_graph(self, component: PipelineComponent):\n self._graph.add_node(component.id)\n\n for input_topic in component.inputs:\n self.__add_input(input_topic.id, component.id)\n\n for output_topic in component.outputs:\n self.__add_output(output_topic.id, component.id)\n\n def __add_output(self, topic_id: str, source: str) -> None:\n self._graph.add_node(topic_id)\n self._graph.add_edge(source, topic_id)\n\n def __add_input(self, topic_id: str, target: str) -> None:\n self._graph.add_node(topic_id)\n self._graph.add_edge(topic_id, target)\n\n def __get_parallel_tasks_from(\n self,\n layer: list[str],\n runner: Callable[[PipelineComponent], Coroutine[Any, Any, None]],\n ) -> list[Coroutine[Any, Any, None]]:\n def gen_parallel_tasks():\n for node_in_layer in layer:\n # check if component, skip topics\n if (component := self._component_index.get(node_in_layer)) is not None:\n yield runner(component)\n\n return list(gen_parallel_tasks())\n\n def __validate_graph(self) -> None:\n if not nx.is_directed_acyclic_graph(self._graph):\n msg = \"Pipeline is not a valid DAG.\"\n raise ValueError(msg)\n
property
components: list[SerializeAsAny[PipelineComponent]]\n
last: PipelineComponent\n
step_names: list[str]\n
add(component: PipelineComponent) -> None\n
def add(self, component: PipelineComponent) -> None:\n if self._component_index.get(component.id) is not None:\n msg = (\n f\"Pipeline steps must have unique id, '{component.id}' already exists.\"\n )\n raise ValidationError(msg)\n self._component_index[component.id] = component\n self.__add_to_graph(component)\n
build_execution_graph(runner: Callable[[PipelineComponent], Coroutine[Any, Any, None]], /, reverse: bool = False) -> Awaitable[None]\n
def build_execution_graph(\n self,\n runner: Callable[[PipelineComponent], Coroutine[Any, Any, None]],\n /,\n reverse: bool = False,\n) -> Awaitable[None]:\n async def run_parallel_tasks(\n coroutines: list[Coroutine[Any, Any, None]],\n ) -> None:\n tasks: list[asyncio.Task[None]] = []\n for coro in coroutines:\n tasks.append(asyncio.create_task(coro))\n await asyncio.gather(*tasks)\n\n async def run_graph_tasks(pending_tasks: list[Awaitable[None]]) -> None:\n for pending_task in pending_tasks:\n await pending_task\n\n graph: nx.DiGraph = self._graph.copy() # pyright: ignore[reportAssignmentType, reportGeneralTypeIssues] imprecise type hint in networkx\n\n # We add an extra node to the graph, connecting all the leaf nodes to it\n # in that way we make this node the root of the graph, avoiding backtracking\n root_node = \"root_node_bfs\"\n graph.add_node(root_node)\n\n for node in graph:\n predecessors = list(graph.predecessors(node))\n if not predecessors:\n graph.add_edge(root_node, node)\n\n layers_graph: list[list[str]] = list(nx.bfs_layers(graph, root_node))\n\n sorted_tasks: list[Awaitable[None]] = []\n for layer in layers_graph[1:]:\n if parallel_tasks := self.__get_parallel_tasks_from(layer, runner):\n sorted_tasks.append(run_parallel_tasks(parallel_tasks))\n\n if reverse:\n sorted_tasks.reverse()\n\n return run_graph_tasks(sorted_tasks)\n
filter(predicate: ComponentFilterPredicate) -> None\n
Filter pipeline components using a custom predicate.
predicate
Filter function, returns boolean value whether the component should be kept or removed
TYPE: ComponentFilterPredicate
ComponentFilterPredicate
def filter(self, predicate: ComponentFilterPredicate) -> None:\n \"\"\"Filter pipeline components using a custom predicate.\n\n :param predicate: Filter function,\n returns boolean value whether the component should be kept or removed\n \"\"\"\n for component in self.components:\n # filter out components not matching the predicate\n if not predicate(component):\n self.remove(component.id)\n
find(predicate: ComponentFilterPredicate) -> Iterator[PipelineComponent]\n
Find pipeline components matching a custom predicate.
Iterator[PipelineComponent]
Iterator of components matching the predicate
def find(self, predicate: ComponentFilterPredicate) -> Iterator[PipelineComponent]:\n \"\"\"Find pipeline components matching a custom predicate.\n\n :param predicate: Filter function,\n returns boolean value whether the component should be kept or removed\n :returns: Iterator of components matching the predicate\n \"\"\"\n for component in self.components:\n if predicate(component):\n yield component\n
get(component_id: str) -> PipelineComponent | None\n
def get(self, component_id: str) -> PipelineComponent | None:\n return self._component_index.get(component_id)\n
remove(component_id: str) -> None\n
def remove(self, component_id: str) -> None:\n self._component_index.pop(component_id)\n
to_yaml() -> str\n
def to_yaml(self) -> str:\n return yaml.dump(\n self.model_dump(mode=\"json\", by_alias=True, exclude_none=True)[\"components\"]\n )\n
validate() -> None\n
def validate(self) -> None: # pyright: ignore [reportIncompatibleMethodOverride]\n self.__validate_graph()\n
Auto generation happens mostly with pre-commit hooks. You can find the pre-commit configuration here. These pre-commit hooks call different Python scripts to auto generate code for the documentation.
pre-commit
cli_env_vars.env
cli_env_vars.md
config_env_vars.env
KpopsConfig
env
topic_name_config
variable_substitution.yaml
./tests/pipeline/resources/component-type-substitution/pipeline.yaml
Generated by typer-cli from the code in main.py. It is called with Python's subprocess module.
typer-cli
main.py
subprocess
Generates example pipeline.yaml and defaults.yaml for each individual component, stores them and also concatenates them into 1 big pipeline definition and 1 big pipeline defaults definition.
pipeline.yaml
defaults.yaml
User input
headers/*\\.yaml
type
name
pipeline-components
pipeline-defaults
sections/*\\.yaml
Generated
pipeline-components/dependencies/*
pipeline_component_dependencies.yaml
sections
defaults_pipeline_component_dependencies.yaml
kpops_structure.yaml
pipeline-components/*\\.yaml
pipeline-defaults/*\\.yaml
Welcome! We are glad to have you visit our contributing guide!
If you find any bugs or have suggestions for improvements, please open an issue and optionally a pull request (PR). In the case of a PR, we would appreciate it if you preface it with an issue outlining your goal and means of achieving it.
We are using git submodules to import the KPOps examples repository. You need to fetch the repository locally on your machine. To do so use this command:
git submodule init\ngit submodule update --recursive\n
This will fetch the resources under the examples folder.
examples
We advise that you stick to our pre-commit hooks for code linting, formatting, and auto-generation of documentation. After you install them using poetry run pre-commit install they're triggered automatically during git commit. Additionally, you can manually invoke them with poetry run pre-commit run -a. In order for dprint to work, you have to manually install it locally. It will work in the CI, so it is also possible to manually carry out formatting changes flagged by dprint in the CI and skip installing it locally.
poetry run pre-commit install
git commit
poetry run pre-commit run -a
dprint
To ensure a consistent Python code style, we use Ruff for both linting and formatting. The official docs contain a guide on editor integration.
Our configuration can be found in KPOps' top-level pyproject.toml.
pyproject.toml
To ensure a consistent markdown style, we use dprint's Markdown code formatter. Our configuration can be found here.
To ensure a consistent CSS style, we use the malva dprint's plugin. Our configuration can be found here.
To ensure a consistent TOML style, we use dprint's TOML code formatter. Our configuration can be found here.
Welcome! We are glad to have you visit our developer guide! If you find any bugs or have suggestions for improvements, please open an issue and optionally a pull request (PR). In the case of a PR, we would appreciate it if you preface it with an issue outlining your goal and means of achieving it.
Find more about our code-style or insights into KPOps' code base here in our developer guide.
Work in progress
The developer guide is still under construction. If you have a question left unanswered here, feel free to ask it by opening an issue.
Introduce KPOps operation and manifest resources for deployment - #541
Drop support for Python 3.10 - #561
KPOps V9 - #558
Define Pydantic model to representing Kubernetes manifest - #546
Manifest toSection with Strimzi KafkaTopic - #545
Manifest Kubernetes resources for destroy command - #552
destroy
Manifest Kubernetes resources for clean command - #559
clean
Manifest Kubernetes resources for reset command - #563
reset
Add documentation for operation-mode in KPOps - #565
Add migration guide v8-v9 - #562
SerializeAsOptional
Fix Kubernetes memory not accepting decimal values - #568
Add ephemeral storage to Kubernetes resource requests and limits - #569
Bump streams-bootstrap to 3.1.0 - #557
Add Pydantic models for Kubernetes Affinity - #555
kpops --version
Fix files field value type in Streamsboostrap component - #542
files
Fix: Use enum values when dumping models - #543
Improve incomplete type hints - #515
Fallback to user defined model when the validation of cluster model fails - #521
Replace kubernetes-asyncio with lightkube - #517
Automatic loading of namespaced custom components - #500
Call destroy from inside of reset or clean - #501
Rename app field - #506
Add image tag field to streams-bootstrap app values - #499
Delete ignored keys from diff - #510
6.0.0
Update Ruff - #475
Set Pyright to warn on unknown types - #480
Quiet faker debug logs in tests - #483
Add pyright matcher - #481
from.components.<component-name>.type
Add support for Python 3.12 - #467
Update Pyright - #468
Remove package classifiers that are automatically assigned by Poetry - #469
Validate autoscaling mandatory fields when enabled - #470
Fix docs CI to include the latest changes to a tagged version in the changelog - #459
Fix tempfile creation - #461
Fix symbolic link to CONTRIBUTING.md and parallel option in action.yaml - #462
Refactor Kafka topics - #447
Refactor PipelineGenerator to use component ids - #460
Fix order of pipeline steps for clean/reset - #450
Fix substitution - #449
Fix cleaner inheritance, parent model should be aliased during instantiation - #452
Refactor enrichment using Pydantic model validator - #444
Refactor pipeline filter and add to public API - #405
Add custom PascalCase to snake_case alias generator - #436
Add parallel flag support to kpops runner - #439
Add message if examples git submodule is not initialized - #432
Update type annotation for deserialized pipeline - #433
Fix broken doc link - #427
Add warning log if SR handler is disabled but URL is set - #428
Update docs of word-count example for v3 & new folder structure - #423
Move ATM fraud to examples repo - #425
Update pydantic dependency - #422
Add git submodule instructions to the contributing.md - #429
Move GitHub action to repository root - #356
Make Kafka REST Proxy & Kafka Connect hosts default and improve Schema Registry config - #354
Create HelmApp component - #370
Change substitution variables separator to . - #388
.
Refactor pipeline generator & representation - #392
Define custom components module & pipeline base dir globally - #387
Use hash and trim long Helm release names instead of only trimming - #390
Refactor generate template for Python API usage - #380
Namespace substitution vars - #408
Refactor streams-bootstrap cleanup jobs as individual HelmApp - #398
Refactor Kafka Connector resetter as individual HelmApp - #400
Fix wrong Helm release name character limit - #418
Allow overriding config files - #391
Generate defaults schema - #402
Fix missing component type in pipeline schema - #401
Fix enrichment of nested Pydantic BaseModel - #415
Update release workflow template to support custom changelog file path - #421
Migrate to Pydantic v2 - #347
Refactor Helm nameOverride - #397
nameOverride
Mark component type as computed Pydantic field - #399
Support multiple inheritance for doc generation - #406
Update docs for substitution variable usage in v3 - #409
Update docs for v3 - #416
Update tests resources - #417
Summarize all breaking changes in diffs at the top of the migration guide - #419
Replace black with ruff - #365
Add toml formatter to dprint - #386
Add malva to dprint - #385
Update KPOps runner with the new options - #395
Fix KPOps action to get package from testPyPI - #396
KPOps 3.0 - #420
Fix early exit upon Helm exit code 1 - #376
Fix docs setup page list indentation - #377
Migrate deprecated mkdocs-material-extensions - #378
Exclude resources from docs search - #371
Fix environment variables documentation generation - #362
Introduce ruff - #363
Print details on connector name mismatch error - #369
Enable transparent OS environment lookups from internal environment - #368
Refactor component prefix & name - #326
Remove unnecessary condition during inflate - #328
--template
Add dprint as the markdown formatter - #337
Publish pre-release docs for PRs & main branch - #339
Align docs colours - #345
Add version dropdown to the documentation - #336
Break the documentation down into smaller subsection - #329
Remove camel case conversion of internal models - #308
Derive component type automatically from class name - #309
Refactor input/output types - #232
v2 - #321
Automatically support schema generation for custom components - #307
Add KPOps Runner GitHub Action to the documentation - #325
Remove :type and :rtype from docstrings - #324
:type
:rtype
Modularize and autogenerate examples for the documentation - #267
Update the variable documentation - #266
--set-file
Refactor Helm wrapper and add --set-file flag - #311
Set default for ToSection topics - #313
Annotate types for ToSection models mapping - #315
Order PipelineComponent fields - #290
Migrate requests to httpx - #302
Refactor CLI using dtyper - #306
Update Black - #294
Fix vulnerability in mkdocs-material - #295
Move breaking changes section upper in the change log config - #287
Update codeowners - #281
Reactivate Windows CI - #255
Downgrade Poetry version on the Windows CI pipeline - #286
Set ANSI theme for output of kpops generate - #289
kpops generate
Create workflow to lint CI - #260
Fix update docs when releasing - #261
Rename change log message for uncategorized issues - #262
helm repo update <repo-name>
add --namespace option to Helm template command - #237
Add missing type annotation for Pydantic attributes - #238
Fix helm version check - #242
Fix Helm Version Check - #244
Fix import from external module - #256
Remove enable option from helm diff - #235
Refactor variable substitution - #198
Add background to docs home page - #236
Update Poetry version in CI - #247
Add pip cache in KPOps runner action - #249
Check types using Pyright - #251
Remove MyPy - #252
Disable broken Windows CI temporarily - #253
Update release and publish workflows - #254
Fix release & publish workflows - #257
With a couple of easy commands in the shell, and a pipeline.yaml of under 30 lines, KPOps can not only deploy a Kafka pipeline1 to a Kubernetes cluster, but also reset, clean or destroy it!
deploy
- type: producer-app\n name: data-producer\n values:\n image: bakdata/kpops-demo-sentence-producer\n imageTag: \"2.0.0\"\n\n- type: streams-app\n name: word-count-app\n values:\n image: bakdata/kpops-demo-word-count-app\n imageTag: \"2.0.0\"\n replicaCount: 1\n to:\n topics:\n ${output_topic_name}:\n type: output\n configs:\n cleanup.policy: compact\n\n- type: kafka-sink-connector\n name: redis-sink-connector\n config:\n connector.class: com.github.jcustenborder.kafka.connect.redis.RedisSinkConnector\n redis.hosts: redis-headless:6379\n redis.database: 0\n tasks.max: 1\n key.converter: org.apache.kafka.connect.storage.StringConverter\n value.converter: org.apache.kafka.connect.storage.StringConverter\n
A Kafka pipeline can consist of consecutive streaming applications, producers, and connectors.\u00a0\u21a9
KPOps reads its global configuration that is unrelated to a pipeline's components from config.yaml.
config.yaml
Consider enabling KPOps' editor integration feature to enjoy the benefits of autocompletion and validation when configuring your pipeline.
To learn about any of the available settings, take a look at the example below.
# CONFIGURATION\n#\n# Base directory to the pipelines (default is current working directory)\npipeline_base_dir: .\n# The Kafka brokers address.\n# REQUIRED\nkafka_brokers: \"http://broker1:9092,http://broker2:9092\"\n# Configure the topic name variables you can use in the pipeline definition.\ntopic_name_config:\n # Configures the value for the variable ${output_topic_name}\n default_output_topic_name: ${pipeline.name}-${component.name}\n # Configures the value for the variable ${error_topic_name}\n default_error_topic_name: ${pipeline.name}-${component.name}-error\n# Configuration for Schema Registry.\nschema_registry:\n # Whether the Schema Registry handler should be initialized.\n enabled: false\n # Address of the Schema Registry.\n url: \"http://localhost:8081\"\n# Configuration for the Kafka REST Proxy.\nkafka_rest:\n # Address of the Kafka REST Proxy.\n url: \"http://localhost:8082\"\n# Configuration for Kafka Connect.\nkafka_connect:\n # Address of Kafka Connect.\n url: \"http://localhost:8083\"\n# Flag for `helm upgrade --install`.\n# Create the release namespace if not present.\ncreate_namespace: false\n# Global flags for Helm.\nhelm_config:\n # Name of kubeconfig context (`--kube-context`)\n context: name\n # Run Helm in Debug mode.\n debug: false\n # Kubernetes API version used for Capabilities.APIVersions\n api_version: null\n# Configure Helm Diff.\nhelm_diff_config:\n # Set of keys that should not be checked.\n ignore:\n - name\n - imageTag\n# Whether to retain clean up jobs in the cluster or uninstall the, after\n# completion.\nretain_clean_jobs: false\n
Environment-specific pipeline definitions
Similarly to defaults, it is possible to have an unlimited amount of additional environment-specific pipeline definitions. The naming convention is the same: add a suffix of the form _{environment} to the filename.
_{environment}
KPOps has a very efficient way of dealing with repeating settings which manifests as defaults.yaml. This file provides the user with the power to set defaults for any and all components, thus omitting the need to repeat the same settings in pipeline.yaml.
See real-world examples for defaults.
defaults
An important mechanic of KPOps is that defaults set for a component apply to all components that inherit from it.
It is possible, although not recommended, to add settings that are specific to a component's subclass. An example would be configuring offset_topic under kafka-connector instead of kafka-source-connector.
offset_topic
kafka-connector
kafka-source-connector
KPOps allows using multiple default values. The defaults.yaml (or defaults_<env>.yaml) files can be distributed across multiple files. These will be picked up by KPOps and get merged into a single pipeline.yaml file. KPOps starts from reading the default files from where the pipeline path is defined and picks up every defaults file on its way to where the pipeline_base_dir is defined.
defaults_<env>.yaml
pipeline_base_dir
The deepest defaults.yaml file in the folder hierarchy (i.e., the closest one to the pipeline.yaml) overwrites the higher-level defaults' values.
It is important to note that defaults_{environment}.yaml overrides only the settings that are explicitly set to be different from the ones in the base defaults file.
defaults_{environment}.yaml
Imagine the following folder structure, where the pipeline_base_dir is configured to pipelines:
pipelines
\u2514\u2500 pipelines\n \u2514\u2500\u2500 distributed-defaults\n \u251c\u2500\u2500 defaults.yaml\n \u251c\u2500\u2500 defaults_dev.yaml\n \u2514\u2500\u2500 pipeline-deep\n \u251c\u2500\u2500 defaults.yaml\n \u2514\u2500\u2500 pipeline.yaml\n
KPOps picks up the defaults in the following order (high to low priority):
./pipelines/distributed-defaults/pipeline-deep/defaults.yaml
./pipelines/distributed-defaults/defaults_dev.yaml
./pipelines/distributed-defaults/defaults.yaml
The defaults codeblocks in this section contain the full set of settings that are specific to the component. If a setting already exists in a parent config, it will not be included in the child's.
# Base Kubernetes App\n#\n# Parent of: HelmApp\n# Child of: PipelineComponent\nkubernetes-app:\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n from: # Must not be null\n topics: # read from topic\n ${pipeline.name}-input-topic:\n type: input # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra\n ${pipeline.name}-input-pattern-topic:\n type: pattern # Implied to be an input pattern if `role` is undefined\n ${pipeline.name}-extra-pattern-topic:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n components: # read from specific component\n account-producer:\n type: input # Implied when role is NOT specified\n other-producer:\n role: some-role # Implies `type` to be extra\n component-as-input-pattern:\n type: pattern # Implied to be an input pattern if `role` is undefined\n component-as-extra-pattern:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n namespace: namespace # required\n values: # required\n image: exampleImage # Example\n debug: false # Example\n commandLine: {} # Example\n
# StreamsApp component that configures a streams bootstrap app.\n#\n# Child of: KafkaApp\n# More documentation on StreamsApp: https://github.com/bakdata/streams-bootstrap\nstreams-app:\n # No arbitrary keys are allowed under `app`here\n # Allowed configs:\n # https://github.com/bakdata/streams-bootstrap/tree/master/charts/streams-app\n values: # required\n # Streams Bootstrap streams section\n streams: # required, streams-app-specific\n brokers: ${config.kafka_brokers} # required\n schemaRegistryUrl: ${config.schema_registry.url}\n inputTopics:\n - topic1\n - topic2\n outputTopic: output-topic\n inputPattern: input-pattern\n extraInputTopics:\n input_role1:\n - input_topic1\n - input_topic2\n input_role2:\n - input_topic3\n - input_topic4\n extraInputPatterns:\n pattern_role1: input_pattern1\n extraOutputTopics:\n output_role1: output_topic1\n output_role2: output_topic2\n errorTopic: error-topic\n config:\n my.streams.config: my.value\n nameOverride: override-with-this-name # streams-app-specific\n autoscaling: # streams-app-specific\n consumerGroup: consumer-group # required\n lagThreshold: 0 # Average target value to trigger scaling actions.\n enabled: false # Whether to enable auto-scaling using KEDA.\n # This is the interval to check each trigger on.\n # https://keda.sh/docs/2.9/concepts/scaling-deployments/#pollinginterval\n pollingInterval: 30\n # The period to wait after the last trigger reported active before scaling\n # the resource back to 0. https://keda.sh/docs/2.9/concepts/scaling-deployments/#cooldownperiod\n cooldownPeriod: 300\n # The offset reset policy for the consumer if the the consumer group is\n # not yet subscribed to a partition.\n offsetResetPolicy: earliest\n # This setting is passed to the HPA definition that KEDA will create for a\n # given resource and holds the maximum number of replicas of the target resouce.\n # https://keda.sh/docs/2.9/concepts/scaling-deployments/#maxreplicacount\n maxReplicas: 1\n # Minimum number of replicas KEDA will scale the resource down to.\n # https://keda.sh/docs/2.7/concepts/scaling-deployments/#minreplicacount\n minReplicas: 0\n # If this property is set, KEDA will scale the resource down to this\n # number of replicas.\n # https://keda.sh/docs/2.9/concepts/scaling-deployments/#idlereplicacount\n idleReplicas: 0\n topics: # List of auto-generated Kafka Streams topics used by the streams app.\n - topic1\n - topic2\n
\n
# Kafka connector\n#\n# Parent of: KafkaSinkConnector, KafkaSourceConnector\n# Child of: PipelineComponent\nkafka-connector:\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n from: # Must not be null\n topics: # read from topic\n ${pipeline.name}-input-topic:\n type: input # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra\n ${pipeline.name}-input-pattern-topic:\n type: pattern # Implied to be an input pattern if `role` is undefined\n ${pipeline.name}-extra-pattern-topic:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n components: # read from specific component\n account-producer:\n type: input # Implied when role is NOT specified\n other-producer:\n role: some-role # Implies `type` to be extra\n component-as-input-pattern:\n type: pattern # Implied to be an input pattern if `role` is undefined\n component-as-extra-pattern:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n # Full documentation on connectors: https://kafka.apache.org/documentation/#connectconfigs\n config: # required\n tasks.max: 1\n # Overriding Kafka Connect Resetter Helm values. E.g. to override the\n # Image Tag etc.\n resetter_values:\n imageTag: \"1.2.3\"\n
# Kafka source connector\n#\n# Child of: KafkaConnector\nkafka-source-connector:\n # The source connector has no `from` section\n # from:\n # offset.storage.topic\n # https://kafka.apache.org/documentation/#connect_running\n offset_topic: offset_topic\n
# Kafka sink connector\n#\n# Child of: KafkaConnector\nkafka-sink-connector:\n # No settings differ from `kafka-connector`\n
KPOps supports three operation modes\u2014managed, manifest, and argo. These modes determine how resources are managed and allow users to tailor their deployment strategy.
managed
manifest
argo
You can configure the operation mode using one of the following methods:
Command-Line Option: Pass the --operation-mode <OPERATION> flag when running a CLI command. Refer to the CLI commands documentation for more details.
--operation-mode <OPERATION>
Environment Variable: Set the operation mode by defining the KPOPS_OPERATION_MODE environment variable.
KPOPS_OPERATION_MODE
Job
Deployment
ConfigMap
Service
KafkaTopic
sync-wave
sync-wave=0
sync-wave>0
PostDelete
Can be used to deploy any app in Kubernetes using Helm, for example, a REST service that serves Kafka data.
# Kubernetes app managed through Helm with an associated Helm chart\n- type: helm-app\n name: helm-app # required\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n from: # Must not be null\n topics: # read from topic\n ${pipeline.name}-input-topic:\n type: input # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra\n ${pipeline.name}-input-pattern-topic:\n type: pattern # Implied to be an input pattern if `role` is undefined\n ${pipeline.name}-extra-pattern-topic:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n components: # read from specific component\n account-producer:\n type: input # Implied when role is NOT specified\n other-producer:\n role: some-role # Implies `type` to be extra\n component-as-input-pattern:\n type: pattern # Implied to be an input pattern if `role` is undefined\n component-as-extra-pattern:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n namespace: namespace # required\n values: # required\n image: exampleImage # Example\n debug: false # Example\n commandLine: {} # Example\n # Helm repository configuration (optional)\n # If not set the helm repo add will not be called. Useful when using local Helm charts\n repo_config:\n repository_name: bakdata-streams-bootstrap # required\n url: https://bakdata.github.io/streams-bootstrap/ # required\n repo_auth_flags:\n username: user\n password: pass\n ca_file: /home/user/path/to/ca-file\n insecure_skip_tls_verify: false\n version: \"1.0.0\" # Helm chart version\n
Deploy using Helm.
Uninstall Helm release.
Do nothing.
KafkaConnector is a component that deploys Kafka Connectors. Since a connector cannot be different from sink or source it is not recommended to use KafkaConnector for deployment in pipeline.yaml. Instead, KafkaConnector should be used in defaults.yaml to set defaults for all connectors in the pipeline as they can share some common settings.
KafkaConnector
Subclass of KafkaConnector.
Lets other systems pull data from Apache Kafka.
# Kafka sink connector\n- type: kafka-sink-connector\n name: kafka-sink-connector # required\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n from: # Must not be null\n topics: # read from topic\n ${pipeline.name}-input-topic:\n type: input # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra\n ${pipeline.name}-input-pattern-topic:\n type: pattern # Implied to be an input pattern if `role` is undefined\n ${pipeline.name}-extra-pattern-topic:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n components: # read from specific component\n account-producer:\n type: input # Implied when role is NOT specified\n other-producer:\n role: some-role # Implies `type` to be extra\n component-as-input-pattern:\n type: pattern # Implied to be an input pattern if `role` is undefined\n component-as-extra-pattern:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n # Full documentation on connectors: https://kafka.apache.org/documentation/#connectconfigs\n config: # required\n tasks.max: 1\n # Overriding Kafka Connect Resetter Helm values. E.g. to override the\n # Image Tag etc.\n resetter_values:\n imageTag: \"1.2.3\"\n
The associated sink connector is removed from the Kafka Connect cluster.
Reset the consumer group offsets using bakdata's sink resetter.
Manages source connectors in your Kafka Connect cluster.
# Kafka source connector\n- type: kafka-source-connector # required\n name: kafka-source-connector # required\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n # The source connector has no `from` section\n # from:\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n # Full documentation on connectors: https://kafka.apache.org/documentation/#connectconfigs\n config: # required\n tasks.max: 1\n # Overriding Kafka Connect Resetter Helm values. E.g. to override the\n # Image Tag etc.\n resetter_values:\n imageTag: \"1.2.3\"\n # offset.storage.topic\n # https://kafka.apache.org/documentation/#connect_running\n offset_topic: offset_topic\n
Remove the source connector from the Kafka Connect cluster.
Delete state associated with the connector using bakdata's source resetter.
Can be used to create components for any Kubernetes app.
# Base Kubernetes App\n- type: kubernetes-app\n name: kubernetes-app # required\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n from: # Must not be null\n topics: # read from topic\n ${pipeline.name}-input-topic:\n type: input # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra\n ${pipeline.name}-input-pattern-topic:\n type: pattern # Implied to be an input pattern if `role` is undefined\n ${pipeline.name}-extra-pattern-topic:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n components: # read from specific component\n account-producer:\n type: input # Implied when role is NOT specified\n other-producer:\n role: some-role # Implies `type` to be extra\n component-as-input-pattern:\n type: pattern # Implied to be an input pattern if `role` is undefined\n component-as-extra-pattern:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n namespace: namespace # required\n values: # required\n image: exampleImage # Example\n debug: false # Example\n commandLine: {} # Example\n
This section explains the different components of KPOps, their usage and configuration in the pipeline definition pipeline.yaml.
flowchart BT\n KubernetesApp --> PipelineComponent\n HelmApp --> KubernetesApp\n StreamsBootstrap --> HelmApp\n StreamsApp --> StreamsBootstrap\n ProducerApp --> StreamsBootstrap\n KafkaConnector --> PipelineComponent\n KafkaSourceConnector --> KafkaConnector\n KafkaSinkConnector --> KafkaConnector\n\n click KubernetesApp \"./../kubernetes-app\"\n click HelmApp \"./../helm-app\"\n click StreamsBootstrap \"./../streams-bootstrap\"\n click StreamsApp \"./../streams-app\"\n click ProducerApp \"./../producer-app\"\n click KafkaConnector \"./../kafka-connector\"\n click KafkaSourceConnector \"./../kafka-source-connector\"\n click KafkaSinkConnector \"./../kafka-sink-connector\"
KPOps component hierarchy
Subclass of StreamsBootstrap.
Configures a streams-bootstrap Kafka producer app
# Holds configuration to use as values for the streams bootstrap producer-app Helm\n# chart.\n# More documentation on ProducerApp:\n# https://github.com/bakdata/streams-bootstrap\n- type: producer-app\n name: producer-app # required\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n # from: # While the producer-app does inherit from kafka-app, it does not need a\n # `from` section, hence it does not support it.\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n namespace: namespace # required\n # Allowed configs:\n # https://github.com/bakdata/streams-bootstrap/tree/master/charts/producer-app\n values: # required\n streams: # required, producer-app-specific\n brokers: ${config.kafka_brokers} # required\n schemaRegistryUrl: ${config.schema_registry.url}\n outputTopic: output_topic\n extraOutputTopics:\n output_role1: output_topic1\n output_role2: output_topic2\n nameOverride: override-with-this-name # kafka-app-specific\n # Helm repository configuration (optional)\n # If not set the helm repo add will not be called. Useful when using local Helm charts\n repo_config:\n repository_name: bakdata-streams-bootstrap # required\n url: https://bakdata.github.io/streams-bootstrap/ # required\n repo_auth_flags:\n username: user\n password: pass\n ca_file: /home/user/path/to/ca-file\n insecure_skip_tls_verify: false\n version: \"2.12.0\" # Helm chart version\n
In addition to KubernetesApp's deploy:
Do nothing, producers are stateless.
Subclass of and StreamsBootstrap.
Configures a streams-bootstrap Kafka Streams app
# StreamsApp component that configures a streams bootstrap app.\n# More documentation on StreamsApp: https://github.com/bakdata/streams-bootstrap\n- type: streams-app # required\n name: streams-app # required\n # Pipeline prefix that will prefix every component name. If you wish to not\n # have any prefix you can specify an empty string.\n prefix: ${pipeline.name}-\n from: # Must not be null\n topics: # read from topic\n ${pipeline.name}-input-topic:\n type: input # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra\n ${pipeline.name}-input-pattern-topic:\n type: pattern # Implied to be an input pattern if `role` is undefined\n ${pipeline.name}-extra-pattern-topic:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n components: # read from specific component\n account-producer:\n type: input # Implied when role is NOT specified\n other-producer:\n role: some-role # Implies `type` to be extra\n component-as-input-pattern:\n type: pattern # Implied to be an input pattern if `role` is undefined\n component-as-extra-pattern:\n type: pattern # Implied to be an extra pattern if `role` is defined\n role: some-role\n # Topic(s) into which the component will write output\n to:\n topics:\n ${pipeline.name}-output-topic:\n type: output # Implied when role is NOT specified\n ${pipeline.name}-extra-topic:\n role: topic-role # Implies `type` to be extra; Will throw an error if `type` is defined\n ${pipeline.name}-error-topic:\n type: error\n # Currently KPOps supports Avro and JSON schemas.\n key_schema: key-schema # must implement SchemaProvider to use\n value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs: # https://kafka.apache.org/documentation/#topicconfigs\n cleanup.policy: compact\n models: # SchemaProvider is initiated with the values given here\n model: model\n namespace: namespace # required\n # No arbitrary keys are allowed under `app`here\n # Allowed configs:\n # https://github.com/bakdata/streams-bootstrap/tree/master/charts/streams-app\n values: # required\n # Streams Bootstrap streams section\n streams: # required, streams-app-specific\n brokers: ${config.kafka_brokers} # required\n schemaRegistryUrl: ${config.schema_registry.url}\n inputTopics:\n - topic1\n - topic2\n outputTopic: output-topic\n inputPattern: input-pattern\n extraInputTopics:\n input_role1:\n - input_topic1\n - input_topic2\n input_role2:\n - input_topic3\n - input_topic4\n extraInputPatterns:\n pattern_role1: input_pattern1\n extraOutputTopics:\n output_role1: output_topic1\n output_role2: output_topic2\n errorTopic: error-topic\n config:\n my.streams.config: my.value\n nameOverride: override-with-this-name # streams-app-specific\n autoscaling: # streams-app-specific\n consumerGroup: consumer-group # required\n lagThreshold: 0 # Average target value to trigger scaling actions.\n enabled: false # Whether to enable auto-scaling using KEDA.\n # This is the interval to check each trigger on.\n # https://keda.sh/docs/2.9/concepts/scaling-deployments/#pollinginterval\n pollingInterval: 30\n # The period to wait after the last trigger reported active before scaling\n # the resource back to 0. https://keda.sh/docs/2.9/concepts/scaling-deployments/#cooldownperiod\n cooldownPeriod: 300\n # The offset reset policy for the consumer if the the consumer group is\n # not yet subscribed to a partition.\n offsetResetPolicy: earliest\n # This setting is passed to the HPA definition that KEDA will create for a\n # given resource and holds the maximum number of replicas of the target resouce.\n # https://keda.sh/docs/2.9/concepts/scaling-deployments/#maxreplicacount\n maxReplicas: 1\n # Minimum number of replicas KEDA will scale the resource down to.\n # https://keda.sh/docs/2.7/concepts/scaling-deployments/#minreplicacount\n minReplicas: 0\n # If this property is set, KEDA will scale the resource down to this\n # number of replicas.\n # https://keda.sh/docs/2.9/concepts/scaling-deployments/#idlereplicacount\n idleReplicas: 0\n topics: # List of auto-generated Kafka Streams topics used by the streams app.\n - topic1\n - topic2\n # Helm repository configuration (optional)\n # If not set the helm repo add will not be called. Useful when using local Helm charts\n repo_config:\n repository_name: bakdata-streams-bootstrap # required\n url: https://bakdata.github.io/streams-bootstrap/ # required\n repo_auth_flags:\n username: user\n password: pass\n ca_file: /home/user/path/to/ca-file\n insecure_skip_tls_verify: false\n version: \"2.12.0\" # Helm chart version\n
Similar to reset with to additional steps:
Subclass of HelmApp.
Defines a streams-bootstrap component
Often used in defaults.yaml
Environment variables can be set by using the export command in Linux or the set command in Windows.
dotenv files
KPOps currently supports .env files only for variables related to the config. Full support for .env files is on the roadmap. One of the possible ways to use one and export the contents manually is with the following command: export $(xargs < .env). This would work in bash suppose there are no spaces inside the values.
.env
export $(xargs < .env)
bash
These variables take precedence over the settings in config.yaml. Variables marked as required can instead be set in the global config.
helm upgrade --install
--kube-context
Capabilities.APIVersions
# Global config environment variables\n#\n# The default setup is shown. These variables take precedence over the\n# settings in `config.yaml`. Variables marked as required can instead\n# be set in the global config.\n#\n# pipeline_base_dir\n# Base directory to the pipelines (default is current working\n# directory)\nKPOPS_PIPELINE_BASE_DIR=.\n# kafka_brokers\n# The comma separated Kafka brokers address.\nKPOPS_KAFKA_BROKERS # No default value, required\n# topic_name_config.default_output_topic_name\n# Configures the value for the variable ${output_topic_name}\nKPOPS_TOPIC_NAME_CONFIG__DEFAULT_OUTPUT_TOPIC_NAME=${pipeline.name}-${component.name}\n# topic_name_config.default_error_topic_name\n# Configures the value for the variable ${error_topic_name}\nKPOPS_TOPIC_NAME_CONFIG__DEFAULT_ERROR_TOPIC_NAME=${pipeline.name}-${component.name}-error\n# schema_registry.enabled\n# Whether the Schema Registry handler should be initialized.\nKPOPS_SCHEMA_REGISTRY__ENABLED=False\n# schema_registry.url\n# Address of the Schema Registry.\nKPOPS_SCHEMA_REGISTRY__URL=http://localhost:8081/\n# schema_registry.timeout\n# Operation timeout in seconds.\nKPOPS_SCHEMA_REGISTRY__TIMEOUT=30\n# kafka_rest.url\n# Address of the Kafka REST Proxy.\nKPOPS_KAFKA_REST__URL=http://localhost:8082/\n# kafka_rest.timeout\n# Operation timeout in seconds.\nKPOPS_KAFKA_REST__TIMEOUT=30\n# kafka_connect.url\n# Address of Kafka Connect.\nKPOPS_KAFKA_CONNECT__URL=http://localhost:8083/\n# kafka_connect.timeout\n# Operation timeout in seconds.\nKPOPS_KAFKA_CONNECT__TIMEOUT=30\n# create_namespace\n# Flag for `helm upgrade --install`. Create the release namespace if\n# not present.\nKPOPS_CREATE_NAMESPACE=False\n# helm_config.context\n# Name of kubeconfig context (`--kube-context`)\nKPOPS_HELM_CONFIG__CONTEXT # No default value, not required\n# helm_config.debug\n# Run Helm in Debug mode\nKPOPS_HELM_CONFIG__DEBUG=False\n# helm_config.api_version\n# Kubernetes API version used for `Capabilities.APIVersions`\nKPOPS_HELM_CONFIG__API_VERSION # No default value, not required\n# helm_diff_config.ignore\n# Set of keys that should not be checked.\nKPOPS_HELM_DIFF_CONFIG__IGNORE # No default value, required\n# retain_clean_jobs\n# Whether to retain clean up jobs in the cluster or uninstall the,\n# after completion.\nKPOPS_RETAIN_CLEAN_JOBS=False\n# strimzi_topic\n# Configuration for Strimzi Kafka Topics.\nKPOPS_STRIMZI_TOPIC # No default value, not required\n# operation_mode\n# The operation mode of KPOps (managed, manifest, argo).\nKPOPS_OPERATION_MODE=managed\n
These variables take precedence over the commands' flags. If a variable is set, the corresponding flag does not have to be specified in commands. Variables marked as required can instead be set as flags.
# CLI Environment variables\n#\n# The default setup is shown. These variables take precedence over the\n# commands' flags. If a variable is set, the corresponding flag does\n# not have to be specified in commands. Variables marked as required\n# can instead be set as flags.\n#\n# Path to the dir containing config.yaml files\nKPOPS_CONFIG_PATH=.\n# Path to dotenv file. Multiple files can be provided. The files will\n# be loaded in order, with each file overriding the previous one.\nKPOPS_DOTENV_PATH # No default value, not required\n# The environment you want to generate and deploy the pipeline to.\n# Suffix your environment files with this value (e.g.\n# defaults_development.yaml for environment=development).\nKPOPS_ENVIRONMENT # No default value, not required\n# How KPOps should operate.\nKPOPS_OPERATION_MODE=managed\n# Paths to dir containing 'pipeline.yaml' or files named\n# 'pipeline.yaml'.\nKPOPS_PIPELINE_PATHS # No default value, required\n# Comma separated list of steps to apply the command on\nKPOPS_PIPELINE_STEPS # No default value, not required\n
KPOps supports the usage of placeholders and environment variables in pipeline definition and defaults.
These variables can be used in a component's definition to refer to any of its attributes, including ones that the user has defined in the defaults.
All of them are prefixed with component. and follow the following form: component.{attribute_name}. If the attribute itself contains attributes, they can be referred to like this: component.{attribute_name}.{subattribute_name}.
component.
component.{attribute_name}
component.{attribute_name}.{subattribute_name}
- type: scheduled-producer\n values:\n labels:\n app_type: \"${component.type}\"\n app_name: \"${component.name}\"\n app_schedule: \"${component.values.schedule}\"\n commandLine:\n FAKE_ARG: \"fake-arg-value\"\n schedule: \"30 3/8 * * *\"\n- type: converter\n values:\n commandLine:\n CONVERT_XML: true\n resources:\n limits:\n memory: 2G\n requests:\n memory: 2G\n- type: filter\n name: \"filter-app\"\n values:\n labels:\n app_type: \"${component.type}\"\n app_name: \"${component.name}\"\n app_resources_requests_memory: \"${component.values.resources.requests.memory}\"\n ${component.type}: \"${component.values.labels.app_name}-${component.values.labels.app_type}\"\n test_placeholder_in_placeholder: \"${component.values.labels.${component.type}}\"\n commandLine:\n TYPE: \"nothing\"\n resources:\n requests:\n memory: 3G\n replicaCount: 4\n autoscaling:\n minReplicas: 4\n maxReplicas: 4\n
These variables include all fields in the config and refer to the pipeline configuration that is independent of the components.
All such variables are prefixed with config. and are of the same form as the component-specific variables.
config.
Info
error_topic_name is an alias for config.topic_name_config.default_error_topic_name output_topic_name is an alias for config.topic_name_config.default_output_topic_name
error_topic_name
config.topic_name_config.default_error_topic_name
output_topic_name
config.topic_name_config.default_output_topic_name
Environment variables such as $PATH can be used in the pipeline definition and defaults without any transformation following the form ${ENV_VAR_NAME}. This, of course, includes variables like the ones relevant to the KPOps cli that are exported by the user.
$PATH
${ENV_VAR_NAME}
See all KPOps environment variables
These are special variables that refer to the name and path of a pipeline.
${pipeline.name}: Concatenated path of the parent directory where pipeline.yaml is defined in. For instance, ./data/pipelines/v1/pipeline.yaml, here the value for the variable would be data-pipelines-v1.
${pipeline.name}
./data/pipelines/v1/pipeline.yaml
data-pipelines-v1
${pipeline_name_<level>}: Similar to the previous variable, each <level> contains a part of the path to the pipeline.yaml file. Consider the previous example, ${pipeline_name_0} would be data, ${pipeline_name_1} would be pipelines, and ${pipeline_name_2} equals to v1.
${pipeline_name_<level>}
<level>
${pipeline_name_0}
data
${pipeline_name_1}
${pipeline_name_2}
v1
ATM fraud is a demo pipeline for ATM fraud detection. The original by Confluent is written in KSQL and outlined in this blogpost. The one used in this example is re-built from scratch using bakdata's streams-bootstrap library.
streams-bootstrap
Completed all steps in the setup.
Deploy PostgreSQL using the Bitnami Helm chart: Add the helm repository:
helm repo add bitnami https://charts.bitnami.com/bitnami && \\\nhelm repo update\n
Install the PostgreSQL with helm:
helm upgrade --install -f ./postgresql.yaml \\\n--namespace kpops \\\npostgresql bitnami/postgresql\n
postgresql.yaml
auth:\n database: app_db\n enablePostgresUser: true\n password: AppPassword\n postgresPassword: StrongPassword\n username: app1\nprimary:\n persistence:\n enabled: false\n existingClaim: postgresql-data-claim\nvolumePermissions:\n enabled: true\n
Before we deploy the pipeline, we need to forward the ports of kafka-rest-proxy and kafka-connect. Run the following commands in two different terminals.
kafka-rest-proxy
kafka-connect
kubectl port-forward --namespace kpops service/k8kafka-cp-rest 8082:8082\n
kubectl port-forward --namespace kpops service/k8kafka-cp-kafka-connect 8083:8083\n
Clone the kpops-examples repository and cd into the directory.
cd
Install KPOps pip install -r requirements.txt.
pip install -r requirements.txt
Export environment variables in your terminal:
export DOCKER_REGISTRY=bakdata && \\\nexport NAMESPACE=kpops\n
Deploy the pipeline
kpops deploy atm-fraud/pipeline.yaml --execute\n
Note
You can use the --dry-run flag instead of the --execute flag and check the logs if your pipeline will be deployed correctly.
--dry-run
--execute
You can use the Streams Explorer to see the deployed pipeline. To do so, port-forward the service in a separate terminal session using the command below:
kubectl port-forward -n kpops service/streams-explorer 8080:8080\n
After that open http://localhost:8080 in your browser. You should be able to see pipeline shown in the image below:
Attention
Kafka Connect needs some time to set up the connector. Moreover, Streams Explorer needs a while to scrape the information from Kafka connect. Therefore, it might take a bit until you see the whole graph.
PostgreSQL can be uninstalled by running the following command:
helm --namespace kpops uninstall postgresql\n
Export environment variables in your terminal.
Remove the pipeline
kpops clean atm-fraud/pipeline.yaml --verbose --execute\n
You can use the --dry-run flag instead of the --execute flag and check the logs if your pipeline will be destroyed correctly.
If you face any issues destroying this example see Teardown for manual deletion.
deploy --dry-run
Word-count is a demo pipeline consisting of a producer producing words to Kafka, a Kafka streams app counting the number of times each word occurs, and finally a Redis database into which the words are exported.
Deploy Redis using the Bitnami Helm chart: Add the Helm repository:
Install Redis with Helm:
helm upgrade --install -f ./values-redis.yaml \\\n--namespace kpops \\\nredis bitnami/redis\n
values-redis.yaml
architecture: standalone\nauth:\n enabled: false\nmaster:\n count: 1\n configuration: \"databases 1\"\nimage:\n tag: 7.0.8\n
kpops deploy word-count/pipeline.yaml --execute\n
You can use the Streams Explorer to inspect the deployed pipeline. To do so, port-forward the service in a separate terminal session using the command below:
After that open http://localhost:8080 in your browser.
You should be able to see pipeline shown in the image below:
Kafka Connect needs some time to set up the connector. Moreover, Streams Explorer needs a while to scrape the information from Kafka Connect. Therefore, it might take a bit until you see the whole graph.
Redis can be uninstalled by running the following command:
helm --namespace kpops uninstall redis\n
kpops clean word-count/pipeline.yaml --verbose --execute\n
In this part, you will set up KPOps. This includes:
If you don't have access to an existing Kubernetes cluster, this section will guide you through creating a local cluster. We recommend the lightweight Kubernetes distribution k3s for this. k3d is a wrapper around k3s in Docker that lets you get started fast.
You can install k3d with its installation script:
wget -q -O - https://raw.githubusercontent.com/k3d-io/k3d/v5.4.6/install.sh | bash\n
For other ways of installing k3d, you can have a look at their installation guide.
The Kafka deployment needs a modified Docker image. In that case the image is built and pushed to a Docker registry that holds it. If you do not have access to an existing Docker registry, you can use k3d's Docker registry:
k3d registry create kpops-registry.localhost --port 12345\n
Now you can create a new cluster called kpops that uses the previously created Docker registry:
kpops
k3d cluster create kpops --k3s-arg \"--no-deploy=traefik@server:*\" --registry-use k3d-kpops-registry.localhost:12345\n
Creating a new k3d cluster automatically configures kubectl to connect to the local cluster by modifying your ~/.kube/config. In case you manually set the KUBECONFIG variable or don't want k3d to modify your config, k3d offers many other options.
kubectl
~/.kube/config
KUBECONFIG
You can check the cluster status with kubectl get pods -n kube-system. If all returned elements have a STATUS of Running or Completed, then the cluster is up and running.
kubectl get pods -n kube-system
STATUS
Running
Completed
Kafka is an open-source data streaming platform. More information about Kafka can be found in the documentation. To deploy Kafka, this guide uses Confluent's Helm chart.
To allow connectivity to other systems Kafka Connect needs to be extended with drivers. You can install a JDBC driver for Kafka Connect by creating a new Docker image:
Create a Dockerfile with the following content:
Dockerfile
FROM confluentinc/cp-kafka-connect:7.1.3\n\nRUN confluent-hub install --no-prompt confluentinc/kafka-connect-jdbc:10.6.0\n
Build and push the modified image to your private Docker registry:
docker build . --tag localhost:12345/kafka-connect-jdbc:7.1.3 && \\\ndocker push localhost:12345/kafka-connect-jdbc:7.1.3\n
Detailed instructions on building, tagging and pushing a docker image can be found in Docker docs.
Add Confluent's Helm chart repository and update the index:
helm repo add confluentinc https://confluentinc.github.io/cp-helm-charts/ && \nhelm repo update\n
Install Kafka, Zookeeper, Confluent's Schema Registry, Kafka Rest Proxy, and Kafka Connect. A single Helm chart installs all five components. Below you can find an example for the --values ./kafka.yaml file configuring the deployment accordingly. Deploy the services:
--values ./kafka.yaml
helm upgrade \\\n --install \\\n --version 0.6.1 \\\n --values ./kafka.yaml \\\n --namespace kpops \\\n --create-namespace \\\n --wait \\\n k8kafka confluentinc/cp-helm-charts\n
kafka.yaml
An example value configuration for Confluent's Helm chart. This configuration deploys a single Kafka Broker, a Schema Registry, Zookeeper, Kafka Rest Proxy, and Kafka Connect with minimal resources.
cp-zookeeper:\n enabled: true\n servers: 1\n imageTag: 7.1.3\n heapOptions: \"-Xms124M -Xmx124M\"\n overrideGroupId: k8kafka\n fullnameOverride: \"k8kafka-cp-zookeeper\"\n resources:\n requests:\n cpu: 50m\n memory: 0.2G\n limits:\n cpu: 250m\n memory: 0.2G\n prometheus:\n jmx:\n enabled: false\n\ncp-kafka:\n enabled: true\n brokers: 1\n imageTag: 7.1.3\n podManagementPolicy: Parallel\n configurationOverrides:\n \"auto.create.topics.enable\": false\n \"offsets.topic.replication.factor\": 1\n \"transaction.state.log.replication.factor\": 1\n \"transaction.state.log.min.isr\": 1\n \"confluent.metrics.reporter.topic.replicas\": 1\n resources:\n requests:\n cpu: 50m\n memory: 0.5G\n limits:\n cpu: 250m\n memory: 0.5G\n prometheus:\n jmx:\n enabled: false\n persistence:\n enabled: false\n\ncp-schema-registry:\n enabled: true\n imageTag: 7.1.3\n fullnameOverride: \"k8kafka-cp-schema-registry\"\n overrideGroupId: k8kafka\n kafka:\n bootstrapServers: \"PLAINTEXT://k8kafka-cp-kafka-headless:9092\"\n resources:\n requests:\n cpu: 50m\n memory: 0.25G\n limits:\n cpu: 250m\n memory: 0.25G\n prometheus:\n jmx:\n enabled: false\n\ncp-kafka-connect:\n enabled: true\n replicaCount: 1\n image: k3d-kpops-registry.localhost:12345/kafka-connect-jdbc\n imageTag: 7.1.3\n fullnameOverride: \"k8kafka-cp-kafka-connect\"\n overrideGroupId: k8kafka\n kafka:\n bootstrapServers: \"PLAINTEXT://k8kafka-cp-kafka-headless:9092\"\n heapOptions: \"-Xms256M -Xmx256M\"\n resources:\n requests:\n cpu: 500m\n memory: 0.25G\n limits:\n cpu: 500m\n memory: 0.25G\n configurationOverrides:\n \"consumer.max.poll.records\": \"10\"\n \"consumer.max.poll.interval.ms\": \"900000\"\n \"config.storage.replication.factor\": \"1\"\n \"offset.storage.replication.factor\": \"1\"\n \"status.storage.replication.factor\": \"1\"\n cp-schema-registry:\n url: http://k8kafka-cp-schema-registry:8081\n prometheus:\n jmx:\n enabled: false\n\ncp-kafka-rest:\n enabled: true\n imageTag: 7.1.3\n fullnameOverride: \"k8kafka-cp-rest\"\n heapOptions: \"-Xms256M -Xmx256M\"\n resources:\n requests:\n cpu: 50m\n memory: 0.25G\n limits:\n cpu: 250m\n memory: 0.5G\n prometheus:\n jmx:\n enabled: false\n\ncp-ksql-server:\n enabled: false\ncp-control-center:\n enabled: false\n
Streams Explorer allows examining Apache Kafka data pipelines in a Kubernetes cluster including the inspection of schemas and monitoring of metrics. First, add the Helm repository:
helm repo add streams-explorer https://bakdata.github.io/streams-explorer && \\\nhelm repo update\n
Below you can find an example for the --values ./streams-explorer.yaml file configuring the deployment accordingly. Now, deploy the service:
--values ./streams-explorer.yaml
helm upgrade \\\n --install \\\n --version 0.2.3 \\\n --values ./streams-explorer.yaml \\\n --namespace kpops \\\n streams-explorer streams-explorer/streams-explorer\n
streams-explorer.yaml
An example value configuration for Steams Explorer Helm chart.
imageTag: \"v2.1.2\"\nconfig:\n K8S__deployment__cluster: true\n SCHEMAREGISTRY__url: http://k8kafka-cp-schema-registry.kpops.svc.cluster.local:8081\n KAFKACONNECT__url: http://k8kafka-cp-kafka-connect.kpops.svc.cluster.local:8083\nresources:\n requests:\n cpu: 200m\n memory: 300Mi\n limits:\n cpu: 200m\n memory: 300Mi\n
Now we will check if all the pods are running in our namespace. You can list all pods in the namespace with this command:
kubectl --namespace kpops get pods\n
Then you should see the following output in your terminal:
NAME READY STATUS RESTARTS AGE\nk8kafka-cp-kafka-connect-8fc7d544f-8pjnt 1/1 Running 0 15m\nk8kafka-cp-zookeeper-0 1/1 Running 0 15m\nk8kafka-cp-kafka-0 1/1 Running 0 15m\nk8kafka-cp-schema-registry-588f8c65db-jdwbq 1/1 Running 0 15m\nk8kafka-cp-rest-6bbfd7b645-nwkf8 1/1 Running 0 15m\nstreams-explorer-54db878c67-s8wbz 1/1 Running 0 15m\n
Pay attention to the STATUS row. The pods should have a status of Running.
KPOps comes as a PyPI package. You can install it with pip:
pip
pip install kpops\n
The kpops CLI can be used to destroy a pipeline that was previously deployed with KPOps. In case that doesn't work, the pipeline can always be taken down manually with helm (see section Infrastructure).
helm
Export environment variables.
Navigate to the examples folder. Replace the <name-of-the-example-directory> with the example you want to tear down. For example the atm-fraud-detection.
<name-of-the-example-directory>
atm-fraud-detection
# Uncomment 1 line to either destroy, reset or clean.\n\n# poetry run kpops destroy <name-of-the-example-directory>/pipeline.yaml \\\n# poetry run kpops reset <name-of-the-example-directory>/pipeline.yaml \\\n# poetry run kpops clean <name-of-the-example-directory>/pipeline.yaml \\\n--config <name-of-the-example-directory>/config.yaml \\\n--execute\n
Delete namespace:
kubectl delete namespace kpops\n
In case kpops destroy is not working one can uninstall the pipeline services one by one. This is equivalent to running kpops destroy. In case a clean uninstall (like the one kpops clean does) is needed, one needs to also delete the topics and schemas created by deployment of the pipeline.
kpops destroy
kpops clean
Delete local cluster:
k3d cluster delete kpops\n
Delete local registry:
k3d registry delete k3d-kpops-registry.localhost\n
KPOps automatically infers the component type from the class name. Therefore, the type and schema_type attributes can be removed from your custom components. By convention the type would be the lower, and kebab cased name of the class.
schema_type
class MyCoolStreamApp(StreamsApp):\n- type = \"my-cool-stream-app\"\n+ ...\n
Because of this new convention producer has been renamed to producer-app. This must be addressed in your pipeline.yaml and defaults.yaml.
producer
producer-app
- producer:\n+ producer-app:\n app:\n streams:\n outputTopic: output_topic\n extraOutputTopics:\n output_role1: output_topic1\n output_role2: output_topic2\n
In the to section these have changed:
to
output
role
extra
error
to:\n topics:\n ${pipeline_name}-topic-1:\n- type: extra\n role: \"role-1\"\n ...\n ${pipeline_name}-topic-2:\n- type: output\n ...\n ${pipeline_name}-topic-3:\n type: error\n ...\n
In the from section these have changed:
from
input
input-pattern
pattern
extra-pattern
from:\n topics:\n ${pipeline_name}-input-topic:\n- type: input\n ...\n ${pipeline_name}-extra-topic:\n- type: extra\n role: topic-role\n ...\n ${pipeline_name}-input-pattern-topic:\n- type: input-pattern\n+ type: pattern\n ...\n ${pipeline_name}-extra-pattern-topic:\n- type: extra-pattern\n+ type: pattern\n role: some-role\n ...\n
All the internal KPOps models are now snake_case, and only Helm/Kubernetes values require camel casing. You can find an example of a pipeline.yaml in the following. Notice that the app section here remains untouched.
app
...\ntype: streams-app\n name: streams-app\n namespace: namespace\n app:\n streams:\n brokers: ${brokers}\n schemaRegistryUrl: ${schema_registry_url}\n autoscaling:\n consumerGroup: consumer-group\n lagThreshold: 0\n enabled: false\n pollingInterval: 30\n\n to:\n topics:\n ${pipeline_name}-output-topic:\n type: error\n- keySchema: key-schema\n+ key_schema: key-schema\n- valueSchema: value-schema\n+ value_schema: value-schema\n partitions_count: 1\n replication_factor: 1\n configs:\n cleanup.policy: compact\n models:\n model: model\n prefix: ${pipeline_name}-\n- repoConfig:\n+ repo_config:\n- repositoryName: bakdata-streams-bootstrap\n+ repository_name: bakdata-streams-bootstrap\n url: https://bakdata.github.io/streams-bootstrap/\n- repoAuthFlags:\n+ repo_auth_flags:\n username: user\n password: pass\n ca_file: /home/user/path/to/ca-file\n insecure_skip_tls_verify: false\n version: \"1.0.4\"\n...\n
If you are using the KubernetesApp class to define your own Kubernetes resource to deploy, the abstract function get_helm_chart that returns the chart for deploying the app using Helm is now a Python property and renamed to helm_chart.
KubernetesApp
get_helm_chart
helm_chart
class MyCoolApp(KubernetesApp):\n\n+ @property\n @override\n- def get_helm_chart(self) -> str:\n+ def helm_chart(self) -> str:\n return \"./charts/charts-folder\"\n
Since you can pass a comma separated string of broker address, the broker field in KPOps is now plural. The pluralization has affected multiple areas:
environment: development\n- broker: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n+ brokers: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n kafka_connect_host: \"http://localhost:8083\"\n kafka_rest_host: \"http://localhost:8082\"\n schema_registry_url: \"http://localhost:8081\"\n
The variable is now called brokers.
brokers
...\n app:\n streams:\n- brokers: ${broker}\n+ brokers: ${brokers}\n schemaRegistryUrl: ${schema_registry_url}\n nameOverride: override-with-this-name\n imageTag: \"1.0.0\"\n...\n
Previously, if you set the environment variable KPOPS_KAFKA_BROKER, you need to replace that now with KPOPS_KAFKA_BROKERS.
KPOPS_KAFKA_BROKER
KPOPS_KAFKA_BROKERS
Jump to the summary
KPOps handles long (more than 53 characters) Helm releases names differently. Helm will not find your (long) old release names anymore. Therefore, it is recommended that you should once destroy your pipeline with KPOps v2 to remove old Helm release names. After a clean destroy, re-deploy your pipeline with the KPOps v3.
For example if you have a component with the Helm release name example-component-name-too-long-fake-fakefakefakefakefake. The new release name will shorten the original name to 53 characters and then replace the last 6 characters of the trimmed name with the first 5 characters of the result of SHA-1(helm_release_name).
example-component-name-too-long-fake-fakefakefakefakefake
example-component-name-too-long-fake-fakefakef-0a7fc ----> 53 chars\n---------------------------------------------- -----\n ^Shortened helm_release_name ^first 5 characters of SHA1(helm_release_name)\n
All Helm-specific parts of the built-in KubernetesApp have been extracted to a new child component that is more appropriately named HelmApp. It has to be renamed in your existing pipeline defintions and custom components module.
HelmApp
-- type: kubernetes-app\n+- type: helm-app\n name: foo\n
- from kpops.components import KubernetesApp\n+ from kpops.components import HelmApp\n\n\n- class CustomHelmApp(KubernetesApp):\n+ class CustomHelmApp(HelmApp):\n ...\n
Previously the default KafkaApp component configured the streams-bootstrap Helm Charts. Now, this component is no longer tied to Helm (or Kubernetes). Instead, there is a new StreamsBootstrap component that configures the Helm Chart repository for the components that use it, e.g. StreamsApp and ProducerApp. If you are using non-default values for the Helm Chart repository or version, it has to be updated as shown below.
KafkaApp
StreamsBootstrap
StreamsApp
ProducerApp
kafka-app:\n app:\n streams: ...\n\n+ streams-bootstrap:\n repo_config: ...\n version: ...\n
Internally, the Kafka Connector resetter is now its own standard HelmApp, removing a lot of the shared code. It is configured using the resetter_namespace (formerly namespace) and resetter_values attributes.
resetter_namespace
namespace
resetter_values
kafka-connector:\n- namespace: my-namespace\n+ resetter_namespace: my-namespace\n
The breaking changes target the config.yaml file:
The schema_registry_url is replaced with schema_registry.url (default http://localhost:8081) and schema_registry.enabled (default false).
schema_registry_url
schema_registry.url
http://localhost:8081
schema_registry.enabled
false
kafka_rest_host is renamed to kafka_rest.url (default http://localhost:8082).
kafka_rest_host
kafka_rest.url
http://localhost:8082
kafka_connect_host is replaced with kafka_connect.url (default http://localhost:8083).
kafka_connect_host
kafka_connect.url
http://localhost:8083
brokers is renamed to kafka_brokers.
kafka_brokers
The environment variable names of these config fields changed respectively. Please refer to the environment variables documentation page to see the newest changes.
environment: development\n- brokers: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n- kafka_rest_host: \"http://my-custom-rest.url:8082\"\n- kafka_connect_host: \"http://my-custom-connect.url:8083\"\n- schema_registry_url: \"http://my-custom-sr.url:8081\"\n+ kafka_brokers: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n+ kafka_rest:\n+ url: \"http://my-custom-rest.url:8082\"\n+ kafka_connect:\n+ url: \"http://my-custom-connect.url:8083\"\n+ schema_registry:\n+ enabled: true\n+ url: \"http://my-custom-sr.url:8081\"\n
The variable is now called kafka_brokers.
...\n app:\n streams:\n- brokers: ${brokers}\n+ brokers: ${kafka_brokers}\n schemaRegistryUrl: ${schema_registry_url}\n nameOverride: override-with-this-name\n imageTag: \"1.0.0\"\n...\n
Warning
The previous CLI parameters have been removed.
The options for a custom components_module and pipeline_base_dir are now global settings, defined in config.yaml.
components_module
kafka_brokers: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n environment: development\n+ components_module: components\n+ pipeline_base_dir: pipelines\n
The location of the GitHub action has changed, and it's now available directly as bakdata/kpops.
bakdata/kpops
You'll need to change it in your GitHub CI workflows.
steps:\n - name: kpops deploy\n- uses: bakdata/kpops/actions/kpops-runner@main\n+ uses: bakdata/kpops@main\n with:\n command: deploy --execute\n # ...\n
Specifying the environment is no longer mandatory. If not defined, only the global files will be used.
environment is no longer specified in config.yaml. Instead, it can be either set via the CLI flag --environment or with the environment variable KPOPS_ENVIRONMENT.
--environment
KPOPS_ENVIRONMENT
The --config flag in the CLI now points to the directory that contains config*.yaml files. The files to be used are resolved based on the provided (or not) environment.
--config
config*.yaml
- environment: development\n kafka_brokers: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n schema_registry:\n enabled: true\n url: \"http://my-custom-sr.url:8081\"\n
The delimiter in the substitution variables is changed to ..
steps:\n - type: scheduled-producer\n app:\n labels:\n- app_type: \"${component_type}\"\n- app_name: \"${component_name}\"\n- app_schedule: \"${component_app_schedule}\"\n+ app_type: \"${component.type}\"\n+ app_name: \"${component.name}\"\n+ app_schedule: \"${component.app.schedule}\"\n
topic_name_config:\n- default_error_topic_name: \"${pipeline_name}-${component_name}-dead-letter-topic\"\n- default_output_topic_name: \"${pipeline_name}-${component_name}-topic\"\n+ default_error_topic_name: \"${pipeline_name}-${component.name}-dead-letter-topic\"\n+ default_output_topic_name: \"${pipeline_name}-${component.name}-topic\"\n
The template method of every pipeline component has been renamed to manifest as it is no longer strictly tied to Helm template. Instead, it can be used to render the final resources of a component, such as Kubernetes manifests.
template
There is also a new kpops manifest command replacing the existing kpops generate --template flag.
kpops manifest
kpops generate --template
If you're using this functionality in your custom components, it needs to be updated.
from kpops.components.base_components.models.resource import Resource\n\n @override\n- def template(self) -> None:\n+ def manifest(self) -> Resource:\n \"\"\"Render final component resources, e.g. Kubernetes manifests.\"\"\"\n return [] # list of manifests\n
The global configuration variables are now namespaced under the config key, such as ${config.kafka_brokers}, ${config.schema_registry.url}. Same with pipeline variables, e.g. ${pipeline_name} \u2192 ${pipeline.name}. This would make it more uniform with the existing ${component.<key>} variables.
${config.kafka_brokers}
${config.schema_registry.url}
${pipeline_name} \u2192 ${pipeline.name}
${component.<key>}
name: kafka-app\n- prefix: ${pipeline_name}-\n+ prefix: ${pipeline.name}-\n app:\n streams:\n- brokers: ${kafka_brokers}\n- schemaRegistryUrl: ${schema_registry.url}\n+ brokers: ${config.kafka_brokers}\n+ schemaRegistryUrl: ${config.schema_registry.url}\n
Helm will not find your (long) old release names anymore.
- - type: kubernetes-app\n+ - type: helm-app\n ...\n - type: kafka-app\n app:\n- brokers: ${brokers}\n+ brokers: ${config.kafka_brokers}\n labels:\n- app_schedule: \"${component_app_schedule}\"\n+ app_schedule: \"${component.app.schedule}\"\n ...\n - type: kafka-connector\n- namespace: my-namespace\n+ resetter_namespace: my-namespace\n ...\n
- environment: development\n\n+ components_module: components\n\n+ pipeline_base_dir: pipelines\n\n- brokers: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n+ kafka_brokers: \"http://k8kafka-cp-kafka-headless.kpops.svc.cluster.local:9092\"\n\n- kafka_rest_host: \"http://my-custom-rest.url:8082\"\n+ kafka_rest:\n+ url: \"http://my-custom-rest.url:8082\"\n\n- kafka_connect_host: \"http://my-custom-connect.url:8083\"\n+ kafka_connect:\n+ url: \"http://my-custom-connect.url:8083\"\n\n- schema_registry_url: \"http://my-custom-sr.url:8081\"\n+ schema_registry:\n+ enabled: true\n+ url: \"http://my-custom-sr.url:8081\"\n\n topic_name_config:\n- default_error_topic_name: \"${pipeline_name}-${component_name}-dead-letter-topic\"\n+ default_error_topic_name: \"${pipeline.name}-${component.name}-dead-letter-topic\"\n ...\n
- from kpops.components import KubernetesApp\n+ from kpops.components import HelmApp\n+ from kpops.components.base_components.models.resource import Resource\n\n- class CustomHelmApp(KubernetesApp):\n+ class CustomHelmApp(HelmApp):\n\n @override\n- def template(self) -> None:\n+ def manifest(self) -> Resource:\n \"\"\"Render final component resources, e.g. Kubernetes manifests.\"\"\"\n return [] # list of manifests\n ...\n
steps:\n - name: ...\n- uses: bakdata/kpops/actions/kpops-runner@main\n+ uses: bakdata/kpops@main\n ...\n
The --defaults flag is removed
--defaults
It is possible now to use multiple default values. The defaults.yaml (or defaults_<env>.yaml) files can be distributed across multiple files. These will be picked up by KPOps and get merged into a single pipeline.yaml file. KPOps starts from reading the default files from where the pipeline path is defined and picks up every defaults file on its way to where the pipeline_base_dir is defined.
For example, imagine the following folder structure:
The pipeline_base_dir is configured to pipelines. Now if we generate this pipeline with the following command:
kpops generate \\\n --environment dev\n ./pipelines/distributed-defaults/pipeline-deep/pipeline.yaml\n
The defaults would be picked in the following order (high to low priority):
The global timeout setting has been removed. Instead, an individual timeout can be set for each external service. The default is 30 seconds.
timeout
- timeout: 300\n\n kafka_rest:\n url: \"http://my-custom-rest.url:8082\"\n+ timeout: 30\n kafka_connect:\n url: \"http://my-custom-connect.url:8083\"\n+ timeout: 30\n schema_registry:\n enabled: true\n url: \"http://my-custom-sr.url:8081\"\n+ timeout: 30\n
KPOps can now deploy multiple pipelines in a single command. It is possible to pass one or many pipeline.yaml files or pass a directory with many pipeline.yaml files within it.
The environment variable KPOPS_PIPELINE_PATH is changed to KPOPS_PIPELINE_PATHS.
KPOPS_PIPELINE_PATH
KPOPS_PIPELINE_PATHS
Read more:
KPops Python API is now stable and separated from the CLI! \ud83c\udf89
KPOps is now distributed as a Python namespace package (as defined by PEP 420). This allows us to standardize the namespace kpops.components for both builtin and custom pipeline components.
kpops.components
As a result of the restructure, some imports need to be adjusted:
KPOps Python API
- import kpops\n+ import kpops.api as kpops\n
builtin KPOps components
- from kpops.components import (\n- HelmApp,\n- KafkaApp,\n- KafkaConnector,\n- KafkaSinkConnector,\n- KafkaSourceConnector,\n- KubernetesApp,\n- StreamsBootstrap,\n- ProducerApp,\n- StreamsApp,\n- PipelineComponent,\n- StreamsApp,\n- ProducerApp,\n- )\n+ from kpops.components.base_components import (\n+ HelmApp,\n+ KafkaApp,\n+ KafkaConnector,\n+ KafkaSinkConnector,\n+ KafkaSourceConnector,\n+ KubernetesApp,\n+ PipelineComponent,\n+ )\n+ from kpops.components.streams_bootstrap import (\n+ StreamsBootstrap,\n+ StreamsApp,\n+ ProducerApp,\n+ )\n
- components_module: components\n
- components/__init__.py\n+ kpops/components/custom/__init__.py\n
The app attribute of the builtin KPOps components has been renamed to better differentiate them. Both your pipeline.yaml and defaults.yaml files have to be updated, e.g.:
kubernetes-app:\n- app: {}\n+ values: {}\n\n helm-app:\n- app: {}\n+ values: {}\n\n kafka-app:\n- app: {}\n+ values: {}\n\n streams-app:\n- app: {}\n+ values: {}\n\n producer-app:\n- app: {}\n+ values: {}\n\n kafka-connector:\n- app: {}\n+ config: {}\n\n kafka-source-connector:\n- app: {}\n+ config: {}\n\n kafka-sink-connector:\n- app: {}\n+ config: {}\n
Before v7, the KPOps CLI executed destroy before running reset/clean to ensure the component was destroyed.
reset/clean
This logic has changed. The destroy method is now called within the PipelineComponent's reset/clean.
PipelineComponent
During migrating to v7, you should check your custom components and see if they override the reset/clean methods. If so, you need to call the supermethod reset/clean to trigger the destroy inside the parent class. Alternatively, if you are implementing the PipelineComponent class, you need to call the destroy method at the beginning of the method.
For example, when creating a custom StreamsApp or ProducerApp (or any other custom component), you must call the supermethod reset/clean to execute the destroy in the parent class. Otherwise, the logic of destroy will not be executed!
class MyStreamsApp(StreamsApp):\n\n @override\n async def clean(self, dry_run: bool) -> None:\n+ await super().clean(dry_run)\n # Some custom clean logic\n # ...\n ```diff\n \n \nclass MyCustomComponent(PipelineComponent):\n \n @override\n async def destroy(self, dry_run: bool) -> None:\n # Some custom destroy logic\n # ...\n\n @override\n async def clean(self, dry_run: bool) -> None:\n+ await super().clean(dry_run)\n # Some custom clean logic\n # ...\n
From now on KPOps supports streams-bootstrap v3 as its default component. The previous streams-bootstrap version (below 3.x.x) is marked as deprecated and will be removed in a future version of KPOps. If you don't want to migrate your producer or streams app to v3, you should suffix your components with -v2. Here is an example of a pipeline.yaml file.
-v2
- - type: producer-app\n+ - type: producer-app-v2\n\n- - type: streams-app\n+ - type: streams-app-v2\n\n# rest of your pipeline\n
- class MyStreamsApp(StreamsApp):\n+ class MyStreamsApp(StreamsAppV2):\n ...\n
The streams-boostrap, streams-app, and producer-app now all take the Helm values of streams-bootstrap version 3. You can find these values under the Helm charts documentation or by referring to the Base model definitions.
streams-boostrap
streams-app
The keyword role is renamed to label. You need to replace it in your pipeline.yaml, defaults.yaml, and the Python components definition files. Here is a simple example of the defaults.yaml.
label
streams-app-v2:\n values:\n streams:\n brokers: localhost:9092\n from:\n topics:\n my-labeled-input-topic:\n- role: my-input-topic-label\n+ label: my-input-topic-label\n my-labeled-input-pattern:\n type: pattern\n- role: my-input-topic-labeled-pattern\n+ label: my-input-topic-labeled-pattern\n\n to:\n topics:\n my-labeled-topic-output:\n- role: my-output-topic-label\n+ label: my-output-topic-label\n\n# rest of your pipeline\n
The KafkaApp component now only contains the deployment logic of the stream-bootstrap applications (streams-app, producer-app). It should not be used in the defaults.yaml nor the pipeline.yaml. If you are using it, it should be replaced by streams-bootstrap.
- kafka-app:\n+ streams-bootstrap-v2:\n values:\n streams:\n brokers: 127.0.0.1:9092\n schemaRegistryUrl: 127.0.0.1:8081\n
The kpops manifest command and kpops.manifest() API have been removed.
kpops.manifest()
Resource manifesting is now integrated into the operation commands (deploy, destroy, reset, clean) through the new operation mode feature.
To manifest resources, you can:
--operation-mode manifest
KPOps now supports generating valid Kubernetes KafkaTopic resources compatible with Strimzi. When using manifest or argo as the operation_mode, you must specify the Strimzi cluster label to ensure the topics are recognized by the deployed Strimzi Topic Operator.
operation_mode: manifest\n\n+ strimzi_topic:\n+ label:\n+ strimzi.io/cluster: my-cluster\n\n# rest of your config\n
Refer to the Strimzi documentation on deploying a standalone topic operator for more details.
KPOps V9 no longer supports Python 3.10. Ensure your environment is running Python 3.11 to 3.12.
Upgrade your Python version to a supported version (3.11 or 3.12). Update your virtual environments and CI pipelines to reflect this change.
Usage:
$ kpops [OPTIONS] COMMAND [ARGS]...\n
Options:
-V, --version
--install-completion
--show-completion
--help
Commands:
generate
init
schema
Clean pipeline steps
$ kpops clean [OPTIONS] PIPELINE_PATHS...\n
Arguments:
PIPELINE_PATHS...
--dotenv FILE
--config DIRECTORY
--steps TEXT
--filter-type [include|exclude]
--environment TEXT
--dry-run / --execute
--verbose / --no-verbose
--parallel / --no-parallel
--operation-mode [argo|manifest|managed]
kpops deploy
Deploy pipeline steps
$ kpops deploy [OPTIONS] PIPELINE_PATHS...\n
Destroy pipeline steps
$ kpops destroy [OPTIONS] PIPELINE_PATHS...\n
Enrich pipeline steps with defaults. The enriched pipeline is used for all KPOps operations (deploy, destroy, ...).
$ kpops generate [OPTIONS] PIPELINE_PATHS...\n
kpops init
Initialize a new KPOps project.
$ kpops init [OPTIONS] PATH\n
PATH
--config-include-optional / --no-config-include-optional
kpops reset
Reset pipeline steps
$ kpops reset [OPTIONS] PIPELINE_PATHS...\n
kpops schema
Generate JSON schema.
The schemas can be used to enable support for KPOps files in a text editor.
$ kpops schema [OPTIONS] SCOPE:{pipeline|defaults|config}\n
SCOPE:{pipeline|defaults|config}
- pipeline: Schema of PipelineComponents for KPOps pipeline.yaml\n\n- defaults: Schema of PipelineComponents for KPOps defaults.yaml\n\n- config: Schema for KPOps config.yaml [required]\n
We are working towards first-class editor support by providing plugins that work out of the box.
settings.json
{\n \"yaml.schemas\": {\n \"https://bakdata.github.io/kpops/4.0/schema/pipeline.json\": [\n \"pipeline.yaml\",\n \"pipeline_*.yaml\"\n ],\n \"https://bakdata.github.io/kpops/4.0/schema/defaults.json\": [\n \"defaults.yaml\",\n \"defaults_*.yaml\"\n ],\n \"https://bakdata.github.io/kpops/4.0/schema/config.json\": [\n \"config.yaml\",\n \"config_*.yaml\"\n ]\n }\n}\n
Advanced usage
It is possible to generate schemas with the kpops schema command. Useful for including custom components or when using a pre-release version of KPOps.
KPOps provides JSON schemas that enable autocompletion and validation for all YAML files that the user must work with.
We provided a GitHub composite action bakdata/kpops that installs and executes KPOps commands with the given parameters.
steps:\n # ...\n # This step is useful for debugging reasons\n - name: Generate Kafka pipeline\n uses: bakdata/kpops@main\n with:\n command: generate\n working-directory: home/my-kpops-root-dir\n pipeline: pipelines/my-pipeline-file.yaml\n kpops-version: 1.2.3\n\n # It is possible to use a pre-release KPOps version from TestPyPI https://test.pypi.org/project/kpops/#history\n - name: Deploy Kafka pipeline\n uses: bakdata/kpops@main\n with:\n command: deploy --execute\n working-directory: home/my-kpops-root-dir\n pipeline: pipelines/my-pipeline-file.yaml\n kpops-version: 1.2.5.dev20230707132709\n # ...\n