diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index acd3c12d..8a51feed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,7 @@ jobs: uses: actions/checkout@v4 - name: Setup Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: ${{ env.go-version }} @@ -81,35 +81,42 @@ jobs: uses: actions/checkout@v4 - name: Setup Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: ${{ env.go-version }} - - - name: Echo Go version - run: go version - name: Build with different arch run: | export GOOS=$(echo ${{ matrix.platforms }} | cut -d '/' -f 1) export GOARCH=$(echo ${{ matrix.platforms }} | cut -d '/' -f 2) export GOARM=$(echo ${{ matrix.platforms }} | cut -d '/' -f 3 | cut -d 'v' -f 2) - if [[ "x$GOOS" == "xwindows" ]]; then - make build CMD_NAME="builds/${{ env.cmd-name }}_${GOOS}_${GOARCH}.exe" - elif [[ "x$GOARM" != "x" ]]; then - make build CMD_NAME="builds/${{ env.cmd-name }}_${GOOS}_${GOARCH}v${GOARM}" + if [[ "$GOOS" == "windows" ]]; then + make build CMD_NAME="builds/${{ env.cmd-name }}.exe" else - make build CMD_NAME="builds/${{ env.cmd-name }}_${GOOS}_${GOARCH}" + make build CMD_NAME="builds/${{ env.cmd-name }}" fi - - name: Create checksums + - name: Create checksum if: startsWith(github.ref, 'refs/tags/v') working-directory: builds run: | find . -type f -exec shasum -a 256 -b {} + | sed 's# \*\./# *#' | while read sum file; do echo "$sum $file" > "${file#\*}".sha256; done - - name: List artifacts + - name: Create archive if: startsWith(github.ref, 'refs/tags/v') - run: tree -nh builds + run: | + export GOOS=$(echo ${{ matrix.platforms }} | cut -d '/' -f 1) + export GOARCH=$(echo ${{ matrix.platforms }} | cut -d '/' -f 2) + export GOARM=$(echo ${{ matrix.platforms }} | cut -d '/' -f 3 | cut -d 'v' -f 2) + export ARCHIVE_NAME=$(echo "${{ env.cmd-name }}-${GOOS}-${GOARCH}$(if [ -n "${GOARM}" ]; then echo v${GOARM}; fi).$(if [ "${GOOS}" = "windows" ]; then echo "zip"; else echo "tar.gz"; fi)") + cp LICENSE builds/ + cd builds + if [[ "$GOOS" == "windows" ]]; then + zip "${ARCHIVE_NAME}" * + else + tar -czvf "${ARCHIVE_NAME}" * + fi + find . -maxdepth 1 -type f ! -name ${ARCHIVE_NAME} -exec rm -f {} + - name: GitHub Release if: startsWith(github.ref, 'refs/tags/v') diff --git a/.gitignore b/.gitignore index ec63b7bd..5e82a3dd 100644 --- a/.gitignore +++ b/.gitignore @@ -18,11 +18,11 @@ vendor/ .idea .vscode scratch - config.yml !playground/config.yml # Project exclusion +site venv .cache # Binaries diff --git a/cmd/greenmask/cmd/dump/dump.go b/cmd/greenmask/cmd/dump/dump.go index 6c4782f2..21708fed 100644 --- a/cmd/greenmask/cmd/dump/dump.go +++ b/cmd/greenmask/cmd/dump/dump.go @@ -64,7 +64,7 @@ var ( ) // TODO: Check how does work mixed options - use-list + tables, etc. -// TODO: Option that currently does not implemented: +// TODO: Options currently are not implemented: // - encoding // - disable-triggers // - lock-wait-timeout diff --git a/cmd/greenmask/cmd/list_dump/list_dump.go b/cmd/greenmask/cmd/list_dumps/list_dumps.go similarity index 98% rename from cmd/greenmask/cmd/list_dump/list_dump.go rename to cmd/greenmask/cmd/list_dumps/list_dumps.go index a2c498d9..d30c6f85 100644 --- a/cmd/greenmask/cmd/list_dump/list_dump.go +++ b/cmd/greenmask/cmd/list_dumps/list_dumps.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package list_dump +package list_dumps import ( "context" @@ -43,7 +43,7 @@ var ( } if err := listDumps(); err != nil { - log.Err(err).Msg("") + log.Fatal().Err(err).Msg("") } }, } diff --git a/cmd/greenmask/cmd/list_transformers/list_transformers.go b/cmd/greenmask/cmd/list_transformers/list_transformers.go index 78a60f2c..d9f3cb41 100644 --- a/cmd/greenmask/cmd/list_transformers/list_transformers.go +++ b/cmd/greenmask/cmd/list_transformers/list_transformers.go @@ -20,9 +20,9 @@ import ( "fmt" "os" "slices" - "strconv" "strings" + "github.com/greenmaskio/greenmask/pkg/toolkit" "github.com/olekukonko/tablewriter" "github.com/rs/zerolog/log" "github.com/spf13/cobra" @@ -42,8 +42,8 @@ var ( log.Err(err).Msg("") } - if err := run(args); err != nil { - log.Err(err).Msg("") + if err := run(); err != nil { + log.Fatal().Err(err).Msg("") } }, } @@ -56,7 +56,20 @@ const ( TextFormatName = "text" ) -func run(transformerNames []string) error { +const anyTypesValue = "any" + +type parameter struct { + Name string `json:"name,omitempty"` + SupportedTypes []string `json:"supported_types,omitempty"` +} + +type jsonResponse struct { + Name string `json:"name,omitempty"` + Description string `json:"description,omitempty"` + Parameters []*parameter `json:"parameters,omitempty"` +} + +func run() error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() err := custom.BootstrapCustomTransformers(ctx, utils.DefaultTransformerRegistry, Config.CustomTransformers) @@ -64,11 +77,14 @@ func run(transformerNames []string) error { return fmt.Errorf("error registering custom transformer: %w", err) } + // TODO: Consider about listing format. The transformer can have one and more columns as an input + // and + switch format { case JsonFormatName: - err = listTransformersJson(utils.DefaultTransformerRegistry, transformerNames) + err = listTransformersJson(utils.DefaultTransformerRegistry) case TextFormatName: - err = listTransformersText(utils.DefaultTransformerRegistry, transformerNames) + err = listTransformersText(utils.DefaultTransformerRegistry) default: return fmt.Errorf(`unknown format %s`, format) } @@ -79,92 +95,73 @@ func run(transformerNames []string) error { return nil } -func listTransformersJson(registry *utils.TransformerRegistry, transformerNames []string) error { - var transformers []*utils.Definition - - if len(transformerNames) > 0 { +func listTransformersJson(registry *utils.TransformerRegistry) error { + var transformers []*jsonResponse - for _, name := range transformerNames { - def, ok := registry.M[name] - if ok { - transformers = append(transformers, def) - } else { - return fmt.Errorf("unknown transformer name \"%s\"", name) + for _, def := range registry.M { + var params []*parameter + for _, p := range def.Parameters { + if !p.IsColumn && !p.IsColumnContainer { + continue } + supportedTypes := getColumnTypes(p) + params = append(params, ¶meter{Name: p.Name, SupportedTypes: supportedTypes}) } - } else { - for _, def := range registry.M { - transformers = append(transformers, def) - } + transformers = append(transformers, &jsonResponse{ + Name: def.Properties.Name, + Description: def.Properties.Description, + Parameters: params, + }) } + slices.SortFunc(transformers, func(a, b *jsonResponse) int { + return strings.Compare(a.Name, b.Name) + }) + if err := json.NewEncoder(os.Stdout).Encode(transformers); err != nil { return err } return nil } -func listTransformersText(registry *utils.TransformerRegistry, transformerNames []string) error { +func listTransformersText(registry *utils.TransformerRegistry) error { var data [][]string table := tablewriter.NewWriter(os.Stdout) var names []string - if len(transformerNames) > 0 { - for _, name := range transformerNames { - _, ok := registry.M[name] - if ok { - names = append(names, name) - } else { - return fmt.Errorf("unknown transformer name \"%s\"", name) - } - } - - } else { - for name := range registry.M { - names = append(names, name) - } - slices.Sort(names) + for name := range registry.M { + names = append(names, name) } - + slices.Sort(names) + table.SetHeader([]string{"name", "description", "column parameter name", "supported types"}) for _, name := range names { def := registry.M[name] - data = append(data, []string{def.Properties.Name, "description", def.Properties.Description, "", "", ""}) + //allowedTypes := getAllowedTypesList(def) for _, p := range def.Parameters { - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "description", p.Description, ""}) - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "required", strconv.FormatBool(p.Required), ""}) - if p.DefaultValue != nil { - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "default", string(p.DefaultValue), ""}) - } - if p.LinkParameter != "" { - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "linked_parameter", p.LinkParameter, ""}) - } - if p.CastDbType != "" { - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "cast_to_db_type", p.CastDbType, ""}) + if !p.IsColumn && !p.IsColumnContainer { + continue } - if p.ColumnProperties != nil { - if len(p.ColumnProperties.AllowedTypes) > 0 { - allowedTypes := strings.Join(p.ColumnProperties.AllowedTypes, ", ") - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "allowed_types", allowedTypes}) - } - isAffected := strconv.FormatBool(p.ColumnProperties.Affected) - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "is_affected", isAffected}) - skipOriginalData := strconv.FormatBool(p.ColumnProperties.SkipOriginalData) - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "skip_original_data", skipOriginalData}) - skipOnNull := strconv.FormatBool(p.ColumnProperties.SkipOnNull) - data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "skip_on_null", skipOnNull}) - } - + supportedTypes := getColumnTypes(p) + data = append(data, []string{def.Properties.Name, def.Properties.Description, p.Name, strings.Join(supportedTypes, ", ")}) } } + table.AppendBulk(data) table.SetRowLine(true) - table.SetAutoMergeCellsByColumnIndex([]int{0, 1, 2, 3}) + table.SetAutoMergeCellsByColumnIndex([]int{0, 1}) table.Render() return nil } +func getColumnTypes(p *toolkit.Parameter) []string { + if p.ColumnProperties != nil && len(p.ColumnProperties.AllowedTypes) > 0 { + return p.ColumnProperties.AllowedTypes + } + return []string{anyTypesValue} +} + func init() { Cmd.Flags().StringVarP(&format, "format", "f", TextFormatName, "output format [text|json]") } diff --git a/cmd/greenmask/cmd/restore/restore.go b/cmd/greenmask/cmd/restore/restore.go index 9b7b7541..704e7aca 100644 --- a/cmd/greenmask/cmd/restore/restore.go +++ b/cmd/greenmask/cmd/restore/restore.go @@ -20,6 +20,7 @@ import ( "path" "slices" + "github.com/greenmaskio/greenmask/internal/storages" "github.com/rs/zerolog/log" "github.com/spf13/cobra" "github.com/spf13/viper" @@ -36,7 +37,6 @@ var ( Args: cobra.ExactArgs(1), Short: "restore dump with ID or the latest to the target database", Run: func(cmd *cobra.Command, args []string) { - var dumpId string if err := logger.SetLogLevel(Config.Log.Level, Config.Log.Format); err != nil { log.Fatal().Err(err).Msg("fatal") @@ -49,41 +49,9 @@ var ( log.Fatal().Err(err).Msg("fatal") } - if args[0] == "latest" { - var backupNames []string - - _, dirs, err := st.ListDir(ctx) - if err != nil { - log.Fatal().Err(err).Msg("cannot walk through directory") - } - for _, dir := range dirs { - exists, err := dir.Exists(ctx, "metadata.json") - if err != nil { - log.Fatal().Err(err).Msg("cannot check file existence") - } - if exists { - backupNames = append(backupNames, dir.Dirname()) - } - } - - slices.SortFunc( - backupNames, func(a, b string) int { - if a > b { - return -1 - } - return 1 - }, - ) - dumpId = backupNames[0] - } else { - dumpId = args[0] - exists, err := st.Exists(ctx, path.Join(dumpId, "metadata.json")) - if err != nil { - log.Fatal().Err(err).Msg("cannot check file existence") - } - if !exists { - log.Fatal().Err(err).Msg("choose another dump is failed") - } + dumpId, err := getDumpId(ctx, st, args[0]) + if err != nil { + log.Fatal().Err(err).Msg("") } st = st.SubStorage(dumpId, true) @@ -104,7 +72,51 @@ var ( Config = pgDomains.NewConfig() ) -// TODO: Option that currently does not implemented: +func getDumpId(ctx context.Context, st storages.Storager, dumpId string) (string, error) { + if dumpId == "latest" { + var backupNames []string + + _, dirs, err := st.ListDir(ctx) + if err != nil { + log.Fatal().Err(err).Msg("cannot walk through directory") + } + for _, dir := range dirs { + exists, err := dir.Exists(ctx, "metadata.json") + if err != nil { + log.Fatal().Err(err).Msg("cannot check file existence") + } + if exists { + backupNames = append(backupNames, dir.Dirname()) + } + } + + slices.SortFunc( + backupNames, func(a, b string) int { + if a > b { + return -1 + } + return 1 + }, + ) + dumpId = backupNames[0] + } else { + exists, err := st.Exists(ctx, path.Join(dumpId, "metadata.json")) + if err != nil { + log.Fatal(). + Err(err). + Msg("cannot check file existence") + } + if !exists { + log.Fatal(). + Err(err). + Str("DumpId", dumpId). + Msg("dump with provided id is not found") + } + } + return dumpId, nil +} + +// TODO: Options currently are not implemented: // * data-only // * exit-on-error // * use-list diff --git a/cmd/greenmask/cmd/root.go b/cmd/greenmask/cmd/root.go index 34763879..429af065 100644 --- a/cmd/greenmask/cmd/root.go +++ b/cmd/greenmask/cmd/root.go @@ -26,10 +26,11 @@ import ( "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/delete_backup" "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/dump" - "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/list_dump" + "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/list_dumps" "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/list_transformers" "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/restore" "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/show_dump" + "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/show_transformer" "github.com/greenmaskio/greenmask/cmd/greenmask/cmd/validate" pgDomains "github.com/greenmaskio/greenmask/internal/domains" configUtils "github.com/greenmaskio/greenmask/internal/utils/config" @@ -90,12 +91,13 @@ func init() { ) RootCmd.AddCommand(dump.Cmd) - RootCmd.AddCommand(list_dump.Cmd) + RootCmd.AddCommand(list_dumps.Cmd) RootCmd.AddCommand(restore.Cmd) RootCmd.AddCommand(delete_backup.Cmd) RootCmd.AddCommand(show_dump.Cmd) RootCmd.AddCommand(list_transformers.Cmd) RootCmd.AddCommand(validate.Cmd) + RootCmd.AddCommand(show_transformer.Cmd) if err := RootCmd.MarkPersistentFlagRequired("config"); err != nil { log.Fatal().Err(err).Msg("") diff --git a/cmd/greenmask/cmd/show_transformer/show_transformer.go b/cmd/greenmask/cmd/show_transformer/show_transformer.go new file mode 100644 index 00000000..663e3b74 --- /dev/null +++ b/cmd/greenmask/cmd/show_transformer/show_transformer.go @@ -0,0 +1,159 @@ +// Copyright 2023 Greenmask +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package show_transformer + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strconv" + "strings" + + "github.com/olekukonko/tablewriter" + "github.com/rs/zerolog/log" + "github.com/spf13/cobra" + + "github.com/greenmaskio/greenmask/internal/db/postgres/transformers/custom" + "github.com/greenmaskio/greenmask/internal/db/postgres/transformers/utils" + "github.com/greenmaskio/greenmask/internal/domains" + "github.com/greenmaskio/greenmask/internal/utils/logger" +) + +var ( + Cmd = &cobra.Command{ + Use: "show-transformer", + Args: cobra.ExactArgs(1), + Short: "show transformer details", + Run: func(cmd *cobra.Command, args []string) { + if err := logger.SetLogLevel(Config.Log.Level, Config.Log.Format); err != nil { + log.Err(err).Msg("") + } + + if err := run(args[0]); err != nil { + log.Fatal().Err(err).Msg("") + } + }, + } + Config = domains.NewConfig() + format string +) + +const ( + JsonFormatName = "json" + TextFormatName = "text" +) + +const anyTypesValue = "any" + +func run(name string) error { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + err := custom.BootstrapCustomTransformers(ctx, utils.DefaultTransformerRegistry, Config.CustomTransformers) + if err != nil { + return fmt.Errorf("error registering custom transformer: %w", err) + } + + switch format { + case JsonFormatName: + err = showTransformerJson(utils.DefaultTransformerRegistry, name) + case TextFormatName: + err = showTransformerText(utils.DefaultTransformerRegistry, name) + default: + return fmt.Errorf(`unknown format \"%s\"`, format) + } + if err != nil { + return fmt.Errorf("error listing transformers: %w", err) + } + + return nil +} + +func showTransformerJson(registry *utils.TransformerRegistry, transformerName string) error { + var transformers []*utils.Definition + + def, ok := registry.M[transformerName] + if ok { + transformers = append(transformers, def) + } else { + return fmt.Errorf("unknown transformer with name \"%s\"", transformerName) + } + + if err := json.NewEncoder(os.Stdout).Encode(transformers); err != nil { + return err + } + return nil +} + +func showTransformerText(registry *utils.TransformerRegistry, name string) error { + + var data [][]string + table := tablewriter.NewWriter(os.Stdout) + + def, err := getTransformerDefinition(registry, name) + if err != nil { + return err + } + + data = append(data, []string{def.Properties.Name, "description", def.Properties.Description, "", "", ""}) + for _, p := range def.Parameters { + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "description", p.Description, ""}) + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "required", strconv.FormatBool(p.Required), ""}) + if p.DefaultValue != nil { + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "default", string(p.DefaultValue), ""}) + } + if p.LinkParameter != "" { + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "linked_parameter", p.LinkParameter, ""}) + } + if p.CastDbType != "" { + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "cast_to_db_type", p.CastDbType, ""}) + } + if p.IsColumnContainer { + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "allowed_types", anyTypesValue}) + } + if p.ColumnProperties != nil { + allowedTypes := []string{anyTypesValue} + if len(p.ColumnProperties.AllowedTypes) > 0 { + allowedTypes = p.ColumnProperties.AllowedTypes + } + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "allowed_types", strings.Join(allowedTypes, ", ")}) + isAffected := strconv.FormatBool(p.ColumnProperties.Affected) + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "is_affected", isAffected}) + skipOriginalData := strconv.FormatBool(p.ColumnProperties.SkipOriginalData) + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "skip_original_data", skipOriginalData}) + skipOnNull := strconv.FormatBool(p.ColumnProperties.SkipOnNull) + data = append(data, []string{def.Properties.Name, "parameters", p.Name, "column_properties", "skip_on_null", skipOnNull}) + } + } + + table.AppendBulk(data) + table.SetRowLine(true) + table.SetAutoMergeCellsByColumnIndex([]int{0, 1, 2, 3}) + table.Render() + + return nil +} + +func getTransformerDefinition(registry *utils.TransformerRegistry, name string) (*utils.Definition, error) { + def, ok := registry.M[name] + if ok { + return def, nil + } + return nil, fmt.Errorf("unknown transformer \"%s\"", name) +} + +func init() { + Cmd.Flags().StringVarP(&format, "format", "f", TextFormatName, "output format [text|json]") +} diff --git a/docker-compose.yml b/docker-compose.yml index 8038ed12..0999f41e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,7 +14,6 @@ services: -c 'mkdir -p /export/adventureworks && minio server /export --console-address :9001' healthcheck: test: timeout 5s bash -c ':> /dev/tcp/127.0.0.1/9000' || exit 1 - start_period: 5s interval: 10s timeout: 5s retries: 2 @@ -54,6 +53,11 @@ services: - ./playground:/var/lib/playground image: "greenmask/greenmask:latest" working_dir: /var/lib/playground + environment: + PGPASSWORD: example + ORIGINAL_DB_NAME: "original" + TRANSFORMED_DB_NAME: "transformed" + DATABASE_HOST: "playground-db" depends_on: playground-dbs-filler: condition: service_completed_successfully @@ -68,6 +72,11 @@ services: build: dockerfile: docker/greenmask/Dockerfile context: ./ + environment: + PGPASSWORD: example + ORIGINAL_DB_NAME: "original" + TRANSFORMED_DB_NAME: "transformed" + DATABASE_HOST: "playground-db" depends_on: playground-dbs-filler: condition: service_completed_successfully diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 00000000..c317cd65 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,64 @@ +# Architecture + +## Introduction + +It is evident that the most appropriate approach for executing logical backup dumping and restoration is by leveraging the core PostgreSQL utilities, specifically `pg_dump` and `pg_restore`. Greenmask has been purposefully designed to align with PostgreSQL's native utilities, ensuring compatibility. Greenmask primarily handles data dumping operations independently and delegates the responsibilities of schema dumping and restoration to `pg_dump` and `pg_restore` respectively, maintaining seamless integration with PostgreSQL's standard tools. + +## Backup process + +The process of backing up PostgreSQL databases is divided into three distinct sections: + +* **Pre-data** — this section encompasses the raw schema of tables, excluding primary keys (PK) and foreign keys (FK). +* **Data** — the data section contains the actual table data in COPY format, including information about sequence current values and Large Objects data. +* **Post-data** — in this section, you'll find the definitions of indexes, triggers, rules, and constraints (such as PK and FK). + +Greenmask focuses exclusively on the data section during runtime. It delegates the handling of the `pre-data` and `post-data` sections to the core PostgreSQL utilities, `pg_dump` and `pg_restore`. + +Greenmask employs the directory format of `pg_dump` and `pg_restore`. This format is particularly suitable for +parallel execution and partial restoration, and it includes clear metadata files that aid in determining the backup and restoration steps. Greenmask has been optimized to work seamlessly with remote storage systems and obfuscation procedures. + +When performing data dumping, Greenmask utilizes the COPY command in TEXT format, maintaining reliability and +compatibility with the vanilla PostgreSQL utilities. + +Additionally, Greenmask supports parallel execution, significantly reducing the time required for the dumping process. + +## Storage options + +The core PostgreSQL utilities, `pg_dump` and `pg_restore`, traditionally operate with files in a directory format, offering no alternative methods. To meet modern backup requirements and provide flexible approaches, +Greenmask introduces the concept of storages. + +* `s3` — this option supports any S3-like storage system, including AWS S3, which makes it versatile and adaptable to various cloud-based storage solutions. +* `directory` — this is the standard choice, representing the ordinary filesystem directory for local storage. + +!!! note + If you have suggestions for additional storage options that would be valuable to implement, feel free to + share your ideas with us. Greenmask aims to accommodate a wide range of storage preferences to suit diverse backup needs. + +## Restoration process + +In the restoration process, Greenmask combines the capabilities of different tools: + +* For **schema restoration** Greenmask utilizes `pg_restore` to restore the database schema. This ensures that the schema is accurately reconstructed. +* For **data restoration** Greenmask independently applies the data using the COPY protocol. This allows Greenmask to handle the data efficiently, especially when working with various storage solutions. Greenmask is aware of the restoration metadata, which enables it to download only the necessary data. This feature is particularly useful for partial restoration scenarios, such as restoring a single table from a complete backup. + +Greenmask also supports **parallel restoration**, which can significantly reduce the time required to complete the restoration process. This parallel execution enhances the efficiency of restoring large datasets. + +## Data obfuscation and validation + +Greenmask works with COPY lines, collects schema metadata using the Golang driver, and employs this driver in the encoding and decoding process. The **validate command** offers a way to assess the impact on both schema +(**validation warnings**) and data (**transformation and displaying differences**). This command allows you to validate the schema and data transformations, ensuring the desired outcomes during the obfuscation process. + +## Customization + +If your table schema relies on functional dependencies between columns, you can address this challenge using the [**TemplateRecord**](built_in_transformers/advanced_transformers/template_record.md) transformer. This transformer enables you to define transformation logic for entire tables, offering type-safe operations when assigning new values. + +Greenmask provides a framework for creating your custom transformers, which can be reused efficiently. These +transformers can be seamlessly integrated without requiring recompilation, thanks to the PIPE (stdin/stdout) +interaction. + +!!! note + Furthermore, Greenmask's architecture is designed to be highly extensible, making it possible to introduce other interaction protocols, such as HTTP or Socket, for conducting obfuscation procedures. + +## PostgreSQL version compatibility + +Greenmask is compatible with PostgreSQL versions **11 and higher**. diff --git a/docs/assets/built_in_transformers/orders-schema.png b/docs/assets/built_in_transformers/orders-schema.png new file mode 100644 index 00000000..58a8e7f7 Binary files /dev/null and b/docs/assets/built_in_transformers/orders-schema.png differ diff --git a/docs/assets/built_in_transformers/person-person-schema.png b/docs/assets/built_in_transformers/person-person-schema.png new file mode 100644 index 00000000..277d25a2 Binary files /dev/null and b/docs/assets/built_in_transformers/person-person-schema.png differ diff --git a/docs/assets/getting_started/list-dumps.png b/docs/assets/getting_started/list-dumps.png new file mode 100644 index 00000000..2f16c565 Binary files /dev/null and b/docs/assets/getting_started/list-dumps.png differ diff --git a/docs/assets/getting_started/list-transformers-example.png b/docs/assets/getting_started/list-transformers-example.png new file mode 100644 index 00000000..4c13aaed Binary files /dev/null and b/docs/assets/getting_started/list-transformers-example.png differ diff --git a/docs/assets/getting_started/validate-result.png b/docs/assets/getting_started/validate-result.png new file mode 100644 index 00000000..d4102980 Binary files /dev/null and b/docs/assets/getting_started/validate-result.png differ diff --git a/docs/assets/list_dumps_screen.png b/docs/assets/list_dumps_screen.png new file mode 100644 index 00000000..ac266015 Binary files /dev/null and b/docs/assets/list_dumps_screen.png differ diff --git a/docs/assets/list_transformers_screen_1.png b/docs/assets/list_transformers_screen_1.png new file mode 100644 index 00000000..d4bd91e3 Binary files /dev/null and b/docs/assets/list_transformers_screen_1.png differ diff --git a/docs/assets/list_transformers_screen_2.png b/docs/assets/list_transformers_screen_2.png new file mode 100644 index 00000000..8e8371bb Binary files /dev/null and b/docs/assets/list_transformers_screen_2.png differ diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 00000000..52e5e884 Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/docs/assets/show_transformer.png b/docs/assets/show_transformer.png new file mode 100644 index 00000000..16015cdd Binary files /dev/null and b/docs/assets/show_transformer.png differ diff --git a/docs/assets/validate_horizontal_diff.png b/docs/assets/validate_horizontal_diff.png new file mode 100644 index 00000000..c34fc97a Binary files /dev/null and b/docs/assets/validate_horizontal_diff.png differ diff --git a/docs/assets/validate_vertical_diff.png b/docs/assets/validate_vertical_diff.png new file mode 100644 index 00000000..ae9c6351 Binary files /dev/null and b/docs/assets/validate_vertical_diff.png differ diff --git a/docs/built_in_transformers/advanced_transformers/custom_functions/core_functions.md b/docs/built_in_transformers/advanced_transformers/custom_functions/core_functions.md new file mode 100644 index 00000000..e470d482 --- /dev/null +++ b/docs/built_in_transformers/advanced_transformers/custom_functions/core_functions.md @@ -0,0 +1,246 @@ +# Core functions + +Below you can find custom core functions which are divided into categories based on the transformation purpose. + +## PostgreSQL driver functions + +| Function | Description | +|---------------|-----------------------------------------------------------------------------------------------------------------| +| `null` | Returns the `NULL` value that can be used for the driver encoding-decoding operations | +| `isNull` | Returns `true` if the checked value is `NULL` | +| `isNotNull` | Returns `true` if the checked value is *not* `NULL` | +| `sqlCoalesce` | Works as a standard SQL `coalesce` function. It allows you to choose the first non-NULL argument from the list. | + +## JSON output function + +| Function | Description | +|------------------|------------------------------------------------------------------------------------------| +| `jsonExists` | Checks if the path value exists in JSON. Returns `true` if the path exists. | +| `mustJsonGet` | Gets the JSON attribute value by path and throws an error if the path does not exist | +| `mustJsonGetRaw` | Gets the JSON attribute raw value by path and throws an error if the path does not exist | +| `jsonGet` | Gets the JSON attribute value by path and returns nil if the path does not exist | +| `jsonGetRaw` | Gets the JSON attribute raw value by path and returns nil if the path does not exist | +| `jsonSet` | Sets the value for the JSON document by path | +| `jsonSetRaw` | Sets the raw value for the JSON document by path | +| `jsonDelete` | Deletes an attribute from the JSON document by path | +| `jsonValidate` | Validates the JSON document syntax and throws an error if there are any issues | +| `jsonIsValid` | Checks the JSON document for validity and returns `true` if it is valid | +| `toJsonRawValue` | Casts any type of value to the raw JSON value | + +## Testing functions + +| Function | Description | +|------------|----------------------------------------| +| `isInt` | Checks if the value of an integer type | +| `isFloat` | Checks if the value of a float type | +| `isNil` | Checks if the value is nil | +| `isString` | Checks if the value of a string type | +| `isMap` | Checks if the value of a map type | +| `isSlice` | Checks if the value of a slice type | +| `isBool` | Checks if the value of a boolean type | + +## Transformation and generators + +### masking + +Replaces characters with asterisk `*` symbols depending on the provided masking rule. If the +value is `NULL`, it is kept unchanged. This function is based on [ggwhite/go-masker](https://github.com/ggwhite/go-masker). + +=== "Masking rules" + + | Rule | Description | Example input | Example output | + |---------------|------------------------------------------------------------------------------------------------------------------|----------------------------------------------------|-----------------------------------------| + | `default` | Returns the sequence of `*` symbols of the same length | `test1234` | `********` | + | `name` | Masks the second and the third letters | `ABCD` | `A**D` | + | `password` | Always returns a sequence of `*` | | | + | `address` | Keeps first 6 letters, masks the rest | `Larnaca, makarios st` | `Larnac*************` | + | `email` | Keeps a domain and the first 3 letters, masks the rest | `ggw.chang@gmail.com` | `ggw****@gmail.com` | + | `mobile` | Masks 3 digits starting from the 4th digit | `0987654321` | `0987***321` | + | `telephone` | Removes `(`, `)`, ` `, `-` symbols, masks last 4 digits of a telephone number, and formats it to `(??)????-????` | `0227993078` | `(02)2799-****` | + | `id` | Masks last 4 digits of an ID | `A123456789` | `A12345****` | + | `credit_card` | Masks 6 digits starting from the 7th digit | `1234567890123456` | `123456******3456` | + | `url` | Masks the password part of the URL (if applicable) | `http://admin:mysecretpassword@localhost:1234/uri` | `http://admin:xxxxx@localhost:1234/uri` | + +=== "Signature" + + `masking(dataType string, value string) (res string, err error)` + +=== "Parameters" + + * `dataType` — one of the masking rules (see previous tab) + * `value` — the original string value + +=== "Return values" + + * `res` — a masked string + * `err` — an error if there is an issue + +### truncateDate + +Truncates datetime up to the provided `part`. + +=== "Signature" + + `truncateDate(part string, original time.Time) (res time.Time, err error)` + +=== "Parameters" + + * `part` — the truncation part. Must be one of `nano`, `second`, `minute`, `hour`, `day`, `month`, or `year` + * `original` — the original datetime value + +=== "Return values" + + * `res` — a truncated datetime + * `err` — an error if there is an issue + +### noiseDatePgInterval + +Adds or subtracts a random duration in the provided `interval` to or from the original date value. + +=== "Signature" + + `noiseDate(interval string, original time.Time) (res time.Time, err error)` + +=== "Parameters" + + * `interval` — the maximum value of `ratio` that is added to the original value. The format is the same as in the [PostgreSQL interval format](https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-INTERVAL-INPUT). + * `original` — the original time value + +=== "Return values" + + * `res` — a noised date + * `err` — an error if there is an issue + +### noiseFloat + +Adds or subtracts a random fraction to or from the original float value. Multiplies the original float value by a provided random value that is not higher than the `ratio` parameter and adds it to the original value with the option to specify the precision via the `precision` parameter. + +=== "Signature" + + `noiseFloat(ratio float, precision int, value float) (res float64, err error)` + +=== "Parameters" + + * `ratio` — the maximum multiplier value in the interval (0:1). The value will be randomly generated up to `ratio`, multiplied by the original value, and the result will be added to the original value. + * `precision` — the precision of the resulted value + * `value` — the original value + +=== "Return values" + + * `res` — a noised float value + * `err` — an error if there is an issue + +### noiseInt + +Adds or subtracts a random fraction to or from the original integer value. Multiplies the original integer value by a provided random value that is not higher than the `ratio` parameter and adds it to the original value. + +=== "Signature" + + `noiseInt(ratio float, value float) (res int, err error)` + +=== "Parameters" + + * `ratio` — the max multiplier value in the interval (0:1). The value will be generated randomly up to `ratio`, multiplied by the original value, and the result will be added to the original value. + * `value` — the original value + +=== "Return values" + + * `res` — a noised integer value + * `err` — an error if there is an issue + +### randomBool + +Generates a random boolean value. + +### randomDate + +Generates a random date within the provided interval. + +=== "Signature" + + `randomDate(min time.Time, max time.Time) (res time.Time, err error)` + +=== "Parameters" + + * `min` — the minimum random value threshold + * `max` — the maximum random value threshold + +=== "Return values" + + * `res` — a randomly generated date value + * `err` — an error if there is an issue + +### randomFloat + +Generates a random float value within the provided interval. + +=== "Signature" + + `randomFloat(min any, max any, precision int) (res float, err error)` + +=== "Parameters" + + * `min` — the minimum random value threshold + * `max` — the maximum random value threshold + * `precision` — the precision of the resulted value + +=== "Return values" + + * `res` — a randomly generated float value + * `err` — an error if there is an issue + +### randomInt + +Generates a random integer value within the provided interval. + +=== "Signature" + + `randomInt(min int, max int) (res int, err error)` + +=== "Parameters" + + * `min` — the minimum random value threshold + * `max` — the maximum random value threshold + +=== "Return values" + + * `res` — a randomly generated int value + * `err` — an error if there is an issue + +### randomString + +Generates a random string using the provided characters within the specified length range. + +=== "Signature" + + `randomString(minLength int, maxLength int, symbols string) (res string, err error)` + +=== "Parameters" + + * `minLength` — the minimum string length + * `maxLength` — the maximum string length + * `symbols` — a string with a set of symbols which can be used. The default value is + `abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890` + +=== "Return values" + + * `res` — a randomly generated string value + * `err` — an error if there is an issue + +### roundFloat + +Rounds a float value up to provided precision. + +=== "Signature" + + `roundFloat(precision int, original float) (res float, err error)` + +=== "Parameters" + + * `precision` — the precision of the value + * `original` — the original float value + +=== "Return values" + + * `res` — a rounded float value + * `err` — an error if there is an issue diff --git a/docs/built_in_transformers/advanced_transformers/custom_functions/faker_function.md b/docs/built_in_transformers/advanced_transformers/custom_functions/faker_function.md new file mode 100644 index 00000000..ff93ceda --- /dev/null +++ b/docs/built_in_transformers/advanced_transformers/custom_functions/faker_function.md @@ -0,0 +1,84 @@ +# Faker functions + +Greenmask uses [go-faker/faker](https://github.com/go-faker/faker) under the hood for generating of synthetic data. + +## Faker functions: Address + +| Function | Description | Signature | +|--------------------|------------------------------------------------------------------------------------------------------|----------------------------------------| +| `fakerRealAddress` | Generates a random real-world address that includes: city, state, postal code, latitude, and longitude | `fakerRealAddress() (res ReadAddress)` | +| `fakerLatitude` | Generates random fake latitude | `fakerLatitude() (res float64)` | +| `fakerLongitude` | Generates random fake longitude | `fakerLongitude() (res float64)` | + +## Faker functions: Datetime + +| Function | Description | Signature | +|-------------------|------------------------------------------------------------------------|----------------------------------| +| `fakerUnixTime` | Generates random Unix time in seconds | `fakerLongitude() (res int64)` | +| `fakerDate` | Generates random date with the pattern of `YYYY-MM-DD` | `fakerDate() (res string)` | +| `fakerTimeString` | Generates random time | `fakerTimeString() (res string)` | +| `fakerMonthName` | Generates a random month | `fakerMonthName() (res string)` | +| `fakerYearString` | Generates a random year | `fakerYearString() (res string)` | +| `fakerDayOfWeek` | Generates a random day of a week | `fakerDayOfWeek() (res string)` | +| `fakerDayOfMonth` | Generates a random day of a month | `fakerDayOfMonth() (res string)` | +| `fakerTimestamp` | Generates a random timestamp with the pattern of `YYYY-MM-DD HH:MM:SS` | `fakerTimestamp() (res string)` | +| `fakerCentury` | Generates a random century | `fakerCentury() (res string)` | +| `fakerTimezone` | Generates a random timezone name | `fakerTimezone() (res string)` | +| `fakerTimeperiod` | Generates a random time period with the patter of either `AM` or `PM` | `fakerTimeperiod() (res string)` | + +## Faker functions: Internet + +| Function | Description | Signature | +|-------------------|-----------------------------------------------------------------------------------|----------------------------------| +| `fakerEmail` | Generates a random email | `fakerEmail() (res string)` | +| `fakerMacAddress` | Generates a random MAC address | `fakerMacAddress() (res string)` | +| `fakerDomainName` | Generates a random domain name | `fakerDomainName() (res string)` | +| `fakerURL` | Generates a random URL with the pattern of `https://www.domainname.some/somepath` | `fakerURL() (res string)` | +| `fakerUsername` | Generates a random username | `fakerUsername() (res string)` | +| `fakerIPv4` | Generates a random IPv4 address | `fakerIPv4() (res string)` | +| `fakerIPv6` | Generates a random IPv6 address | `fakerIPv6() (res string)` | +| `fakerPassword` | Generates a random password | `fakerPassword() (res string)` | + +## Faker functions: words and sentences + +| Function | Description | Signature | +|------------------|---------------------------------------------------------|---------------------------------| +| `fakerWord` | Generates a random word | `fakerWord() (res string)` | +| `fakerSentence` | Generates a random sentence | `fakerSentence() (res string)` | +| `fakerParagraph` | Generates a random sequence of sentences as a paragraph | `fakerParagraph() (res string)` | + +## Faker functions: Payment + +| Function | Description | Signature | +|---------------------------|------------------------------------------------------------------|------------------------------------------| +| `fakerCCType` | Generates a random credit card type, e.g. VISA, MasterCard, etc. | `fakerCCType() (res string)` | +| `fakerCCNumber` | Generates a random credit card number | `fakerCCNumber() (res string)` | +| `fakerCurrency` | Generates a random currency name | `fakerCurrency() (res string)` | +| `fakerAmountWithCurrency` | Generates random amount preceded with random currency | `fakerAmountWithCurrency() (res string)` | + +## Faker functions: Person + +| Function | Description | Signature | +|------------------------|----------------------------------------------------------|---------------------------------------| +| `fakerTitleMale` | Generates a random male title from the predefined list | `fakerTitleMale() (res string)` | +| `fakerTitleFemale` | Generates a random female title from the predefined list | `fakerTitleFemale() (res string)` | +| `fakerFirstName` | Generates a random first name | `fakerFirstName() (res string)` | +| `fakerFirstNameMale` | Generates a random male first name | `fakerFirstNameMale() (res string)` | +| `fakerFirstNameFemale` | Generates a random female first name | `fakerFirstNameFemale() (res string)` | +| `fakerFirstLastName` | Generates a random last name | `fakerFirstLastName() (res string)` | +| `fakerName` | Generates a random full name preceded with a title | `fakerName() (res string)` | + +## Faker functions: Phone + +| Function | Description | Signature | +|----------------------------|----------------------------------------------------------------------|-------------------------------------------| +| `fakerPhoneNumber` | Generates a random phone number | `fakerPhoneNumber() (res string)` | +| `fakerTollFreePhoneNumber` | Generates a random phone number with the pattern of `(123) 456-7890` | `fakerTollFreePhoneNumber() (res string)` | +| `fakerE164PhoneNumber` | Generates a random phone number with the pattern of `+12345678900` | `fakerE164PhoneNumber() (res string)` | + +## Faker functions: UUID + +| Function | Description | Signature | +|-----------------------|--------------------------------------------------------|---------------------------------| +| `fakerUUIDHyphenated` | Generates a random unique user ID separated by hyphens | `fakerUUID() (res string)` | +| `fakerUUIDDigit` | Generates a random unique user ID in the HEX format | `fakerUUIDDigit() (res string)` | diff --git a/docs/built_in_transformers/advanced_transformers/custom_functions/index.md b/docs/built_in_transformers/advanced_transformers/custom_functions/index.md new file mode 100644 index 00000000..42506cd7 --- /dev/null +++ b/docs/built_in_transformers/advanced_transformers/custom_functions/index.md @@ -0,0 +1,14 @@ +# Template custom functions + +Within Greenmask, custom functions play a crucial role, providing a wide array of options for implementing diverse logic. Under the hood, the custom functions are based on the [sprig Go's template functions](https://masterminds.github.io/sprig/). Greenmask enhances this capability by introducing additional functions and transformation functions. These extensions mirror the logic found in the [standard transformers](../../standard_transformers/index.md) but offer you the flexibility to implement intricate and comprehensive logic tailored to your specific needs. + +Currently, you can use template custom functions for the [advanced transformers](../index.md): + +* [Json](../json.md) +* [Template](../template.md) +* [TemplateRecord](../template_record.md) + +Custom functions are arbitrarily divided into 2 groups: + +- [Core functions](core_functions.md) — custom functions that vary in purpose and include PostgreSQL driver, JSON output, testing, and transformation functions. +- [Faker functions](faker_function.md) — custom function of a *faker* type which generate synthetic data. diff --git a/docs/built_in_transformers/advanced_transformers/index.md b/docs/built_in_transformers/advanced_transformers/index.md new file mode 100644 index 00000000..e05552ed --- /dev/null +++ b/docs/built_in_transformers/advanced_transformers/index.md @@ -0,0 +1,10 @@ +# Advanced transformers + +Advanced transformers are modifiable obfuscation methods that users can adjust based on their needs by using [custom functions](custom_functions/index.md). + +Below you can find an index of all advanced transformers currently available in Greenmask. + +1. [Json](json.md) — changes a JSON content by using `delete` and `set` operations. +1. [Template](template.md) — executes a Go template of your choice and applies the result to a specified column. +1. [TemplateRecord](template_record.md) — modifies records by using a Go template of your choice and applies the changes via the PostgreSQL +driver. diff --git a/docs/built_in_transformers/advanced_transformers/json.md b/docs/built_in_transformers/advanced_transformers/json.md new file mode 100644 index 00000000..8a62fcce --- /dev/null +++ b/docs/built_in_transformers/advanced_transformers/json.md @@ -0,0 +1,59 @@ +Change a JSON document using `delete` and `set` operations. `NULL` values are kept. + +## Parameters + +| Name | Properties | Description | Default | Required | Supported DB types | +|------------|-----------------|--------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| column | | The name of the column to be affected | | Yes | json, jsonb | +| operations | | A list of operations that contains editing `delete` and `set` | | Yes | - | +| ∟ | operation | Specifies the operation type: `set` or `delete` | | Yes | - | +| ∟ | path | The path to an object to be modified. See path syntax below. | | Yes | - | +| ∟ | value | A value to be assigned to the provided path | | No | - | +| ∟ | value_template | A Golang template to be assigned to the provided path. See the list of template functions below. | | No | - | +| ∟ | error_not_exist | Throws an error if the key does not exist by the provided path. Disabled by default. | `false` | No | - | + +## Description + +The `Json` transformer applies a sequence of changing operations (`set` and/or `delete`) to a JSON document. The value can be static or dynamic. For the `set` operation type, a static value is provided in the `value` parameter, while a dynamic value is provided in the `value_template` parameter, taking the data received after template execution as a result. Both the `value` and `value_template` parameters are mandatory for the `set` operation. + +### Path syntax + +The Json transformer is based on [tidwall/sjson](https://github.com/tidwall/sjson) and supports the same path syntax. See their documentation for [syntax rules](https://github.com/tidwall/sjson#path-syntax). + +### Template functions + +| Function | Description | Signature | +|------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------| +| `.GetPath` | Returns the current path to which the operation is being applied | `.GetPath() (path string)` | +| `.GetOriginalValue` | Returns the original value to which the current operation path is pointing. If the value at the specified path does not exist, it returns `nil`. | `.GetOriginalValue() (value any)` | +| `.OriginalValueExists` | Returns a boolean value indicating whether the specified path exists or not. | `.OriginalValueExists() (exists bool)` | +| `.GetColumnValue` | Returns an encoded into Golang type value for a specified column or throws an error. A value can be any of `int`, `float`, `time`, `string`, `bool`, or `slice` or `map`. | `.GetColumnValue(name string) (value any, err error)` | +| `.GetRawColumnValue` | Returns a raw value for a specified column as a string or throws an error | `.GetRawColumnValue(name string) (value string, err error)` | +| `.EncodeValueByColumn` | Encodes a value of any type into its raw string representation using the specified column name. Encoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.EncodeValueByColumn(name string, value any) (res any, err error)` | +| `.DecodeValueByColumn` | Decodes a value from its raw string representation to a Golang type using the specified column name. Decoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.DecodeValueByColumn(name string, value any) (res any, err error)` | +| `.EncodeValueByType` | Encodes a value of any type into its string representation using the specified type name. Encoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.EncodeValueByType(name string, value any) (res any, err error)` | +| `.DecodeValueByType` | Decodes a value from its raw string representation to a Golang type using the specified type name. Decoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.DecodeValueByType(name string, value any) (res any, err error)` | + +## Example: Changing JSON document + +``` yaml title="Json transformer example" +- schema: "bookings" + name: "aircrafts_data" + transformers: + - name: "Json" + params: + column: "model" + operations: + - operation: "set" + path: "en" + value: "Boeing 777-300-2023" + - operation: "set" + path: "seats" + error_not_exist: True + value_template: "{{ randomInt 100 400 }}" + - operation: "set" + path: "details.preperties.1" + value: {"name": "somename", "description": null} + - operation: "delete" + path: "values.:2" +``` diff --git a/docs/built_in_transformers/advanced_transformers/template.md b/docs/built_in_transformers/advanced_transformers/template.md new file mode 100644 index 00000000..7c921dea --- /dev/null +++ b/docs/built_in_transformers/advanced_transformers/template.md @@ -0,0 +1,87 @@ +Execute a Go template and automatically apply the result to a specified column. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | any | +| template | A Go template string | | Yes | - | +| validate | Validates the template result using the PostgreSQL driver decoding procedure. Throws an error if a custom type does not have an encode-decoder implementation. | false | No | - | + +## Description + +The `Template` transformer executes Go templates and automatically applies the template result to a specified column. Go template system is designed to be extensible, enabling developers to access data objects and incorporate custom functions programmatically. For more information, you can refer to the official [Go Template documentation](https://pkg.go.dev/text/template). + +With the `Template` transformer, you can implement complicated transformation logic using basic or custom template functions. Below you can get familiar with the basic template functions for the `Template` transformer. For more information about available custom template functions, see [Custom functions](custom_functions/index.md). + +!!! warning + + Pay attention to the whitespaces in templates. Use dash-wrapped - brackets `{{- -}}` for + trimming the spaces. For example, the value `"2023-12-19"` is not the same as `" 2023-12-19 "` and it may throw an error when restoring. + +### Template functions + +| Function | Description | Signature | +|------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------| +| `.GetColumnType` | Returns a string with the column type. | `.GetColumnType(name string) (typeName string, err error)` | +| `.GetValue` | Returns the column value for column assigned in the `column` parameter, encoded by the PostgreSQL driver into any type along with any associated error. Supported types include `int`, `float`, `time`, `string`, `bool`, as well as `slice` or `map` of any type. | `.GetValue() (value any, err error)` | +| `.GetRawValue` | Returns a raw value as a string for column assigned in the `column` parameter. | `.GetRawColumnValue(name string) (value string, err error)` | +| `.GetColumnValue` | Returns an encoded value for a specified column or throws an error. A value can be any of `int`, `float`, `time`, `string`, `bool`, or `slice` or `map`. | `.GetColumnValue(name string) (value any, err error)` | +| `.GetRawColumnValue` | Returns a raw value for a specified column as a string or throws an error | `.GetRawColumnValue(name string) (value string, err error)` | +| `.EncodeValue` | Encodes a value of any type into its string representation using the type assigned to the table column specified in the `column` parameter. Encoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.EncodeValue(value any) (res any, err error)` | +| `.DecodeValue` | Decodes a value from its raw string representation to a Golang type using the data type assigned to the table column specified in the `column` parameter. Decoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.DecodeValueByColumn(value any) (res any, err error)` | +| `.EncodeValueByColumn` | Encodes a value of any type into its raw string representation using the specified column name. Encoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.EncodeValueByColumn(name string, value any) (res any, err error)` | +| `.DecodeValueByColumn` | Decodes a value from its raw string representation to a Golang type using the specified column name. Decoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.DecodeValueByColumn(name string, value any) (res any, err error)` | +| `.EncodeValueByType` | Encodes a value of any type into its string representation using the specified type name. Encoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.EncodeValueByType(name string, value any) (res any, err error)` | +| `.DecodeValueByType` | Decodes a value from its raw string representation to a Golang type using the specified type name. Decoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.DecodeValueByType(name string, value any) (res any, err error)` | + +## Example: Update the `firstname` column + +Below you can see the table structure: + +![person-person-schema.png](../../assets/built_in_transformers/person-person-schema.png) + +#### Change rule + +The goal is to modify the `firstname` column based on the following conditions: + +* If the current value of the `firstname` column is equal to `Terri`, replace it with `Mary`. +* For all other cases, generate a random name and append `Jr`. + +#### Using a template function + +[comment]: <> (To Do: Add a link to fakerFirstName) + +To generate random names, you can use the `fakerFirstName` template function, which is designed to create +synthetic names. + +```yaml title="Template transformer example" +- schema: "humanresources" + name: "employee" + transformation: + - name: "Template" + params: + column: "firstname" + template: > + {{- if eq .GetValue "Terri" -}} + Mary + {{- else -}} + {{- fakerFirstName -}} Jr + {{- end -}} + + validate: true +``` + +Expected result: + +=== "Value = Terry" + + | column name | original value | transformed | + |-------------|----------------|-------------| + | firstname | Terri | Mary | + +=== "Value != Terri" + + | column name | original value | transformed | + |-------------|------------------|-------------| + | firstname | Ken Jr | Mike | diff --git a/docs/built_in_transformers/advanced_transformers/template_record.md b/docs/built_in_transformers/advanced_transformers/template_record.md new file mode 100644 index 00000000..716f0a66 --- /dev/null +++ b/docs/built_in_transformers/advanced_transformers/template_record.md @@ -0,0 +1,68 @@ +Modify records using a Go template and apply changes by using the PostgreSQL driver functions. This transformer provides a way to implement custom transformation logic. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| columns | A list of columns to be affected by the template. The list of columns will be checked for constraint violations. | | No | any | +| template | A Go template string | | Yes | - | +| validate | Validate the template result via PostgreSQL driver decoding procedure. Throws an error if a custom type does not have an encode-decoder implementation. | false | No | - | + +## Description + +`TemplateRecord` uses [Go templates](https://pkg.go.dev/text/template) to change data. However, while the [Template transformer](/template.md) operates with a single column and automatically applies results, the `TemplateRecord` transformer can make changes to a set of columns in the string, and using driver functions `.SetValue` or `.SetRawValue` is mandatory to do that. + +With the `TemplateRecord` transformer, you can implement complicated transformation logic using basic or custom template functions. Below you can get familiar with the basic template functions for the `TemplateRecord` transformer. For more information about available custom template functions, see [Custom functions](custom_functions/index.md). + +### Template functions + +| Function | Description | Signature | +|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------| +| `.GetColumnType` | Returns a string with the column type. | `.GetColumnType(name string) (typeName string, err error)` | +| `.GetColumnValue` | Returns an encoded value for a specified column or throws an error. A value can be any of `int`, `float`, `time`, `string`, `bool`, or `slice` or `map`. | `.GetColumnValue(name string) (value any, err error)` | +| `.GetRawColumnValue` | Returns a raw value for a specified column as a string or throws an error | `.GetRawColumnValue(name string) (value string, err error)` | +| `.SetColumnValue` | Sets a new value of a specific data type to the column. The value assigned must be compatible with the PostgreSQL data type of the column. For example, it is allowed to assign an `int` value to an `INTEGER` column, but you cannot assign a `float` value to a `timestamptz` column. | `SetColumnValue(name string, v any) (bool, error)` | +| `.SetRawColumnValue` | Sets a new raw value for a column, inheriting the column's existing data type, without performing data type validation. This can lead to errors when restoring the dump if the assigned value is not compatible with the column type. To ensure compatibility, consider using the `.DecodeValueByColumn` function followed by `.SetColumnValue`, for example, `{{ "13" \| .DecodeValueByColumn "items_amount" \| .SetColumnValue "items_amount" }}`. | `.SetRawColumnValue(name string, value any) (err error)` | +| `.EncodeValueByColumn` | Encodes a value of any type into its raw string representation using the specified column name. Encoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.EncodeValueByColumn(name string, value any) (res any, err error)` | +| `.DecodeValueByColumn` | Decodes a value from its raw string representation to a Golang type using the specified column name. Decoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.DecodeValueByColumn(name string, value any) (res any, err error)` | +| `.EncodeValueByType` | Encodes a value of any type into its string representation using the specified type name. Encoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.EncodeValueByType(name string, value any) (res any, err error)` | +| `.DecodeValueByType` | Decodes a value from its raw string representation to a Golang type using the specified type name. Decoding is performed through the PostgreSQL driver. Throws an error if types are incompatible. | `.DecodeValueByType(name string, value any) (res any, err error)` | + +## Example: Generate a random `created_at` and `updated_at` dates + +Below you can see the table structure: + +![img.png](../../assets/built_in_transformers/orders-schema.png) + +The goal is to modify the `"created_at"` and `"updated_at"` columns based on the following rules: + +* Do not change the value if the `created_at` is Null. +* If the `created_at` is not Null, generate the current time and use it as the minimum threshold for randomly + generating the `updated_at` value. +* Assign all generated values using the `.SetColumnValue` function. + + +```yaml title="Template transformer example" +- name: "TemplateRecord" + params: + columns: + - "created_at" + - "updated_at" + template: > + {{ $val := .GetColumnValue "created_at" }} + {{ if isNotNull $val }} + {{ $createdAtValue := now }} + {{ $maxUpdatedDate := date_modify "24h" $createdAtValue }} + {{ $updatedAtValue := randomDate $createdAtValue $maxUpdatedDate }} + {{ .SetColumnValue "created_at" $createdAtValue }} + {{ .SetColumnValue "updated_at" $updatedAtValue }} + {{ end }} + validate: true +``` + +Expected result: + +| column name | original value | transformed | +|-------------|-------------------------------|-----------------------------| +| created_at | 2021-01-20 07:01:00.513325+00 | 2023-12-17 19:37:29.910054Z | +| updated_at | 2021-08-09 21:27:00.513325+00 | 2023-12-18 10:05:25.828498Z | diff --git a/docs/built_in_transformers/index.md b/docs/built_in_transformers/index.md new file mode 100644 index 00000000..31fda4ed --- /dev/null +++ b/docs/built_in_transformers/index.md @@ -0,0 +1,7 @@ +# About transformers + +Transformers in Greenmask are methods which are applied to obfuscate sensitive data. All Greenmask transformers are split into the following groups: + +- [Standard transformers](standard_transformers/index.md) — transformers that require only an input of parameters. +- [Advanced transformers](advanced_transformers/index.md) — transformers that can be modified according to user's needs with the help of [custom functions](advanced_transformers/custom_functions/index.md). +- Custom transformers — coming soon... diff --git a/docs/built_in_transformers/standard_transformers/cmd.md b/docs/built_in_transformers/standard_transformers/cmd.md new file mode 100644 index 00000000..ddf2025c --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/cmd.md @@ -0,0 +1,115 @@ +Transform data via external program using `stdin` and `stdout` interaction. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|----------|--------------------| +| columns | A list of column names to be affected. If empty, the entire tuple is used. Read about the structure further. | | Yes | Any | +| executable | The path to the `executable` parameter file | | Yes | - | +| args | A list of parameters for the executable | | No | - | +| driver | The row driver with parameters that is used for interacting with cmd. `{"name": "text, csv, json", "params": { "format": "[text bytes]"}}`. The `text` and `bytes` options are available only for `json`. | `{"name": "csv"}` | No | - | +| validate | Performs a decoding operation using the PostgreSQL driver for data received from the command to ensure the data format is correct | `false` | No | - | +| timeout | Timeout for sending and receiving data from the external command | `2s` | No | - | +| expected_exit_code | The expected exit code on SIGTERM signal. If the exit code is unexpected, the transformation exits with an error. | `0` | No | - | +| skip_on_behaviour | Skips transformation call if one of the provided columns has a `null` value (`any`) or each of the provided columns has `null` values (`all`). This option works together with the `skip_on_null_input` parameter on columns. Possible values: `all`, `any`. | `all` | No | - | + +!!! warning + + The parameter `validate_output=true` may cause an error if the type does not have a PostgreSQL driver decoder implementation. Most of the types, such as `int`, `float`, `text`, `varchar`, `date`, `timestamp`, etc., have encoders and decoders, as well as inherited types like domain types based on them. + +## Description + +The `Cmd` transformer allows you to send original data to an external program via `stdin` and receive transformed data from `stdout`. It supports various interaction formats such as `json`, `csv`, or plain `text` for one-column transformations. The interaction is performed line by line, so at the end of each sent data, a new line symbol `\n` must be included. + +### Types of interaction modes + +#### text + +Textual driver that is used only for one column transformation, thus you cannot provide here more than one column. The value encodes into string laterally. For example, `2023-01-03 01:00:00.0+03`. + +#### json + +JSON line driver. It has two formats that can be passed through `driver.params`: `[text|bytes]`. Use the `bytes` format for binary datatypes. Use the `text` format for non-binary datatypes and for those that can be represented as string literals. + +=== "The text format example" + + ```json + { + 1: { + "d": "some_value1", + "n": false, + }, + 2: { + "d": "some_value2", + "n": false, + } + } + ``` + +=== "The bytes format example" + + ```json + { + 1: { + "d": "aGVsbG8gd29ybHNeODcxMjE5MCUlJSUlJQ==", + "n": false, + }, + 2: { + "d": "aGVsbG8gd29ybHNeODcxMjE5MCUlJSUlJQ==", + "n": false, + } + } + ``` + +where: + +* Each line is a JSON line with a map of attribute numbers to their values +* `d` — the raw data represented as base64 encoding for the bytes format or Unicode text for the text format. The base64 encoding is needed because data can be binary. +* `n` — indicates if NULL is present + +#### csv + +CSV driver (comma-separated). The number of attributes is the same as the number of table columns, but the +columns that were not mentioned in the `columns` list are empty. The `NULL` value is represented as `\N`. Each attribute is escaped by a quote (`"`). For example, if the transformed table has attributes `id`, `title`, and `created_at`, and only `id` and `created_at` require transformation, then the CSV line will look as follows: + +``` csv title="csv line example" +"123","","2023-01-03 01:00:00.0+03" +``` + +Column object attributes: + +* `name` — the name of the column. This value is required. Depending on the attributes that follows further, this column may be used just as a value and is not affected in any way. +* `not_affected` — indicates whether the column is affected in the transformation. This attribute is required for the validation procedure when Greenmask is called with `greenmask dump --validate`. Setting `not_affected=true` can be helpful when the command transformer transforms data depending on the value of another column. For example, if you want to generate an `updated_at` column value depending on the `created_at` column value, you can set `created_at` to `not_affected=true`. The default value is `false`. +* `skip_original_data` — indicates whether the original data is required for the transformer. This attribute can be helpful for decreasing the interaction time. One use case is when the command works as a generator and returns the value without relying on the original data. The default value is `false`. +* `skip_on_null_input` — specifies whether to skip transformation when the original value is `null`. This attribute works in conjunction with the `skip_on_behaviour` parameter. For example, if you have two affected columns with `skip_on_null_input=true` and one column is `null`, then, if `skip_on_behaviour=any`, the transformation will be skipped, or, if `skip_on_behaviour=and`, the transformation will be performed. The default is `false`. + +## Example: Apply transformation performed by external command + +In the following example, `actual_arrival` and `scheduled_arrival` columns are transformed via external command transformer. + +```yaml title="Cmd transformer example" +- name: "Cmd" + params: + executable: "cmd_test.sh" + driver: + name: "json" + params: + format: "bytes" # (4) + timeout: "60s" + validate_output: true # (1) + expected_exit_code: -1 + skip_on_behaviour: "any" # (2) + columns: + - name: "actual_arrival" + skip_original_data: true + skip_on_null_input: true # (3) + - name: "scheduled_arrival" + skip_original_data: true + skip_on_null_input: true # (3) +``` +{ .annotate } + +1. Validate the received data via decode procedure using the PostgreSQL driver. Note that this may cause an error if the type is not supported in the PostgreSQL driver. +2. Skip transformation (keep the values) if one of the affected columns (`not_affected=false`) has a null value. +3. If a column has a null value, then skip it. This works in conjunction with `skip_on_behaviour`. Since it has the value any, if one of the columns (`actual_arrival` or `scheduled_arrival`) has a `null` value, then skip the transformation call. +4. The format of JSON can be either `text` or `bytes`. diff --git a/docs/built_in_transformers/standard_transformers/dict.md b/docs/built_in_transformers/standard_transformers/dict.md new file mode 100644 index 00000000..35c18dbd --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/dict.md @@ -0,0 +1,37 @@ +Replace values matched by dictionary keys. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------------|----------------------------------------------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | any | +| values | Value replace mapping as in: `{"string": "string"}`. The string with value `"\N"` is considered NULL. | | No | - | +| default | Shown if no value has been matched with dict. The string with value `"\N"` is considered NULL. By default is empty. | | No | - | +| fail_not_matched | When no value is matched with the dict, fails the replacement process if set to `true`, or keeps the current value, if set to `false`. | `true` | No | - | +| validate | Performs the encode-decode procedure using column type to ensure that values have correct type | `true` | No | - | + +## Description + +The `Dict` transformer uses a user-provided key-value dictionary to replace values based on matches specified in the `values` parameter mapping. These provided values must align with the PostgreSQL type format. To validate the values format before application, you can utilize the `validate` parameter, triggering a decoding procedure via the PostgreSQL driver. + +If there are no matches by key, an error will be raised according to a default `fail_not_matched: true` parameter. You can change this behaviour by providing the `default` parameter, value from which will be shown in case of a missing match. + +In certain cases where the driver type does not support the validation operation, an error may occur. For setting or matching a NULL value, use a string with the `\N` sequence. + +## Example: Replace marital status + +The following example replaces marital status from `S` to `M` or from `M` to `S` and raises an error if there is no match: + +``` yaml title="Dict transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "Dict" + params: + column: "maritalstatus" + values: + "S": "M" + "M": "S" + validate: true + fail_not_matched: true +``` diff --git a/docs/built_in_transformers/standard_transformers/hash.md b/docs/built_in_transformers/standard_transformers/hash.md new file mode 100644 index 00000000..4944983e --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/hash.md @@ -0,0 +1,32 @@ +Generate a hash of the text value using the `Scrypt` hash function under the hood. `NULL` values are kept. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|---------------------------------------------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| function | Hash algorithm to obfuscate data. Can be any of `md5`, `sha1`, `sha256`, `sha512`. | `sha1` | No | - | +| max_length | Indicates whether to truncate the hash tail and specifies at what length. Can be any integer number, where `0` means "no truncation". | `0` | No | - | + +## Example: Generate hash from job title + +The following example generates a hash from the `jobtitle` into sha1 and truncates the results after the 10th character. + +```yaml title="Hash transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "Hash" + params: + column: "jobtitle" + function: "sha1" + max_length: 10 +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|-------------|--------------------------|-------------| +| jobtitle | Research and Development | Zpmfe8F+LV | + +``` diff --git a/docs/built_in_transformers/standard_transformers/index.md b/docs/built_in_transformers/standard_transformers/index.md new file mode 100644 index 00000000..ccd1ce68 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/index.md @@ -0,0 +1,56 @@ +# Standard transformers + +Standard transformers are ready-to-use methods that require no customization and perform with just as little as parameters input. Below you can find an index of all standard transformers currently available in Greenmask. + +1. [Cmd](cmd.md) — transforms data via external program using `stdin` and `stdout` interaction. +1. [Dict](dict.md) — replaces values matched by dictionary keys. +1. [Hash](dict.md) — generates a hash of the text value. +1. [Masking](masking.md) — masks a value using one of the masking behaviors depending on your domain. +1. [NoiseDate](noise_date.md) — randomly adds or subtracts a duration within the provided ratio interval to the original date value. +1. [NoiseFloat](noise_float.md) — adds or subtracts a random fraction to the original float value. +1. [NoiseInt](noise_int.md) — adds or subtracts a random fraction to the original integer value. +1. [RandomBool](random_bool.md) — generates random boolean values. +1. [RandomChoice](random_choice.md) — replaces values randomly chosen from a provided list. +1. [RandomDate](random_date.md) — generates a random date in a specified interval. +1. [RandomFloat](random_float.md) — generates a random float within the provided interval. +1. [RandomInt](random_int.md) — generates a random integer within the provided interval. +1. [RandomString](random_string.md) — generates a random string using the provided characters within the specified length range. +1. [RandomUuid](random_uuid.md) — generates a random unique user ID. +1. [RandomLatitude](random_latitude.md) — generates a random latitude value. +1. [RandomLongitude](random_longitude.md) — generates a random longitude value. +1. [RandomUnixTime](random_unix_time.md) — generates a random Unix timestamp. +1. [RandomDayOfWeek](random_day_of_week.md) — generates a random day of the week. +1. [RandomDayOfMonth](random_day_of_month.md) — generates a random day of the month. +1. [RandomMonthName](random_month_name.md) — generates the name of a random month. +1. [RandomYearString](random_year_string.md) — generates a random year as a string. +1. [RandomCentury](random_century.md) — generates a random century. +1. [RandomTimezone](random_timezone.md) — generates a random timezone. +1. [RandomEmail](random_email.md) — generates a random email address. +1. [RandomUsername](random_username.md) — generates a random username. +1. [RandomPassword](random_password.md) — generates a random password. +1. [RandomMacAddress](random_mac_address.md) — generates a random MAC address. +1. [RandomDomainName](random_domain_name.md) — generates a random domain name. +1. [RandomURL](random_url.md) — generates a random URL. +1. [RandomIPv4](random_ipv4.md) — generates a random IPv4 address. +1. [RandomIPv6](random_ipv6.md) — generates a random IPv6 address. +1. [RandomWord](random_word.md) — generates a random word. +1. [RandomSentence](random_sentence.md) — generates a random sentence. +1. [RandomParagraph](random_paragraph.md) — generates a random paragraph. +1. [RandomCCType](random_cc_type.md) — generates a random credit card type. +1. [RandomCCNumber](random_cc_number.md) — generates a random credit card number. +1. [RandomCurrency](random_currency.md) — generates a random currency code. +1. [RandomAmountWithCurrency](random_amount_with_currency.md) — generates a random monetary amount with currency. +1. [RandomName](random_name.md) — generates a full random name. +1. [RandomLastName](random_last_name.md) — generates a random last name. +1. [RandomFirstName](random_first_name.md) — generates a random first name. +1. [RandomFirstNameMale](random_first_name_male.md) — generates a random male first name. +1. [RandomFirstNameFemale](random_first_name_female.md) — generates a random female first name. +1. [RandomTitleMale](random_title_male.md) — generates a random male title. +1. [RandomTitleFemale](random_title_female.md) — generates a random female title. +1. [RandomPhoneNumber](random_phone_number.md) — generates a random phone number. +1. [RandomTollFreePhoneNumber](random_toll_free_phone_number.md) — generates a random toll-free phone number. +1. [RandomE164PhoneNumber](random_e164_phone_number.md) — generates a random phone number in E.164 format. +1. [RealAddress](real_address.md) — generates a real address. +1. [RegexpReplace](regexp_replace.md) — replaces a string using a regular expression. +1. [Replace](replace.md) — replaces an original value by the provided one. +1. [SetNull](set_null.md) — sets `NULL` value to the column. diff --git a/docs/built_in_transformers/standard_transformers/masking.md b/docs/built_in_transformers/standard_transformers/masking.md new file mode 100644 index 00000000..f240cef5 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/masking.md @@ -0,0 +1,48 @@ +Mask a value using one of the masking rules depending on your domain. `NULL` values are kept. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|--------|-----------------------------------------------------------------------------------------------------------------|-----------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| type | Data type of attribute (`default`, `password`, `name`, `addr`, `email`, `mobile`, `tel`, `id`, `credit`, `url`) | `default` | No | - | + +## Description + +The `Masking` transformer replaces characters with asterisk `*` symbols depending on the provided data type. If the +value is `NULL`, it is kept unchanged. It is based on [ggwhite/go-masker](https://github.com/ggwhite/go-masker) and +supports the following masking rules: + +| Type | Description | +|:-----------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| default | Returns `*` symbols with the same length, e.g. input: `test1234` output: `********` | +| name | Masks the second letter the third letter in a word, e. g. input: `ABCD` output: `A**D` | +| password | Always returns `************` | +| address | Keeps first 6 letters, masks the rest, e. g. input: `Larnaca, makarios st` output: `Larnac*************` | +| email | Keeps a domain and the first 3 letters, masks the rest, e. g. input: `ggw.chang@gmail.com` output: `ggw****@gmail.com` | +| mobile | Masks 3 digits starting from the 4th digit, e. g. input: `0987654321` output: `0987***321` | +| telephone | Removes `(`, `)`, ` `, `-` chart, and masks last 4 digits of telephone number, then formats it to `(??)????-????`, e. g. input: `0227993078` output: `(02)2799-****` | +| id | Masks last 4 digits of ID number, e. g. input: `A123456789` output: `A12345****` | +| credit_cart | Masks 6 digits starting from the 7th digit, e. g. input `1234567890123456` output `123456******3456` | +| url | Masks the password part of the URL, if applicable, e. g. `http://admin:mysecretpassword@localhost:1234/uri` output: `http://admin:xxxxx@localhost:1234/uri` | + +## Example: Masking employee national ID number + +In the following example, the national ID number of an employee is masked. + +``` yaml title="Masking transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "Masking" + params: + column: "nationalidnumber" + type: "id" +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|------------------|----------------|-------------| +| nationalidnumber | 295847284 | 295847**** | +``` diff --git a/docs/built_in_transformers/standard_transformers/noise_date.md b/docs/built_in_transformers/standard_transformers/noise_date.md new file mode 100644 index 00000000..1307da43 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/noise_date.md @@ -0,0 +1,31 @@ +Randomly add or subtract a duration within the provided `ratio` interval to the original date value. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|----------|-------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|------------------------------| +| column | The name of the column to be affected | | Yes | date, timestamp, timestamptz | +| ratio | The maximum random duration for noise. The value must be in PostgreSQL interval format, e. g. 1 year 2 mons 3 day — `04:05:06.07` | | Yes | - | +| truncate | Truncate the date to the specified part (`nano`, `second`, `minute`, `hour`, `day`, `month`, `year`). The truncate operation is not applied by default. | | No | - | + +## Description + +The `NoiseDate` transformer randomly generates duration within the specified `ratio` parameter and adds it to or +subtracts it from the original date value. The `ratio` parameter must be written in +the [PostgreSQL interval format](https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-INTERVAL-INPUT). You can also truncate the date up to a specified part by setting the `truncate` parameter. + +## Example: Adding noise to the modified date + +In the following example, the original `timestamp` value of `modifieddate` will be noised up to `1 year 2 months 3 days 4 hours 5 +minutes 6 seconds and 7 milliseconds` with truncation up to the `nano` part. + +``` yaml title="NoiseDate transformer example" +- schema: "humanresources" + name: "jobcandidate" + transformers: + - name: "NoiseDate" + params: + column: "modifieddate" + ratio: "1 year 2 mons 3 day 04:05:06.07" + truncate: "nano" +``` diff --git a/docs/built_in_transformers/standard_transformers/noise_float.md b/docs/built_in_transformers/standard_transformers/noise_float.md new file mode 100644 index 00000000..eecef9ee --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/noise_float.md @@ -0,0 +1,29 @@ +Add or subtract a random fraction to the original float value. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|----------------------------------------------------------------------------------------------------------|---------|----------|---------------------------------------------------| +| column | The name of the column to be affected | | Yes | float4 (real), float8 (double precision), numeric | +| ratio | The maximum random percentage for noise, from `0` to `1`, e. g. `0.1` means "add noise up to 10%" | | Yes | - | +| precision | The precision of the noised float value (number of digits after the decimal point) | `4` | No | - | + +## Description + +The `NoiseFloat` transformer multiplies the original float value by a provided random value that is not higher than +the `ratio` parameter and adds it to or subtracts it from the original value. Additionally, you can specify the number of decimal digits by using the `precision` parameter. + +## Example: Adding noise to the purchase price + +In this example, the original value of `standardprice` will be noised up to `50%` and rounded up to `2` decimals. + +``` yaml title="NoiseFloat transformer example" +- schema: "purchasing" + name: "productvendor" + transformers: + - name: "NoiseFloat" + params: + column: "standardprice" + ratio: 0.5 + precision: 2 +``` diff --git a/docs/built_in_transformers/standard_transformers/noise_int.md b/docs/built_in_transformers/standard_transformers/noise_int.md new file mode 100644 index 00000000..759e633f --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/noise_int.md @@ -0,0 +1,27 @@ +Add or subtract a random fraction to the original integer value. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|--------|------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | int2, int4, int8 | +| ratio | The maximum random percentage for noise, from `0` to `1` | | Yes | - | + +## Description + +The `NoiseInt` transformer multiplies the original integer value by a provided random value that is not higher than the +`ratio` parameter and adds it to or subtracts it from the original value. + +## Example: Noise vacation hours of an employee + +In the following example, the original value of `vacationhours` will be noised up to 40%. + +``` yaml title="NoiseInt transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "NoiseInt" + params: + column: "vacationhours" + ratio: 0.4 +``` diff --git a/docs/built_in_transformers/standard_transformers/random_amount_with_currency.md b/docs/built_in_transformers/standard_transformers/random_amount_with_currency.md new file mode 100644 index 00000000..a70d3d8c --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_amount_with_currency.md @@ -0,0 +1,28 @@ +The `RandomAmountWithCurrency` transformer is specifically designed to populate specified database columns with random financial amounts accompanied by currency codes. Ideal for applications requiring the simulation of financial transactions, this utility enhances the realism of financial datasets by introducing variability in amounts and currencies. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|----------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +This transformer automatically generates random financial amounts along with corresponding global currency codes (e. g., `250.00 USD`, `300.00 EUR`), injecting them into the designated database column. It provides a straightforward solution for populating financial records with varied and realistic data, suitable for testing payment systems, data anonymization, and simulation of economic models. + +## Example: Populate the `payments` table with random amounts and currencies + +This example shows how to configure the `RandomAmountWithCurrency` transformer to populate the `payment_details` column in the `payments` table with random amounts and currencies. It is an effective approach to simulating a diverse range of payment transactions. + +```yaml title="RandomAmountWithCurrency transformer example" +- schema: "public" + name: "payments" + transformers: + - name: "RandomAmountWithCurrency" + params: + column: "payment_details" + keep_null: false +``` + +In this setup, the `payment_details` column will be updated with random financial amounts and currency codes for each entry, replacing any existing non-NULL values. The `keep_null` parameter, when set to `true`, ensures that existing NULL values in the column remain unchanged, preserving the integrity of records without specified payment details. diff --git a/docs/built_in_transformers/standard_transformers/random_bool.md b/docs/built_in_transformers/standard_transformers/random_bool.md new file mode 100644 index 00000000..89f51f07 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_bool.md @@ -0,0 +1,26 @@ +Generate random boolean values. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | bool | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | - | + +## Description + +The `RandomBool` transformer generates a random boolean value. The behaviour for NULL values can be +configured using the `keep_null` parameter. + +## Example: Generate a random boolean for a column + +In the following example, the `RandomBool` transformer generates a random boolean value for the `salariedflag` column. + +``` yaml title="RandomBool transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "RandomBool" + params: + column: "salariedflag" +``` diff --git a/docs/built_in_transformers/standard_transformers/random_cc_number.md b/docs/built_in_transformers/standard_transformers/random_cc_number.md new file mode 100644 index 00000000..fe2bb128 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_cc_number.md @@ -0,0 +1,28 @@ +The `RandomCCNumber` transformer is specifically designed to populate specified database columns with random credit card numbers. This utility is crucial for applications that involve simulating financial data, testing payment systems, or anonymizing real credit card numbers in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +By leveraging algorithms capable of generating plausible credit card numbers that adhere to standard credit card validation rules (such as the Luhn algorithm), the `RandomCCNumber` transformer injects random credit card numbers into the designated database column. This approach ensures the generation of credit card numbers that are realistic for testing and development purposes, without compromising real-world applicability and security. + +## Example: Populate random credit card numbers for the `payment_information` table + +This example demonstrates configuring the `RandomCCNumber` transformer to populate the `cc_number` column in the `payment_information` table with random credit card numbers. It is an effective strategy for creating a realistic set of payment data for application testing or data anonymization. + +```yaml title="RandomCCNumber transformer example" +- schema: "public" + name: "payment_information" + transformers: + - name: "RandomCCNumber" + params: + column: "cc_number" + keep_null: false +``` + +With this setup, the `cc_number` column will be updated with random credit card numbers for each entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, it will ensure that existing NULL values in the column are preserved, maintaining the integrity of records where credit card information is not applicable or available. diff --git a/docs/built_in_transformers/standard_transformers/random_cc_type.md b/docs/built_in_transformers/standard_transformers/random_cc_type.md new file mode 100644 index 00000000..d09b10b0 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_cc_type.md @@ -0,0 +1,28 @@ +The `RandomCCType` transformer is designed to populate specified database columns with random credit card types. This tool is essential for applications that require the simulation of financial transaction data, testing payment processing systems, or anonymizing credit card type information in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing a predefined list of credit card types (e.g., VISA, MasterCard, American Express, Discover), the `RandomCCType` transformer injects random credit card type names into the designated database column. This feature allows for the creation of realistic and varied financial transaction datasets by simulating a range of credit card types without using real card data. + +## Example: Populate random credit card types for the `transactions` table + +This example shows how to configure the `RandomCCType` transformer to populate the `card_type` column in the `transactions` table with random credit card types. It is a straightforward method for simulating diverse payment methods across transactions. + +```yaml title="RandomCCType transformer example" +- schema: "public" + name: "transactions" + transformers: + - name: "RandomCCType" + params: + column: "card_type" + keep_null: false +``` + +In this configuration, the `card_type` column will be updated with random credit card types for each entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, maintaining the integrity of records where card type information is not applicable. diff --git a/docs/built_in_transformers/standard_transformers/random_century.md b/docs/built_in_transformers/standard_transformers/random_century.md new file mode 100644 index 00000000..e683bd45 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_century.md @@ -0,0 +1,28 @@ +The `RandomCentury` transformer is crafted to populate specified database columns with random century values. It is ideal for applications that require historical data simulation, such as generating random years within specific centuries for historical databases, testing datasets with temporal dimensions, or anonymizing dates in historical research data. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomCentury` transformer utilizes an algorithm or a library function (hypothetical in this context) to generate random century values. Each value represents a century (e.g., `19th`, `20th`, `21st`), providing a broad temporal range that can be used to enhance datasets requiring a distribution across different historical periods without the need for precise date information. + +## Example: Populate random centuries for the `historical_artifacts` table + +This example shows how to configure the `RandomCentury` transformer to populate the `century` column in a `historical_artifacts` table with random century values, adding an element of variability and historical context to the dataset. + +```yaml title="RandomCentury transformer example" +- schema: "public" + name: "historical_artifacts" + transformers: + - name: "RandomCentury" + params: + column: "century" + keep_null: false +``` + +In this setup, the `century` column will be filled with random century values, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, then existing NULL values in the column will remain untouched, preserving the original dataset's integrity where no temporal data is available. diff --git a/docs/built_in_transformers/standard_transformers/random_choice.md b/docs/built_in_transformers/standard_transformers/random_choice.md new file mode 100644 index 00000000..19d5f3fb --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_choice.md @@ -0,0 +1,33 @@ +Replace values randomly chosen from a provided list. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|-----------------------------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | any | +| values | A list of values in any format. The string with value `\N` is considered NULL. | | Yes | - | +| validate | Performs a decoding procedure via the PostgreSQL driver using the column type to ensure that values have correct type | `true` | No | | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | | + +## Description + +The `RandomChoice` transformer replaces one randomly chosen value from the list provided in the `values` parameter. You +can use the `validate` parameter to ensure that values are correct before applying the transformation. The behaviour for NULL values can be configured using the `keep_null` parameter. + +## Example: Choosing randomly from provided dates + +In this example, the provided values undergo validation through PostgreSQL driver decoding, and one value is randomly +chosen from the list. + +```yaml title="RandomChoice transformer example" + - schema: "humanresources" + name: "jobcandidate" + transformers: + - name: "RandomChoice" + params: + column: "modifieddate" + validate: true + values: + - "2023-12-21 07:41:06.891" + - "2023-12-21 07:41:06.896" +``` diff --git a/docs/built_in_transformers/standard_transformers/random_currency.md b/docs/built_in_transformers/standard_transformers/random_currency.md new file mode 100644 index 00000000..aae5e83c --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_currency.md @@ -0,0 +1,28 @@ +The `RandomCurrency` transformer is tailored to populate specified database columns with random currency codes. This tool is highly beneficial for applications involving the simulation of international financial data, testing currency conversion features, or anonymizing currency information in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing a comprehensive list of global currency codes (e.g., USD, EUR, JPY), the `RandomCurrency` transformer injects random currency codes into the designated database column. This feature allows for the creation of diverse and realistic financial transaction datasets by simulating a variety of currencies without relying on actual financial data. + +## Example: Populate random currency codes for the `transactions` table + +This example outlines configuring the `RandomCurrency` transformer to populate the `currency_code` column in a `transactions` table with random currency codes. It is an effective way to simulate international transactions across multiple currencies. + +```yaml title="RandomCurrency transformer example" +- schema: "public" + name: "transactions" + transformers: + - name: "RandomCurrency" + params: + column: "currency_code" + keep_null: false +``` + +In this configuration, the `currency_code` column will be updated with random currency codes for each entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where currency data may not be applicable. diff --git a/docs/built_in_transformers/standard_transformers/random_date.md b/docs/built_in_transformers/standard_transformers/random_date.md new file mode 100644 index 00000000..d8c796f5 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_date.md @@ -0,0 +1,43 @@ +Generate a random date in a specified interval. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|-----------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|------------------------------| +| column | Name of the column to be affected | | Yes | date, timestamp, timestamptz | +| min | The minimum threshold date for the random value. The format depends on the column type. | | Yes | - | +| max | The maximum threshold date for the random value. The format depends on the column type. | | Yes | - | +| truncate | Truncate the date to the specified part (`nano`, `second`, `minute`, `hour`, `day`, `month`, `year`). The truncate operation is not applied by default. | | No | - | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | - | + +## Description + +The `RandomDate` transformer generates a random date within the provided interval, starting from `min` to `max`. It +can also perform date truncation up to the specified part of the date. The format of dates in the `min` and `max` +parameters must adhere to PostgreSQL types, including `DATE`, `TIMESTAMP WITHOUT TIMEZONE`, +or `TIMESTAMP WITH TIMEZONE`. The behaviour for NULL values can be configured using the `keep_null` parameter. + +## Example: Generate `modifieddate` + +In the following example, a random timestamp without timezone is generated for the `modifieddate` column within the range from +`2011-05-31 00:00:00` to `2013-05-31 00:00:00`, and the part of the random value after `day` is truncated. + +``` yaml title="RandomDate transformer example" +- schema: "sales" + name: "salesorderdetail" + transformers: + - name: "RandomDate" + params: + column: "modifieddate" + keep_null: false + min: "2011-05-31 00:00:00" + max: "2013-05-31 00:00:00" + truncate: "day" +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|--------------|---------------------|---------------------| +| modifieddate | 2007-06-23 00:00:00 | 2005-12-08 00:00:00 | +``` diff --git a/docs/built_in_transformers/standard_transformers/random_day_of_month.md b/docs/built_in_transformers/standard_transformers/random_day_of_month.md new file mode 100644 index 00000000..65471716 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_day_of_month.md @@ -0,0 +1,28 @@ +The `RandomDayOfMonth` transformer is designed to populate specified database columns with random day-of-the-month values. It is particularly useful for scenarios requiring the simulation of dates, such as generating random event dates, user sign-up dates, or any situation where the specific day of the month is needed without reference to the actual month or year. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar, int2, int4, int8, numeric | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing the `faker` library, the `RandomDayOfMonth` transformer generates random numerical values representing days of the month, ranging from 1 to 31. This allows for the easy insertion of random but plausible day-of-the-month data into a database, enhancing realism or anonymizing actual dates. + +## Example: Populate random days of the month for the `events` table + +This example illustrates how to configure the `RandomDayOfMonth` transformer to fill the `event_day` column in the `events` table with random day-of-the-month values, facilitating the simulation of varied event scheduling. + +```yaml title="RandomDayOfMonth transformer example" +- schema: "public" + name: "events" + transformers: + - name: "RandomDayOfMonth" + params: + column: "event_day" + keep_null: false +``` + +With this setup, the `event_day` column will be updated with random day-of-the-month values, replacing any existing non-NULL values. Setting `keep_null` to `true` ensures that NULL values in the column are left unchanged, maintaining any existing gaps in the data. diff --git a/docs/built_in_transformers/standard_transformers/random_day_of_week.md b/docs/built_in_transformers/standard_transformers/random_day_of_week.md new file mode 100644 index 00000000..3caae1ee --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_day_of_week.md @@ -0,0 +1,28 @@ +The `RandomDayOfWeek` transformer is specifically designed to fill specified database columns with random day-of-the-week names. It is particularly useful for applications that require simulated weekly schedules, random event planning, or any scenario where the day of the week is relevant but the specific date is not. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing the `faker` library, the `RandomDayOfWeek` transformer generates names of days (e. g., Monday, Tuesday) at random. This transformer can be applied to any text or varchar column in a database, introducing variability and realism into data sets that need to represent days of the week in a non-specific manner. + +## Example: Populate random days of the week for the `work_schedule` table + +This example demonstrates configuring the `RandomDayOfWeek` transformer to populate the `work_day` column in the `work_schedule` table with random days of the week. This setup can help simulate a diverse range of work schedules without tying them to specific dates. + +```yaml title="RandomDayOfWeek transformer example" +- schema: "public" + name: "work_schedule" + transformers: + - name: "RandomDayOfWeek" + params: + column: "work_day" + keep_null: false +``` + +In this configuration, every entry in the `work_day` column will be updated with a random day of the week, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, then existing NULL values within the column will remain unchanged. diff --git a/docs/built_in_transformers/standard_transformers/random_domain_name.md b/docs/built_in_transformers/standard_transformers/random_domain_name.md new file mode 100644 index 00000000..9850ccd6 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_domain_name.md @@ -0,0 +1,28 @@ +The `RandomDomainName` transformer is designed to populate specified database columns with random domain names. This tool is invaluable for simulating web data, testing applications that interact with domain names, or anonymizing real domain information in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +By leveraging an algorithm or library capable of generating believable domain names, the `RandomDomainName` transformer introduces random domain names into the specified database column. Each generated domain name includes a second-level domain (SLD) and a top-level domain (TLD), such as "example.com" or "website.org," providing a wide range of plausible web addresses for database enrichment. + +## Example: Populate random domain names for the `websites` table + +This example demonstrates configuring the `RandomDomainName` transformer to populate the `domain` column in the `websites` table with random domain names. This approach facilitates the creation of a diverse and realistic set of web addresses for testing, simulation, or data anonymization purposes. + +```yaml title="RandomDomainName transformer example" +- schema: "public" + name: "websites" + transformers: + - name: "RandomDomainName" + params: + column: "domain" + keep_null: false +``` + +In this setup, the `domain` column will be updated with random domain names for each entry, replacing any existing non-NULL values. If `keep_null` is set to `true`, the transformer will preserve existing NULL values in the column, maintaining the integrity of data where domain information is not applicable. diff --git a/docs/built_in_transformers/standard_transformers/random_e164_phone_number.md b/docs/built_in_transformers/standard_transformers/random_e164_phone_number.md new file mode 100644 index 00000000..aeb9cf90 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_e164_phone_number.md @@ -0,0 +1,28 @@ +The `RandomE164PhoneNumber` transformer is developed to populate specified database columns with random E.164 phone numbers. This tool is essential for applications requiring the simulation of contact information, testing phone number validation systems, or anonymizing phone number data in datasets while focusing on E.164 numbers. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomE164PhoneNumber` transformer utilizes algorithms capable of generating random E.164 phone numbers with the standard international format and injects them into the designated database column. This feature allows for the creation of diverse and realistic contact information in datasets for development, testing, or data anonymization purposes. + +## Example: Populate random E.164 phone numbers for the `contact_information` table + +This example demonstrates configuring the `RandomE164PhoneNumber` transformer to populate the `phone_number` column in the `contact_information` table with random E.164 phone numbers. It is an effective method for simulating a variety of contact information entries with E.164 numbers. + +```yaml title="RandomE164PhoneNumber transformer example" +- schema: "public" + name: "contact_information" + transformers: + - name: "RandomE164PhoneNumber" + params: + column: "phone_number" + keep_null: false +``` + +In this configuration, the `phone_number` column will be updated with random E.164 phone numbers for each contact information entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where E.164 phone number information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_email.md b/docs/built_in_transformers/standard_transformers/random_email.md new file mode 100644 index 00000000..128413e7 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_email.md @@ -0,0 +1,28 @@ +The `RandomEmail` transformer is designed to populate specified database columns with random email addresses. This transformer is especially useful for applications requiring the simulation of user contact data, testing email functionalities, or anonymizing real user email addresses in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Leveraging a method or library capable of generating plausible email address strings, the `RandomEmail` transformer injects random email addresses into the specified database column. It generates email addresses with varied domains and user names, offering a realistic range of email patterns suitable for filling user tables, contact lists, or any other dataset requiring email addresses without utilizing real user data. + +## Example: Populate random email addresses for the `users` table + +This example illustrates configuring the `RandomEmail` transformer to populate the `email` column in the `users` table with random email addresses, thereby simulating a diverse user base without exposing real contact information. + +```yaml title="RandomEmail transformer example" +- schema: "public" + name: "users" + transformers: + - name: "RandomEmail" + params: + column: "email" + keep_null: false +``` + +In this setup, the `email` column will receive random email addresses for each entry, replacing any existing non-NULL values. If `keep_null` is set to `true`, then the transformer will preserve existing NULL values, maintaining the integrity of the original dataset where email information is absent. diff --git a/docs/built_in_transformers/standard_transformers/random_first_name.md b/docs/built_in_transformers/standard_transformers/random_first_name.md new file mode 100644 index 00000000..f184e7f7 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_first_name.md @@ -0,0 +1,28 @@ +The `RandomFirstName` transformer is designed to populate specified database columns with random first names. This tool is indispensable for applications requiring the simulation of user profiles, testing user registration systems, or anonymizing user data in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomFirstName` transformer utilizes a comprehensive list of first names to inject random first names into the designated database column. This feature allows for the creation of diverse and realistic user profiles by simulating a variety of first names without using real user data. + +## Example: Populate random first names for the `user_profiles` table + +This example demonstrates configuring the `RandomFirstName` transformer to populate the `first_name` column in the `user_profiles` table with random first names. It is an effective method for simulating a variety of user profiles with diverse first names. + +```yaml title="RandomFirstName transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomFirstName" + params: + column: "first_name" + keep_null: false +``` + +In this configuration, the `first_name` column will be updated with random first names for each user profile entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where first name information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_first_name_female.md b/docs/built_in_transformers/standard_transformers/random_first_name_female.md new file mode 100644 index 00000000..0921a2e6 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_first_name_female.md @@ -0,0 +1,28 @@ +The `RandomFirstNameFemale` transformer is designed to populate specified database columns with random female first names. This tool is crucial for applications requiring the simulation of user profiles, testing gender-specific features, or anonymizing user data in datasets while focusing on female names. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomFirstNameFemale` transformer utilizes a comprehensive list of female first names to inject random female first names into the designated database column. This feature allows for the creation of diverse and realistic user profiles with a focus on female names without using real user data. + +## Example: Populate random female first names for the `user_profiles` table + +This example demonstrates configuring the `RandomFirstNameFemale` transformer to populate the `first_name` column in the `user_profiles` table with random female first names. It is an effective method for simulating a variety of user profiles with diverse female first names. + +```yaml title="RandomFirstNameFemale transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomFirstNameFemale" + params: + column: "first_name" + keep_null: false +``` + +In this configuration, the `first_name` column will be updated with random female first names for each user profile entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where female first name information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_first_name_male.md b/docs/built_in_transformers/standard_transformers/random_first_name_male.md new file mode 100644 index 00000000..fc25b1fc --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_first_name_male.md @@ -0,0 +1,28 @@ +The `RandomFirstNameMale` transformer is developed to populate specified database columns with random male first names. This tool is essential for applications requiring the simulation of user profiles, testing gender-specific features, or anonymizing user data in datasets while focusing on male names. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|---------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomFirstNameMale` transformer utilizes a comprehensive list of male first names to inject random male first names into the designated database column. This feature allows for the creation of diverse and realistic user profiles with a focus on male names without using real user data. + +## Example: Populate random male first names for the `user_profiles` table + +This example demonstrates configuring the `RandomFirstNameMale` transformer to populate the `first_name` column in the `user_profiles` table with random male first names. It is an effective method for simulating a variety of user profiles with diverse male first names. + +```yaml title="RandomFirstNameMale transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomFirstNameMale" + params: + column: "first_name" + keep_null: false +``` + +In this configuration, the `first_name` column will be updated with random male first names for each user profile entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where male first name information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_float.md b/docs/built_in_transformers/standard_transformers/random_float.md new file mode 100644 index 00000000..c0db5b70 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_float.md @@ -0,0 +1,40 @@ +Generate a random float within the provided interval. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|----------------------------------------------------------------------------------------|---------|----------|---------------------------------------------------| +| column | The name of the column to be affected | | Yes | float4 (real), float8 (double precision), numeric | +| min | The minimum threshold for the random value. The value range depends on the column type. | | Yes | - | +| max | The maximum threshold for the random value. The value range depends on the column type. | | Yes | - | +| precision | The precision of the random float value (number of digits after the decimal point) | `4` | No | - | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | - | + +## Description + +The `RandomFloat` transformer generates a random float value within the provided interval, starting from `min` to +`max`, with the option to specify the number of decimal digits by using the `precision` parameter. The behaviour for NULL values can be configured using the `keep_null` parameter. + +## Example: Generate random price + +In this example, the `RandomFloat` transformer generates random prices in the range from `0.1` to `7000` while +maintaining a precision of up to 2 digits. + +``` yaml title="RandomFloat transformer example" +- schema: "sales" + name: "salesorderdetail" + transformers: + - name: "RandomFloat" + params: + column: "unitprice" + min: 0.1 + max: 7000 + precision: 2 +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|-------------|----------------|-------------| +| unitprice | 2024.994 | 6806.5 | +``` diff --git a/docs/built_in_transformers/standard_transformers/random_int.md b/docs/built_in_transformers/standard_transformers/random_int.md new file mode 100644 index 00000000..71db66fb --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_int.md @@ -0,0 +1,37 @@ +Generate a random integer within the provided interval. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|----------------------------------------------------------------------------------------|---------|----------|-----------------------------------------------------| +| column | The name of the column to be affected | | Yes | int2 (smallint), int4 (int), int8 (bigint), numeric | +| min | The minimum threshold for the random value. The value range depends on the column type. | | Yes | - | +| max | The maximum threshold for the random value. The value range depends on the column type. | | Yes | - | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | - | + +## Description + +The `RandomInt` transformer generates a random integer within the specified `min` and `max` thresholds. The behaviour for NULL values can be configured using the `keep_null` parameter. + +## Example: Generate random item quantity + +In the following example, the `RandomInt` transformer generates a random value in the range from `1` to `30` and assigns it to +the `orderqty` column. + +``` yaml title="RandomInt transformer example" +- schema: "sales" + name: "salesorderdetail" + transformers: + - name: "RandomInt" + params: + column: "orderqty" + min: 1 + max: 30 +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|-------------|----------------|-------------| +| orderqty | 1 | 8 | +``` diff --git a/docs/built_in_transformers/standard_transformers/random_ipv4.md b/docs/built_in_transformers/standard_transformers/random_ipv4.md new file mode 100644 index 00000000..321554c6 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_ipv4.md @@ -0,0 +1,28 @@ +The `RandomIPv4` transformer is designed to populate specified database columns with random IPv4 addresses. This utility is essential for applications requiring the simulation of network data, testing systems that utilize IP addresses, or anonymizing real IP addresses in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|----------|----------|---------------------| +| column | The name of the column to be affected | | Yes | text, varchar, inet | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing a robust algorithm or library for generating IPv4 address strings, the `RandomIPv4` transformer injects random IPv4 addresses into the designated database column. Each generated address follows the standard IPv4 format, consisting of four octets separated by dots (e. g., "192.168.1.1"), ensuring a wide range of plausible network addresses for various use cases. + +## Example: Populate random IPv4 addresses for the `network_logs` table + +This example shows how to configure the `RandomIPv4` transformer to populate the `source_ip` column in the `network_logs` table with random IPv4 addresses, simulating diverse network traffic sources for analysis or testing purposes. + +```yaml title="RandomIPv4 transformer example" +- schema: "public" + name: "network_logs" + transformers: + - name: "RandomIPv4" + params: + column: "source_ip" + keep_null: false +``` + +With this setup, the `source_ip` column will be updated with random IPv4 addresses for each entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, it will ensure that existing NULL values in the column are preserved, accommodating scenarios where IP address data may be intentionally omitted. diff --git a/docs/built_in_transformers/standard_transformers/random_ipv6.md b/docs/built_in_transformers/standard_transformers/random_ipv6.md new file mode 100644 index 00000000..0226b98c --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_ipv6.md @@ -0,0 +1,28 @@ +The `RandomIPv6` transformer is engineered to populate specified database columns with random IPv6 addresses. This tool is particularly useful for simulating modern network environments, testing systems that operate with IPv6 addresses, or anonymizing datasets containing real IPv6 addresses. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|---------------------| +| column | The name of the column to be affected | | Yes | text, varchar, inet | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Employing advanced algorithms or libraries capable of generating IPv6 address strings, the `RandomIPv6` transformer introduces random IPv6 addresses into the specified database column. IPv6 addresses, represented as eight groups of four hexadecimal digits separated by colons (e. g., "2001:0db8:85a3:0000:0000:8a2e:0370:7334"), provide a vast range of possible addresses, reflecting the extensive addressing capacity of the IPv6 standard. + +## Example: Populate random IPv6 addresses for the `devices` table + +This example illustrates configuring the `RandomIPv6` transformer to populate the `device_ip` column in the `devices` table with random IPv6 addresses, enhancing the dataset with a broad spectrum of network addresses for development, testing, or data protection purposes. + +```yaml title="RandomIPv6 transformer example" +- schema: "public" + name: "devices" + transformers: + - name: "RandomIPv6" + params: + column: "device_ip" + keep_null: false +``` + +This configuration ensures that the `device_ip` column receives random IPv6 addresses for each entry, replacing any existing non-NULL values. Setting the `keep_null` parameter to `true` allows for the preservation of existing NULL values within the column, maintaining the integrity of records where IP address information is not applicable or available. diff --git a/docs/built_in_transformers/standard_transformers/random_last_name.md b/docs/built_in_transformers/standard_transformers/random_last_name.md new file mode 100644 index 00000000..8fd8aba9 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_last_name.md @@ -0,0 +1,28 @@ +The `RandomLastName` transformer is developed to populate specified database columns with random last names. This tool is essential for applications requiring the simulation of user profiles, testing user registration systems, or anonymizing user data in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomLastName` transformer utilizes a comprehensive list of last names to inject random last names into the designated database column. This feature allows for the creation of diverse and realistic user profiles by simulating a variety of last names without using real user data. + +## Example: Populate random last names for the `user_profiles` table + +This example demonstrates configuring the `RandomLastName` transformer to populate the `last_name` column in the `user_profiles` table with random last names. It is an effective method for simulating a variety of user profiles with diverse last names. + +```yaml title="RandomLastName transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomLastName" + params: + column: "last_name" + keep_null: false +``` + +In this configuration, the `last_name` column will be updated with random last names for each user profile entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where last name information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_latitude.md b/docs/built_in_transformers/standard_transformers/random_latitude.md new file mode 100644 index 00000000..a9222587 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_latitude.md @@ -0,0 +1,28 @@ +The `RandomLatitude` transformer generates random latitude values for specified database columns. It is designed to support geographical data enhancements, particularly useful for applications requiring randomized but plausible geographical coordinates. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | float4, float8, numeric | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomLatitude` transformer utilizes the `faker` library to produce random latitude values within the range of -90 to +90 degrees. This transformer can be applied to columns designated to store geographical latitude information, enhancing data sets with randomized latitude coordinates. + +## Example: Populate random latitude for the `locations` table + +This example demonstrates configuring the `RandomLatitude` transformer to populate the `latitude` column in the `locations` table with random latitude values. + +```yaml title="RandomLatitude transformer example" +- schema: "public" + name: "locations" + transformers: + - name: "RandomLatitude" + params: + column: "latitude" + keep_null: false +``` + +With this configuration, the `latitude` column will be filled with random latitude values, replacing any existing non-NULL values. If `keep_null` is set to `true`, existing NULL values will be preserved. diff --git a/docs/built_in_transformers/standard_transformers/random_longitude.md b/docs/built_in_transformers/standard_transformers/random_longitude.md new file mode 100644 index 00000000..9a80dc39 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_longitude.md @@ -0,0 +1,28 @@ +The `RandomLongitude` transformer is designed to generate random longitude values for specified database columns, enhancing datasets with realistic geographic coordinates suitable for a wide range of applications, from testing location-based services to anonymizing real geographic data. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | float4, float8, numeric | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomLongitude` transformer leverages the `faker` library to produce random longitude values within the globally accepted range of -180 to +180 degrees. This flexibility allows the transformer to be applied to any column intended for storing longitude data, providing a simple yet powerful tool for introducing randomized longitude coordinates into a database. + +## Example: Populate random longitude for the `locations` table + +This example shows how to use the `RandomLongitude` transformer to fill the `longitude` column in the `locations` table with random longitude values. + +```yaml title="RandomLongitude transformer example" +- schema: "public" + name: "locations" + transformers: + - name: "RandomLongitude" + params: + column: "longitude" + keep_null: false +``` + +This setup ensures that all entries in the `longitude` column receive a random longitude value, replacing any existing non-NULL values. If `keep_null` is set to `true`, then existing NULL values in the column will remain unchanged. diff --git a/docs/built_in_transformers/standard_transformers/random_mac_address.md b/docs/built_in_transformers/standard_transformers/random_mac_address.md new file mode 100644 index 00000000..7982800e --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_mac_address.md @@ -0,0 +1,29 @@ +The `RandomMacAddress` transformer is developed to populate specified database columns with random MAC (Media Access Control) addresses. This transformer is particularly useful for simulating network hardware data, testing applications that process MAC addresses, or anonymizing real network device identifiers in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|----------------------------------------------------|---------|----------|----------------------------------| +| column | The name of the column to be affected | | Yes | text, varchar, macaddr, macaddr8 | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing a sophisticated algorithm or library for generating MAC address strings, the `RandomMacAddress` transformer injects random MAC addresses into the designated database column. Each generated MAC address follows the standard format of six groups of two hexadecimal digits, separated by colons (e. g., "01:23:45:67:89:ab"), ensuring plausible values for network device simulations. + +## Example: Populate random MAC addresses for the `network_devices` table + +This example shows how to configure the `RandomMacAddress` transformer to populate the `mac_address` column in +a `network_devices` table with random MAC addresses, enhancing the realism of simulated network device data. + +```yaml title="RandomMacAddress transformer example" +- schema: "public" + name: "network_devices" + transformers: + - name: "RandomMacAddress" + params: + column: "mac_address" + keep_null: false +``` + +With this configuration, every entry in the `mac_address` column will be assigned a random MAC address, replacing any existing non-NULL values. Setting the `keep_null` parameter to `true` allows the preservation of existing NULL values within the column, accommodating scenarios where MAC address data may be intentionally absent. diff --git a/docs/built_in_transformers/standard_transformers/random_month_name.md b/docs/built_in_transformers/standard_transformers/random_month_name.md new file mode 100644 index 00000000..deccd673 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_month_name.md @@ -0,0 +1,28 @@ +The `RandomMonthName` transformer is crafted to populate specified database columns with random month names. This transformer is especially useful for scenarios requiring the simulation of time-related data, such as user birth months or event months, without relying on specific date values. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomMonthName` transformer utilizes the `faker` library to generate the names of months at random. It can be applied to any textual column in a database to introduce variety and realism into data sets that require representations of months without the need for specific calendar dates. + +## Example: Populate random month names for the `user_profiles` table + +This example demonstrates how to configure the `RandomMonthName` transformer to fill the `birth_month` column in the `user_profiles` table with random month names, adding a layer of diversity to user data without using actual birthdates. + +```yaml title="RandomMonthName transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomMonthName" + params: + column: "birth_month" + keep_null: false +``` + +With this setup, the `birth_month` column will be updated with random month names, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, then existing NULL values within the column will remain untouched. diff --git a/docs/built_in_transformers/standard_transformers/random_name.md b/docs/built_in_transformers/standard_transformers/random_name.md new file mode 100644 index 00000000..e2e86fbd --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_name.md @@ -0,0 +1,28 @@ +The `RandomName` transformer is designed to populate specified database columns with random full names, including both first names and last names. This tool is indispensable for applications requiring the simulation of user profiles, testing user registration systems, or anonymizing user data in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomName` transformer utilizes a comprehensive list of first names and last names to inject random full names into the designated database column. This feature allows for the creation of diverse and realistic user profiles by simulating a variety of full names without using real user data. + +## Example: Populate random full names for the `user_profiles` table + +This example demonstrates configuring the `RandomName` transformer to populate the name column in the `user_profiles` table with random full names. It is an effective method for simulating a variety of user profiles with diverse full names. + +```yaml title="RandomName transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomName" + params: + column: "name" + keep_null: false +``` + +In this configuration, the `name` column will be updated with random full names for each user profile entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where full name information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_paragraph.md b/docs/built_in_transformers/standard_transformers/random_paragraph.md new file mode 100644 index 00000000..3050378b --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_paragraph.md @@ -0,0 +1,28 @@ +The `RandomParagraph` transformer is crafted to populate specified database columns with random paragraphs. This utility is indispensable for applications that require the generation of extensive textual content, such as simulating articles, enhancing textual datasets for NLP systems, or anonymizing textual content in databases. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Employing sophisticated text generation algorithms or libraries, the `RandomParagraph` transformer generates random paragraphs, injecting them into the designated database column. This transformer is designed to create varied and plausible paragraphs that simulate real-world textual content, providing a valuable tool for database enrichment, testing, and anonymization. + +## Example: Populate random paragraphs for the `articles` table + +This example illustrates configuring the `RandomParagraph` transformer to populate the `body` column in an `articles` table with random paragraphs. It is an effective way to simulate diverse article content for development, testing, or demonstration purposes. + +```yaml title="RandomParagraph transformer example" +- schema: "public" + name: "articles" + transformers: + - name: "RandomParagraph" + params: + column: "body" + keep_null: false +``` + +With this setup, the `body` column will receive random paragraphs for each entry, replacing any existing non-NULL values. Setting the `keep_null` parameter to `true` allows for the preservation of existing NULL values within the column, maintaining the integrity of records where article content is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_password.md b/docs/built_in_transformers/standard_transformers/random_password.md new file mode 100644 index 00000000..9fd6b483 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_password.md @@ -0,0 +1,28 @@ +The `RandomPassword` transformer is designed to populate specified database columns with random passwords. This utility is vital for applications that require the simulation of secure user data, testing systems with authentication mechanisms, or anonymizing real passwords in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Employing sophisticated password generation algorithms or libraries, the `RandomPassword` transformer injects random passwords into the designated database column. This feature is particularly useful for creating realistic and secure user password datasets for development, testing, or demonstration purposes. + +## Example: Populate random passwords for the `user_accounts` table + +This example demonstrates how to configure the `RandomPassword` transformer to populate the `password` column in the `user_accounts` table with random passwords. + +```yaml title="RandomPassword transformer example" +- schema: "public" + name: "user_accounts" + transformers: + - name: "RandomPassword" + params: + column: "password" + keep_null: false +``` + +In this configuration, every entry in the `password` column will be updated with a random password. Setting the `keep_null` parameter to `true` will preserve existing NULL values in the column, accommodating scenarios where password data may not be applicable. diff --git a/docs/built_in_transformers/standard_transformers/random_phone_number.md b/docs/built_in_transformers/standard_transformers/random_phone_number.md new file mode 100644 index 00000000..3b4c659c --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_phone_number.md @@ -0,0 +1,28 @@ +The `RandomPhoneNumber` transformer is developed to populate specified database columns with random phone numbers. This tool is essential for applications requiring the simulation of contact information, testing phone number validation systems, or anonymizing phone number data in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomPhoneNumber` transformer utilizes algorithms capable of generating random phone numbers with various formats and injects them into the designated database column. This feature allows for the creation of diverse and realistic contact information in datasets for development, testing, or data anonymization purposes. + +## Example: Populate random phone numbers for the `contact_information` table + +This example demonstrates configuring the `RandomPhoneNumber` transformer to populate the `phone_number` column in the `contact_information` table with random phone numbers. It is an effective method for simulating a variety of contact information entries. + +```yaml title="RandomPhoneNumber transformer example" +- schema: "public" + name: "contact_information" + transformers: + - name: "RandomPhoneNumber" + params: + column: "phone_number" + keep_null: false +``` + +In this configuration, the `phone_number` column will be updated with random phone numbers for each contact information entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where phone number information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_sentence.md b/docs/built_in_transformers/standard_transformers/random_sentence.md new file mode 100644 index 00000000..56e8f68c --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_sentence.md @@ -0,0 +1,28 @@ +The `RandomSentence` transformer is designed to populate specified database columns with random sentences. Ideal for simulating natural language text for user comments, testing NLP systems, or anonymizing textual data in databases. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomSentence` transformer employs complex text generation algorithms or libraries to generate random sentences, injecting them into a designated database column without the need for specifying sentence length. This flexibility ensures the creation of varied and plausible text for a wide range of applications. + +## Example: Populate random sentences for the `comments` table + +This example shows how to configure the `RandomSentence` transformer to populate the `comment` column in the `comments` table with random sentences. It is a straightforward method for simulating diverse user-generated content. + +```yaml title="RandomSentence transformer example" +- schema: "public" + name: "comments" + transformers: + - name: "RandomSentence" + params: + column: "comment" + keep_null: false +``` + +In this configuration, the `comment` column will be updated with random sentences for each entry, replacing any existing non-NULL values. If `keep_null` is set to `true`, existing NULL values in the column will be preserved, maintaining the integrity of records where comments are not applicable. diff --git a/docs/built_in_transformers/standard_transformers/random_string.md b/docs/built_in_transformers/standard_transformers/random_string.md new file mode 100644 index 00000000..8384c7e3 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_string.md @@ -0,0 +1,40 @@ +Generate a random string using the provided characters within the specified length range. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------------------------------|--------------------------------------------------------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| min_length | The minimum length of the generated string | | Yes | - | +| max_length | The maximum length of the generated string | | Yes | - | +| symbols | The range of characters that can be used in the random string | `abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ` | No | - | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | - | + +## Description + +The `RandomString` transformer generates a random string with a length between `min_length` and `max_length` using the +characters specified in the symbols string as the possible set of characters. The behaviour for NULL values can be configured using the `keep_null` parameter. + +## Example: Generate a random string for `accountnumber` + +In the following example, a random string is generated for the `accountnumber` column with a length range from `9` to `12`. The +character set used for generation includes `1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ`. + +``` yaml title="RandomString transformer example" +- schema: "purchasing" + name: "vendor" + transformers: + - name: "RandomString" + params: + column: "accountnumber" + min_length: 9 + max_length: 12 + symbols: "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ" +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|---------------|----------------|-------------| +| accountnumber | AUSTRALI0001 | 96B82A65548 | +``` diff --git a/docs/built_in_transformers/standard_transformers/random_timezone.md b/docs/built_in_transformers/standard_transformers/random_timezone.md new file mode 100644 index 00000000..e5456c37 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_timezone.md @@ -0,0 +1,28 @@ +The `RandomTimezone` transformer is designed to populate specified database columns with random timezone strings. This transformer is particularly useful for applications that require the simulation of global user data, testing of timezone-related functionalities, or anonymizing real user timezone information in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing a comprehensive library or algorithm for generating timezone data, the `RandomTimezone` transformer provides random timezone strings (e. g., "America/New_York", "Europe/London") for database columns. This feature enables the creation of diverse and realistic datasets by simulating timezone information for user profiles, event timings, or any other data requiring timezone context. + +## Example: Populate random timezone strings for the `user_accounts` table + +This example demonstrates how to configure the `RandomTimezone` transformer to populate the `timezone` column in the `user_accounts` table with random timezone strings, enhancing the dataset with varied global user representations. + +```yaml title="RandomTimezone transformer example" +- schema: "public" + name: "user_accounts" + transformers: + - name: "RandomTimezone" + params: + column: "timezone" + keep_null: false +``` + +With this configuration, every entry in the `timezone` column will be updated with a random timezone string, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values within the column will remain unchanged, preserving the integrity of rows without specified timezone data. diff --git a/docs/built_in_transformers/standard_transformers/random_title_female.md b/docs/built_in_transformers/standard_transformers/random_title_female.md new file mode 100644 index 00000000..cea669c7 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_title_female.md @@ -0,0 +1,28 @@ +The `RandomTitleFemale` transformer is designed to populate specified database columns with random female titles. This tool is crucial for applications that require the simulation of user profiles, testing gender-specific features, or anonymizing user data in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomTitleFemale` transformer utilizes a predefined list of female titles (e. g., Mrs., Dr., Prof.) to inject random female titles into the designated database column. This feature allows for the creation of diverse and realistic user profiles by simulating a variety of female titles without using real user data. + +## Example: Populate random female titles for the `user_profiles` table + +This example demonstrates configuring the `RandomTitleFemale` transformer to populate the `title` column in the `user_profiles` table with random female titles. It is an effective method for simulating a variety of user profiles with female titles. + +```yaml title="RandomTitleFemale transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomTitleFemale" + params: + column: "title" + keep_null: false +``` + +In this configuration, the `title` column will be updated with random female titles for each user profile entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where title information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_title_male.md b/docs/built_in_transformers/standard_transformers/random_title_male.md new file mode 100644 index 00000000..59e668b3 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_title_male.md @@ -0,0 +1,28 @@ +The `RandomTitleMale` transformer is developed to populate specified database columns with random male titles. This tool is essential for applications that require the simulation of user profiles, testing gender-specific features, or anonymizing user data in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomTitleMale` transformer utilizes a predefined list of male titles (e. g., Mr., Dr., Prof.) to inject random male titles into the designated database column. This feature allows for the creation of diverse and realistic user profiles by simulating a variety of male titles without using real user data. + +## Example: Populate random male titles for the `user_profile` table + +This example outlines configuring the `RandomTitleMale` transformer to populate the `title` column in a `user_profiles` table with random male titles. It is a straightforward method for simulating a variety of user profiles with male titles. + +```yaml title="RandomTitleMale transformer example" +- schema: "public" + name: "user_profiles" + transformers: + - name: "RandomTitleMale" + params: + column: "title" + keep_null: false +``` + +In this configuration, the `title` column will be updated with random male titles for each user profile entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where title information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_toll_free_phone_number.md b/docs/built_in_transformers/standard_transformers/random_toll_free_phone_number.md new file mode 100644 index 00000000..832a7a2c --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_toll_free_phone_number.md @@ -0,0 +1,28 @@ +The `RandomTollFreePhoneNumber` transformer is designed to populate specified database columns with random toll-free phone numbers. This tool is essential for applications requiring the simulation of contact information, testing phone number validation systems, or anonymizing phone number data in datasets while focusing on toll-free numbers. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomTollFreePhoneNumber` transformer utilizes algorithms capable of generating random toll-free phone numbers with various formats and injects them into the designated database column. This feature allows for the creation of diverse and realistic toll-free contact information in datasets for development, testing, or data anonymization purposes. + +## Example: Populate random toll-free phone numbers for the `contact_information` table + +This example demonstrates configuring the `RandomTollFreePhoneNumber` transformer to populate the `phone_number` column in the `contact_information` table with random toll-free phone numbers. It is an effective method for simulating a variety of contact information entries with toll-free numbers. + +```yaml title="RandomTollFreePhoneNumber transformer example" +- schema: "public" + name: "contact_information" + transformers: + - name: "RandomTollFreePhoneNumber" + params: + column: "phone_number" + keep_null: false +``` + +In this configuration, the `phone_number` column will be updated with random toll-free phone numbers for each contact information entry, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, existing NULL values in the column will be preserved, ensuring the integrity of records where toll-free phone number information is not applicable or provided. diff --git a/docs/built_in_transformers/standard_transformers/random_unix_time.md b/docs/built_in_transformers/standard_transformers/random_unix_time.md new file mode 100644 index 00000000..cdf2c4fa --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_unix_time.md @@ -0,0 +1,28 @@ +The `RandomUnixTime` transformer generates random Unix time values (timestamps) for specified database columns. It is particularly useful for populating columns with timestamp data, simulating time-related data, or anonymizing actual timestamps in a dataset. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | int4, int8, numeric | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomUnixTime` transformer uses the `faker` library to generate random Unix timestamps. Unix time, also known as POSIX time or Epoch time, is a system for describing points in time, defined as the number of seconds elapsed since midnight Coordinated Universal Time (UTC) of January 1, 1970, not counting leap seconds. This transformer allows for the generation of timestamps that can represent any moment from the Epoch to the present or even into the future, depending on the range of the `faker` library's implementation. + +## Example: Populate random timestamps for the `registration_dates` table + +This example configures the `RandomUnixTime` transformer to apply random Unix timestamps to the `registration_date` column in a `users` table, simulating user registration times. + +```yaml title="RandomUnixTime transformer example" +- schema: "public" + name: "users" + transformers: + - name: "RandomUnixTime" + params: + column: "registration_date" + keep_null: false +``` + +In this configuration, every entry in the `registration_date` column is assigned a random Unix timestamp, replacing any existing non-NULL values. Setting `keep_null` to `true` would ensure that NULL values in the column are left unchanged. diff --git a/docs/built_in_transformers/standard_transformers/random_url.md b/docs/built_in_transformers/standard_transformers/random_url.md new file mode 100644 index 00000000..a48d38b7 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_url.md @@ -0,0 +1,28 @@ +The `RandomURL` transformer is designed to populate specified database columns with random URL (Uniform Resource Locator) addresses. This tool is highly beneficial for simulating web content, testing applications that require URL input, or anonymizing real web addresses in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +Utilizing advanced algorithms or libraries for generating URL strings, the `RandomURL` transformer injects random, plausible URLs into the designated database column. Each generated URL is structured to include the protocol (e. g., "http://", "https://"), domain name, and path, offering a realistic range of web addresses for various applications. + +## Example: Populate random URLs for the `webpages` table + +This example illustrates how to configure the `RandomURL` transformer to populate the `page_url` column in a `webpages` table with random URLs, providing a broad spectrum of web addresses for testing or data simulation purposes. + +```yaml title="RandomURL transformer example" +- schema: "public" + name: "webpages" + transformers: + - name: "RandomURL" + params: + column: "page_url" + keep_null: false +``` + +With this configuration, the `page_url` column will be filled with random URLs for each entry, replacing any existing non-NULL values. Setting the `keep_null` parameter to `true` allows for the preservation of existing NULL values within the column, accommodating scenarios where URL data may be intentionally omitted. diff --git a/docs/built_in_transformers/standard_transformers/random_username.md b/docs/built_in_transformers/standard_transformers/random_username.md new file mode 100644 index 00000000..73e367fb --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_username.md @@ -0,0 +1,28 @@ +The `RandomUsername` transformer is crafted to populate specified database columns with random usernames. This utility is crucial for applications that require the simulation of user data, testing systems with user login functionality, or anonymizing real usernames in datasets. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +By employing sophisticated algorithms or libraries capable of generating believable usernames, the `RandomUsername` transformer introduces random usernames into the specified database column. Each generated username is designed to be unique and plausible, incorporating a mix of letters, numbers, and possibly special characters, depending on the generation logic used. + +## Example: Populate random usernames for the `user_accounts` table + +This example demonstrates configuring the `RandomUsername` transformer to populate the `username` column in a `user_accounts` table with random usernames. This setup is ideal for creating a diverse and realistic user base for development, testing, or demonstration purposes. + +```yaml title="RandomUsername transformer example" +- schema: "public" + name: "user_accounts" + transformers: + - name: "RandomUsername" + params: + column: "username" + keep_null: false +``` + +In this configuration, every entry in the `username` column will be updated with a random username, replacing any existing non-NULL values. If the `keep_null` parameter is set to `true`, then the transformer will preserve existing NULL values within the column, maintaining data integrity where usernames are not applicable or available. diff --git a/docs/built_in_transformers/standard_transformers/random_uuid.md b/docs/built_in_transformers/standard_transformers/random_uuid.md new file mode 100644 index 00000000..f3baa8d2 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_uuid.md @@ -0,0 +1,33 @@ +Generate random unique user ID using version 4. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------------------------------|---------|----------|---------------------| +| column | The name of the column to be affected | | Yes | text, varchar, uuid | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | - | + +## Description + +The `RandomUuid` transformer generates a random UUID. The behaviour for NULL values can be configured using the `keep_null` parameter. + +## Example: Updating the `rowguid` column + +The following example replaces original UUID values of the `rowguid` column to randomly generated ones. + +``` yaml title="RandomUuid transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "RandomUuid" + params: + column: "rowguid" + keep_null: false +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|-------------|--------------------------------------|--------------------------------------| +| rowguid | f01251e5-96a3-448d-981e-0f99d789110d | 0211629f-d197-4187-8a87-095ec4f51977 | +``` diff --git a/docs/built_in_transformers/standard_transformers/random_word.md b/docs/built_in_transformers/standard_transformers/random_word.md new file mode 100644 index 00000000..78be7024 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_word.md @@ -0,0 +1,28 @@ +The `RandomWord` transformer populates specified database columns with random words. Ideal for simulating textual content, enhancing linguistic datasets, or anonymizing text in databases. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|------------|-------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomWord` transformer employs a mechanism to inject random words into a designated database column, supporting the generation of linguistically plausible and contextually diverse text. This transformer is particularly beneficial for creating rich text datasets for development, testing, or educational purposes without specifying the language, focusing on versatility and ease of use. + +## Example: Populate random words for the `content` table + +This example demonstrates configuring the `RandomWord` transformer to populate the `tag` column in the `content` table with random words. It is a straightforward approach to adding varied textual data for tagging or content categorization. + +```yaml title="RandomWord transformer example" +- schema: "public" + name: "content" + transformers: + - name: "RandomWord" + params: + column: "tag" + keep_null: false +``` + +In this setup, the `tag` column will be updated with random words for each entry, replacing any existing non-NULL values. If `keep_null` is set to `true`, existing NULL values in the column will remain unchanged, maintaining data integrity for records where textual data is not applicable. diff --git a/docs/built_in_transformers/standard_transformers/random_year_string.md b/docs/built_in_transformers/standard_transformers/random_year_string.md new file mode 100644 index 00000000..9d95bbcd --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/random_year_string.md @@ -0,0 +1,28 @@ +The `RandomYearString` transformer is designed to populate specified database columns with random year strings. It is ideal for scenarios that require the representation of years without specific dates, such as manufacturing years of products, birth years of users, or any other context where only the year is relevant. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar, int2, int4, int8, numeric | +| keep_null | Indicates whether NULL values should be preserved | `false` | No | - | + +## Description + +The `RandomYearString` transformer leverages the `faker` library to generate strings representing random years. This allows for the easy generation of year data in a string format, adding versatility and realism to datasets that need to simulate or anonymize year-related information. + +## Example: Populate random year strings for the `products` table + +This example shows how to use the `RandomYearString` transformer to fill the `manufacturing_year` column in the `products` table with random year strings, simulating the diversity of manufacturing dates. + +```yaml title="RandomYearString transformer example" +- schema: "public" + name: "products" + transformers: + - name: "RandomYearString" + params: + column: "manufacturing_year" + keep_null: false +``` + +In this configuration, the `manufacturing_year` column will be populated with random year strings, replacing any existing non-NULL values. If `keep_null` is set to `true`, then existing NULL values in the column will be preserved. diff --git a/docs/built_in_transformers/standard_transformers/real_address.md b/docs/built_in_transformers/standard_transformers/real_address.md new file mode 100644 index 00000000..54bb543e --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/real_address.md @@ -0,0 +1,45 @@ +Generates real addresses for specified database columns using the `faker` library. It supports customization of the generated address format through Go templates. + +## Parameters + +| Name | Properties | Description | Default | Required | Supported DB types | +|---------|------------|--------------------------------------------------------------------------------------|---------|----------|--------------------| +| columns | | Specifies the affected column names along with additional properties for each column | | Yes | Various | +| ∟ | name | The name of the column to be affected | | Yes | string | +| ∟ | template | A Go template string for formatting real address attributes | | Yes | string | +| ∟ | keep_null | Indicates whether NULL values should be preserved | | No | bool | + +### Template value descriptions + +The `template` parameter allows for the injection of real address attributes into a customizable template. The following values can be included in your template: + +- `{{.Address}}` — street address or equivalent +- `{{.City}}` — city name +- `{{.State}}` — state, province, or equivalent region name +- `{{.PostalCode}}` — postal or ZIP code +- `{{.Latitude}}` — geographic latitude +- `{{.Longitude}}` — geographic longitude + +These placeholders can be combined and formatted as desired within the template string to generate custom address formats. + +## Description + +The `RealAddress` transformer uses the `faker` library to generate realistic addresses, which can then be formatted according to a specified template and applied to selected columns in a database. It allows for the generated addresses to replace existing values or to preserve NULL values, based on the transformer's configuration. + +## Example: Generate Real addresses for the `employee` table + +This example shows how to configure the `RealAddress` transformer to generate real addresses for the `address` column in the `employee` table, using a custom format. + +```yaml title="RealAddress transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "RealAddress" + params: + columns: + - name: "address" + template: "{{.Address}}, {{.City}}, {{.State}} {{.PostalCode}}" + keep_null: false +``` + +This configuration will generate real addresses with the format "Street address, city, state postal code" and apply them to the `address` column, replacing any existing non-NULL values. diff --git a/docs/built_in_transformers/standard_transformers/regexp_replace.md b/docs/built_in_transformers/standard_transformers/regexp_replace.md new file mode 100644 index 00000000..36f16f86 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/regexp_replace.md @@ -0,0 +1,42 @@ +Replace a string using a regular expression. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|---------|-----------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | text, varchar | +| regexp | The regular expression pattern to search for in the column's value | | Yes | - | +| replace | The replacement value. This value may be replaced with a captured group from the `regexp` parameter. | | Yes | - | + +## Description + +The `RegexpReplace` transformer replaces a string according to the applied regular expression. The valid regular expressions syntax is the same as the general syntax used by Perl, Python, and other languages. To be precise, it is the syntax accepted by RE2 and described in the [Golang documentation](https://golang.org/s/re2syntax), except for `\C`. + +## Example: Removing leading prefix from `loginid` column value + +In the following example, the original values from `loginid` matching the `adventure-works\{{ id_name }}` format are replaced with `{{ id_name }}`. + +``` yaml title="RegexpReplace transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "RegexpReplace" + params: + column: "loginid" + regexp: "adventure-works\\\\(.*)" + replace: "$1" +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|-------------|----------------------|-------------| +| loginid | adventure-works\ken0 | ken0 | +``` + +!!! note + + YAML has control symbols, and using them without escaping may result in an error. In the example above, the prefix + of `id` is separated by the `\` symbol. Since this symbol is a control symbol, we must escape it using `\\`. + However, the '\' symbol is also a control symbol for regular expressions, which is why we need to + double-escape it as `\\\\`. diff --git a/docs/built_in_transformers/standard_transformers/replace.md b/docs/built_in_transformers/standard_transformers/replace.md new file mode 100644 index 00000000..f4579e62 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/replace.md @@ -0,0 +1,39 @@ +Replace an original value by the provided one. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|-----------|--------------------------------------------------------------------------------------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | any | +| replace | The value to replace | | Yes | - | +| keep_null | Indicates whether NULL values should be replaced with transformed values or not | `true` | No | - | +| validate | Performs a decoding procedure via the PostgreSQL driver using the column type to ensure that values have correct type | `true` | No | - | + +## Description + +The `Replace` transformer replace an original value from the specified column with the provided one. It can optionally run a validation check with the `validate` parameter to ensure that the values are of a correct type before starting transformation. The behaviour for NULL values can be configured using the `keep_null` parameter. + +## Example: Updating the `jobtitle` column + +In the following example, the provided `value: "programmer"` is first validated through driver decoding. If the current value of the +`jobtitle` column is not `NULL`, it will be replaced with `programmer`. If the current value is `NULL`, it will +remain `NULL`. + +``` yaml title="Replace transformer example" +- schema: "humanresources" + name: "employee" + transformers: + - name: "Replace" + params: + column: "jobtitle" + value: "programmer" + keep_null: false + validate: true +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|-------------|-------------------------|-------------| +| jobtitle | Chief Executive Officer | programmer | +``` diff --git a/docs/built_in_transformers/standard_transformers/set_null.md b/docs/built_in_transformers/standard_transformers/set_null.md new file mode 100644 index 00000000..4fdd2314 --- /dev/null +++ b/docs/built_in_transformers/standard_transformers/set_null.md @@ -0,0 +1,45 @@ +Set `NULL` value to a column. + +## Parameters + +| Name | Description | Default | Required | Supported DB types | +|--------|-----------------------------------------------------|---------|----------|--------------------| +| column | The name of the column to be affected | | Yes | any | + +## Description + +The `SetNull` transformer assigns `NULL` value to a column. This transformer generates warning if the affected column has `NOT NULL` constraint. + +```json title="NULL constraint violation warning" +{ + "hash": "5a229ee964a4ba674a41a4d63dab5a8c", + "meta": { + "ColumnName": "jobtitle", + "ConstraintType": "NotNull", + "ParameterName": "column", + "SchemaName": "humanresources", + "TableName": "employee", + "TransformerName": "SetNull" + }, + "msg": "transformer may produce NULL values but column has NOT NULL constraint", + "severity": "warning" +} +``` + +## Example: Set NULL value to `updated_at` column + +``` yaml title="SetNull transformer example" +- schema: "humanresources" + name: "employee" + transformation: + - name: "SetNull" + params: + column: "jobtitle" +``` + +```bash title="Expected result" + +| column name | original value | transformed | +|-------------|-------------------------|-------------| +| jobtitle | Chief Executive Officer | NULL | +``` diff --git a/docs/commands.md b/docs/commands.md new file mode 100644 index 00000000..58d43f0e --- /dev/null +++ b/docs/commands.md @@ -0,0 +1,773 @@ +# Commands + +## Introduction + +```shell title="Greenmask available commands" +greenmask \ + --log-format=[json|text] \ + --log-level=[debug|info|error] \ + --config=config.yml \ + [dump|list-dumps|delete|list-transformers|show-transformer|restore|show-dump]` +``` + +You can use the following commands within Greenmask: + +* `dump` — initiates the data dumping process +* `list-dumps` — lists all available dumps stored in the system +* `delete` — deletes a specific dump from the storage +* `list-transformers` — displays a list of available transformers along with their documentation +* `show-transformer` — displays information about the specified transformer +* `restore` — restores data to the target database either by specifying a `dumpId` or using the latest available dump +* `show-dump` — provides metadata information about a particular dump, offering insights into its structure and attributes + +For any of the commands mentioned above, you can include the following common flags: + +* `--log-format` — specifies the desired format for log output, which can be either `json` or `text`. This parameter is optional, with the default format set to `text`. +* `--log-level` — sets the desired level for log output, which can be one of `debug`, `info`, or `error`. This parameter is optional, with the default log level being `info`. +* `--config` — requires the specification of a configuration file in YAML format. This configuration file is mandatory for Greenmask to operate correctly. +* `--help` — displays comprehensive help information for Greenmask, providing guidance on its usage and available commands. + +## validate + +The `validate` command allows you to perform a validation procedure and compare data transformations. +Below is a list of all supported flags for the `validate` command: + +```text +Usage: + greenmask validate [flags] + +Flags: + --data perform test dump for --rows-limit rows and print it pretty + --diff find difference between original and transformed data + --format string format of table output. possible values [horizontal|vertical] (default "horizontal") + --rows-limit uint check tables dump only for specific tables (default 10) + --table strings check tables dump only for specific tables +``` + +You can use the `--table` flag multiple times to specify the tables you want to check. Tables can be written with or without schema names (e. g., `public.table_name` or `table_name`). If you specify multiple tables from different schemas, an error will be thrown. + +To start validation, use the following command: + +```shell +greenmask --config=config.yml dump --validate +``` + +```text title="Validation output example" +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"scheduled_departure","ConstraintDef":"CHECK (scheduled_arrival \u003e scheduled_departure)","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"Check","ParameterName":"column","SchemaName":"bookings","TableName":"flights","TransformerName":"RandomDate"},"msg":"possible constraint violation: column has Check constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"scheduled_departure","ConstraintDef":"UNIQUE (flight_no, scheduled_departure)","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"Unique","ParameterName":"column","SchemaName":"bookings","TableName":"flights","TransformerName":"RandomDate"},"msg":"possible constraint violation: column is involved into Unique constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"scheduled_arrival","ConstraintDef":"CHECK (scheduled_arrival \u003e scheduled_departure)","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"Check","ParameterName":"column","SchemaName":"bookings","TableName":"flights","TransformerName":"NoiseDate"},"msg":"possible constraint violation: column has Check constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"departure_airport","ConstraintDef":"FOREIGN KEY (departure_airport) REFERENCES airports_data(airport_code)","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"ForeignKey","ParameterName":"column","SchemaName":"bookings","TableName":"flights","TransformerName":"RegexpReplace"},"msg":"possible constraint violation: column is involved into ForeignKey constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"status","ConstraintDef":"CHECK (status::text = ANY (ARRAY['On Time'::character varying::text, 'Delayed'::character varying::text, 'Departed'::character varying::text, 'Arrived'::character varying::text, 'Scheduled'::character varying::text, 'Cancelled'::character varying::text]))","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"Check","ParameterName":"column","SchemaName":"bookings","TableName":"flights","TransformerName":"RegexpReplace"},"msg":"possible constraint violation: column has Check constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"post_code","ParameterName":"column","SchemaName":"bookings","TableName":"flights","TransformerName":"Replace","TypeName":"column"},"msg":"transformer may produce NULL values but column type has NOT NULL constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"Column":"actual_arrival","ColumnMaxLength":-1,"ConstraintType":"Length","Parameter":"column_b","SchemaName":"bookings","TableName":"flights","TransformerMaxLength":0,"TransformerName":"TwoDatesGen"},"msg":"transformer value might be out of length range: column has a length","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"actual_arrival","ConstraintDef":"CHECK (actual_arrival IS NULL OR actual_departure IS NOT NULL AND actual_arrival IS NOT NULL AND actual_arrival \u003e actual_departure)","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"Check","ParameterName":"column_b","SchemaName":"bookings","TableName":"flights","TransformerName":"TwoDatesGen"},"msg":"possible constraint violation: column has Check constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"Column":"scheduled_arrival","ConstraintType":"NotNull","Parameter":"column_a","SchemaName":"bookings","TableName":"flights","TransformerName":"TwoDatesGen"},"msg":"transformer may produce NULL values but column has NOT NULL constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"Column":"scheduled_arrival","ColumnMaxLength":-1,"ConstraintType":"Length","Parameter":"column_a","SchemaName":"bookings","TableName":"flights","TransformerMaxLength":0,"TransformerName":"TwoDatesGen"},"msg":"transformer value might be out of length range: column has a length","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"scheduled_arrival","ConstraintDef":"CHECK (scheduled_arrival \u003e scheduled_departure)","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"Check","ParameterName":"column_a","SchemaName":"bookings","TableName":"flights","TransformerName":"TwoDatesGen"},"msg":"possible constraint violation: column has Check constraint","severity":"warning"} +2023-10-30T11:19:28+02:00 WRN ValidationWarning={"meta":{"ColumnName":"range","ConstraintDef":"CHECK (range \u003e 0)","ConstraintName":"bookings","ConstraintSchema":"bookings","ConstraintType":"Check","ParameterName":"column","SchemaName":"bookings","TableName":"aircrafts_data","TransformerName":"NoiseInt"},"msg":"possible constraint violation: column has Check constraint","severity":"warning"} +``` + +The validation output will provide detailed information about potential constraint violations and schema issues. Each line contains nested JSON data under the `ValidationWarning` key, offering insights into the affected part of the configuration and potential constraint violations. + +```json title="Pretty formatted validation warning" +{ + "meta": { // (1) + "ColumnName": "status", // (2) + "ConstraintDef": "CHECK (status::text = ANY (ARRAY['On Time'::character varying::text, 'Delayed'::character varying::text, 'Departed'::character varying::text, 'Arrived'::character varying::text, 'Scheduled'::character varying::text, 'Cancelled'::character varying::text]))", // (3) + "ConstraintName": "bookings", // (4) + "ConstraintSchema": "bookings", // (5) + "ConstraintType": "Check", // (6) + "SchemaName": "bookings", // (7) + "TableName": "flights", // (8) + "TransformerName": "RegexpReplace", // (9) + "ParameterName": "column" // (10) + }, + "msg": "possible constraint violation: column has Check constraint", // (11) + "severity": "warning" // (12) +} +``` +{ .annotate } + +1. **Detailed metadata**. The validation output provides comprehensive metadata to pinpoint the source of problems. +2. **Column name** indicates the name of the affected column. +3. **Constraint definition** specifies the definition of the constraint that may be violated. +4. **Constraint name** identifies the name of the constraint that is potentially violated. +5. **Constraint schema name** indicates the schema in which the constraint is defined. +6. **Type of constraint** represents the type of constraint and can be one of the following: + ``` + * ForeignKey + * Check + * NotNull + * PrimaryKey + * PrimaryKeyReferences + * Unique + * Length + * Exclusion + * TriggerConstraint + ``` +7. **Table schema name** specifies the schema name of the affected table. +8. **Table name** identifies the name of the table where the problem occurs. +9. **Transformer name** indicates the name of the transformer responsible for the transformation. +10. **Name of affected parameter** typically, this is the name of the column parameter that is relevant to the validation warning. +11. **Validation warning description** provides a detailed description of the validation warning and the reason behind it. +12. **Severity of validation warning** indicates the severity level of the validation warning and can be one of the following: + ``` + * error + * warning + * info + * debug + ``` + +!!! note + + A validation warning with a severity level of `"error"` is considered critical and must be addressed before the dump operation can proceed. Failure to resolve such warnings will prevent the dump operation from being executed. + +Example of validation diff in vertical format (`--format=vertical`): + +![img.png](assets/validate_vertical_diff.png) + +The validation diff is presented in a neatly formatted table. In this table: + +* Columns that are affected by the transformation are highlighted with a red background. +* The pre-transformation values are displayed in green. +* The post-transformation values are shown in red. + +The result can be displayed in either **horizontal** or **vertical** format, making it easy to visualize and understand the differences between the original and transformed data. + +Example of validation diff in horizontal format (`--format=horizontal`): + +![img.png](assets/validate_horizontal_diff.png) + +In horizontal format, the validation diff is displayed with two lines for each entry: **line 1** represents +the **original data**, and **line 2** represents the **transformed data**. The color highlighting behavior +remains consistent with the vertical format, where affected columns are highlighted in red, and original and transformed values are displayed in green and red, respectively. This format allows for a side-by-side comparison of the original and transformed data, making it easy to spot differences. + +## dump + +The `dump` command operates in the following way: + +1. Dumps the data from the source database. +2. Validates the data for potential issues. +3. Applies the defined transformations. +4. Stores the transformed data in the specified storage location. + +```text title="Supported flags" +Usage: + greenmask dump [flags] + +Flags: + -b, --blobs include large objects in dump + -c, --clean clean (drop) database objects before recreating + -Z, --compress int compression level for compressed formats (default -1) + -C, --create include commands to create database in dump + -a, --data-only dump only the data, not the schema + -d, --dbname string database to dump (default "postgres") + --disable-dollar-quoting disable dollar quoting, use SQL standard quoting + --disable-triggers disable triggers during data-only restore + --enable-row-security enable row security (dump only content user has access to) + -E, --encoding string dump the data in encoding ENCODING + -N, --exclude-schema strings dump the specified schema(s) only + -T, --exclude-table strings do NOT dump the specified table(s) + --exclude-table-data strings do NOT dump data for the specified table(s) + -e, --extension strings dump the specified extension(s) only + --extra-float-digits string override default setting for extra_float_digits + -f, --file string output file or directory name + -h, --host string database server host or socket directory (default "/var/run/postgres") + --if-exists use IF EXISTS when dropping objects + --include-foreign-data strings use IF EXISTS when dropping objects + -j, --jobs int use this many parallel jobs to dump (default 1) + --load-via-partition-root load partitions via the root table + --lock-wait-timeout int fail after waiting TIMEOUT for a table lock (default -1) + -B, --no-blobs exclude large objects in dump + --no-comments do not dump comments + -O, --no-owner string skip restoration of object ownership in plain-text format + -X, --no-privileges do not dump privileges (grant/revoke) + --no-publications do not dump publications + --no-security-labels do not dump security label assignments + --no-subscriptions do not dump subscriptions + --no-sync do not wait for changes to be written safely to dis + --no-synchronized-snapshots do not use synchronized snapshots in parallel jobs + --no-tablespaces do not dump tablespace assignments + --no-toast-compression do not dump TOAST compression methods + --no-unlogged-table-data do not dump unlogged table data + --on-conflict-do-nothing add ON CONFLICT DO NOTHING to INSERT commands + -p, --port int database server port number (default 5432) + --quote-all-identifiers quote all identifiers, even if not key words + -n, --schema strings dump the specified schema(s) only + -s, --schema-only string dump only the schema, no data + --section string dump named section (pre-data, data, or post-data) + --serializable-deferrable wait until the dump can run without anomalies + --snapshot string use given snapshot for the dump + --strict-names require table and/or schema include patterns to match at least one entity each + -S, --superuser string superuser user name to use in plain-text format + -t, --table strings dump the specified table(s) only + --test string connect as specified database user (default "postgres") + --use-set-session-authorization use SET SESSION AUTHORIZATION commands instead of ALTER OWNER commands to set ownership + -U, --username string connect as specified database user (default "postgres") + -v, --verbose string verbose mode +``` + +## list-dumps + +The `list-dumps` command provides a list of all dumps stored in the storage. The list includes the following attributes: + +* `ID` — the unique identifier of the dump, used for operations like `restore`, `delete`, and `show-dump` +* `DATE` — the date when the snapshot was created +* `DATABASE` — the name of the database associated with the dump +* `SIZE` — the original size of the dump +* `COMPRESSED SIZE` — the size of the dump after compression +* `DURATION` — the duration of the dump procedure +* `TRANSFORMED` — indicates whether the dump has been transformed +* `STATUS` — the status of the dump, which can be one of the following: + * `done` — the dump was completed successfully + * `unknown` or `failed` — the dump might be in progress or failed. Failed dumps are not deleted automatically. + +Example of `list-dumps` output: +![list_dumps_screen.png](assets/list_dumps_screen.png) + +## list-transformers + +The `list-transformers` command provides a list of all the allowed transformers, including both standard and advanced transformers. This list can be helpful for searching for an appropriate transformer for your data transformation needs. + +To show a list of available transformers, use the following command: + +```shell +greenmask --config=config.yml list-transformers +``` + +Supported flags: + +* `--format` — allows to select the output format. There are two options available: `text` or `json`. The + default setting is `text`. + +Example of `list-transformers` output: + +![list_transformers_screen.png](assets/list_transformers_screen_2.png) + +When using the `list-transformers` command, you receive a list of available transformers with essential information about each of them. Below are the key parameters for each transformer: + +* `NAME` — the name of the transformer +* `DESCRIPTION` — a brief description of what the transformer does +* `COLUMN PARAMETER NAME` — name of a column or columns affected by transformation +* `SUPPORTED TYPES` — list the supported value types + +The JSON call `greenmask --config=config.yml list-transformers --format=json` has the same attributes: + +```json title="JSON format output" +[ + { + "name": "Cmd", + "description": "Transform data via external program using stdin and stdout interaction", + "parameters": [ + { + "name": "columns", + "supported_types": [ + "any" + ] + } + ] + }, + { + "name": "Dict", + "description": "Replace values matched by dictionary keys", + "parameters": [ + { + "name": "column", + "supported_types": [ + "any" + ] + } + ] + } +] +``` + +## show-transformer + +This command prints out detailed information about a transformer by a provided name, including specific attributes to help you understand and configure the transformer effectively. + +To show detailed information about a transformer, use the following command: + +```shell +greenmask --config=config.yml show-transformer TRANSFORMER_NAME +``` + +Supported flags: + +* `--format` — allows to select the output format. There are two options available: `text` or `json`. The + default setting is `text`. + +Example of `show-transformer` output: + +![show_transformer.png](assets/show_transformer.png) + +When using the `show-transformer` command, you receive detailed information about the transformer and its parameters and their possible attributes. Below are the key parameters for each transformer: + +* `Name` — the name of the transformer +* `Description` — a brief description of what the transformer does +* `Parameters` — a list of transformer parameters, each with its own set of attributes. Possible attributes include: + + * `description` — a brief description of the parameter's purpose + * `required` — a flag indicating whether the parameter is required when configuring the transformer + * `link_parameter` — specifies whether the value of the parameter will be encoded using a specific parameter type encoder. For example, if a parameter named `column` is linked to another parameter `start`, the `start` parameter's value will be encoded according to the `column` type when the transformer is initialized. + * `cast_db_type` — indicates that the value should be encoded according to the database type. For example, when dealing with the INTERVAL data type, you must provide the interval value in PostgreSQL format. + * `default_value` — the default value assigned to the parameter if it's not provided during configuration. + * `column_properties` — if a parameter represents the name of a column, it may contain additional properties, including: + * `nullable` — indicates whether the transformer may produce NULL values, potentially violating the NOT NULL constraint + * `unique` — specifies whether the transformer guarantees unique values for each call. If set to `true`, it means that the transformer cannot produce duplicate values, ensuring compliance with the UNIQUE constraint. + * `affected` — indicates whether the column is affected during the transformation process. If not affected, the column's value might still be required for transforming another column. + * `allowed_types` — a list of data types that are compatible with this parameter + * `skip_original_data` — specifies whether the original value of the column, before transformation, is relevant for the transformation process + * `skip_on_null` — indicates whether the transformer should skip the transformation when the input column value is NULL. If the column value is NULL, interaction with the transformer is unnecessary. + +!!! warning + + The default value in JSON format is base64 encoded. This might be changed in later version of Greenmask. + +```json title="JSON output example" +[ + { + "properties": { + "name": "NoiseFloat", + "description": "Make noise float for int", + "is_custom": false + }, + "parameters": [ + { + "name": "column", + "description": "column name", + "required": true, + "is_column": true, + "is_column_container": false, + "column_properties": { + "max_length": -1, + "affected": true, + "allowed_types": [ + "float4", + "float8", + "numeric" + ], + "skip_on_null": true + } + }, + { + "name": "ratio", + "description": "max random percentage for noise", + "required": false, + "is_column": false, + "is_column_container": false, + "default_value": "MC4x" + }, + { + "name": "precision", + "description": "precision of noised float value (number of digits after coma)", + "required": false, + "is_column": false, + "is_column_container": false, + "default_value": "NA==" + } + ] + } +] +``` + +## restore + +To perform a dump restoration with the provided dump ID, use the following command: + +```shell +greenmask --config=config.yml restore DUMP_ID +``` + +Alternatively, to restore the latest completed dump, use the following command: + +```shell +greenmask --config=config.yml restore latest +``` + +Note that the `restore` command shares the same parameters and environment variables as `pg_restore`, +allowing you to configure the restoration process as needed. + +```text title="Supported flags" +Flags: + -c, --clean clean (drop) database objects before recreating + -C, --create create the target database + -a, --data-only restore only the data, no schema + -d, --dbname string connect to database name (default "postgres") + --disable-triggers disable triggers during data-only restore + --enable-row-security enable row security + -N, --exclude-schema strings do not restore objects in this schema + -e, --exit-on-error exit on error, default is to continue + -f, --file string output file name (- for stdout) + -P, --function strings restore named function + -h, --host string database server host or socket directory (default "/var/run/postgres") + --if-exists use IF EXISTS when dropping objects + -i, --index strings restore named index + -j, --jobs int use this many parallel jobs to restore (default 1) + --list-format string use table of contents in format of text, json or yaml (default "text") + --no-comments do not restore comments + --no-data-for-failed-tables do not restore data of tables that could not be created + -O, --no-owner string skip restoration of object ownership + -X, --no-privileges skip restoration of access privileges (grant/revoke) + --no-publications do not restore publications + --no-security-labels do not restore security labels + --no-subscriptions ddo not restore subscriptions + --no-table-access-method do not restore table access methods + --no-tablespaces do not restore tablespace assignments + -p, --port int database server port number (default 5432) + -n, --schema strings restore only objects in this schema + -s, --schema-only string restore only the schema, no data + --section string restore named section (pre-data, data, or post-data) + -1, --single-transaction restore as a single transaction + --strict-names restore named section (pre-data, data, or post-data) match at least one entity each + -S, --superuser string superuser user name to use for disabling triggers + -t, --table strings restore named relation (table, view, etc.) + -T, --trigger strings restore named trigger + -L, --use-list string use table of contents from this file for selecting/ordering output + --use-set-session-authorization use SET SESSION AUTHORIZATION commands instead of ALTER OWNER commands to set ownership + -U, --username string connect as specified database user (default "postgres") + -v, --verbose string verbose mode + +``` + +## show-dump + +This command provides details about all objects and data that can be restored, similar to the `pg_restore -l` command in PostgreSQL. It helps you inspect the contents of the dump before performing the actual restoration. + +Parameters: + +* `--format` — format of printing. Can be `text` or `json`. + +To display metadata information about a dump, use the following command: + +```shell +greenmask --config=config.yml show-dump dumpID +``` + + +=== "Text output example" + ```text + ; + ; Archive created at 2023-10-30 12:52:38 UTC + ; dbname: demo + ; TOC Entries: 17 + ; Compression: -1 + ; Dump Version: 15.4 + ; Format: DIRECTORY + ; Integer: 4 bytes + ; Offset: 8 bytes + ; Dumped from database version: 15.4 + ; Dumped by pg_dump version: 15.4 + ; + ; + ; Selected TOC Entries: + ; + 3444; 0 0 ENCODING - ENCODING + 3445; 0 0 STDSTRINGS - STDSTRINGS + 3446; 0 0 SEARCHPATH - SEARCHPATH + 3447; 1262 24970 DATABASE - demo postgres + 3448; 0 0 DATABASE PROPERTIES - demo postgres + 222; 1259 24999 TABLE bookings flights postgres + 223; 1259 25005 SEQUENCE bookings flights_flight_id_seq postgres + 3460; 0 0 SEQUENCE OWNED BY bookings flights_flight_id_seq postgres + 3281; 2604 25030 DEFAULT bookings flights flight_id postgres + 3462; 0 24999 TABLE DATA bookings flights postgres + 3289; 2606 25044 CONSTRAINT bookings flights flights_flight_no_scheduled_departure_key postgres + 3291; 2606 25046 CONSTRAINT bookings flights flights_pkey postgres + 3287; 1259 42848 INDEX bookings flights_aircraft_code_status_idx postgres + 3292; 1259 42847 INDEX bookings flights_status_aircraft_code_idx postgres + 3293; 2606 25058 FK CONSTRAINT bookings flights flights_aircraft_code_fkey postgres + 3294; 2606 25063 FK CONSTRAINT bookings flights flights_arrival_airport_fkey postgres + 3295; 2606 25068 FK CONSTRAINT bookings flights flights_departure_airport_fkey postgres + ``` +=== "JSON output example" + + ```json linenums="1" + { + "startedAt": "2023-10-29T20:50:19.948017+02:00", // (1) + "completedAt": "2023-10-29T20:50:22.19333+02:00", // (2) + "originalSize": 4053842, // (3) + "compressedSize": 686557, // (4) + "transformers": [ // (5) + { + "Schema": "bookings", // (6) + "Name": "flights", // (7) + "Query": "", // (8) + "Transformers": [ // (9) + { + "Name": "RandomDate", // (10) + "Params": { // (11) + "column": "c2NoZWR1bGVkX2RlcGFydHVyZQ==", + "max": "MjAyMy0wMS0wMiAwMDowMDowMC4wKzAz", + "min": "MjAyMy0wMS0wMSAwMDowMDowMC4wKzAz" + } + } + ], + "ColumnsTypeOverride": null // (12) + } + ], + "header": { // (13) + "creationDate": "2023-10-29T20:50:20+02:00", + "dbName": "demo", + "tocEntriesCount": 15, + "dumpVersion": "16.0 (Homebrew)", + "format": "TAR", + "integer": 4, + "offset": 8, + "dumpedFrom": "16.0 (Debian 16.0-1.pgdg120+1)", + "dumpedBy": "16.0 (Homebrew)", + "tocFileSize": 8090, + "compression": 0 + }, + "entries": [ // (14) + { + "dumpId": 3416, + "databaseOid": 0, + "objectOid": 0, + "objectType": "ENCODING", + "schema": "", + "name": "ENCODING", + "owner": "", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": null + }, + { + "dumpId": 3417, + "databaseOid": 0, + "objectOid": 0, + "objectType": "STDSTRINGS", + "schema": "", + "name": "STDSTRINGS", + "owner": "", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": null + }, + { + "dumpId": 3418, + "databaseOid": 0, + "objectOid": 0, + "objectType": "SEARCHPATH", + "schema": "", + "name": "SEARCHPATH", + "owner": "", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": null + }, + { + "dumpId": 3419, + "databaseOid": 16384, + "objectOid": 1262, + "objectType": "DATABASE", + "schema": "", + "name": "demo", + "owner": "postgres", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": null + }, + { + "dumpId": 3420, + "databaseOid": 0, + "objectOid": 0, + "objectType": "DATABASE PROPERTIES", + "schema": "", + "name": "demo", + "owner": "postgres", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": null + }, + { + "dumpId": 222, + "databaseOid": 16414, + "objectOid": 1259, + "objectType": "TABLE", + "schema": "bookings", + "name": "flights", + "owner": "postgres", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": null + }, + { + "dumpId": 223, + "databaseOid": 16420, + "objectOid": 1259, + "objectType": "SEQUENCE", + "schema": "bookings", + "name": "flights_flight_id_seq", + "owner": "postgres", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 222 + ] + }, + { + "dumpId": 3432, + "databaseOid": 0, + "objectOid": 0, + "objectType": "SEQUENCE OWNED BY", + "schema": "bookings", + "name": "flights_flight_id_seq", + "owner": "postgres", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 223 + ] + }, + { + "dumpId": 3254, + "databaseOid": 16445, + "objectOid": 2604, + "objectType": "DEFAULT", + "schema": "bookings", + "name": "flights flight_id", + "owner": "postgres", + "section": "PreData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 223, + 222 + ] + }, + { + "dumpId": 3434, + "databaseOid": 16414, + "objectOid": 0, + "objectType": "TABLE DATA", + "schema": "\"bookings\"", + "name": "\"flights\"", + "owner": "\"postgres\"", + "section": "Data", + "originalSize": 4045752, + "compressedSize": 678467, + "fileName": "3434.dat.gz", + "dependencies": [] + }, + { + "dumpId": 3261, + "databaseOid": 16461, + "objectOid": 2606, + "objectType": "CONSTRAINT", + "schema": "bookings", + "name": "flights flights_flight_no_scheduled_departure_key", + "owner": "postgres", + "section": "PostData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 222, + 222 + ] + }, + { + "dumpId": 3263, + "databaseOid": 16463, + "objectOid": 2606, + "objectType": "CONSTRAINT", + "schema": "bookings", + "name": "flights flights_pkey", + "owner": "postgres", + "section": "PostData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 222 + ] + }, + { + "dumpId": 3264, + "databaseOid": 16477, + "objectOid": 2606, + "objectType": "FK CONSTRAINT", + "schema": "bookings", + "name": "flights flights_aircraft_code_fkey", + "owner": "postgres", + "section": "PostData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 222 + ] + }, + { + "dumpId": 3265, + "databaseOid": 16482, + "objectOid": 2606, + "objectType": "FK CONSTRAINT", + "schema": "bookings", + "name": "flights flights_arrival_airport_fkey", + "owner": "postgres", + "section": "PostData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 222 + ] + }, + { + "dumpId": 3266, + "databaseOid": 16487, + "objectOid": 2606, + "objectType": "FK CONSTRAINT", + "schema": "bookings", + "name": "flights flights_departure_airport_fkey", + "owner": "postgres", + "section": "PostData", + "originalSize": 0, + "compressedSize": 0, + "fileName": "", + "dependencies": [ + 222 + ] + } + ] + } + ``` + { .annotate } + + 1. The date when the backup has been initiated, also indicating the snapshot date. + 2. The date when the backup process was successfully completed. + 3. The original size of the backup in bytes. + 4. The size of the backup after compression in bytes. + 5. A list of tables that underwent transformation during the backup. + 6. The schema name of the table. + 7. The name of the table. + 8. Custom query override, if applicable. + 9. A list of transformers that were applied during the backup. + 10. The name of the transformer. + 11. The parameters provided for the transformer. + 12. A mapping of overridden column types. + 13. The header information in the table of contents file. This provides the same details as the `--format=text` output in the previous snippet. + 14. The list of restoration entries. This offers the same information as the `--format=text` output in the previous snippet. + +!!! note + + The `json` format provides more detailed information compared to the `text` format. The `text` format is primarily used for backward compatibility and for generating a restoration list that can be used with `pg_restore -L listfile`. On the other hand, the `json` format provides comprehensive metadata about the dump, including information about the applied transformers and their parameters. The `json` format is especially useful for detailed dump introspection. diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..8d483ba7 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,287 @@ +# Configuration + +The configuration is organized into six sections: + +* `common` — settings that can be used for both the `dump` and `restore` commands +* `log` — settings for the logging subsystem +* `storage` — settings for the storage locations where dumps are stored +* `dump` — settings for the `dump` command. This section includes `pg_dump` options and transformation parameters. +* `restore` — settings for the `restore` command. It contains `pg_restore` options and additional restoration + scripts. +* `custom_transformers` — definitions of the custom transformers that interact through `stdin` and `stdout`. Once a custom transformer is configured, it becomes accessible via the `greenmask list-transformers` command. + +## `common` section + +In the `common` section of the configuration, you can specify the following settings: + +* `pg_bin_path` — path to the PostgreSQL binaries. Note that the PostgreSQL server version must match the provided binaries. +* `tmp_dir` — temporary directory for storing the table of contents files + +!!! note + + Greenmask exclusively manages data dumping and data restoration processes, delegating schema dumping to the `pg_dump `utility and schema restoration to the `pg_restore` utility. Both `pg_dump` and `pg_restore` rely on a `toc.dat` file located in a specific directory, which contains metadata and object definitions. Therefore, the `tmp_dir` parameter is essential for storing the `toc.dat` file during the dumping or restoration procedure. It is important to note that all artifacts in this directory will be automatically deleted once the Greenmask command is completed. + +## `log` section + +In the `log` section of the configuration, you can specify the following settings: + +* `level` — specifies the level of logging, which can be one of the following: `debug`, `info`, or `error`. The default level is `info`. +* `format` — defines the logging format, which can be either `json` or `text`. The default format is `text`. + +## `storage` section + +In the `storage` section, you can configure the storage driver for storing the dumped data. Currently, +two storage options are supported: `directory` and `s3`. + +=== "`directory` option" + + The directory storage option refers to a filesystem directory where the dump data will be stored. + + Parameters include `path` which specifies the path to the directory in the filesystem where the dumps will be stored. + + ``` yaml title="directory storage config example" + directory: + path: "/home/user_name/storage_dir" # (1) + ``` + +=== "`s3` option" + + By choosing the `s3` storage option, you can store dump data in an S3-like remote storage service, + such as Amazon S3 or Azure Blob Storage. Here are the parameters you can configure for S3 storage: + + * `endpoint` — overrides the default AWS endpoint to a custom one for making requests + * `bucket` — the name of the bucket where the dump data will be stored + * `prefix` — a prefix for objects in the bucket, specified in path format + * `region` — the S3 service region + * `storage_class` — the storage class for performing object requests + * `disable_ssl` — disable SSL for interactions (default is `false`) + * `access_key_id` — access key for authentication + * `secret_access_key` — secret access key for authentication + * `session_token` — session token for authentication + * `role_arn` — Amazon resource name for role-based authentication + * `session_name` — role session name to uniquely identify a session + * `max_retries` — the number of retries on request failures + * `cert_file` — the path to the SSL certificate for making requests + * `max_part_size` — the maximum part length for one request + * `concurrency` — the number of goroutines to use in parallel for each upload call when sending parts + * `use_list_objects_v1` — use the old v1 `ListObjects` request instead of v2 one + * `force_path_style` — force the request to use path-style addressing (e. g., `http://s3.amazonaws.com/BUCKET/KEY`) instead of virtual hosted bucket addressing (e. g., `http://BUCKET.s3.amazonaws.com/KEY`) + * `use_accelerate` — enable S3 Accelerate feature + + ```yaml title="s3 storage config example for Minio running in Docker" + s3: + endpoint: "http://localhost:9000" + bucket: "testbucket" + region: "us-east-1" + access_key_id: "Q3AM3UQ867SPQQA43P2F" + secret_access_key: "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG" + ``` + +## `dump` section + +In the `dump` section of the configuration, you configure the `greenmask dump` command. It includes the following parameters: + +* `pg_dump_options` — a map of `pg_dump` options to configure the behavior of the command itself. You can refer to the list of supported `pg_dump` options in the [Greenmask dump command documentation](commands.md#dump). +* `transformation` — this section contains configuration for applying transformations to table columns during the dump operation. It includes the following sub-parameters: + + * `schema` — the schema name of the table + * `name` — the name of the table + * `query` — an optional parameter for specifying a custom query to be used in the COPY command. By default, the entire table is dumped, but you can use this parameter to set a custom query. + + !!! warning + + Be cautious when using the `query` parameter, as it may lead to constraint violation errors during restoration, and Greenmask currently cannot handle query validation. + + * `columns_type_override` — allows you to override the column types explicitly. You can associate a column with another type that is supported by your transformer. This is useful when the transformer works strictly with specific types of columns. For example, if a column named `post_code` is of the TEXT type, but the `RandomInt` transformer works only with INT family types, you can override it as shown in the example provided. + ``` yaml title="column type overridden example" + columns_type_override: + post_code: "int4" # (1) + ``` + { .annotate } + + 1. Change the data type of the post_code column to `INT4` (`INTEGER`) + + * `apply_for_inherited` — an optional parameter to apply the same transformation to all partitions if the table is partitioned. This can save you from defining the transformation for each partition manually. + + !!! warning + + It is recommended to use the `--load-via-partition-root` parameter when dealing with partitioned tables, as the partition key value might change. + + * `transformers` — a list of transformers to apply to the table, along with their parameters. Each transformation item includes the following sub-parameters: + + * `name` — the name of the transformer + * `params` — a map of the provided transformer parameters + + ```yaml title="transformers config example" + transformers: + - name: "RandomDate" + params: + min: "2023-01-01 00:00:00.0+03" + max: "2023-01-02 00:00:00.0+03" + column: "scheduled_departure" + + - name: "NoiseDate" + params: + ratio: "01:00:00" + column: "scheduled_arrival" + ``` + +Here is an example configuration for the `dump` section: + +```yaml title="dump section config example" +dump: + pg_dump_options: + dbname: "host=/run/postgresql user=postgres dbname=demo" + jobs: 10 + exclude-schema: "(\"teSt\"*|test*)" + table: "bookings.flights" + load-via-partition-root: true + + transformation: + - schema: "bookings" + name: "flights" + query: "select * from bookings.flights3 limit 1000000" + columns_type_override: + post_code: "int4" # (1) + transformers: + - name: "RandomDate" + params: + min: "2023-01-01 00:00:00.0+03" + max: "2023-01-02 00:00:00.0+03" + column: "scheduled_departure" + + - name: "NoiseDate" + params: + ratio: "01:00:00" + column: "scheduled_arrival" + + - name: "RegexpReplace" + params: + column: "status" + regexp: "On Time" + replace: "Delayed" + + - name: "RandomInt" # (2) + params: + column: "post_code" + min: "11" + max: "99" + + - schema: "bookings" + name: "aircrafts_data" + transformers: + - name: "Json" + params: + column: "model" + operations: + - operation: "set" + path: "en" + value: "Boeing 777-300-2023" + - operation: "set" + path: "crewSize" + value: 10 + + - name: "NoiseInt" + params: + ratio: 0.9 + column: "range" +``` +{ .annotate } + +1. Override the `post_code` column type to `int4` (INTEGER). This is necessary because the `post_code` column + originally has a `TEXT` type, but it contains values that resemble integers. By explicitly overriding the type to `int4`, we ensure compatibility with transformers that work with integer types, such as `RandomInt`. +2. After the type is overridden, we can apply a compatible transformer. + +## `validate` section + +In the `validate` section of the configuration, you can specify parameters for the `greenmask validate` +command. Here is an example of the validate section configuration: + +```yaml title="validate section config example" +validate: + tables: # (1) + - "orders" + - "public.cart" + data: true # (2) + diff: true # (3) + rows_limit: 10 # (4) + resolved_warnings: # (5) + - "8d436fae67b2b82b36bd3afeb0c93f30" + format: "horizontal" # (6) +``` +{ .annotate } + +1. A list of tables to validate. If this list is not empty, the validation operation will only be performed for the specified tables. Tables can be written with or without the schema name (e. g., `"public.cart"` or `"orders"`). +2. Specifies whether to perform data transformation for a limited set of rows. If set to `true`, data transformation will be performed, and the number of rows transformed will be limited to the value specified in the `rows_limit` parameter (default is `10`). +3. Specifies whether to perform diff operations for the transformed data. If set to `true`, the validation process will **find the differences between the original and transformed data**. See more details in the [validate command documentation](commands.md/#validate). +4. Limits the number of rows to be transformed during validation. The default limit is `10` rows, but you can change it by modifying this parameter. +5. A hash list of resolved warnings. These warnings have been addressed and resolved in a previous validation run. +6. Specifies the format of the transformation output. Possible values are `[horizontal|vertical]`. The default format is `horizontal`. You can choose the format that suits your needs. See more details in the [validate command documentation](commands.md/#validate). + +## `restore` section + +In the `restore` section of the configuration, you can specify parameters for the `greenmask restore` command. It contains `pg_restore` settings and custom script execution settings. Below you can find the available parameters: + +* `pg_restore_options` — a map of `pg_restore` options that are used to configure the behavior of + the `pg_restore` utility during the restoration process. You can refer to the list of supported `pg_restore` options in the [Greenmask restore command documentation](commands.md#restore). +* `scripts` — a map of custom scripts to be executed during different restoration stages. Each script is associated with a specific restoration stage and includes the following attributes: + * `[pre-data|data|post-data]` — the name of the restoration stage when the script should be executed; has the following parameters: + * `name` — the name of the script + * `when` — specifies when to execute the script, which can be either `"before"` or `"after"` the + specified restoration stage + * `query` — an SQL query string to be executed + * `query_file` — the path to an SQL query file to be executed + * `command` — a command with parameters to be executed. It is provided as a list, where the first item is the command name. + +As mentioned in [the architecture](architecture.md/#backing-up), a backup contains three sections: pre-data, data, and post-data. The custom script execution allows you to customize and control the restoration process by executing scripts or commands at specific stages. The available restoration stages and their corresponding execution conditions are as follows: + +* `pre-data` — scripts or commands can be executed before or after restoring the pre-data section +* `data` — scripts or commands can be executed before or after restoring the data section +* `post-data` — scripts or commands can be executed before or after restoring the post-data section + +Each stage can have a `"when"` condition with one of the following possible values: + +* `before` — execute the script or SQL command before the mentioned restoration stage +* `after` — execute the script or SQL command after the mentioned restoration stage + +Below you can one of the possible versions for the `scripts` part of the `restore` section: + +``` yaml title="scripts definition example" +scripts: + pre-data: # (1) + - name: "pre-data before script [1] with query" + when: "before" + query: "create table script_test(stage text)" + - name: "pre-data before script [2]" + when: "before" + query: "insert into script_test values('pre-data before')" + - name: "pre-data after test script [1]" + when: "after" + query: "insert into script_test values('pre-data after')" + - name: "pre-data after script with query_file [1]" + when: "after" + query_file: "pre-data-after.sql" + data: # (2) + - name: "data before script with command [1]" + when: "before" + command: # (4) + - "data-after.sh" + - "param1" + - "param2" + - name: "data after script [1]" + when: "after" + query_file: "data-after.sql" + post-data: # (3) + - name: "post-data before script [1]" + when: "before" + query: "insert into script_test values('post-data before')" + - name: "post-data after script with query_file [1]" + when: "after" + query_file: "post-data-after.sql" +``` +{ .annotate } + +1. **List of pre-data stage scripts**. This section contains scripts that are executed before or after the restoration of the pre-data section. The scripts include SQL queries and query files. +2. **List of data stage scripts**. This section contains scripts that are executed before or after the restoration of the data section. The scripts include shell commands with parameters and SQL query files. +3. **List of post-data stage scripts**. This section contains scripts that are executed before or after the restoration of the post-data section. The scripts include SQL queries and query files. +4. **Command in the first argument and the parameters in the rest of the list**. When specifying a command to be executed in the scripts section, you provide the command name as the first item in a list, followed by any parameters or arguments for that command. The command and its parameters are provided as a list within the script configuration. diff --git a/docs/custom-transformers.md b/docs/custom-transformers.md new file mode 100644 index 00000000..a0e3aa52 --- /dev/null +++ b/docs/custom-transformers.md @@ -0,0 +1,3 @@ +# Coming soon + +This article and custom transformation framework are in development. Stay tuned for the latest updates. diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 00000000..050d9ec0 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,408 @@ +# Getting started + +This guide will help you to quickly get familiar with Greenmask by setting up **Greenmask Playground** and trying it in action. Greenmask Playground is a Docker Compose environment that includes the following components: + +* **Original database** — the source database you'll be working with. +* **Empty database for restoration** — an empty database where the restored data will be placed. +* **MinIO storage** — used for storage purposes. +* **Greenmask Utility** — Greenmask itself, ready for use. + +!!! warning + + To complete this guide, you must have **Docker** and **docker-compose** installed. + +## Setting up Greenmask Playground + +1. Clone the `greenmask` repository and navigate to its directory by running the following commands: + + ```shell + git clone git@github.com:GreenmaskIO/greenmask.git && cd greenmask + ``` + +2. Once you have cloned the repository, start the environment by running Docker Compose: + + ```shell + docker-compose run greenmask + ``` +!!! Tip + + If you're experiencing problems with pulling images from Docker Hub, you can build the Greenmask image from source by running the following command: + + ```shell + docker-compose run greenmask-from-source + ``` + +Now you have Greenmask Playground up and running with a shell prompt inside the container. All further operations will be carried out within this container's shell. + +## Commands + +Before proceeding to configure Greenmask, let us explore some of the available Greenmask commands: + +```shell +greenmask + --log-format=[json|text] \ + --log-level=[debug|info|error] \ + --config=config.yml \ + [dump | list-dumps | delete | list-transformers | restore | show-dump | validate | completion] +``` + +Below you can find a description for each command: + +* `dump` — performs a logical data dump, transforms the data, and stores it in the designated storage. + +* `list-dumps` — retrieves a list of all stored dumps within the chosen storage. + +* `delete` — removes a dump with a specific ID from the storage. + +* `list-transformers` — displays a list of approved transformers and their documentation. + +* `restore` — restores a dump either by specifying its ID or using the latest available dump to the target database. + +* `show-dump` — presents metadata information about a specific dump (equivalent to `pg_restore -l ./`). + +* `validate` — executes a validation process and generates a data diff for the transformation. + +* `completion` — generates the autocompletion script for the specified shell. + +Note that you can customize the logging format and level using the provided options. Specifying a configuration file (`config.yml`) is mandatory to guide the tool's behavior. + +## Building config.yml + +### The sample database + +Greenmask Playground uses the [Microsoft AdventureWorks sample databases](https://learn.microsoft.com/en-us/sql/samples/adventureworks-install-configure?view=sql-server-ver16&tabs=ssms), +that have been ported to PostgreSQL and sourced from [morenoh149/postgresDBSamples](https://github.com/morenoh149/postgresDBSamples). + +Within Playground, you'll find two predefined databases: + +```text + Name | Owner +-------------+---------- + original | postgres + transformed | postgres +``` + +where: + +* `original` — a database that contains the deployed AdventureWorks sample databases as-is. +* `transformed` — an empty database for restoring transformed dumps. + +Within the Greenmask container, you'll have access to the following commands: + +* `greenmask` — launches the Greenmask obfuscation utility. +* `psql_o` — connects to the `original` database using the psql utility. +* `psql_t` — connects to the `transformed` database using the psql utility. +* `cleanup` — drops and recreates the `transformed` database as an empty container. + +If you are using an external Integrated Development Environment (IDE), you can connect using the following URIs: + +* Original database: `postgresql://postgres:example@localhost:54316/original` +* Transformed database: `postgresql://postgres:example@localhost:54316/transformed` + +### Creating a simple configuration + +The Greenmask utility container is configured with a volume attached to the `./playground` directory located at the root of the repository. Within this directory, there is a pre-defined configuration file called `config.yml`. You have the flexibility to modify this configuration as needed, making adjustments or adding additional transformations. + +Any changes made to this configuration file will be accessible within the container, allowing you to tailor Greenmask's behavior to your specific requirements. + +To build a basic configuration for Greenmask, you can follow these steps: + +1. Get a list of currently available transformers by running the following command: + + ```shell + greenmask --config config.yml list-transformers + ``` + + ![list-transformers-example.png](assets%2Fgetting_started%2Flist-transformers-example.png) + +When building your configuration, ensure that you fill in all the required attributes, including the following sections: + +* common +* storage +* dump +* restore + +Below is an example of a minimal configuration in YAML format: + +```yaml +common: + pg_bin_path: "/usr/lib/postgresql/16/bin" + tmp_dir: "/tmp" + +storage: + s3: + endpoint: "http://playground-storage:9000" + bucket: "adventureworks" + region: "us-east-1" + access_key_id: "Q3AM3UQ867SPQQA43P2F" + secret_access_key: "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG" + +validate: +# resolved_warnings: +# - "aa808fb574a1359c6606e464833feceb" + +dump: + pg_dump_options: # pg_dump option that will be provided + dbname: "host=playground-db user=postgres password=example dbname=original" + jobs: 10 + + transformation: # List of tables to transform + - schema: "humanresources" # Table schema + name: "employee" # Table name + transformers: # List of transformers to apply + - name: "NoiseDate" # name of transformers + params: # Transformer parameters + ratio: "10 year 9 mon 1 day" + column: "birthdate" # Column parameter - this transformer affects scheduled_departure column + +restore: + pg_restore_options: # pg_restore option (you can use the same options as pg_restore has) + jobs: 10 + dbname: "host=playground-db user=postgres password=example dbname=transformed" +``` + +This example demonstrates the essential components of a Greenmask configuration file in YAML format. Please ensure that +you customize it according to your specific needs. + +In the config above applied only one transformer on table `humanresources.employee` called `NoiseDate` with the +next parameters: + +* ratio - add noise to the value up to "10 year 9 mon 1 day" eather before or after. For he current value is + `1976-12-03` and the transformer generated the noise value randomly `1 year 3 mon` and decided to increase that value. + The result will be `1978-02-033` +* column - there is a column name that is going to be affected called `birthdate` + +### Run validation procedure + +You can utilize the following command to initiate a validation procedure: + +```shell +greenmask --config config.yml validate \ + --data \ + --diff \ + --format=vertical \ + --rows-limit=2 +``` + +The validation result will be displayed as follows: + +![validate-result.png](assets%2Fgetting_started%2Fvalidate-result.png) + +There is one warning; let's investigate it: + +```yaml +{ + "hash": "aa808fb574a1359c6606e464833feceb", + "meta": { + "ColumnName": "birthdate", + "ConstraintDef": "CHECK (birthdate >= '1930-01-01'::date AND birthdate <= (now() - '18 years'::interval))", + "ConstraintName": "humanresources", + "ConstraintSchema": "humanresources", + "ConstraintType": "Check", + "ParameterName": "column", + "SchemaName": "humanresources", + "TableName": "employee", + "TransformerName": "NoiseDate" + }, + "msg": "possible constraint violation: column has Check constraint", + "severity": "warning" +} +``` + +The validation warnings include the following details: + +* **hash** - A unique identifier for each validation warning, which can be used to exclude the warning from future + checks + by adding it to the `validate.resolved_warnings` configuration. +* **meta** - Contains essential information that helps identify the location in the configuration or the potentially + violated constraint. +* **msg** - A comprehensive message that provides a detailed explanation of the warning's cause +* **severity** - Indicates the severity of the warning, which can be either "warning" or "error." In the case of an + error, Greenmask will exit immediately with a non-zero exit code. + +The next step in the validation procedure is to compare the data before and after the transformation. This comparison +is presented in a table format. Columns with a red background indicate that they have been affected by the +transformation. The green values represent the original data before the transformation, while the red values depict +the data after the transformation. + +To exclude a warning from future runs, you can uncomment the resolved_warning attribute in the configuration file. + +```yaml +validate: + resolved_warnings: + - "aa808fb574a1359c6606e464833feceb" +``` + +By adding the hash of a warning to the `validate.resolved_warnings` configuration in your `config.yml` +file, you can effectively exclude that specific warning from being displayed in subsequent runs of the validation +process using the command: + +```shell +greenmask --config config.yml validate +``` + +### Dumping procedure + +To perform the data dumping procedure, follow these steps: + +1. Execute the following command to initiate the dump using your configured settings: + ```shell + greenmask --config config.yml dump + ``` + +2. Once the dumping process is complete, you will find the dump with an associated ID in the designated storage. + To list all available dumps, use the following command: + ```shell + greenmask --config config.yml list-dumps + ``` + ![list-dumps.png](assets%2Fgetting_started%2Flist-dumps.png) + +3. If you wish to examine the data that is scheduled for restoration, you can use the show dump command. + Provide the `dumpId` in your call to access the details: + + ```shell + greenmask --config config.yml show-dump 1702489882319 + ``` + + In the output below, you can observe the portion of objects that will be restored: + + ```text + ; + ; Archive created at 2023-12-13 17:51:22 UTC + ; dbname: original + ; TOC Entries: 986 + ; Compression: 0 + ; Dump Version: 16.1 (Ubuntu 16.1-1.pgdg22.04+1) + ; Format: DIRECTORY + ; Integer: 4 bytes + ; Offset: 8 bytes + ; Dumped from database version: 16.0 (Debian 16.0-1.pgdg120+1) + ; Dumped by pg_dump version: 16.1 (Ubuntu 16.1-1.pgdg22.04+1) + ; + ; + ; Selected TOC Entries: + ; + 4666; 0 0 ENCODING - ENCODING + 4667; 0 0 STDSTRINGS - STDSTRINGS + 4668; 0 0 SEARCHPATH - SEARCHPATH + 4669; 1262 16384 DATABASE - original postgres + 14; 2615 18396 SCHEMA - hr postgres + 9; 2615 16524 SCHEMA - humanresources postgres + 4670; 0 0 COMMENT - SCHEMA humanresources postgres + 13; 2615 18343 SCHEMA - pe postgres + 8; 2615 16429 SCHEMA - person postgres + 4671; 0 0 COMMENT - SCHEMA person postgres + 15; 2615 18421 SCHEMA - pr postgres + 10; 2615 16586 SCHEMA - production postgres + 4672; 0 0 COMMENT - SCHEMA production postgres + 16; 2615 18523 SCHEMA - pu postgres + 11; 2615 17034 SCHEMA - purchasing postgres + 4673; 0 0 COMMENT - SCHEMA purchasing postgres + + ... + ... + ... + 4427; 2606 18157 FK CONSTRAINT sales shoppingcartitem FK_ShoppingCartItem_Product_ProductID postgres + 4428; 2606 18162 FK CONSTRAINT sales specialofferproduct FK_SpecialOfferProduct_Product_ProductID postgres + 4429; 2606 18167 FK CONSTRAINT sales specialofferproduct FK_SpecialOfferProduct_SpecialOffer_SpecialOfferID postgres + 4430; 2606 18182 FK CONSTRAINT sales store FK_Store_BusinessEntity_BusinessEntityID postgres + 4431; 2606 18187 FK CONSTRAINT sales store FK_Store_SalesPerson_SalesPersonID postgres + + ``` + +### Restoration Procedure + +To restore data to the target database, you can use the following commands: + +1. To restore data from a specific dump (identified by its **dumpId**), execute the following command: + ```shell + greenmask --config config.yml restore [dumpId] + ``` + Replace **[dumpId]** with the appropriate dump identifier. + +2. Alternatively, you can restore the latest available dump by using the reserved word **latest** like this: + ```shell + greenmask --config config.yml restore latest + ``` + +3. After the restoration process is complete, you can verify the restored data by running the following PostgreSQL + command: + ```shell + psql_t -xc 'select * from humanresources.employee limit 2;' + ``` + This command will display the first two rows of the "flights" table in the target database, showing the restored + data. + + ``` + -[ RECORD 1 ]----+------------------------------------- + businessentityid | 1 + nationalidnumber | 295847284 + loginid | adventure-works\ken0 + jobtitle | Chief Executive Officer + birthdate | 1968-12-18 + maritalstatus | S + gender | M + hiredate | 2009-01-14 + salariedflag | t + vacationhours | 99 + sickleavehours | 69 + currentflag | t + rowguid | f01251e5-96a3-448d-981e-0f99d789110d + modifieddate | 2014-06-30 00:00:00 + organizationnode | / + -[ RECORD 2 ]----+------------------------------------- + businessentityid | 2 + nationalidnumber | 245797967 + loginid | adventure-works\terri0 + jobtitle | Vice President of Engineering + birthdate | 1970-05-04 + maritalstatus | S + gender | F + hiredate | 2008-01-31 + salariedflag | t + vacationhours | 1 + sickleavehours | 20 + currentflag | t + rowguid | 45e8f437-670d-4409-93cb-f9424a40d6ee + modifieddate | 2014-06-30 00:00:00 + organizationnode | /1/ + + ``` + +### Deleting a Dump + +To remove a specific dump from the storage, use the **delete** command with the appropriate **dumpId**. +Here's how to do it: + +```shell +greenmask --config config.yml delete 1702489882319 +``` + +After executing this command, the specified dump will be deleted from the storage. + +To verify the changes, you can list the available dumps using the following command: + +The result + +```shell +greenmask --config config.yml list-dumps +``` + +The list displayed dumps will not include the deleted dump with the previously provided dumpId. + +``` ++----+------+----------+------+-----------------+----------+-------------+--------+ +| ID | DATE | DATABASE | SIZE | COMPRESSED SIZE | DURATION | TRANSFORMED | STATUS | ++----+------+----------+------+-----------------+----------+-------------+--------+ ++----+------+----------+------+-----------------+----------+-------------+--------+ +``` + +## Conclusion + +This is a straightforward example of using Greenmask. If you wish to explore more advanced transformation cases and +delve deeper into the documentation. + +Additionally, if you have any questions or require further assistance, don't hesitate to reach out via +[Discord](https://discord.gg/97AKHdGD), [Telegram](https://t.me/greenmask_community), or by +emailing us at [support@greenmask.io](mailto:support@greenmask.io). Our team is here to help and +provide guidance as needed. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..9ed4a425 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,41 @@ +# About Greenmask + +**Greenmask** is a powerful open-source utility that is designed for logical database backup dumping, +obfuscation, and restoration. It offers extensive functionality for backup, anonymization, and data masking. + +Greenmask is written in pure Go and includes ported PostgreSQL libraries that allows for platform independence. This tool is stateless and does not require any changes to your database schema. It is designed to be highly customizable and backward-compatible with existing PostgreSQL utilities. + +## Purpose + +The Greenmask utility plays a central role in the Greenmask ecosystem. Our goal is to develop a comprehensive, UI-based solution for managing obfuscation procedures. We recognize the challenges of maintaining obfuscation consistency throughout the software lifecycle. Greenmask is dedicated to providing valuable tools and features that ensure the obfuscation process remains fresh, predictable, and transparent. + +## Key features + +* **Cross-platform** — can be easily built and executed on any platform, thanks to its Go-based architecture, + which eliminates platform dependencies. +* **Database type safe** — ensures data integrity by validating data and utilizing the database driver for + encoding and decoding operations. This approach guarantees the preservation of data formats. +* **Transformation validation and easy maintainable** — during obfuscation development, Greenmask provides validation warnings and a transformation diff feature, allowing you to monitor and maintain transformations effectively throughout the software lifecycle. +* **Partitioned tables transformation inheritance** — define transformation configurations once and apply them to all partitions within partitioned tables, simplifying the obfuscation process. +* **Stateless** — Greenmask operates as a logical dump and does not impact your existing database schema. +* **Backward compatible** — it fully supports the same features and protocols as existing vanilla PostgreSQL utilities. Dumps created by Greenmask can be successfully restored using the pg_restore utility. +* **Extensible** — users have the flexibility to implement domain-based transformations in any programming language or use predefined templates. +* **Declarative** — Greenmask allows you to define configurations in a structured, easily parsed, and recognizable format. +* **Integrable** — integrate Greenmask seamlessly into your CI/CD system for automated database obfuscation and restoration. +* **Parallel execution** — take advantage of parallel dumping and restoration, significantly reducing the time required to deliver results. +* **Provide variety of storages** — Greenmask offers a variety of storage options for local and remote data storage, including directories and S3-like storage solutions. + +## Use cases + +Greenmask is ideal for various scenarios, including: + +* **Backup and restoration**. Use Greenmask for your daily routines involving logical backup dumping and restoration. It seamlessly handles tasks like table restoration after truncation. Its functionality closely mirrors that of pg_dump and pg_restore, making it a straightforward replacement. +* **Anonymization, transformation, and data masking**. Employ Greenmask for anonymizing, transforming, and masking backups, especially when setting up a staging environment or for analytical purposes. It simplifies the deployment of a pre-production environment with consistently anonymized data, facilitating faster time-to-market in the development lifecycle. + +## Links + +* [Email](mailto:support@greenmask.io) +* [Twitter](https://twitter.com/GreenmaskIO) +* [Telegram](https://t.me/greenmask_community) +* [Discord](https://discord.gg/tAJegUKSTB) +* [DockerHub](https://hub.docker.com/r/greenmask/greenmask) diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..476f7c7d --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,53 @@ +# Installation + +## Prerequisites + +* Ensure that you have PostgreSQL utilities preinstalled, matching the **major version** + of your destination server. + +* If you are building Greenmask from source, make sure you have the `make` utility installed. + +## From GitHub binaries + +The easiest way to install Greenmask is by using the latest release's binary. Follow these steps: + +1. Check the latest [Greenmask release](https://github.com/GreenmaskIO/greenmask/releases). +2. From **Assets**, download the required binary. +3. Execute the downloaded binary to start using Greenmask. + +### Additional instructions for macOS users + +For those downloading `greenmask-macos-amd64` or `greenmask-macos-arm64`, additional steps are required to ensure proper execution. + +1. In your terminal, move to the directory where the Greenmask binary is located. + +2. Change the file permissions to make it executable by using the following command: + + ```bash + chmod 777 greenmask-macos-[version] + ``` + +3. Remove a quarantine attribute, which macOS may have applied, by using the following command: + + ```bash + xattr -d com.apple.quarantine greenmask-macos-[version] + ``` + !!! info + + In both commands above, replace `[version]` with `amd64` or `arm64` according to your download. + +## From source + +1. Clone the Greenmask repository by using the following command: + + ```bash + git clone git@github.com:GreenmaskIO/greenmask.git + ``` + +2. Once the repository is cloned, execute the following command to build Greenmask: + + ```bash + make build + ``` + +After completing the build process, you will find the binary named `greenmask` in the root directory of the repository. Execute the binary to start using Greenmask. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 00000000..1b575abe --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,147 @@ +site_name: Greenmask — PostgreSQL masking and obfuscation tool + +# Theme +theme: + name: material + custom_dir: overrides + logo: assets/logo.png + favicon: assets/logo.png + features: + - navigation.indexes + - announce.dismiss + - content.code.annotate + - content.code.copy + - content.tooltips + # - navigation.sections + - navigation.tabs + # - navigation.top + # - navigation.tracking + - search.highlight + - search.share + - search.suggest + # - toc.follow + palette: + scheme: default + primary: teal + accent: indigo + toggle: + icon: material/brightness-7 + + +# Markdown extensions +markdown_extensions: + - attr_list + - toc: + permalink: true + - pymdownx.highlight + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets + - admonition + - pymdownx.details + - tables + +nav: + - Home: + - Home: index.md + - Documentation: + - Installation: installation.md + - Configuration: configuration.md + - Architecture: architecture.md + - Playground: playground.md + - Commands: commands.md + - Transformers: + - built_in_transformers/index.md + - Standard transformers: + - built_in_transformers/standard_transformers/index.md + - Cmd: built_in_transformers/standard_transformers/cmd.md + - Dict: built_in_transformers/standard_transformers/dict.md + - Hash: built_in_transformers/standard_transformers/hash.md + - Masking: built_in_transformers/standard_transformers/masking.md + - NoiseDate: built_in_transformers/standard_transformers/noise_date.md + - NoiseFloat: built_in_transformers/standard_transformers/noise_float.md + - NoiseInt: built_in_transformers/standard_transformers/noise_int.md + - RandomBool: built_in_transformers/standard_transformers/random_bool.md + - RandomChoice: built_in_transformers/standard_transformers/random_choice.md + - RandomDate: built_in_transformers/standard_transformers/random_date.md + - RandomFloat: built_in_transformers/standard_transformers/random_float.md + - RandomInt: built_in_transformers/standard_transformers/random_int.md + - RandomString: built_in_transformers/standard_transformers/random_string.md + - RandomUuid: built_in_transformers/standard_transformers/random_uuid.md + - RandomLatitude: built_in_transformers/standard_transformers/random_latitude.md + - RandomLongitude: built_in_transformers/standard_transformers/random_longitude.md + - RandomUnixTime: built_in_transformers/standard_transformers/random_unix_time.md + - RandomDayOfWeek: built_in_transformers/standard_transformers/random_day_of_week.md + - RandomDayOfMonth: built_in_transformers/standard_transformers/random_day_of_month.md + - RandomMonthName: built_in_transformers/standard_transformers/random_month_name.md + - RandomYearString: built_in_transformers/standard_transformers/random_year_string.md + - RandomCentury: built_in_transformers/standard_transformers/random_century.md + - RandomTimezone: built_in_transformers/standard_transformers/random_timezone.md + - RandomEmail: built_in_transformers/standard_transformers/random_email.md + - RandomUsername: built_in_transformers/standard_transformers/random_username.md + - RandomPassword: built_in_transformers/standard_transformers/random_password.md + - RandomMacAddress: built_in_transformers/standard_transformers/random_mac_address.md + - RandomDomainName: built_in_transformers/standard_transformers/random_domain_name.md + - RandomURL: built_in_transformers/standard_transformers/random_url.md + - RandomIPv4: built_in_transformers/standard_transformers/random_ipv4.md + - RandomIPv6: built_in_transformers/standard_transformers/random_ipv6.md + - RandomWord: built_in_transformers/standard_transformers/random_word.md + - RandomSentence: built_in_transformers/standard_transformers/random_sentence.md + - RandomParagraph: built_in_transformers/standard_transformers/random_paragraph.md + - RandomCCType: built_in_transformers/standard_transformers/random_cc_type.md + - RandomCCNumber: built_in_transformers/standard_transformers/random_cc_number.md + - RandomCurrency: built_in_transformers/standard_transformers/random_currency.md + - RandomAmountWithCurrency: built_in_transformers/standard_transformers/random_amount_with_currency.md + - RandomName: built_in_transformers/standard_transformers/random_name.md + - RandomLastName: built_in_transformers/standard_transformers/random_last_name.md + - RandomFirstName: built_in_transformers/standard_transformers/random_first_name.md + - RandomFirstNameMale: built_in_transformers/standard_transformers/random_first_name_male.md + - RandomFirstNameFemale: built_in_transformers/standard_transformers/random_first_name_female.md + - RandomTitleMale: built_in_transformers/standard_transformers/random_title_male.md + - RandomTitleFemale: built_in_transformers/standard_transformers/random_title_female.md + - RandomPhoneNumber: built_in_transformers/standard_transformers/random_phone_number.md + - RandomTollFreePhoneNumber: built_in_transformers/standard_transformers/random_toll_free_phone_number.md + - RandomE164PhoneNumber: built_in_transformers/standard_transformers/random_e164_phone_number.md + - RealAddress: built_in_transformers/standard_transformers/real_address.md + - RegexpReplace: built_in_transformers/standard_transformers/regexp_replace.md + - Replace: built_in_transformers/standard_transformers/replace.md + - SetNull: built_in_transformers/standard_transformers/set_null.md + - Advanced transformers: + - built_in_transformers/advanced_transformers/index.md + - Json: built_in_transformers/advanced_transformers/json.md + - Template: built_in_transformers/advanced_transformers/template.md + - TemplateRecord: built_in_transformers/advanced_transformers/template_record.md + - Custom functions: + - built_in_transformers/advanced_transformers/custom_functions/index.md + - Core custom functions: built_in_transformers/advanced_transformers/custom_functions/core_functions.md + - Faker function: built_in_transformers/advanced_transformers/custom_functions/faker_function.md + - Release notes: + - Greenmask 0.1.5: release_notes/greenmask_0_1_5.md + - Greenmask 0.1.4: release_notes/greenmask_0_1_4.md + - Greenmask 0.1.3: release_notes/greenmask_0_1_3.md + - Greenmask 0.1.2: release_notes/greenmask_0_1_2.md + - Greenmask 0.1.1: release_notes/greenmask_0_1_1.md + - Greenmask 0.1.0: release_notes/greenmask_0_1_0.md + - Greenmask 0.1.0 Beta: release_notes/greenmask_0_1_0_beta.md + +repo_url: https://github.com/GreenmaskIO/greenmask +repo_name: GreenmaskIO/greenmask +site_url: https://greenmask.io +copyright: Copyright © 2024 Greenmask + + +extra: + # version: + # provider: mike + social: + - icon: fontawesome/brands/x-twitter + link: https://twitter.com/GreenmaskIO + - icon: fontawesome/brands/discord + link: https://discord.gg/97AKHdGD + - icon: fontawesome/brands/github + link: https://github.com/GreenmaskIO/greenmask + +plugins: + - social + - search diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 00000000..b2dea1c9 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,5 @@ +{% extends "base.html" %} + +{% block announce %} + Version 0.1.5 is released +{% endblock %} diff --git a/docs/playground.md b/docs/playground.md new file mode 100644 index 00000000..23f51a53 --- /dev/null +++ b/docs/playground.md @@ -0,0 +1,63 @@ +# Greenmask Playground + +Greenmask Playground is a sandbox environment in Docker with sample databases included to help you try Greenmask without any additional actions. It includes the following components: + +* **Original database** — the source database you'll be working with. +* **Empty database for restoration** — an empty database where the restored data will be placed. +* **MinIO storage** — used for storage purposes. +* **Greenmask Utility** — Greenmask itself, ready for use. + +!!! warning + + To complete this guide, you must have **Docker** and **docker-compose** installed. + +## Setting up Greenmask Playground + +1. Clone the `greenmask` repository and navigate to its directory by running the following commands: + + ```shell + git clone git@github.com:GreenmaskIO/greenmask.git && cd greenmask + ``` + +2. Once you have cloned the repository, start the environment by running Docker Compose: + + ```shell + docker-compose run greenmask + ``` +!!! Tip + + If you're experiencing problems with pulling images from Docker Hub, you can build the Greenmask image from source by running the following command: + + ```shell + docker-compose run greenmask-from-source + ``` + +Now you have Greenmask Playground up and running with a shell prompt inside the container. All further operations will be carried out within this container's shell. + +## Commands + +Below you can see Greenmask commands: + +* `dump` — performs a logical data dump, transforms the data, and stores it in the designated storage. + +* `list-dumps` — retrieves a list of all stored dumps within the chosen storage. + +* `delete` — removes a dump with a specific ID from the storage. + +* `list-transformers` — displays a list of approved transformers and their documentation. + +* `restore` — restores a dump either by specifying its ID or using the latest available dump to the target database. + +* `show-dump` — presents metadata information about a specific dump (equivalent to `pg_restore -l ./`). + +* `validate` — executes a validation process and generates a data diff for the transformation. + +* `completion` — generates the autocompletion script for the specified shell. + +To learn more about them, see [Commands](commands.md). + +## Transformers + +A configuration file is mandatory for Greenmask functioning. The pre-defined configuration file is stored at the repository root directory (`./playground/config.yml`). It also serves to define transformers which you can update to your liking in order to use Greenmask Playground more effectively and to get better understanding of the tool itself. To learn how to customize a configuration file, see [Configuration](configuration.md) + +The pre-defined configuration file uses the [NoiseDate](built_in_transformers/standard_transformers/noise_date.md) transformer as an example. To learn more about other transformers and how to use them, see [Transformers](built_in_transformers/index.md). diff --git a/docs/release_notes/greenmask_0_1_0.md b/docs/release_notes/greenmask_0_1_0.md new file mode 100644 index 00000000..9a82e249 --- /dev/null +++ b/docs/release_notes/greenmask_0_1_0.md @@ -0,0 +1,75 @@ +# Greenmask 0.1.0 + +We are excited to announce the release of Greenmask v0.1.0, marking the first production-ready version. This release addresses various bug fixes, introduces improvements, and includes documentation refactoring for enhanced clarity. + +## New features + +- Added positional arguments for the list-transformers command, allowing specific transformer information retrieval (e.g., `greenmask list-transformers RandomDate`). + +- Added a version parameter `--version` that prints Greenmask version. + +- Added numeric parameters support for `-Int` and `-Float` transformers. + +## Improvements + +- Improved verbosity in custom transformer interaction, accumulating `stderr` data and forwarding it in batches instead of writing it one by one. + +- Updated dependencies to newer versions. + +- Enhanced the stability of the JSON line interaction protocol by utilizing the stdlib JSON encoder/decoder. + +- Modified the method for sending table metadata to custom transformers; now, it is sent via `stdin` in the first line in JSON format instead of providing it via command arguments. + +- Refactored template functions naming. + +- Refactored `NoiseDate` transformer implementation for improved stability and predictability. + +- Changed the default value for the `Dict` transformer: `fail_not_matched parameter: true`. + +- Refactored the `Hash` transformer to provide a salt parameter and receive a base64 encoded salt. If salt is not provided, it generates one randomly. + +- Added validation for the truncate parameter of `NoiseDate` and `RandomDate` transformers that issues a warning if the provided value is invalid. + +- Increased verbosity of parameter validation warnings, now properly forwarding warnings to `stdout`. + +## Fixes + +- Resolved `pgx` driver connection leakage issue. + +- Fixed deletion failure of dumps for S3 storage. + +- Corrected cobra autocompletion for the Greenmask utility. + +- Fixed NOT NULL constraint validation. + +- Addressed JSON API interaction issues that previously caused deadlocks and timeouts. + +- Fixed encode-decoding for binary parameters, ensuring accurate forwarding of values to custom transformers. + +- Fixed the `RandomChoice` transformer to correctly marshal and unmarshal values during validation. + +- Introduced the nullable property for the `SetNull` transformer to enhance NOT NULL constraint validation. + +- Resolved text wrapping issues for the `validate` command. + +- Fixed build failures on Windows due to Linux platform dependencies. + +- Corrected `stdout` readline buffer reading during interaction with custom transformers. + +- Fixed integration tests. + +## Ecosystem changes + +- Implemented CI/CD pipelines for the entire project. + +- Established a user-friendly playground in Docker compose, including: + + - Deployed Minio storage container. + - PostgreSQL container containing both the original database (Adventure Works) and the transformed (empty DB). + - Greenmask container itself. + +- Refactored current readme files. + +## Assets + +To download the Greenmask binary compatible with your system, see the [release's assets list](https://github.com/GreenmaskIO/greenmask/releases/tag/v0.1.0). diff --git a/docs/release_notes/greenmask_0_1_0_beta.md b/docs/release_notes/greenmask_0_1_0_beta.md new file mode 100644 index 00000000..1dda166f --- /dev/null +++ b/docs/release_notes/greenmask_0_1_0_beta.md @@ -0,0 +1,19 @@ +# Greenmask 0.0.1 Beta + +We are excited to announce the beta release of Greenmask, a versatile and open-source utility for PostgreSQL logical backup dumping, obfuscation, and restoration. Greenmask is perfect for routine backup and restoration tasks. It facilitates anonymization and data masking for staging environments and analytics. + +This release introduces a range of features aimed at enhancing database management and security. + +## Key features + +- Cross-platform support — fully written in Go without platform dependencies. +- Type-safe database operations — validates and encodes data, maintaining integrity. +- Transformation validation — ensures data transformations are correct and maintainable. +- Partitioned table support — simplifies configuration for partitioned tables. +- Stateless and backward compatible — works alongside standard PostgreSQL utilities. +- Parallel execution — enhances efficiency in dumping and restoration processes. +- Multiple storage options — supports both local (directory) and remote (S3-like) storage solutions. + +## Download + +To download the Greenmask binary compatible with your system, see the [release's assets list](https://github.com/GreenmaskIO/greenmask/releases/tag/v0.1.0-beta). diff --git a/docs/release_notes/greenmask_0_1_1.md b/docs/release_notes/greenmask_0_1_1.md new file mode 100644 index 00000000..93e0a592 --- /dev/null +++ b/docs/release_notes/greenmask_0_1_1.md @@ -0,0 +1,49 @@ +# Greenmask 0.1.1 + +This release introduces a suite of new transformers, significantly enhancing Greenmask's capabilities for obfuscating PostgreSQL databases. + +## New features + +Added the following new transformers: + +| Transformer | Description | +|----------------------------------------------------------------------------------------|--------------------------------------------------| +| [RandomLatitude](../built_in_transformers/standard_transformers/random_latitude.md) | Generates a random latitude value | +| [RandomLongitude](../built_in_transformers/standard_transformers/random_longitude.md) | Generates a random longitude value | +| [RandomUnixTime](../built_in_transformers/standard_transformers/random_unix_time.md) | Generates a random Unix timestamp | +| [RandomMonthName](../built_in_transformers/standard_transformers/random_month_name.md) | Generates the name of a random month | +| [RandomYearString](../built_in_transformers/standard_transformers/random_year_string.md) | Generates a random year as a string | +| [RandomDayOfWeek](../built_in_transformers/standard_transformers/random_day_of_week.md) | Generates a random day of the week | +| [RandomDayOfMonth](../built_in_transformers/standard_transformers/random_day_of_month.md) | Generates a random day of the month | +| [RandomCentury](../built_in_transformers/standard_transformers/random_century.md) | Generates a random century | +| [RandomTimezone](../built_in_transformers/standard_transformers/random_timezone.md) | Generates a random timezone | +| [RandomEmail](../built_in_transformers/standard_transformers/random_email.md) | Generates a random email address | +| [RandomMacAddress](../built_in_transformers/standard_transformers/random_mac_address.md) | Generates a random MAC address | +| [RandomDomainName](../built_in_transformers/standard_transformers/random_domain_name.md) | Generates a random domain name | +| [RandomURL](../built_in_transformers/standard_transformers/random_url.md) | Generates a random URL | +| [RandomUsername](../built_in_transformers/standard_transformers/random_username.md) | Generates a random username | +| [RandomIPv4](../built_in_transformers/standard_transformers/random_ipv4.md) | Generates a random IPv4 address | +| [RandomIPv6](../built_in_transformers/standard_transformers/random_ipv6.md) | Generates a random IPv6 address | +| [RandomPassword](../built_in_transformers/standard_transformers/random_password.md) | Generates a random password | +| [RandomWord](../built_in_transformers/standard_transformers/random_word.md) | Generates a random word | +| [RandomSentence](../built_in_transformers/standard_transformers/random_sentence.md) | Generates a random sentence | +| [RandomParagraph](../built_in_transformers/standard_transformers/random_paragraph.md) | Generates a random paragraph | +| [RandomCCType](../built_in_transformers/standard_transformers/random_cc_type.md) | Generates a random credit card type | +| [RandomCCNumber](../built_in_transformers/standard_transformers/random_cc_number.md) | Generates a random credit card number | +| [RandomCurrency](../built_in_transformers/standard_transformers/random_currency.md) | Generates a random currency code | +| [RandomAmountWithCurrency](../built_in_transformers/standard_transformers/random_amount_with_currency.md) | Generates a random monetary amount with currency | +| [RandomTitleMale](../built_in_transformers/standard_transformers/random_title_male.md) | Generates a random title for males | +| [RandomTitleFemale](../built_in_transformers/standard_transformers/random_title_female.md) | Generates a random title for females | +| [RandomFirstName](../built_in_transformers/standard_transformers/random_first_name.md) | Generates a random first name | +| [RandomFirstNameMale](../built_in_transformers/standard_transformers/random_first_name_male.md) | Generates a random male first name | +| [RandomFirstNameFemale](../built_in_transformers/standard_transformers/random_first_name_female.md) | Generates a random female first name | +| [RandomLastName](../built_in_transformers/standard_transformers/random_last_name.md) | Generates a random last name | +| [RandomName](../built_in_transformers/standard_transformers/random_name.md) | Generates a full random name | +| [RandomPhoneNumber](../built_in_transformers/standard_transformers/random_phone_number.md) | Generates a random phone number | +| [RandomTollFreePhoneNumber](../built_in_transformers/standard_transformers/random_toll_free_phone_number.md) | Generates a random toll-free phone number | +| [RandomE164PhoneNumber](../built_in_transformers/standard_transformers/random_e164_phone_number.md) | Generates a random phone number in E.164 format | +| [RealAddress](../built_in_transformers/standard_transformers/real_address.md) | Generates a real address | + +## Assets + +To download the Greenmask binary compatible with your system, see the [release's assets list](https://github.com/GreenmaskIO/greenmask/releases/tag/v0.1.1). diff --git a/docs/release_notes/greenmask_0_1_2.md b/docs/release_notes/greenmask_0_1_2.md new file mode 100644 index 00000000..6e626114 --- /dev/null +++ b/docs/release_notes/greenmask_0_1_2.md @@ -0,0 +1,13 @@ +# Greenmask 0.1.2 + +This release introduces bug fixes. + +## Fixes + +- Fixed bug when raw COPY lines were parsed incorrectly +- Fixed `--version` parameter behavior +- Fixed `--dbname` parameter - now it correctly works with PostgreSQL connection string in URI format `postgresql:///` + +## Assets + +To download the Greenmask binary compatible with your system, see the [release's assets list](https://github.com/GreenmaskIO/greenmask/releases/tag/v0.1.2). diff --git a/docs/release_notes/greenmask_0_1_3.md b/docs/release_notes/greenmask_0_1_3.md new file mode 100644 index 00000000..937721a2 --- /dev/null +++ b/docs/release_notes/greenmask_0_1_3.md @@ -0,0 +1,12 @@ +# Greenmask 0.1.3 + +This release introduces bug fixes. + +## Fixes + +- Fixed the JSON transformer's parsing for the `operations` fields +- Fixed database connection string builder in `pg_restore` and `pg_dump` + +## Assets + +To download the Greenmask binary compatible with your system, see the [release's assets list](https://github.com/GreenmaskIO/greenmask/releases/tag/v0.1.3). diff --git a/docs/release_notes/greenmask_0_1_4.md b/docs/release_notes/greenmask_0_1_4.md new file mode 100644 index 00000000..af6fe42f --- /dev/null +++ b/docs/release_notes/greenmask_0_1_4.md @@ -0,0 +1,11 @@ +# Greenmask 0.1.4 + +This release introduces bug fixes. + +## Fixes + +- Fixed database connection string behavior fields + +## Assets + +To download the Greenmask binary compatible with your system, see the [release's assets list](https://github.com/GreenmaskIO/greenmask/releases/tag/v0.1.4). diff --git a/docs/release_notes/greenmask_0_1_5.md b/docs/release_notes/greenmask_0_1_5.md new file mode 100644 index 00000000..bf94a6e1 --- /dev/null +++ b/docs/release_notes/greenmask_0_1_5.md @@ -0,0 +1,25 @@ +# Greenmask 0.1.5 + +This release introduces a new Greenmask command, improvements, bug fixes, and numerous documentation updates. + +## New features + +Added a new Greenmask CLI command—[show-transformer](../commands.md#show-transformer) that shows detailed information about a specified transformer. + +## Improvements + +- The [Hash transformer](../built_in_transformers/standard_transformers/hash.md) has been completely remastered and now has the `function` parameter to choose from several hash algorithm options and the `max_length` parameter to truncate the hash tail. +- Split information about transformers between the `list-transformers` and new `show-transformer` CLI commands, which allows for more comprehensible and useful outputs for both commands +- Added error severity for the `Cmd` parameter validator +- Improved UX for the Greenmask release binaries + +## Fixes + +- Fixed metadata enrichment for validation warnings caused by `RawValueValidator` +- Fixed a typo in the `credit_card` value for the `type` parameter of the `Masking` transformer +- Fixed Greenmask Playground environment variables and the `cleanup` command +- Fixed `list-dump`, `list-transformers`, and `restore` commands exit code on error + +## Assets + +To download the Greenmask binary compatible with your system, see the [release's assets list](https://github.com/GreenmaskIO/greenmask/releases/tag/v0.1.5). diff --git a/internal/db/postgres/transformers/cmd.go b/internal/db/postgres/transformers/cmd.go index e978fb48..74f23f9c 100644 --- a/internal/db/postgres/transformers/cmd.go +++ b/internal/db/postgres/transformers/cmd.go @@ -66,7 +66,8 @@ var CmdTransformerDefinition = utils.NewDefinition( `"skip_original_data": "type:bool, required:false, description: is original data required for transformer",`+ `"skip_on_null_input": "type:bool, required:false, description: skip transformation on null input"`+ `}`, - ).SetDefaultValue([]byte("[]")), + ).SetDefaultValue([]byte("[]")). + SetIsColumnContainer(true), toolkit.MustNewParameter( "executable", @@ -455,7 +456,7 @@ func cmdValidateSkipBehaviour(p *toolkit.Parameter, v toolkit.ParamsValue) (tool if value != skipOnAnyName && value != skipOnAllName { return toolkit.ValidationWarnings{ toolkit.NewValidationWarning(). - AddMeta("ParameterName", p.Name). + SetSeverity(toolkit.ErrorValidationSeverity). AddMeta("ParameterValue", value). SetMsg(`unsupported skip_on type: must be one of "all" or "any"`), }, nil diff --git a/internal/db/postgres/transformers/hash.go b/internal/db/postgres/transformers/hash.go index ffccb953..f10209d5 100644 --- a/internal/db/postgres/transformers/hash.go +++ b/internal/db/postgres/transformers/hash.go @@ -16,24 +16,19 @@ package transformers import ( "context" - crand "crypto/rand" - "encoding/base64" + "crypto/md5" + "crypto/sha1" + "crypto/sha256" + "crypto/sha512" + "encoding/hex" "fmt" - "slices" - - "golang.org/x/crypto/scrypt" + "hash" + "strconv" "github.com/greenmaskio/greenmask/internal/db/postgres/transformers/utils" "github.com/greenmaskio/greenmask/pkg/toolkit" ) -// TODO: Make length truncation - -const ( - saltLength = 32 - bufLength = 1024 -) - var HashTransformerDefinition = utils.NewDefinition( utils.NewTransformerProperties( "Hash", @@ -51,17 +46,27 @@ var HashTransformerDefinition = utils.NewDefinition( ).SetRequired(true), toolkit.MustNewParameter( - "salt", - "salt for hash function base64 encoded", - ), + "function", + "hash function name. Possible values sha1, sha256, sha512, md5", + ).SetDefaultValue([]byte("sha1")). + SetRawValueValidator(validateHashFunctionsParameter), + + toolkit.MustNewParameter( + "max_length", + "limit length of hash function result", + ).SetDefaultValue([]byte("0")). + SetRawValueValidator(validateMaxLengthParameter), ) type HashTransformer struct { - salt toolkit.ParamsValue - columnName string - affectedColumns map[int]string - columnIdx int - res []byte + columnName string + affectedColumns map[int]string + columnIdx int + h hash.Hash + maxLength int + encodedOutputLength int + hashBuf []byte + resultBuf []byte } func NewHashTransformer( @@ -80,28 +85,46 @@ func NewHashTransformer( affectedColumns := make(map[int]string) affectedColumns[idx] = columnName - var salt toolkit.ParamsValue - p = parameters["salt"] - var err error - if len(p.RawValue()) > 0 { - salt, err = base64.StdEncoding.DecodeString(string(p.RawValue())) - if err != nil { - return nil, nil, fmt.Errorf("error decoding \"salt\" value from base64: %w", err) - } - } else { - b := make(toolkit.ParamsValue, saltLength) - if _, err := crand.Read(b); err != nil { - return nil, nil, err - } - salt = b + p = parameters["function"] + var hashFunctionName string + if _, err := p.Scan(&hashFunctionName); err != nil { + return nil, nil, fmt.Errorf("unable to scan \"function\" parameter: %w", err) + } + + p = parameters["max_length"] + var maxLength int + if _, err := p.Scan(&maxLength); err != nil { + return nil, nil, fmt.Errorf("unable to scan \"max_length\" parameter: %w", err) + } + + var h hash.Hash + var hashFunctionLength int + switch hashFunctionName { + case "md5": + h = md5.New() + hashFunctionLength = 16 + case "sha1": + h = sha1.New() + hashFunctionLength = 20 + case "sha256": + h = sha256.New() + hashFunctionLength = 32 + case "sha512": + h = sha512.New() + hashFunctionLength = 64 + default: + return nil, nil, fmt.Errorf("unknown hash function \"%s\"", hashFunctionName) } return &HashTransformer{ - salt: salt, - columnName: columnName, - affectedColumns: affectedColumns, - columnIdx: idx, - res: make([]byte, 0, bufLength), + columnName: columnName, + affectedColumns: affectedColumns, + columnIdx: idx, + maxLength: maxLength, + hashBuf: make([]byte, 0, hashFunctionLength), + resultBuf: make([]byte, hex.EncodedLen(hashFunctionLength)), + encodedOutputLength: hex.EncodedLen(hashFunctionLength), + h: h, }, nil, nil } @@ -126,26 +149,56 @@ func (ht *HashTransformer) Transform(ctx context.Context, r *toolkit.Record) (*t return r, nil } - dk, err := scrypt.Key(val.Data, ht.salt, 32768, 8, 1, 32) + defer ht.h.Reset() + _, err = ht.h.Write(val.Data) if err != nil { - return nil, fmt.Errorf("cannot perform hash calculation: %w", err) + return nil, fmt.Errorf("unable to write raw data into writer: %w", err) } + ht.hashBuf = ht.hashBuf[:0] + ht.hashBuf = ht.h.Sum(ht.hashBuf) + + hex.Encode(ht.resultBuf, ht.hashBuf) - length := base64.StdEncoding.EncodedLen(len(dk)) - if len(ht.res) < length { - slices.Grow(ht.res, length) + maxLength := ht.encodedOutputLength + if ht.maxLength > 0 && ht.encodedOutputLength > ht.maxLength { + maxLength = ht.maxLength } - ht.res = ht.res[0:length] - //base64.StdEncoding.EncodeToString(ht.res) - base64.StdEncoding.Encode(ht.res, dk) - if err := r.SetRawColumnValueByIdx(ht.columnIdx, toolkit.NewRawValue(ht.res, false)); err != nil { + if err := r.SetRawColumnValueByIdx(ht.columnIdx, toolkit.NewRawValue(ht.resultBuf[:maxLength], false)); err != nil { return nil, fmt.Errorf("unable to set new value: %w", err) } return r, nil } +func validateHashFunctionsParameter(p *toolkit.Parameter, v toolkit.ParamsValue) (toolkit.ValidationWarnings, error) { + functionName := string(v) + switch functionName { + case "md5", "sha1", "sha256", "sha512": + return nil, nil + } + return toolkit.ValidationWarnings{ + toolkit.NewValidationWarning(). + SetSeverity(toolkit.ErrorValidationSeverity). + AddMeta("ParameterValue", functionName). + SetMsg(`unknown hash function name`)}, nil +} + +func validateMaxLengthParameter(p *toolkit.Parameter, v toolkit.ParamsValue) (toolkit.ValidationWarnings, error) { + max_length, err := strconv.ParseInt(string(v), 10, 32) + if err != nil { + return nil, fmt.Errorf("error parsing \"max_length\" as integer: %w", err) + } + if max_length >= 0 { + return nil, nil + } + return toolkit.ValidationWarnings{ + toolkit.NewValidationWarning(). + SetSeverity(toolkit.ErrorValidationSeverity). + AddMeta("ParameterValue", string(v)). + SetMsg(`max_length parameter cannot be less than zero`)}, nil +} + func init() { utils.DefaultTransformerRegistry.MustRegister(HashTransformerDefinition) } diff --git a/internal/db/postgres/transformers/hash_test.go b/internal/db/postgres/transformers/hash_test.go index 4b27d7ec..1bc040a8 100644 --- a/internal/db/postgres/transformers/hash_test.go +++ b/internal/db/postgres/transformers/hash_test.go @@ -23,46 +23,212 @@ import ( "github.com/greenmaskio/greenmask/pkg/toolkit" ) -func TestHashTransformer_Transform(t *testing.T) { - var attrName = "data" - var originalValue = "old_value" - var expectedValue = toolkit.NewValue("jzTVGK2UHz3ERhrYiZDoDzcKeMxSsgxHHgWlL9OrkZ4=", false) - driver, record := getDriverAndRecord(attrName, originalValue) +func TestHashTransformer_Transform_all_functions(t *testing.T) { + columnValue := toolkit.ParamsValue("data") + tests := []struct { + name string + params map[string]toolkit.ParamsValue + original string + result string + }{ + { + name: "md5", + params: map[string]toolkit.ParamsValue{ + "column": columnValue, + "function": []byte("md5"), + }, + original: "123", + result: "202cb962ac59075b964b07152d234b70", + }, + { + name: "sha1", + params: map[string]toolkit.ParamsValue{ + "column": columnValue, + "function": []byte("sha1"), + }, + original: "123", + result: "40bd001563085fc35165329ea1ff5c5ecbdbbeef", + }, + { + name: "sha256", + params: map[string]toolkit.ParamsValue{ + "column": columnValue, + "function": []byte("sha256"), + }, + original: "123", + result: "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3", + }, + { + name: "sha512", + params: map[string]toolkit.ParamsValue{ + "column": columnValue, + "function": []byte("sha512"), + }, + original: "123", + result: "3c9909afec25354d551dae21590bb26e38d53f2173b8d3dc3eee4c047e7ab1c1eb8b85103e3be7ba613b31bb5c9c36214dc9f14a42fd7a2fdb84856bca5c44c2", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + driver, record := getDriverAndRecord(string(tt.params["column"]), tt.original) + transformer, warnings, err := HashTransformerDefinition.Instance( + context.Background(), + driver, tt.params, + nil, + ) + require.NoError(t, err) + require.Empty(t, warnings) + r, err := transformer.Transform( + context.Background(), + record, + ) + require.NoError(t, err) + + res, err := r.GetRawColumnValueByName(string(tt.params["column"])) + require.NoError(t, err) + + require.False(t, res.IsNull) + require.Equal(t, tt.result, string(res.Data)) + + }) + } +} + +func Test_validateHashFunctionsParameter(t *testing.T) { + + tests := []struct { + name string + value []byte + }{ + { + name: "md5", + value: []byte("md5"), + }, + { + name: "sha1", + value: []byte("md5"), + }, + { + name: "sha256", + value: []byte("md5"), + }, + { + name: "sha512", + value: []byte("md5"), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + warns, err := validateHashFunctionsParameter(nil, tt.value) + require.NoError(t, err) + require.Empty(t, warns) + }) + } + + t.Run("wrong value", func(t *testing.T) { + warns, err := validateHashFunctionsParameter(nil, []byte("md8")) + require.NoError(t, err) + require.Len(t, warns, 1) + warn := warns[0] + require.Equal(t, toolkit.ErrorValidationSeverity, warn.Severity) + require.Equal(t, "unknown hash function name", warn.Msg) + }) + +} + +func TestHashTransformer_Transform_length_truncation(t *testing.T) { + + params := map[string]toolkit.ParamsValue{ + "column": toolkit.ParamsValue("data"), + "max_length": toolkit.ParamsValue("4"), + "function": toolkit.ParamsValue("sha1"), + } + original := "123" + expected := "40bd" + // Check that internal buffers wipes correctly without data lost + driver, record := getDriverAndRecord(string(params["column"]), original) transformer, warnings, err := HashTransformerDefinition.Instance( context.Background(), - driver, map[string]toolkit.ParamsValue{ - "column": toolkit.ParamsValue(attrName), - "salt": toolkit.ParamsValue("MTIzNDU2Nw=="), - }, + driver, params, nil, ) require.NoError(t, err) require.Empty(t, warnings) - r, err := transformer.Transform( context.Background(), record, ) require.NoError(t, err) - res, err := r.GetColumnValueByName(attrName) + + res, err := r.GetRawColumnValueByName(string(params["column"])) require.NoError(t, err) - require.Equal(t, expectedValue.IsNull, res.IsNull) - require.Equal(t, expectedValue.Value, res.Value) + require.False(t, res.IsNull) + require.Equal(t, expected, string(res.Data)) +} + +func TestHashTransformer_Transform_multiple_iterations(t *testing.T) { + columnValue := toolkit.ParamsValue("data") - originalValue = "123asdasdasdaasdlmaklsdmklamsdlkmalksdmlkamsdlkmalkdmlkasds" - expectedValue = toolkit.NewValue("kZsJbWbVoBGMqniHTCzU6fJrxQdlfeqhYIUxOo3JniA=", false) - _, record = getDriverAndRecord(attrName, originalValue) - r, err = transformer.Transform( + params := map[string]toolkit.ParamsValue{ + "column": toolkit.ParamsValue("data"), + "function": toolkit.ParamsValue("sha1"), + } + original := "123" + // Check that internal buffers wipes correctly without data lost + driver, record := getDriverAndRecord(string(params["column"]), original) + transformer, warnings, err := HashTransformerDefinition.Instance( context.Background(), - record, + driver, params, + nil, ) require.NoError(t, err) - res, err = r.GetColumnValueByName(attrName) - require.NoError(t, err) + require.Empty(t, warnings) + + tests := []struct { + name string + original string + expected string + }{ + { + name: "run1", + original: "123", + expected: "40bd001563085fc35165329ea1ff5c5ecbdbbeef", + }, + { + name: "run2", + original: "456", + expected: "51eac6b471a284d3341d8c0c63d0f1a286262a18", + }, + { + name: "run3", + original: "789", + expected: "fc1200c7a7aa52109d762a9f005b149abef01479", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + defer record.Row.Encode() + + err = record.Row.Decode([]byte(tt.original)) + require.NoError(t, err) + + _, err = transformer.Transform( + context.Background(), + record, + ) + require.NoError(t, err) + + res, err := record.GetRawColumnValueByName(string(columnValue)) + require.NoError(t, err) - require.Equal(t, expectedValue.IsNull, res.IsNull) - require.Equal(t, expectedValue.Value, res.Value) + require.False(t, res.IsNull) + require.Equal(t, tt.expected, string(res.Data)) + }) + } } diff --git a/internal/db/postgres/transformers/masking.go b/internal/db/postgres/transformers/masking.go index 5ed43734..9f98c0d7 100644 --- a/internal/db/postgres/transformers/masking.go +++ b/internal/db/postgres/transformers/masking.go @@ -34,7 +34,7 @@ const ( MMobile string = "mobile" MTelephone string = "tel" MID string = "id" - MCreditCard string = "credit_cart" + MCreditCard string = "credit_card" MURL string = "url" MDefault string = "default" ) @@ -50,14 +50,15 @@ var MaskingTransformerDefinition = utils.NewDefinition( toolkit.MustNewParameter( "column", "column name", - ).SetIsColumn(toolkit.NewColumnProperties(). - SetAffected(true). - SetAllowedColumnTypes("text", "varchar"), + ).SetIsColumn( + toolkit.NewColumnProperties(). + SetAffected(true). + SetAllowedColumnTypes("text", "varchar"), ).SetRequired(true), toolkit.MustNewParameter( "type", - "logical type of attribute (default, password, name, addr, email, mobile, tel, id, credit, url)", + "logical type of attribute (default, password, name, addr, email, mobile, tel, id, credit_card, url)", ).SetRawValueValidator(maskerTypeValidator). SetDefaultValue(toolkit.ParamsValue(MDefault)), ) diff --git a/internal/db/postgres/transformers/real_address.go b/internal/db/postgres/transformers/real_address.go index d8108730..96d262c9 100644 --- a/internal/db/postgres/transformers/real_address.go +++ b/internal/db/postgres/transformers/real_address.go @@ -43,7 +43,8 @@ var RealAddressTransformerDefinition = utils.NewDefinition( `"template": "type:string, required:true, description: gotemplate with real address attributes injections",`+ `"keep_null": "type:bool, required:false, description: keep null values",`+ `}`, - ).SetRequired(true), + ).SetRequired(true). + SetIsColumnContainer(true), ) type RealAddressTransformer struct { diff --git a/internal/db/postgres/transformers/utils/template/functions.go b/internal/db/postgres/transformers/utils/template/functions.go index 541f0974..525eae09 100644 --- a/internal/db/postgres/transformers/utils/template/functions.go +++ b/internal/db/postgres/transformers/utils/template/functions.go @@ -47,7 +47,7 @@ const ( MMobile string = "mobile" MTelephone string = "tel" MID string = "id" - MCreditCard string = "credit_cart" + MCreditCard string = "credit_card" MURL string = "url" MDefault string = "default" ) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..55505bd2 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,147 @@ +site_name: Greenmask — PostgreSQL masking and obfuscation tool + +# Theme +theme: + name: material + custom_dir: docs/overrides + logo: assets/logo.png + favicon: assets/logo.png + features: + - navigation.indexes + - announce.dismiss + - content.code.annotate + - content.code.copy + - content.tooltips + # - navigation.sections + - navigation.tabs + # - navigation.top + # - navigation.tracking + - search.highlight + - search.share + - search.suggest + # - toc.follow + palette: + scheme: default + primary: teal + accent: indigo + toggle: + icon: material/brightness-7 + + +# Markdown extensions +markdown_extensions: + - attr_list + - toc: + permalink: true + - pymdownx.highlight + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets + - admonition + - pymdownx.details + - tables + +nav: + - Home: + - Home: index.md + - Documentation: + - Installation: installation.md + - Configuration: configuration.md + - Architecture: architecture.md + - Playground: playground.md + - Commands: commands.md + - Transformers: + - built_in_transformers/index.md + - Standard transformers: + - built_in_transformers/standard_transformers/index.md + - Cmd: built_in_transformers/standard_transformers/cmd.md + - Dict: built_in_transformers/standard_transformers/dict.md + - Hash: built_in_transformers/standard_transformers/hash.md + - Masking: built_in_transformers/standard_transformers/masking.md + - NoiseDate: built_in_transformers/standard_transformers/noise_date.md + - NoiseFloat: built_in_transformers/standard_transformers/noise_float.md + - NoiseInt: built_in_transformers/standard_transformers/noise_int.md + - RandomBool: built_in_transformers/standard_transformers/random_bool.md + - RandomChoice: built_in_transformers/standard_transformers/random_choice.md + - RandomDate: built_in_transformers/standard_transformers/random_date.md + - RandomFloat: built_in_transformers/standard_transformers/random_float.md + - RandomInt: built_in_transformers/standard_transformers/random_int.md + - RandomString: built_in_transformers/standard_transformers/random_string.md + - RandomUuid: built_in_transformers/standard_transformers/random_uuid.md + - RandomLatitude: built_in_transformers/standard_transformers/random_latitude.md + - RandomLongitude: built_in_transformers/standard_transformers/random_longitude.md + - RandomUnixTime: built_in_transformers/standard_transformers/random_unix_time.md + - RandomDayOfWeek: built_in_transformers/standard_transformers/random_day_of_week.md + - RandomDayOfMonth: built_in_transformers/standard_transformers/random_day_of_month.md + - RandomMonthName: built_in_transformers/standard_transformers/random_month_name.md + - RandomYearString: built_in_transformers/standard_transformers/random_year_string.md + - RandomCentury: built_in_transformers/standard_transformers/random_century.md + - RandomTimezone: built_in_transformers/standard_transformers/random_timezone.md + - RandomEmail: built_in_transformers/standard_transformers/random_email.md + - RandomUsername: built_in_transformers/standard_transformers/random_username.md + - RandomPassword: built_in_transformers/standard_transformers/random_password.md + - RandomMacAddress: built_in_transformers/standard_transformers/random_mac_address.md + - RandomDomainName: built_in_transformers/standard_transformers/random_domain_name.md + - RandomURL: built_in_transformers/standard_transformers/random_url.md + - RandomIPv4: built_in_transformers/standard_transformers/random_ipv4.md + - RandomIPv6: built_in_transformers/standard_transformers/random_ipv6.md + - RandomWord: built_in_transformers/standard_transformers/random_word.md + - RandomSentence: built_in_transformers/standard_transformers/random_sentence.md + - RandomParagraph: built_in_transformers/standard_transformers/random_paragraph.md + - RandomCCType: built_in_transformers/standard_transformers/random_cc_type.md + - RandomCCNumber: built_in_transformers/standard_transformers/random_cc_number.md + - RandomCurrency: built_in_transformers/standard_transformers/random_currency.md + - RandomAmountWithCurrency: built_in_transformers/standard_transformers/random_amount_with_currency.md + - RandomName: built_in_transformers/standard_transformers/random_name.md + - RandomLastName: built_in_transformers/standard_transformers/random_last_name.md + - RandomFirstName: built_in_transformers/standard_transformers/random_first_name.md + - RandomFirstNameMale: built_in_transformers/standard_transformers/random_first_name_male.md + - RandomFirstNameFemale: built_in_transformers/standard_transformers/random_first_name_female.md + - RandomTitleMale: built_in_transformers/standard_transformers/random_title_male.md + - RandomTitleFemale: built_in_transformers/standard_transformers/random_title_female.md + - RandomPhoneNumber: built_in_transformers/standard_transformers/random_phone_number.md + - RandomTollFreePhoneNumber: built_in_transformers/standard_transformers/random_toll_free_phone_number.md + - RandomE164PhoneNumber: built_in_transformers/standard_transformers/random_e164_phone_number.md + - RealAddress: built_in_transformers/standard_transformers/real_address.md + - RegexpReplace: built_in_transformers/standard_transformers/regexp_replace.md + - Replace: built_in_transformers/standard_transformers/replace.md + - SetNull: built_in_transformers/standard_transformers/set_null.md + - Advanced transformers: + - built_in_transformers/advanced_transformers/index.md + - Json: built_in_transformers/advanced_transformers/json.md + - Template: built_in_transformers/advanced_transformers/template.md + - TemplateRecord: built_in_transformers/advanced_transformers/template_record.md + - Custom functions: + - built_in_transformers/advanced_transformers/custom_functions/index.md + - Core custom functions: built_in_transformers/advanced_transformers/custom_functions/core_functions.md + - Faker function: built_in_transformers/advanced_transformers/custom_functions/faker_function.md + - Release notes: + - Greenmask 0.1.5: release_notes/greenmask_0_1_5.md + - Greenmask 0.1.4: release_notes/greenmask_0_1_4.md + - Greenmask 0.1.3: release_notes/greenmask_0_1_3.md + - Greenmask 0.1.2: release_notes/greenmask_0_1_2.md + - Greenmask 0.1.1: release_notes/greenmask_0_1_1.md + - Greenmask 0.1.0: release_notes/greenmask_0_1_0.md + - Greenmask 0.1.0 Beta: release_notes/greenmask_0_1_0_beta.md + +repo_url: https://github.com/GreenmaskIO/greenmask +repo_name: GreenmaskIO/greenmask +site_url: https://greenmask.io +copyright: Copyright © 2024 Greenmask + + +extra: + # version: + # provider: mike + social: + - icon: fontawesome/brands/x-twitter + link: https://twitter.com/GreenmaskIO + - icon: fontawesome/brands/discord + link: https://discord.gg/97AKHdGD + - icon: fontawesome/brands/github + link: https://github.com/GreenmaskIO/greenmask + +plugins: + - social + - search diff --git a/pkg/toolkit/parameter.go b/pkg/toolkit/parameter.go index 81b98bd9..bb70191a 100644 --- a/pkg/toolkit/parameter.go +++ b/pkg/toolkit/parameter.go @@ -113,6 +113,8 @@ type Parameter struct { // IsColumn - shows is this parameter column related. If so ColumnProperties must be defined and assigned // otherwise it may cause an unhandled behaviour IsColumn bool `mapstructure:"is_column" json:"is_column"` + // IsColumnContainer - describe is parameter container map or list with multiple columns inside. It allows us to + IsColumnContainer bool `mapstructure:"is_column_container" json:"is_column_container"` // LinkParameter - link with parameter with provided name. This is required if performing raw value encoding // depends on the provided column type and/or relies on the database Driver LinkParameter string `mapstructure:"link_parameter" json:"link_parameter,omitempty"` @@ -297,6 +299,11 @@ func (p *Parameter) SetIsColumn(columnProperties *ColumnProperties) *Parameter { return p } +func (p *Parameter) SetIsColumnContainer(v bool) *Parameter { + p.IsColumnContainer = v + return p +} + func (p *Parameter) SetUnmarshaller(unmarshaller Unmarshaller) *Parameter { p.Unmarshaller = unmarshaller return p @@ -351,15 +358,15 @@ func (p *Parameter) Init(driver *Driver, types []*Type, params []*Parameter, raw } if p.RawValueValidator != nil { - w, err := p.RawValueValidator(p, p.rawValue) + rawValueValidatorWarns, err := p.RawValueValidator(p, p.rawValue) if err != nil { return nil, fmt.Errorf("error performing parameter raw value validation: %w", err) } - for _, w := range warnings { + for _, w := range rawValueValidatorWarns { w.AddMeta("ParameterName", p.Name) } - warnings = append(warnings, w...) - if w.IsFatal() { + warnings = append(warnings, rawValueValidatorWarns...) + if rawValueValidatorWarns.IsFatal() { return warnings, nil } } diff --git a/playground/cleanup.sh b/playground/cleanup.sh index 94061098..072244bb 100755 --- a/playground/cleanup.sh +++ b/playground/cleanup.sh @@ -1,6 +1,6 @@ #!/bin/bash -if psql -lqt -p 5432 -h db -U postgres | cut -d \| -f 1 | grep -qw $TRANSFORMED_DB_NAME; then - psql -p 5432 -h db -U postgres -c "DROP DATABASE $TRANSFORMED_DB_NAME;" - psql -p 5432 -h db -U postgres -c "CREATE DATABASE $TRANSFORMED_DB_NAME;" +if psql -lqt -p 5432 -h $DATABASE_HOST -U postgres | cut -d \| -f 1 | grep -qw $TRANSFORMED_DB_NAME; then + psql -p 5432 -h $DATABASE_HOST -U postgres -c "DROP DATABASE $TRANSFORMED_DB_NAME;" + psql -p 5432 -h $DATABASE_HOST -U postgres -c "CREATE DATABASE $TRANSFORMED_DB_NAME;" fi diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..8d9fdc6a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,47 @@ +Babel==2.13.0 +beautifulsoup4==4.12.2 +cairocffi==1.6.1 +CairoSVG==2.7.1 +certifi==2023.7.22 +cffi==1.16.0 +charset-normalizer==3.3.0 +click==8.1.7 +colorama==0.4.6 +cssselect2==0.7.0 +defusedxml==0.7.1 +ghp-import==2.1.0 +gitdb==4.0.11 +GitPython==3.1.41 +idna==3.4 +Jinja2==3.1.3 +lxml==4.9.3 +Markdown==3.5 +MarkupSafe==2.1.3 +mergedeep==1.3.4 +mkdocs==1.5.3 +mkdocs-git-authors-plugin==0.7.2 +mkdocs-git-committers-plugin-2==1.2.0 +mkdocs-git-revision-date-localized-plugin==1.2.1 +mkdocs-material==9.4.6 +mkdocs-material-extensions==1.3 +packaging==23.2 +paginate==0.5.6 +pathspec==0.11.2 +Pillow==10.2.0 +platformdirs==3.11.0 +pycparser==2.21 +Pygments==2.16.1 +pymdown-extensions==10.3.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +pyyaml_env_tag==0.1 +regex==2023.10.3 +requests==2.31.0 +six==1.16.0 +smmap==5.0.1 +soupsieve==2.5 +tinycss2==1.2.1 +urllib3==2.0.7 +watchdog==3.0.0 +webencodings==0.5.1 diff --git a/tests/integration/toc/backward_compatibility_test.go b/tests/integration/toc/backward_compatibility_test.go index 69e49881..9db86668 100644 --- a/tests/integration/toc/backward_compatibility_test.go +++ b/tests/integration/toc/backward_compatibility_test.go @@ -24,13 +24,14 @@ import ( "path" "time" + "github.com/jackc/pgx/v5" + "github.com/rs/zerolog/log" + "github.com/stretchr/testify/suite" + "github.com/greenmaskio/greenmask/internal/db/postgres/pgdump" "github.com/greenmaskio/greenmask/internal/domains" "github.com/greenmaskio/greenmask/internal/storages/directory" "github.com/greenmaskio/greenmask/pkg/toolkit" - "github.com/jackc/pgx/v5" - "github.com/rs/zerolog/log" - "github.com/stretchr/testify/suite" ) var config = &domains.Config{