From 261167de9420ba37a8be6cb50a7d090b2637f303 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 25 Jun 2024 14:07:55 -0500 Subject: [PATCH 01/51] Allow cloud archiving to run next ot the primary --- internal/flybarman/node.go | 4 +-- internal/flypg/node.go | 1 + internal/flypg/pg.go | 41 ++++++++++++++++++++++ internal/flypg/pg_test.go | 71 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 2 deletions(-) diff --git a/internal/flybarman/node.go b/internal/flybarman/node.go index 455d9cd5..2c1fe0d4 100644 --- a/internal/flybarman/node.go +++ b/internal/flybarman/node.go @@ -139,8 +139,8 @@ wal_retention_policy = main return fmt.Errorf("failed to write file %s: %s", n.RootPasswordConfigPath, err) } - barmanCronFileContent := `* * * * * /usr/local/bin/barman_cron -` + barmanCronFileContent := `* * * * * /usr/local/bin/barman_cron` + if err := os.WriteFile(n.BarmanCronFile, []byte(barmanCronFileContent), 0644); err != nil { return fmt.Errorf("failed write %s: %s", n.BarmanCronFile, err) } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 5a9364b2..9785b944 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -98,6 +98,7 @@ func NewNode() (*Node, error) { node.RepMgr.Witness = present node.PGConfig = PGConfig{ + AppName: node.AppName, DataDir: node.DataDir, Port: node.Port, ConfigFilePath: fmt.Sprintf("%s/postgresql.conf", node.DataDir), diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index f67f9f33..574ee8a1 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -19,6 +19,7 @@ import ( ) type PGConfig struct { + AppName string ConfigFilePath string InternalConfigFilePath string UserConfigFilePath string @@ -171,6 +172,26 @@ func (c *PGConfig) SetDefaults() error { "shared_preload_libraries": fmt.Sprintf("'%s'", strings.Join(sharedPreloadLibraries, ",")), } + // Works to configure archiving to object storage if enabled + switch strings.ToLower(os.Getenv("CLOUD_ARCHIVING_ENABLED")) { + case "true": + if err := validateCloudArchiver(); err != nil { + return fmt.Errorf("failed to validate s3 archiver: %s", err) + } + + bucket := strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")) + endpoint := strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL")) + + archiveCommand := fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider aws-s3 --gzip --endpoint-url %s s3://%s %s %%p'", endpoint, bucket, c.AppName) + c.internalConfig["archive_mode"] = "on" + c.internalConfig["archive_command"] = archiveCommand + case "false": + c.internalConfig["archive_mode"] = "off" + default: + // Noop to remain backwards compatible with existing setups that may have + // manually setup archiving. + } + return nil } @@ -537,3 +558,23 @@ func diskSizeInBytes(dir string) (uint64, error) { } return stat.Blocks * uint64(stat.Bsize), nil } + +func validateCloudArchiver() error { + if os.Getenv("AWS_ACCESS_KEY_ID") == "" { + return fmt.Errorf("AWS_ACCESS_KEY_ID secret must be set") + } + + if os.Getenv("AWS_SECRET_ACCESS_KEY") == "" { + return fmt.Errorf("AWS_SECRET_ACCESS_KEY secret must be set") + } + + if os.Getenv("AWS_BUCKET_NAME") == "" { + return fmt.Errorf("AWS_BUCKET_NAME envvar must be set") + } + + if os.Getenv("AWS_ENDPOINT_URL") == "" { + return fmt.Errorf("AWS_ENDPOINT_URL envvar must be set") + } + + return nil +} diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index bee86622..91d6a4ba 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -112,7 +112,76 @@ func TestPGConfigInitialization(t *testing.T) { if cfg["shared_preload_libraries"] != expected { t.Fatalf("expected %s, got %s", expected, cfg["shared_preload_libraries"]) } + }) + + t.Run("cloud-archiving", func(t *testing.T) { + t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") + t.Setenv("AWS_ACCESS_KEY_ID", "my-key") + t.Setenv("AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") + t.Setenv("AWS_BUCKET_NAME", "my-bucket") + + store, _ := state.NewStore() + + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err := pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["archive_mode"] != "on" { + t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) + } + + expected := fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider aws-s3 --endpoint-url %s s3://%s %s %%p'", + os.Getenv("AWS_ENDPOINT_URL"), + os.Getenv("AWS_BUCKET_NAME"), + pgConf.AppName) + + if cfg["archive_command"] != expected { + t.Fatalf("expected %s, got %s", expected, cfg["archive_command"]) + } + }) + + t.Run("cloud-archiving-disabled", func(t *testing.T) { + t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") + t.Setenv("AWS_ACCESS_KEY_ID", "my-key") + t.Setenv("AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") + t.Setenv("AWS_BUCKET_NAME", "my-bucket") + + store, _ := state.NewStore() + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err := pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["archive_mode"] != "on" { + t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) + } + + t.Setenv("CLOUD_ARCHIVING_ENABLED", "false") + + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err = pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["archive_mode"] != "off" { + t.Fatalf("expected archive_mode to be off, got %v", cfg["archive_mode"]) + } }) } @@ -208,6 +277,7 @@ func TestPGDefaultPassword(t *testing.T) { defer cleanup() pgConf := &PGConfig{ + AppName: "my-app", DataDir: pgTestDirectory, Port: 5433, ConfigFilePath: pgConfigFilePath, @@ -262,6 +332,7 @@ func TestValidateCompatibility(t *testing.T) { defer cleanup() pgConf := &PGConfig{ + AppName: "my-app", DataDir: pgTestDirectory, Port: 5433, ConfigFilePath: pgConfigFilePath, From c7e764aa12fb2f5911512479aaa2dca00392cc3b Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 25 Jun 2024 14:08:45 -0500 Subject: [PATCH 02/51] fix test --- internal/flypg/pg_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index 91d6a4ba..a329bf43 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -136,7 +136,7 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - expected := fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider aws-s3 --endpoint-url %s s3://%s %s %%p'", + expected := fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider aws-s3 --gzip --endpoint-url %s s3://%s %s %%p'", os.Getenv("AWS_ENDPOINT_URL"), os.Getenv("AWS_BUCKET_NAME"), pgConf.AppName) From 479dd0f79665334b1421cf9ae8927b46dccebfb0 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 11:13:25 -0500 Subject: [PATCH 03/51] Retention work --- cmd/monitor/main.go | 15 +++ cmd/monitor/monitor_backup_retention.go | 24 +++++ cmd/monitor/monitor_dead_members.go | 2 - cmd/start/main.go | 1 - internal/flypg/barman.go | 125 ++++++++++++++++++++++++ internal/flypg/pg.go | 34 +------ 6 files changed, 169 insertions(+), 32 deletions(-) create mode 100644 cmd/monitor/monitor_backup_retention.go create mode 100644 internal/flypg/barman.go diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 402e8717..9eb185aa 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "os" "time" "github.com/fly-apps/postgres-flex/internal/flypg" @@ -16,6 +17,8 @@ var ( defaultDeadMemberRemovalThreshold = time.Hour * 12 defaultInactiveSlotRemovalThreshold = time.Hour * 12 + + defaultBackupRetentionEvaluationThreshold = time.Hour * 1 ) func main() { @@ -36,6 +39,18 @@ func main() { } }() + if os.Getenv("CLOUD_ARCHIVING_ENABLED") == "true" { + barman, err := flypg.NewBarman() + if err != nil { + panic(err) + } + + log.Println("Monitoring backup retention") + barman.PrintRetentionPolicy() + + go monitorBackupRetention(ctx, barman) + } + // Readonly monitor log.Println("Monitoring cluster state") go monitorClusterState(ctx, node) diff --git a/cmd/monitor/monitor_backup_retention.go b/cmd/monitor/monitor_backup_retention.go new file mode 100644 index 00000000..06e8818a --- /dev/null +++ b/cmd/monitor/monitor_backup_retention.go @@ -0,0 +1,24 @@ +package main + +import ( + "context" + "log" + "time" + + "github.com/fly-apps/postgres-flex/internal/flypg" +) + +func monitorBackupRetention(ctx context.Context, barman *flypg.Barman) { + ticker := time.NewTicker(defaultBackupRetentionEvaluationThreshold) + defer ticker.Stop() + for range ticker.C { + result, err := barman.WALArchiveDelete(ctx) + if err != nil { + log.Printf("Backup retention failed with: %s", err) + } + + if len(result) > 0 { + log.Printf("Backup retention response: %s", result) + } + } +} diff --git a/cmd/monitor/monitor_dead_members.go b/cmd/monitor/monitor_dead_members.go index 78370ff9..5602cc06 100644 --- a/cmd/monitor/monitor_dead_members.go +++ b/cmd/monitor/monitor_dead_members.go @@ -37,8 +37,6 @@ func monitorDeadMembers(ctx context.Context, node *flypg.Node) error { ticker := time.NewTicker(deadMemberMonitorFrequency) defer ticker.Stop() - log.Printf("Pruning every %s...\n", removalThreshold) - for range ticker.C { err := deadMemberMonitorTick(ctx, node, seenAt, removalThreshold) if err != nil { diff --git a/cmd/start/main.go b/cmd/start/main.go index abe8a8c3..382ebbae 100644 --- a/cmd/start/main.go +++ b/cmd/start/main.go @@ -38,7 +38,6 @@ func main() { } svisor := supervisor.New("flybarman", 1*time.Minute) - svisor.AddProcess("cron", "/usr/sbin/cron -f", supervisor.WithRestart(0, 5*time.Second)) svisor.AddProcess("barman", fmt.Sprintf("tail -f %s", node.LogFile)) svisor.AddProcess("admin", "/usr/local/bin/start_admin_server", supervisor.WithRestart(0, 5*time.Second), diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go new file mode 100644 index 00000000..c23500c6 --- /dev/null +++ b/internal/flypg/barman.go @@ -0,0 +1,125 @@ +package flypg + +import ( + "context" + "fmt" + "log" + "os" + "os/exec" + "strings" +) + +const ( + cronDirectory = "/data/cron" + archivePrunerCronFile = "/data/cron/barman.cron" + archivePrunerBinary = "/usr/local/bin/archive_pruner" +) + +// # TODO - make this configurable +// retention="RECOVERY WINDOW OF 7 DAYS" +// # TODO - make this configurable +// minimum_redundancy=3 + +// provider="aws-s3" +// endpoint=$AWS_ENDPOINT_URL +// bucket=$AWS_BUCKET_NAME +// name=$FLY_APP_NAME + +// barman-cloud-backup-delete \ +// --cloud-provider "$provider" \ +// --endpoint-url "$endpoint" \ +// --retention "$retention" \ +// --minimum-redundancy "$minimum_redundancy" \ +// "s3://$bucket" "$name" + +type Barman struct { + appName string + provider string + endpoint string + bucket string + minimumRedundancy string + retentionDays string +} + +func NewBarman() (*Barman, error) { + if err := validateBarmanRequirements(); err != nil { + return nil, err + } + + // TODO - Validate minimum and retention day values + + return &Barman{ + appName: os.Getenv("FLY_APP_NAME"), + provider: "aws-s3", + endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL")), + bucket: strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")), + minimumRedundancy: getenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "3"), + retentionDays: getenv("CLOUD_ARCHIVING_RETENTION_DAYS", "7"), + }, nil +} + +func (b *Barman) RetentionPolicy() string { + return fmt.Sprintf("'RECOVERY WINDOW OF %s days'", b.retentionDays) +} + +func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { + cmd := exec.CommandContext(ctx, b.walArchiveDeleteCommandString()) + return cmd.CombinedOutput() +} + +func (b *Barman) PrintRetentionPolicy() { + str := `Retention Policy + ----------------- + RECOVERY WINDOW OF %d days + MINIMUM BACKUP REDUNDANCY: %d +` + log.Printf(str, b.retentionDays, b.minimumRedundancy) +} + +func (b *Barman) walArchiveDeleteCommandString() string { + return fmt.Sprintf("barman-cloud-backup-delete --cloud-provider %s --endpoint-url %s --retention %s --minimum-redundancy %s s3://%s %s", + b.provider, + b.endpoint, + b.RetentionPolicy(), + b.minimumRedundancy, + b.bucket, + b.appName, + ) +} + +func (b *Barman) walArchiveCommandString() string { + return fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s s3://%s %s %%p'", + b.provider, + b.endpoint, + b.bucket, + b.appName, + ) +} + +func validateBarmanRequirements() error { + if os.Getenv("AWS_ACCESS_KEY_ID") == "" { + return fmt.Errorf("AWS_ACCESS_KEY_ID secret must be set") + } + + if os.Getenv("AWS_SECRET_ACCESS_KEY") == "" { + return fmt.Errorf("AWS_SECRET_ACCESS_KEY secret must be set") + } + + if os.Getenv("AWS_BUCKET_NAME") == "" { + return fmt.Errorf("AWS_BUCKET_NAME envvar must be set") + } + + if os.Getenv("AWS_ENDPOINT_URL") == "" { + return fmt.Errorf("AWS_ENDPOINT_URL envvar must be set") + } + + return nil +} + +func getenv(key, fallback string) string { + value := os.Getenv(key) + if len(value) == 0 { + return fallback + } + return value +} diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 574ee8a1..a0dd5d69 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -175,21 +175,17 @@ func (c *PGConfig) SetDefaults() error { // Works to configure archiving to object storage if enabled switch strings.ToLower(os.Getenv("CLOUD_ARCHIVING_ENABLED")) { case "true": - if err := validateCloudArchiver(); err != nil { - return fmt.Errorf("failed to validate s3 archiver: %s", err) + barman, err := NewBarman() + if err != nil { + return err } - bucket := strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")) - endpoint := strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL")) - - archiveCommand := fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider aws-s3 --gzip --endpoint-url %s s3://%s %s %%p'", endpoint, bucket, c.AppName) c.internalConfig["archive_mode"] = "on" - c.internalConfig["archive_command"] = archiveCommand + c.internalConfig["archive_command"] = barman.walArchiveCommandString() case "false": c.internalConfig["archive_mode"] = "off" default: - // Noop to remain backwards compatible with existing setups that may have - // manually setup archiving. + // Noop } return nil @@ -558,23 +554,3 @@ func diskSizeInBytes(dir string) (uint64, error) { } return stat.Blocks * uint64(stat.Bsize), nil } - -func validateCloudArchiver() error { - if os.Getenv("AWS_ACCESS_KEY_ID") == "" { - return fmt.Errorf("AWS_ACCESS_KEY_ID secret must be set") - } - - if os.Getenv("AWS_SECRET_ACCESS_KEY") == "" { - return fmt.Errorf("AWS_SECRET_ACCESS_KEY secret must be set") - } - - if os.Getenv("AWS_BUCKET_NAME") == "" { - return fmt.Errorf("AWS_BUCKET_NAME envvar must be set") - } - - if os.Getenv("AWS_ENDPOINT_URL") == "" { - return fmt.Errorf("AWS_ENDPOINT_URL envvar must be set") - } - - return nil -} From 8d497ea0a9b28946c1d081f0a5cfda9be7e2feaf Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 11:15:48 -0500 Subject: [PATCH 04/51] Cleanup --- internal/flypg/barman.go | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index c23500c6..ddf4e93a 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -9,29 +9,6 @@ import ( "strings" ) -const ( - cronDirectory = "/data/cron" - archivePrunerCronFile = "/data/cron/barman.cron" - archivePrunerBinary = "/usr/local/bin/archive_pruner" -) - -// # TODO - make this configurable -// retention="RECOVERY WINDOW OF 7 DAYS" -// # TODO - make this configurable -// minimum_redundancy=3 - -// provider="aws-s3" -// endpoint=$AWS_ENDPOINT_URL -// bucket=$AWS_BUCKET_NAME -// name=$FLY_APP_NAME - -// barman-cloud-backup-delete \ -// --cloud-provider "$provider" \ -// --endpoint-url "$endpoint" \ -// --retention "$retention" \ -// --minimum-redundancy "$minimum_redundancy" \ -// "s3://$bucket" "$name" - type Barman struct { appName string provider string From 2b695199f432ac229a30ea8089e4cd5ac7bc470d Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 11:18:24 -0500 Subject: [PATCH 05/51] fix log --- internal/flypg/barman.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index ddf4e93a..b486ffca 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -45,10 +45,11 @@ func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { } func (b *Barman) PrintRetentionPolicy() { - str := `Retention Policy + str := ` + Retention Policy ----------------- - RECOVERY WINDOW OF %d days - MINIMUM BACKUP REDUNDANCY: %d + RECOVERY WINDOW OF %s days + MINIMUM BACKUP REDUNDANCY: %s ` log.Printf(str, b.retentionDays, b.minimumRedundancy) } From 2cc0b452a3ef1a6a794be21fedcfa941bf670efc Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 11:24:42 -0500 Subject: [PATCH 06/51] Fix deletion call --- internal/flypg/barman.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index b486ffca..6b442755 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -5,8 +5,9 @@ import ( "fmt" "log" "os" - "os/exec" "strings" + + "github.com/fly-apps/postgres-flex/internal/utils" ) type Barman struct { @@ -39,9 +40,9 @@ func (b *Barman) RetentionPolicy() string { return fmt.Sprintf("'RECOVERY WINDOW OF %s days'", b.retentionDays) } +// WALArchiveDelete deletes backups/WAL based on the specified retention policy. func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { - cmd := exec.CommandContext(ctx, b.walArchiveDeleteCommandString()) - return cmd.CombinedOutput() + return utils.RunCommand(b.walArchiveDeleteCommandString(), "postgres") } func (b *Barman) PrintRetentionPolicy() { From ba33e59fb0153f6ed7549e586f0ddfc6e5cc3e5d Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 11:40:50 -0500 Subject: [PATCH 07/51] Move the log under the debug flag --- internal/utils/shell.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/utils/shell.go b/internal/utils/shell.go index ca79c1e9..bd9c1727 100644 --- a/internal/utils/shell.go +++ b/internal/utils/shell.go @@ -12,18 +12,22 @@ import ( "syscall" ) +// TODO - RunCommand needs a context + func RunCommand(cmdStr, usr string) ([]byte, error) { uid, gid, err := SystemUserIDs(usr) if err != nil { return nil, err } - log.Printf("> Running command as %s: %s\n", usr, cmdStr) - cmd := exec.Command("sh", "-c", cmdStr) cmd.SysProcAttr = &syscall.SysProcAttr{} cmd.SysProcAttr.Credential = &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)} + if os.Getenv("DEBUG") != "" { + log.Printf("> Running command as %s: %s\n", usr, cmdStr) + } + var stdoutBuf, stderrBuf bytes.Buffer cmd.Stdout = io.MultiWriter(os.Stdout, &stdoutBuf) cmd.Stderr = io.MultiWriter(os.Stderr, &stderrBuf) @@ -36,6 +40,7 @@ func RunCommand(cmdStr, usr string) ([]byte, error) { } return stdoutBuf.Bytes(), err + } func SetFileOwnership(pathToFile, owner string) error { From 39bc76ad2304e219c2c734c02f2e7d5fd59ea47a Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 12:03:52 -0500 Subject: [PATCH 08/51] Adding tests --- internal/flypg/barman.go | 3 +- internal/flypg/barman_test.go | 143 ++++++++++++++++++++++++++++++++++ internal/flypg/pg.go | 2 +- 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 internal/flypg/barman_test.go diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 6b442755..5293c71b 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -67,7 +67,8 @@ func (b *Barman) walArchiveDeleteCommandString() string { } func (b *Barman) walArchiveCommandString() string { - return fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s s3://%s %s %%p'", + // TODO - Make compression configurable + return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s s3://%s %s %%p", b.provider, b.endpoint, b.bucket, diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go new file mode 100644 index 00000000..742ed717 --- /dev/null +++ b/internal/flypg/barman_test.go @@ -0,0 +1,143 @@ +package flypg + +import ( + "testing" +) + +func TestNewBarman(t *testing.T) { + + t.Run("defaults", func(t *testing.T) { + setDefaultEnv(t) + + barman, err := NewBarman() + if err != nil { + t.Fatal(err) + } + + if barman.provider != "aws-s3" { + t.Fatalf("expected provider to be aws, but got %s", barman.provider) + } + + if barman.endpoint != "https://fly.storage.tigris.dev" { + t.Fatalf("expected endpoint to be https://fly.storage.tigris.dev, but got %s", barman.endpoint) + } + + if barman.bucket != "my-bucket" { + t.Fatalf("expected bucket to be my-bucket, but got %s", barman.bucket) + } + + if barman.appName != "postgres-flex" { + t.Fatalf("expected appName to be postgres-flex, but got %s", barman.appName) + } + + // Defaults + if barman.minimumRedundancy != "3" { + t.Fatalf("expected minimumRedundancy to be 3, but got %s", barman.minimumRedundancy) + } + + if barman.retentionDays != "7" { + t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) + } + + }) + + t.Run("custom-retention", func(t *testing.T) { + setDefaultEnv(t) + + t.Setenv("CLOUD_ARCHIVING_RETENTION_DAYS", "30") + + barman, err := NewBarman() + if err != nil { + t.Fatal(err) + } + + if barman.retentionDays != "30" { + t.Fatalf("expected retentionDays to be 30, but got %s", barman.retentionDays) + } + }) + + t.Run("custom-min-redundancy", func(t *testing.T) { + setDefaultEnv(t) + t.Setenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "7") + + barman, err := NewBarman() + if err != nil { + t.Fatal(err) + } + + if barman.minimumRedundancy != "7" { + t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) + } + }) + + t.Run("test-failure", func(t *testing.T) { + _, err := NewBarman() + if err == nil { + t.Fatal("expected error, but got nil") + } + }) +} + +func TestValidateBarmanRequirements(t *testing.T) { + + t.Run("missing-aws-access-key", func(t *testing.T) { + err := validateBarmanRequirements() + + if err.Error() != "AWS_ACCESS_KEY_ID secret must be set" { + t.Fatalf("expected error to be 'AWS_ACCESS_KEY_ID secret must be set', but got %s", err.Error()) + } + }) + + t.Run("missing-aws-secret-access-key", func(t *testing.T) { + setDefaultEnv(t) + t.Setenv("AWS_SECRET_ACCESS_KEY", "") + err := validateBarmanRequirements() + + if err.Error() != "AWS_SECRET_ACCESS_KEY secret must be set" { + t.Fatalf("expected error to be 'AWS_SECRET_ACCESS_KEY secret must be set', but got %s", err.Error()) + } + }) + + t.Run("missing-aws-bucket-name", func(t *testing.T) { + setDefaultEnv(t) + t.Setenv("AWS_BUCKET_NAME", "") + err := validateBarmanRequirements() + + if err.Error() != "AWS_BUCKET_NAME envvar must be set" { + t.Fatalf("expected error to be 'AWS_BUCKET_NAME envvar must be set', but got %s", err.Error()) + } + }) + + t.Run("missing-aws-endpoint-url", func(t *testing.T) { + setDefaultEnv(t) + t.Setenv("AWS_ENDPOINT_URL", "") + err := validateBarmanRequirements() + + if err.Error() != "AWS_ENDPOINT_URL envvar must be set" { + t.Fatalf("expected error to be 'AWS_ENDPOINT_URL envvar must be set', but got %s", err.Error()) + } + }) +} + +func TestBarmanRetentionPolicy(t *testing.T) { + setDefaultEnv(t) + + barman, err := NewBarman() + if err != nil { + t.Fatal(err) + } + + if barman.RetentionPolicy() != "'RECOVERY WINDOW OF 7 days'" { + t.Fatalf("expected retention policy to be 'RECOVERY WINDOW OF 7 days', but got %s", barman.RetentionPolicy()) + } + +} + +func setDefaultEnv(t *testing.T) { + t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") + t.Setenv("FLY_APP_NAME", "postgres-flex") + t.Setenv("AWS_ACCESS_KEY_ID", "my-key") + t.Setenv("AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") + t.Setenv("AWS_BUCKET_NAME", "my-bucket") +} diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index a0dd5d69..cdeebf56 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -181,7 +181,7 @@ func (c *PGConfig) SetDefaults() error { } c.internalConfig["archive_mode"] = "on" - c.internalConfig["archive_command"] = barman.walArchiveCommandString() + c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommandString()) case "false": c.internalConfig["archive_mode"] = "off" default: From 78c3ac095205c3aafa4e54934c63c6d94698c278 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 22:22:20 -0500 Subject: [PATCH 09/51] Checkin --- internal/flypg/barman.go | 41 ++-- internal/flypg/barman_restore.go | 208 ++++++++++++++++ internal/flypg/barman_restore_test.go | 223 ++++++++++++++++++ internal/flypg/barman_test.go | 9 +- internal/flypg/node.go | 2 +- internal/flypg/pg.go | 28 ++- .../flypg/{restore.go => snapshot_restore.go} | 0 7 files changed, 487 insertions(+), 24 deletions(-) create mode 100644 internal/flypg/barman_restore.go create mode 100644 internal/flypg/barman_restore_test.go rename internal/flypg/{restore.go => snapshot_restore.go} (100%) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 5293c71b..fd2a08e3 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -10,29 +10,39 @@ import ( "github.com/fly-apps/postgres-flex/internal/utils" ) +const ( + barmanRecoveryDirectory = "/data/postgresql" +) + type Barman struct { - appName string - provider string - endpoint string - bucket string + appName string + provider string + endpoint string + bucket string + + // fullBackupFrequency string minimumRedundancy string retentionDays string } func NewBarman() (*Barman, error) { - if err := validateBarmanRequirements(); err != nil { + if err := validateBarman(); err != nil { return nil, err } // TODO - Validate minimum and retention day values + minRedundancy := getenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "3") + retentionDays := getenv("CLOUD_ARCHIVING_RETENTION_DAYS", "7") return &Barman{ - appName: os.Getenv("FLY_APP_NAME"), - provider: "aws-s3", - endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL")), - bucket: strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")), - minimumRedundancy: getenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "3"), - retentionDays: getenv("CLOUD_ARCHIVING_RETENTION_DAYS", "7"), + appName: os.Getenv("FLY_APP_NAME"), + provider: "aws-s3", + endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL")), + bucket: strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")), + + // fullBackupFrequency: getenv("CLOUD_ARCHIVING_FULL_BACKUP_FREQUENCY", "1"), + minimumRedundancy: minRedundancy, + retentionDays: retentionDays, }, nil } @@ -42,7 +52,7 @@ func (b *Barman) RetentionPolicy() string { // WALArchiveDelete deletes backups/WAL based on the specified retention policy. func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { - return utils.RunCommand(b.walArchiveDeleteCommandString(), "postgres") + return utils.RunCommand(b.walArchiveDeleteCommand(), "postgres") } func (b *Barman) PrintRetentionPolicy() { @@ -51,11 +61,12 @@ func (b *Barman) PrintRetentionPolicy() { ----------------- RECOVERY WINDOW OF %s days MINIMUM BACKUP REDUNDANCY: %s + FULL BACKUP FREQUENCY: %s day(s) ` log.Printf(str, b.retentionDays, b.minimumRedundancy) } -func (b *Barman) walArchiveDeleteCommandString() string { +func (b *Barman) walArchiveDeleteCommand() string { return fmt.Sprintf("barman-cloud-backup-delete --cloud-provider %s --endpoint-url %s --retention %s --minimum-redundancy %s s3://%s %s", b.provider, b.endpoint, @@ -66,7 +77,7 @@ func (b *Barman) walArchiveDeleteCommandString() string { ) } -func (b *Barman) walArchiveCommandString() string { +func (b *Barman) walArchiveCommand() string { // TODO - Make compression configurable return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s s3://%s %s %%p", b.provider, @@ -76,7 +87,7 @@ func (b *Barman) walArchiveCommandString() string { ) } -func validateBarmanRequirements() error { +func validateBarman() error { if os.Getenv("AWS_ACCESS_KEY_ID") == "" { return fmt.Errorf("AWS_ACCESS_KEY_ID secret must be set") } diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go new file mode 100644 index 00000000..158c41a1 --- /dev/null +++ b/internal/flypg/barman_restore.go @@ -0,0 +1,208 @@ +package flypg + +import ( + "encoding/json" + "fmt" + "os" + "strings" + "time" + + "github.com/fly-apps/postgres-flex/internal/utils" +) + +type BarmanRestore struct { + appName string + provider string + endpoint string + bucket string + + recoveryTarget string + recoveryTargetTimeline string + recoveryTargetAction string +} + +type BarmanBackupList struct { + Backups []BarmanBackup `json:"backups_list"` +} + +type BarmanBackup struct { + BackupID string `json:"backup_id"` + StartTime string `json:"begin_time"` + EndTime string `json:"end_time"` + BeginWal string `json:"begin_wal"` + EndWal string `json:"end_wal"` +} + +func NewBarmanRestore() (*BarmanRestore, error) { + if err := validateBarmanRestore(); err != nil { + return nil, err + } + + return &BarmanRestore{ + appName: os.Getenv("FLY_APP_NAME"), + provider: "aws-s3", + endpoint: strings.TrimSpace(os.Getenv("SOURCE_AWS_ENDPOINT_URL")), + bucket: strings.TrimSpace(os.Getenv("SOURCE_AWS_BUCKET_NAME")), + + recoveryTarget: getenv("WAL_RECOVERY_TARGET", "immediate"), + recoveryTargetTimeline: getenv("WAL_RECOVERY_TARGET_TIMELINE", "latest"), + recoveryTargetAction: getenv("WAL_RECOVERY_TARGET_ACTION", "promote"), + }, nil +} + +func (b *BarmanRestore) RestoreFromPITBackup(restoreStr string) error { + // Query available backups from object storage + backupsBytes, err := utils.RunCommand(b.backupListCommand(), "postgres") + if err != nil { + return fmt.Errorf("failed to list backups: %s", err) + } + + // Parse the backups + backupList, err := b.parseBackups(backupsBytes) + if err != nil { + return fmt.Errorf("failed to parse backups: %s", err) + } + + if len(backupList.Backups) == 0 { + return fmt.Errorf("no backups found") + } + + // Resolve the base backup to restore + backupID, err := b.resolveBackupTarget(backupList, restoreStr) + if err != nil { + return fmt.Errorf("failed to resolve backup target: %s", err) + } + + // Download and restore the base backup + _, err = utils.RunCommand(b.backupRestoreCommand(backupID), "postgres") + if err != nil { + return fmt.Errorf("failed to restore base backup: %s", err) + } + + return nil +} + +// restoreCommand returns the command string used to restore a base backup. +func (b *BarmanRestore) backupRestoreCommand(backupID string) string { + return fmt.Sprintf("barman-cloud-restore --cloud-provider %s --endpoint-url %s s3://%s %s %s %s", + b.provider, + b.endpoint, + b.bucket, + b.appName, + backupID, + barmanRecoveryDirectory, + ) +} + +// walRestoreCommand returns the command string used to restore WAL files. +// The %f and %p placeholders are replaced with the file path and file name respectively. +func (b *BarmanRestore) walRestoreCommand() string { + return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s s3://%s %s %%f %%p", + b.provider, + b.endpoint, + b.bucket, + b.appName, + ) +} + +func (b *BarmanRestore) backupListCommand() string { + return fmt.Sprintf("barman-cloud-backup-list --cloud-provider %s --endpoint-url %s s3://%s %s --format json", + b.provider, + b.endpoint, + b.bucket, + b.appName, + ) +} + +func (b *BarmanRestore) parseBackups(backupBytes []byte) (BarmanBackupList, error) { + var backupList BarmanBackupList + + if err := json.Unmarshal(backupBytes, &backupList); err != nil { + return BarmanBackupList{}, fmt.Errorf("failed to parse backups: %s", err) + } + + return backupList, nil +} + +func (b *BarmanRestore) resolveBackupTarget(backupList BarmanBackupList, restoreStr string) (string, error) { + if len(backupList.Backups) == 0 { + return "", fmt.Errorf("no backups found") + } + + var restoreTime time.Time + + // Parse the restore string + if restoreStr == "latest" { + restoreTime = time.Now() + } else { + var err error + restoreTime, err = time.Parse(time.RFC3339, restoreStr) + if err != nil { + return "", fmt.Errorf("failed to parse restore time: %s", err) + } + } + + latestBackupID := "" + latestStartTime := time.Time{} + + earliestBackupID := "" + earliestEndTime := time.Time{} + + layout := "Mon Jan 2 15:04:05 2006" + + for _, backup := range backupList.Backups { + // Parse the backup start time + startTime, err := time.Parse(layout, backup.StartTime) + if err != nil { + return "", fmt.Errorf("failed to parse backup start time: %s", err) + } + // Parse the backup start time + endTime, err := time.Parse(layout, backup.EndTime) + if err != nil { + return "", fmt.Errorf("failed to parse backup end time: %s", err) + } + // Check if the restore time falls within the backup window + if restoreTime.After(startTime) && restoreTime.Before(endTime) { + return backup.BackupID, nil + } + + // Track the latest and earliest backups in case the restore time is outside + // the available backup windows + if latestBackupID == "" || startTime.After(latestStartTime) { + latestBackupID = backup.BackupID + latestStartTime = startTime + } + + if earliestBackupID == "" || endTime.Before(earliestEndTime) { + earliestBackupID = backup.BackupID + earliestEndTime = endTime + } + } + + // if the restore time is before the earliest backup, restore the earliest backup + if restoreTime.Before(earliestEndTime) { + return earliestBackupID, nil + } + + return latestBackupID, nil +} + +func validateBarmanRestore() error { + if os.Getenv("SOURCE_AWS_ACCESS_KEY_ID") == "" { + return fmt.Errorf("SOURCE_AWS_ACCESS_KEY_ID secret must be set") + } + + if os.Getenv("SOURCE_AWS_SECRET_ACCESS_KEY") == "" { + return fmt.Errorf("SOURCE_AWS_SECRET_ACCESS_KEY secret must be set") + } + + if os.Getenv("SOURCE_AWS_BUCKET_NAME") == "" { + return fmt.Errorf("SOURCE_AWS_BUCKET_NAME envvar must be set") + } + + if os.Getenv("SOURCE_AWS_ENDPOINT_URL") == "" { + return fmt.Errorf("SOURCE_AWS_ENDPOINT_URL envvar must be set") + } + + return nil +} diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go new file mode 100644 index 00000000..942c8619 --- /dev/null +++ b/internal/flypg/barman_restore_test.go @@ -0,0 +1,223 @@ +package flypg + +import ( + "testing" +) + +const backupsResponse = `{ + "backups_list": [ + { + "backup_label": "'START WAL LOCATION: 0/8000028 (file 000000010000000000000008)\\nCHECKPOINT LOCATION: 0/8000098\\nBACKUP METHOD: streamed\\nBACKUP FROM: primary\\nSTART TIME: 2024-06-25 19:44:13 UTC\\nLABEL: Barman backup cloud 20240625T194412\\nSTART TIMELINE: 1\\n'", + "backup_name": "test-backup-2", + "begin_offset": 40, + "begin_time": "Tue Jun 25 19:44:12 2024", + "begin_wal": "000000010000000000000008", + "begin_xlog": "0/8000028", + "compression": null, + "config_file": "/data/postgresql/postgresql.conf", + "copy_stats": { + "total_time": 8.527192, + "number_of_workers": 2, + "analysis_time": 0, + "analysis_time_per_item": { + "data": 0 + }, + "copy_time_per_item": { + "data": 0.624873 + }, + "serialized_copy_time_per_item": { + "data": 0.430501 + }, + "copy_time": 0.624873, + "serialized_copy_time": 0.430501 + }, + "deduplicated_size": null, + "end_offset": 312, + "end_time": "Tue Jun 25 19:44:18 2024", + "end_wal": "000000010000000000000008", + "end_xlog": "0/8000138", + "error": null, + "hba_file": "/data/postgresql/pg_hba.conf", + "ident_file": "/data/postgresql/pg_ident.conf", + "included_files": [ + "/data/postgresql/postgresql.internal.conf" + ], + "mode": null, + "pgdata": "/data/postgresql", + "server_name": "cloud", + "size": null, + "status": "DONE", + "systemid": "7384497274230341974", + "tablespaces": null, + "timeline": 1, + "version": 150006, + "xlog_segment_size": 16777216, + "backup_id": "20240625T194412" + }, + { + "backup_label": "'START WAL LOCATION: 0/11000238 (file 000000010000000000000011)\\nCHECKPOINT LOCATION: 0/11000270\\nBACKUP METHOD: streamed\\nBACKUP FROM: primary\\nSTART TIME: 2024-06-26 17:26:53 UTC\\nLABEL: Barman backup cloud 20240626T172443\\nSTART TIMELINE: 1\\n'", + "begin_offset": 568, + "begin_time": "Wed Jun 26 17:24:43 2024", + "begin_wal": "000000010000000000000011", + "begin_xlog": "0/11000238", + "compression": null, + "config_file": "/data/postgresql/postgresql.conf", + "copy_stats": { + "total_time": 142.572774, + "number_of_workers": 2, + "analysis_time": 0, + "analysis_time_per_item": { + "data": 0 + }, + "copy_time_per_item": { + "data": 0.845993 + }, + "serialized_copy_time_per_item": { + "data": 0.545768 + }, + "copy_time": 0.845993, + "serialized_copy_time": 0.545768 + }, + "deduplicated_size": null, + "end_offset": 840, + "end_time": "Wed Jun 26 17:27:02 2024", + "end_wal": "000000010000000000000011", + "end_xlog": "0/11000348", + "error": null, + "hba_file": "/data/postgresql/pg_hba.conf", + "ident_file": "/data/postgresql/pg_ident.conf", + "included_files": [ + "/data/postgresql/postgresql.internal.conf" + ], + "mode": null, + "pgdata": "/data/postgresql", + "server_name": "cloud", + "size": null, + "status": "DONE", + "systemid": "7384497274230341974", + "tablespaces": null, + "timeline": 1, + "version": 150006, + "xlog_segment_size": 16777216, + "backup_id": "20240626T172443" + } + ] +}` + +func TestParseBackups(t *testing.T) { + t.Run("parseBackups", func(t *testing.T) { + setRestoreDefaultEnv(t) + + restore, err := NewBarmanRestore() + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } + + list, err := restore.parseBackups([]byte(backupsResponse)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(list.Backups) != 2 { + t.Fatalf("expected 2 backups, got %d", len(list.Backups)) + } + + firstBackup := list.Backups[0] + if firstBackup.BackupID != "20240625T194412" { + t.Fatalf("expected backup ID to be 20240625T194412, got %s", firstBackup.BackupID) + } + + if firstBackup.StartTime != "Tue Jun 25 19:44:12 2024" { + t.Fatalf("expected start time to be Tue Jun 25 19:44:12 2024, got %s", firstBackup.StartTime) + } + + if firstBackup.EndTime != "Tue Jun 25 19:44:18 2024" { + t.Fatalf("expected end time to be Tue Jun 25 19:44:18 2024, got %s", firstBackup.EndTime) + } + + secondBackup := list.Backups[1] + + if secondBackup.BackupID != "20240626T172443" { + t.Fatalf("expected backup ID to be 20240626T172443, got %s", secondBackup.BackupID) + } + + if secondBackup.StartTime != "Wed Jun 26 17:24:43 2024" { + t.Fatalf("expected start time to be Wed Jun 26 17:24:43 2024, got %s", secondBackup.StartTime) + } + + if secondBackup.EndTime != "Wed Jun 26 17:27:02 2024" { + t.Fatalf("expected end time to be Wed Jun 26 17:27:02 2024, got %s", secondBackup.EndTime) + } + }) +} + +func TestResolveBackupTarget(t *testing.T) { + setRestoreDefaultEnv(t) + + restore, err := NewBarmanRestore() + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } + + list, err := restore.parseBackups([]byte(backupsResponse)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + t.Run("resolve-latest-backup-target", func(t *testing.T) { + backupID, err := restore.resolveBackupTarget(list, "latest") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if backupID != "20240626T172443" { + t.Fatalf("expected backup ID to be 20240626T172443, got %s", backupID) + } + }) + + t.Run("resolve-earliest-backup-target", func(t *testing.T) { + backupID, err := restore.resolveBackupTarget(list, "2024-06-25T19:44:12Z") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if backupID != "20240625T194412" { + t.Fatalf("expected backup ID to be 20240625T194412, got %s", backupID) + } + }) + + // "begin_time": "Tue Jun 25 19:44:12 2024" + // "end_time": "Tue Jun 25 19:44:18 2024", + t.Run("resolve-backup-within-first-window", func(t *testing.T) { + backupID, err := restore.resolveBackupTarget(list, "2024-06-25T19:44:15Z") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if backupID != "20240625T194412" { + t.Fatalf("expected backup ID to be 20240625T194412, got %s", backupID) + } + }) + + // "begin_time": "Wed Jun 26 17:24:43 2024", + // "end_time": "Wed Jun 26 17:27:02 2024", + t.Run("resolve-backup-within-second-window", func(t *testing.T) { + backupID, err := restore.resolveBackupTarget(list, "2024-06-26T17:25:15Z") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if backupID != "20240626T172443" { + t.Fatalf("expected backup ID to be 20240626T172443, got %s", backupID) + } + }) +} + +func setRestoreDefaultEnv(t *testing.T) { + t.Setenv("CLOUD_ARCHIVING_WAL_RESTORE", "true") + t.Setenv("FLY_APP_NAME", "postgres-flex") + t.Setenv("SOURCE_AWS_ACCESS_KEY_ID", "my-key") + t.Setenv("SOURCE_AWS_SECRET_ACCESS_KEY", "my-secret") + t.Setenv("SOURCE_AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("SOURCE_AWS_BUCKET_NAME", "my-bucket") +} diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index 742ed717..695207e0 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -5,7 +5,6 @@ import ( ) func TestNewBarman(t *testing.T) { - t.Run("defaults", func(t *testing.T) { setDefaultEnv(t) @@ -81,7 +80,7 @@ func TestNewBarman(t *testing.T) { func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-access-key", func(t *testing.T) { - err := validateBarmanRequirements() + err := validateBarman() if err.Error() != "AWS_ACCESS_KEY_ID secret must be set" { t.Fatalf("expected error to be 'AWS_ACCESS_KEY_ID secret must be set', but got %s", err.Error()) @@ -91,7 +90,7 @@ func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-secret-access-key", func(t *testing.T) { setDefaultEnv(t) t.Setenv("AWS_SECRET_ACCESS_KEY", "") - err := validateBarmanRequirements() + err := validateBarman() if err.Error() != "AWS_SECRET_ACCESS_KEY secret must be set" { t.Fatalf("expected error to be 'AWS_SECRET_ACCESS_KEY secret must be set', but got %s", err.Error()) @@ -101,7 +100,7 @@ func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-bucket-name", func(t *testing.T) { setDefaultEnv(t) t.Setenv("AWS_BUCKET_NAME", "") - err := validateBarmanRequirements() + err := validateBarman() if err.Error() != "AWS_BUCKET_NAME envvar must be set" { t.Fatalf("expected error to be 'AWS_BUCKET_NAME envvar must be set', but got %s", err.Error()) @@ -111,7 +110,7 @@ func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-endpoint-url", func(t *testing.T) { setDefaultEnv(t) t.Setenv("AWS_ENDPOINT_URL", "") - err := validateBarmanRequirements() + err := validateBarman() if err.Error() != "AWS_ENDPOINT_URL envvar must be set" { t.Fatalf("expected error to be 'AWS_ENDPOINT_URL envvar must be set', but got %s", err.Error()) diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 9785b944..ad12e7d2 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -124,7 +124,7 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed to set directory ownership: %s", err) } - // Check to see if we were just restored + // Snapshot restore if os.Getenv("FLY_RESTORED_FROM") != "" { active, err := isRestoreActive() if err != nil { diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index cdeebf56..72db522b 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -158,6 +158,7 @@ func (c *PGConfig) SetDefaults() error { } c.internalConfig = ConfigMap{ + "cluster_name": c.AppName, "random_page_cost": "1.1", "port": c.Port, "shared_buffers": fmt.Sprintf("%dMB", sharedBuffersMb), @@ -170,24 +171,45 @@ func (c *PGConfig) SetDefaults() error { "wal_log_hints": true, "hot_standby": true, "shared_preload_libraries": fmt.Sprintf("'%s'", strings.Join(sharedPreloadLibraries, ",")), + "restore_command": "", + "recovery_target": "", + "recovery_target_timeline": "", + "recovery_target_action": "", } - // Works to configure archiving to object storage if enabled switch strings.ToLower(os.Getenv("CLOUD_ARCHIVING_ENABLED")) { case "true": barman, err := NewBarman() if err != nil { return err } - c.internalConfig["archive_mode"] = "on" - c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommandString()) + c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) + case "false": c.internalConfig["archive_mode"] = "off" + c.internalConfig["archive_command"] = "" default: // Noop } + if os.Getenv("CLOUD_ARCHIVING_WAL_RESTORE") != "" { + barmanRestore, err := NewBarmanRestore() + if err != nil { + return err + } + + c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) + c.internalConfig["recovery_target"] = barmanRestore.recoveryTarget + c.internalConfig["recovery_target_timeline"] = barmanRestore.recoveryTargetTimeline + c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction + + // Write the recovery.signal file + if err := os.WriteFile("/data/postgresql/recovery.signal", []byte(""), 0600); err != nil { + return fmt.Errorf("failed to write recovery.signal: %s", err) + } + } + return nil } diff --git a/internal/flypg/restore.go b/internal/flypg/snapshot_restore.go similarity index 100% rename from internal/flypg/restore.go rename to internal/flypg/snapshot_restore.go From 47a6f21163e68e0b4f93bc706ea64a9431b79e2e Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 22:33:21 -0500 Subject: [PATCH 10/51] Checkin --- internal/flypg/barman_restore.go | 7 ++++--- internal/flypg/node.go | 12 ++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 158c41a1..ec5a7e49 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -1,6 +1,7 @@ package flypg import ( + "context" "encoding/json" "fmt" "os" @@ -45,12 +46,12 @@ func NewBarmanRestore() (*BarmanRestore, error) { bucket: strings.TrimSpace(os.Getenv("SOURCE_AWS_BUCKET_NAME")), recoveryTarget: getenv("WAL_RECOVERY_TARGET", "immediate"), - recoveryTargetTimeline: getenv("WAL_RECOVERY_TARGET_TIMELINE", "latest"), + recoveryTargetTimeline: getenv("WAL_RECOVERY_TARGET_TIME", "latest"), recoveryTargetAction: getenv("WAL_RECOVERY_TARGET_ACTION", "promote"), }, nil } -func (b *BarmanRestore) RestoreFromPITBackup(restoreStr string) error { +func (b *BarmanRestore) RestoreFromPIT(ctx context.Context) error { // Query available backups from object storage backupsBytes, err := utils.RunCommand(b.backupListCommand(), "postgres") if err != nil { @@ -68,7 +69,7 @@ func (b *BarmanRestore) RestoreFromPITBackup(restoreStr string) error { } // Resolve the base backup to restore - backupID, err := b.resolveBackupTarget(backupList, restoreStr) + backupID, err := b.resolveBackupTarget(backupList, b.recoveryTargetTimeline) if err != nil { return fmt.Errorf("failed to resolve backup target: %s", err) } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index ad12e7d2..2e88122d 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -138,6 +138,18 @@ func (n *Node) Init(ctx context.Context) error { } } + // Point-in-time restore + if os.Getenv("CLOUD_ARCHIVING_REMOTE_RESTORE") != "" { + restore, err := NewBarmanRestore() + if err != nil { + return fmt.Errorf("failed to initialize barman restore: %s", err) + } + + if err := restore.RestoreFromPIT(ctx); err != nil { + return fmt.Errorf("failed to restore from PIT: %s", err) + } + } + // Verify whether we are a booting zombie. if ZombieLockExists() { if err := handleZombieLock(ctx, n); err != nil { From d91ea1a8e3e10f8d916b66ddbf18b324b2a955d7 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 26 Jun 2024 22:34:27 -0500 Subject: [PATCH 11/51] Add todo --- internal/flypg/barman_restore.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index ec5a7e49..4be6f2c3 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -45,8 +45,9 @@ func NewBarmanRestore() (*BarmanRestore, error) { endpoint: strings.TrimSpace(os.Getenv("SOURCE_AWS_ENDPOINT_URL")), bucket: strings.TrimSpace(os.Getenv("SOURCE_AWS_BUCKET_NAME")), - recoveryTarget: getenv("WAL_RECOVERY_TARGET", "immediate"), - recoveryTargetTimeline: getenv("WAL_RECOVERY_TARGET_TIME", "latest"), + recoveryTarget: getenv("WAL_RECOVERY_TARGET", "immediate"), + // TODO - Use recovery target time instead. This is a temporary solution. + recoveryTargetTimeline: getenv("WAL_RECOVERY_TARGET_TIMELINE", "latest"), recoveryTargetAction: getenv("WAL_RECOVERY_TARGET_ACTION", "promote"), }, nil } From 96190c4cf1cc278daff78df9d9049544b19c4adc Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Thu, 27 Jun 2024 09:13:20 -0500 Subject: [PATCH 12/51] Update AWS_ENDPOINT_URL -> AWS_ENDPOINT_URL_S3 --- internal/flypg/barman.go | 6 +++--- internal/flypg/barman_restore.go | 6 +++--- internal/flypg/barman_restore_test.go | 2 +- internal/flypg/barman_test.go | 8 ++++---- internal/flypg/pg.go | 4 ---- internal/flypg/pg_test.go | 6 +++--- 6 files changed, 14 insertions(+), 18 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index fd2a08e3..8900dfa9 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -37,7 +37,7 @@ func NewBarman() (*Barman, error) { return &Barman{ appName: os.Getenv("FLY_APP_NAME"), provider: "aws-s3", - endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL")), + endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL_S3")), bucket: strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")), // fullBackupFrequency: getenv("CLOUD_ARCHIVING_FULL_BACKUP_FREQUENCY", "1"), @@ -100,8 +100,8 @@ func validateBarman() error { return fmt.Errorf("AWS_BUCKET_NAME envvar must be set") } - if os.Getenv("AWS_ENDPOINT_URL") == "" { - return fmt.Errorf("AWS_ENDPOINT_URL envvar must be set") + if os.Getenv("AWS_ENDPOINT_URL_S3") == "" { + return fmt.Errorf("AWS_ENDPOINT_URL_S3 envvar must be set") } return nil diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 4be6f2c3..cb7e7ad7 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -42,7 +42,7 @@ func NewBarmanRestore() (*BarmanRestore, error) { return &BarmanRestore{ appName: os.Getenv("FLY_APP_NAME"), provider: "aws-s3", - endpoint: strings.TrimSpace(os.Getenv("SOURCE_AWS_ENDPOINT_URL")), + endpoint: strings.TrimSpace(os.Getenv("SOURCE_AWS_ENDPOINT_URL_S3")), bucket: strings.TrimSpace(os.Getenv("SOURCE_AWS_BUCKET_NAME")), recoveryTarget: getenv("WAL_RECOVERY_TARGET", "immediate"), @@ -202,8 +202,8 @@ func validateBarmanRestore() error { return fmt.Errorf("SOURCE_AWS_BUCKET_NAME envvar must be set") } - if os.Getenv("SOURCE_AWS_ENDPOINT_URL") == "" { - return fmt.Errorf("SOURCE_AWS_ENDPOINT_URL envvar must be set") + if os.Getenv("SOURCE_AWS_ENDPOINT_URL_S3") == "" { + return fmt.Errorf("SOURCE_AWS_ENDPOINT_URL_S3 envvar must be set") } return nil diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index 942c8619..ebe7514f 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -218,6 +218,6 @@ func setRestoreDefaultEnv(t *testing.T) { t.Setenv("FLY_APP_NAME", "postgres-flex") t.Setenv("SOURCE_AWS_ACCESS_KEY_ID", "my-key") t.Setenv("SOURCE_AWS_SECRET_ACCESS_KEY", "my-secret") - t.Setenv("SOURCE_AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("SOURCE_AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") t.Setenv("SOURCE_AWS_BUCKET_NAME", "my-bucket") } diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index 695207e0..e09c2a55 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -109,11 +109,11 @@ func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-endpoint-url", func(t *testing.T) { setDefaultEnv(t) - t.Setenv("AWS_ENDPOINT_URL", "") + t.Setenv("AWS_ENDPOINT_URL_S3", "") err := validateBarman() - if err.Error() != "AWS_ENDPOINT_URL envvar must be set" { - t.Fatalf("expected error to be 'AWS_ENDPOINT_URL envvar must be set', but got %s", err.Error()) + if err.Error() != "AWS_ENDPOINT_URL_S3 envvar must be set" { + t.Fatalf("expected error to be 'AWS_ENDPOINT_URL_S3 envvar must be set', but got %s", err.Error()) } }) } @@ -136,7 +136,7 @@ func setDefaultEnv(t *testing.T) { t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") t.Setenv("FLY_APP_NAME", "postgres-flex") t.Setenv("AWS_ACCESS_KEY_ID", "my-key") - t.Setenv("AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") t.Setenv("AWS_BUCKET_NAME", "my-bucket") } diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 72db522b..1d078005 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -171,10 +171,6 @@ func (c *PGConfig) SetDefaults() error { "wal_log_hints": true, "hot_standby": true, "shared_preload_libraries": fmt.Sprintf("'%s'", strings.Join(sharedPreloadLibraries, ",")), - "restore_command": "", - "recovery_target": "", - "recovery_target_timeline": "", - "recovery_target_action": "", } switch strings.ToLower(os.Getenv("CLOUD_ARCHIVING_ENABLED")) { diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index a329bf43..cf0c3d1f 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -117,7 +117,7 @@ func TestPGConfigInitialization(t *testing.T) { t.Run("cloud-archiving", func(t *testing.T) { t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") t.Setenv("AWS_ACCESS_KEY_ID", "my-key") - t.Setenv("AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") t.Setenv("AWS_BUCKET_NAME", "my-bucket") @@ -137,7 +137,7 @@ func TestPGConfigInitialization(t *testing.T) { } expected := fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider aws-s3 --gzip --endpoint-url %s s3://%s %s %%p'", - os.Getenv("AWS_ENDPOINT_URL"), + os.Getenv("AWS_ENDPOINT_URL_S3"), os.Getenv("AWS_BUCKET_NAME"), pgConf.AppName) @@ -149,7 +149,7 @@ func TestPGConfigInitialization(t *testing.T) { t.Run("cloud-archiving-disabled", func(t *testing.T) { t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") t.Setenv("AWS_ACCESS_KEY_ID", "my-key") - t.Setenv("AWS_ENDPOINT_URL", "https://fly.storage.tigris.dev") + t.Setenv("AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") t.Setenv("AWS_BUCKET_NAME", "my-bucket") From 1033d55aed30235da0ef62b1839086e9f36699c1 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Thu, 27 Jun 2024 11:10:51 -0500 Subject: [PATCH 13/51] Refactor --- cmd/monitor/monitor_backup_retention.go | 4 + internal/flypg/barman.go | 139 ++++++++++++------ internal/flypg/barman_restore.go | 98 ++---------- internal/flypg/barman_test.go | 36 ++--- internal/flypg/node.go | 12 +- internal/flypg/pg.go | 3 +- ...{snapshot_restore.go => remote_restore.go} | 10 +- 7 files changed, 149 insertions(+), 153 deletions(-) rename internal/flypg/{snapshot_restore.go => remote_restore.go} (91%) diff --git a/cmd/monitor/monitor_backup_retention.go b/cmd/monitor/monitor_backup_retention.go index 06e8818a..ba8b6520 100644 --- a/cmd/monitor/monitor_backup_retention.go +++ b/cmd/monitor/monitor_backup_retention.go @@ -8,6 +8,10 @@ import ( "github.com/fly-apps/postgres-flex/internal/flypg" ) +const ( + defaultBackupRetentionEvaluationThreshold = time.Hour * 6 +) + func monitorBackupRetention(ctx context.Context, barman *flypg.Barman) { ticker := time.NewTicker(defaultBackupRetentionEvaluationThreshold) defer ticker.Stop() diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 8900dfa9..52afb08b 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -2,8 +2,8 @@ package flypg import ( "context" + "encoding/json" "fmt" - "log" "os" "strings" @@ -11,7 +11,7 @@ import ( ) const ( - barmanRecoveryDirectory = "/data/postgresql" + providerDefault = "aws-s3" ) type Barman struct { @@ -20,74 +20,100 @@ type Barman struct { endpoint string bucket string - // fullBackupFrequency string - minimumRedundancy string - retentionDays string + fullBackupFrequency string // TODO - Implement + minimumRedundancy string + retentionDays string } -func NewBarman() (*Barman, error) { - if err := validateBarman(); err != nil { - return nil, err - } +type Backup struct { + BackupID string `json:"backup_id"` + StartTime string `json:"begin_time"` + EndTime string `json:"end_time"` + BeginWal string `json:"begin_wal"` + EndWal string `json:"end_wal"` +} - // TODO - Validate minimum and retention day values - minRedundancy := getenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "3") - retentionDays := getenv("CLOUD_ARCHIVING_RETENTION_DAYS", "7") +type BackupList struct { + Backups []Backup `json:"backups_list"` +} - return &Barman{ +func NewBarman(validate bool) (*Barman, error) { + b := &Barman{ appName: os.Getenv("FLY_APP_NAME"), - provider: "aws-s3", + provider: providerDefault, endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL_S3")), bucket: strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")), - // fullBackupFrequency: getenv("CLOUD_ARCHIVING_FULL_BACKUP_FREQUENCY", "1"), - minimumRedundancy: minRedundancy, - retentionDays: retentionDays, - }, nil -} + retentionDays: getenv("CLOUD_ARCHIVING_RETENTION_DAYS", "7"), + minimumRedundancy: getenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "3"), + } -func (b *Barman) RetentionPolicy() string { - return fmt.Sprintf("'RECOVERY WINDOW OF %s days'", b.retentionDays) + if validate { + return b, b.ValidateRequiredEnv() + } + + return b, nil } -// WALArchiveDelete deletes backups/WAL based on the specified retention policy. -func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { - return utils.RunCommand(b.walArchiveDeleteCommand(), "postgres") +func (b *Barman) Backup(ctx context.Context) ([]byte, error) { + backupCmd := fmt.Sprintf("barman-cloud-backup --cloud-provider %s --endpoint-url %s s3://%s %s", + b.provider, + b.endpoint, + b.bucket, + b.appName, + ) + + return utils.RunCommand(backupCmd, "postgres") } -func (b *Barman) PrintRetentionPolicy() { - str := ` - Retention Policy - ----------------- - RECOVERY WINDOW OF %s days - MINIMUM BACKUP REDUNDANCY: %s - FULL BACKUP FREQUENCY: %s day(s) -` - log.Printf(str, b.retentionDays, b.minimumRedundancy) +// RestoreBackup returns the command string used to restore a base backup. +func (b *Barman) RestoreBackup(ctx context.Context, backupID, recoveryDir string) ([]byte, error) { + restoreCmd := fmt.Sprintf("barman-cloud-restore --cloud-provider %s --endpoint-url %s s3://%s %s %s %s", + b.provider, + b.endpoint, + b.bucket, + b.appName, + backupID, + recoveryDir, + ) + + return utils.RunCommand(restoreCmd, "postgres") } -func (b *Barman) walArchiveDeleteCommand() string { - return fmt.Sprintf("barman-cloud-backup-delete --cloud-provider %s --endpoint-url %s --retention %s --minimum-redundancy %s s3://%s %s", +func (b *Barman) ListBackups(ctx context.Context) (BackupList, error) { + listBackupCmd := fmt.Sprintf("barman-cloud-backup-list --cloud-provider %s --endpoint-url %s s3://%s %s --format json", b.provider, b.endpoint, - b.RetentionPolicy(), - b.minimumRedundancy, b.bucket, b.appName, ) + + backupsBytes, err := utils.RunCommand(listBackupCmd, "postgres") + if err != nil { + return BackupList{}, fmt.Errorf("failed to list backups: %s", err) + } + + return b.parseBackups(backupsBytes) } -func (b *Barman) walArchiveCommand() string { - // TODO - Make compression configurable - return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s s3://%s %s %%p", +func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { + deleteCmd := fmt.Sprintf("barman-cloud-backup-delete --cloud-provider %s --endpoint-url %s --retention %s --minimum-redundancy %s s3://%s %s", b.provider, b.endpoint, + b.RetentionPolicy(), + b.minimumRedundancy, b.bucket, b.appName, ) + + return utils.RunCommand(deleteCmd, "postgres") +} + +func (b *Barman) RetentionPolicy() string { + return fmt.Sprintf("'RECOVERY WINDOW OF %s days'", b.retentionDays) } -func validateBarman() error { +func (b *Barman) ValidateRequiredEnv() error { if os.Getenv("AWS_ACCESS_KEY_ID") == "" { return fmt.Errorf("AWS_ACCESS_KEY_ID secret must be set") } @@ -107,6 +133,37 @@ func validateBarman() error { return nil } +func (b *Barman) parseBackups(backupBytes []byte) (BackupList, error) { + var backupList BackupList + + if err := json.Unmarshal(backupBytes, &backupList); err != nil { + return BackupList{}, fmt.Errorf("failed to parse backups: %s", err) + } + + return backupList, nil +} + +func (b *Barman) walArchiveCommand() string { + // TODO - Make compression configurable + return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s s3://%s %s %%p", + b.provider, + b.endpoint, + b.bucket, + b.appName, + ) +} + +// walRestoreCommand returns the command string used to restore WAL files. +// The %f and %p placeholders are replaced with the file path and file name respectively. +func (b *Barman) walRestoreCommand() string { + return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s s3://%s %s %%f %%p", + b.provider, + b.endpoint, + b.bucket, + b.appName, + ) +} + func getenv(key, fallback string) string { value := os.Getenv(key) if len(value) == 0 { diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index cb7e7ad7..45b12c89 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -2,48 +2,31 @@ package flypg import ( "context" - "encoding/json" "fmt" "os" - "strings" "time" - - "github.com/fly-apps/postgres-flex/internal/utils" ) type BarmanRestore struct { - appName string - provider string - endpoint string - bucket string - + *Barman recoveryTarget string recoveryTargetTimeline string recoveryTargetAction string } -type BarmanBackupList struct { - Backups []BarmanBackup `json:"backups_list"` -} - -type BarmanBackup struct { - BackupID string `json:"backup_id"` - StartTime string `json:"begin_time"` - EndTime string `json:"end_time"` - BeginWal string `json:"begin_wal"` - EndWal string `json:"end_wal"` -} +const ( + defaultRestoreDir = "/data/postgresql" +) func NewBarmanRestore() (*BarmanRestore, error) { - if err := validateBarmanRestore(); err != nil { + if err := validateRestoreEnv(); err != nil { return nil, err } + barman, _ := NewBarman(false) + return &BarmanRestore{ - appName: os.Getenv("FLY_APP_NAME"), - provider: "aws-s3", - endpoint: strings.TrimSpace(os.Getenv("SOURCE_AWS_ENDPOINT_URL_S3")), - bucket: strings.TrimSpace(os.Getenv("SOURCE_AWS_BUCKET_NAME")), + Barman: barman, recoveryTarget: getenv("WAL_RECOVERY_TARGET", "immediate"), // TODO - Use recovery target time instead. This is a temporary solution. @@ -52,81 +35,32 @@ func NewBarmanRestore() (*BarmanRestore, error) { }, nil } -func (b *BarmanRestore) RestoreFromPIT(ctx context.Context) error { +func (b *BarmanRestore) Restore(ctx context.Context) error { // Query available backups from object storage - backupsBytes, err := utils.RunCommand(b.backupListCommand(), "postgres") + backups, err := b.ListBackups(ctx) if err != nil { return fmt.Errorf("failed to list backups: %s", err) } - // Parse the backups - backupList, err := b.parseBackups(backupsBytes) - if err != nil { - return fmt.Errorf("failed to parse backups: %s", err) - } - - if len(backupList.Backups) == 0 { + if len(backups.Backups) == 0 { return fmt.Errorf("no backups found") } // Resolve the base backup to restore - backupID, err := b.resolveBackupTarget(backupList, b.recoveryTargetTimeline) + backupID, err := b.resolveBackupTarget(backups, b.recoveryTargetTimeline) if err != nil { return fmt.Errorf("failed to resolve backup target: %s", err) } // Download and restore the base backup - _, err = utils.RunCommand(b.backupRestoreCommand(backupID), "postgres") - if err != nil { - return fmt.Errorf("failed to restore base backup: %s", err) + if _, err := b.RestoreBackup(ctx, backupID, defaultRestoreDir); err != nil { + return fmt.Errorf("failed to restore backup: %s", err) } return nil } -// restoreCommand returns the command string used to restore a base backup. -func (b *BarmanRestore) backupRestoreCommand(backupID string) string { - return fmt.Sprintf("barman-cloud-restore --cloud-provider %s --endpoint-url %s s3://%s %s %s %s", - b.provider, - b.endpoint, - b.bucket, - b.appName, - backupID, - barmanRecoveryDirectory, - ) -} - -// walRestoreCommand returns the command string used to restore WAL files. -// The %f and %p placeholders are replaced with the file path and file name respectively. -func (b *BarmanRestore) walRestoreCommand() string { - return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s s3://%s %s %%f %%p", - b.provider, - b.endpoint, - b.bucket, - b.appName, - ) -} - -func (b *BarmanRestore) backupListCommand() string { - return fmt.Sprintf("barman-cloud-backup-list --cloud-provider %s --endpoint-url %s s3://%s %s --format json", - b.provider, - b.endpoint, - b.bucket, - b.appName, - ) -} - -func (b *BarmanRestore) parseBackups(backupBytes []byte) (BarmanBackupList, error) { - var backupList BarmanBackupList - - if err := json.Unmarshal(backupBytes, &backupList); err != nil { - return BarmanBackupList{}, fmt.Errorf("failed to parse backups: %s", err) - } - - return backupList, nil -} - -func (b *BarmanRestore) resolveBackupTarget(backupList BarmanBackupList, restoreStr string) (string, error) { +func (b *BarmanRestore) resolveBackupTarget(backupList BackupList, restoreStr string) (string, error) { if len(backupList.Backups) == 0 { return "", fmt.Errorf("no backups found") } @@ -189,7 +123,7 @@ func (b *BarmanRestore) resolveBackupTarget(backupList BarmanBackupList, restore return latestBackupID, nil } -func validateBarmanRestore() error { +func validateRestoreEnv() error { if os.Getenv("SOURCE_AWS_ACCESS_KEY_ID") == "" { return fmt.Errorf("SOURCE_AWS_ACCESS_KEY_ID secret must be set") } diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index e09c2a55..d8388ed1 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -8,9 +8,9 @@ func TestNewBarman(t *testing.T) { t.Run("defaults", func(t *testing.T) { setDefaultEnv(t) - barman, err := NewBarman() + barman, err := NewBarman(true) if err != nil { - t.Fatal(err) + t.Fatalf("unexpected error: %v", err) } if barman.provider != "aws-s3" { @@ -42,14 +42,12 @@ func TestNewBarman(t *testing.T) { t.Run("custom-retention", func(t *testing.T) { setDefaultEnv(t) - t.Setenv("CLOUD_ARCHIVING_RETENTION_DAYS", "30") - barman, err := NewBarman() + barman, err := NewBarman(true) if err != nil { - t.Fatal(err) + t.Fatalf("unexpected error: %v", err) } - if barman.retentionDays != "30" { t.Fatalf("expected retentionDays to be 30, but got %s", barman.retentionDays) } @@ -59,28 +57,21 @@ func TestNewBarman(t *testing.T) { setDefaultEnv(t) t.Setenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "7") - barman, err := NewBarman() + barman, err := NewBarman(true) if err != nil { - t.Fatal(err) + t.Fatalf("unexpected error: %v", err) } - if barman.minimumRedundancy != "7" { t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) } }) - - t.Run("test-failure", func(t *testing.T) { - _, err := NewBarman() - if err == nil { - t.Fatal("expected error, but got nil") - } - }) } func TestValidateBarmanRequirements(t *testing.T) { + b, _ := NewBarman(false) t.Run("missing-aws-access-key", func(t *testing.T) { - err := validateBarman() + err := b.ValidateRequiredEnv() if err.Error() != "AWS_ACCESS_KEY_ID secret must be set" { t.Fatalf("expected error to be 'AWS_ACCESS_KEY_ID secret must be set', but got %s", err.Error()) @@ -90,7 +81,7 @@ func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-secret-access-key", func(t *testing.T) { setDefaultEnv(t) t.Setenv("AWS_SECRET_ACCESS_KEY", "") - err := validateBarman() + err := b.ValidateRequiredEnv() if err.Error() != "AWS_SECRET_ACCESS_KEY secret must be set" { t.Fatalf("expected error to be 'AWS_SECRET_ACCESS_KEY secret must be set', but got %s", err.Error()) @@ -100,7 +91,7 @@ func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-bucket-name", func(t *testing.T) { setDefaultEnv(t) t.Setenv("AWS_BUCKET_NAME", "") - err := validateBarman() + err := b.ValidateRequiredEnv() if err.Error() != "AWS_BUCKET_NAME envvar must be set" { t.Fatalf("expected error to be 'AWS_BUCKET_NAME envvar must be set', but got %s", err.Error()) @@ -110,7 +101,7 @@ func TestValidateBarmanRequirements(t *testing.T) { t.Run("missing-aws-endpoint-url", func(t *testing.T) { setDefaultEnv(t) t.Setenv("AWS_ENDPOINT_URL_S3", "") - err := validateBarman() + err := b.ValidateRequiredEnv() if err.Error() != "AWS_ENDPOINT_URL_S3 envvar must be set" { t.Fatalf("expected error to be 'AWS_ENDPOINT_URL_S3 envvar must be set', but got %s", err.Error()) @@ -121,15 +112,14 @@ func TestValidateBarmanRequirements(t *testing.T) { func TestBarmanRetentionPolicy(t *testing.T) { setDefaultEnv(t) - barman, err := NewBarman() + barman, err := NewBarman(true) if err != nil { - t.Fatal(err) + t.Fatalf("unexpected error: %v", err) } if barman.RetentionPolicy() != "'RECOVERY WINDOW OF 7 days'" { t.Fatalf("expected retention policy to be 'RECOVERY WINDOW OF 7 days', but got %s", barman.RetentionPolicy()) } - } func setDefaultEnv(t *testing.T) { diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 2e88122d..8e7ebaac 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -132,22 +132,28 @@ func (n *Node) Init(ctx context.Context) error { } if active { - if err := Restore(ctx, n); err != nil { + if err := prepareRemoteRestore(ctx, n); err != nil { return fmt.Errorf("failed to issue restore: %s", err) } } } - // Point-in-time restore + // Remote point-in-time restore. if os.Getenv("CLOUD_ARCHIVING_REMOTE_RESTORE") != "" { + // TODO - We may need to check for the restore lock her + restore, err := NewBarmanRestore() if err != nil { return fmt.Errorf("failed to initialize barman restore: %s", err) } - if err := restore.RestoreFromPIT(ctx); err != nil { + if err := restore.Restore(ctx); err != nil { return fmt.Errorf("failed to restore from PIT: %s", err) } + + if err := prepareRemoteRestore(ctx, n); err != nil { + return fmt.Errorf("failed to issue restore: %s", err) + } } // Verify whether we are a booting zombie. diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 1d078005..2910ff33 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -175,10 +175,11 @@ func (c *PGConfig) SetDefaults() error { switch strings.ToLower(os.Getenv("CLOUD_ARCHIVING_ENABLED")) { case "true": - barman, err := NewBarman() + barman, err := NewBarman(true) if err != nil { return err } + c.internalConfig["archive_mode"] = "on" c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) diff --git a/internal/flypg/snapshot_restore.go b/internal/flypg/remote_restore.go similarity index 91% rename from internal/flypg/snapshot_restore.go rename to internal/flypg/remote_restore.go index 70dc5160..43d76c13 100644 --- a/internal/flypg/snapshot_restore.go +++ b/internal/flypg/remote_restore.go @@ -17,7 +17,12 @@ const ( restoreLockFile = "/data/restore.lock" ) -func Restore(ctx context.Context, node *Node) error { +// prepareRemoteRestore will reset the environment to a state where it can be restored +// from a remote backup. This process includes: +// * Clearing any locks that may have been set on the original cluster. +// * Dropping the repmgr database to clear any metadata that belonged to the old cluster. +// * Ensuring the internal user credentials match the environment. +func prepareRemoteRestore(ctx context.Context, node *Node) error { // Clear any locks that may have been set on the original cluster if err := clearLocks(); err != nil { return fmt.Errorf("failed to clear locks: %s", err) @@ -63,8 +68,7 @@ func Restore(ctx context.Context, node *Node) error { defer func() { _ = conn.Close(ctx) }() // Drop repmgr database to clear any metadata that belonged to the old cluster. - sql := "DROP DATABASE repmgr;" - _, err = conn.Exec(ctx, sql) + _, err = conn.Exec(ctx, "DROP DATABASE repmgr;") if err != nil { return fmt.Errorf("failed to drop repmgr database: %s", err) } From 645e07d5b8a53835f120e610610bd254d5437e12 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Thu, 27 Jun 2024 13:40:17 -0500 Subject: [PATCH 14/51] Checkin --- cmd/monitor/main.go | 13 ++- cmd/monitor/monitor_backup_retention.go | 27 +++-- cmd/monitor/monitor_backup_schedule.go | 110 ++++++++++++++++++ internal/flypg/barman.go | 99 +++++++++++----- internal/flypg/barman_test.go | 2 +- internal/flypg/pg.go | 12 +- internal/flypg/pg_test.go | 7 +- .../flypg/{remote_restore.go => restore.go} | 0 internal/utils/shell.go | 30 +++++ 9 files changed, 247 insertions(+), 53 deletions(-) create mode 100644 cmd/monitor/monitor_backup_schedule.go rename internal/flypg/{remote_restore.go => restore.go} (100%) diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 9eb185aa..1057f0cd 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -18,9 +18,12 @@ var ( defaultDeadMemberRemovalThreshold = time.Hour * 12 defaultInactiveSlotRemovalThreshold = time.Hour * 12 - defaultBackupRetentionEvaluationThreshold = time.Hour * 1 + defaultBackupRetentionEvalFrequency = time.Hour * 12 + defaultFullBackupSchedule = time.Hour * 24 ) +// TODO - Harden this so one failure doesn't take down the whole monitor + func main() { ctx := context.Background() @@ -40,14 +43,18 @@ func main() { }() if os.Getenv("CLOUD_ARCHIVING_ENABLED") == "true" { - barman, err := flypg.NewBarman() + barman, err := flypg.NewBarman(true) if err != nil { panic(err) } + // Backup scheduler + log.Println("Monitoring backup schedule") + go monitorBackupSchedule(ctx, barman) + + // Backup retention monitor log.Println("Monitoring backup retention") barman.PrintRetentionPolicy() - go monitorBackupRetention(ctx, barman) } diff --git a/cmd/monitor/monitor_backup_retention.go b/cmd/monitor/monitor_backup_retention.go index ba8b6520..c2b5915c 100644 --- a/cmd/monitor/monitor_backup_retention.go +++ b/cmd/monitor/monitor_backup_retention.go @@ -8,21 +8,24 @@ import ( "github.com/fly-apps/postgres-flex/internal/flypg" ) -const ( - defaultBackupRetentionEvaluationThreshold = time.Hour * 6 -) - func monitorBackupRetention(ctx context.Context, barman *flypg.Barman) { - ticker := time.NewTicker(defaultBackupRetentionEvaluationThreshold) + ticker := time.NewTicker(defaultBackupRetentionEvalFrequency) defer ticker.Stop() - for range ticker.C { - result, err := barman.WALArchiveDelete(ctx) - if err != nil { - log.Printf("Backup retention failed with: %s", err) - } - if len(result) > 0 { - log.Printf("Backup retention response: %s", result) + for { + select { + case <-ctx.Done(): + log.Println("Shutting down backup retention monitor") + return + case <-ticker.C: + result, err := barman.WALArchiveDelete(ctx) + if err != nil { + log.Printf("Backup retention failed with: %s", err) + } + + if len(result) > 0 { + log.Printf("Backup retention response: %s", result) + } } } } diff --git a/cmd/monitor/monitor_backup_schedule.go b/cmd/monitor/monitor_backup_schedule.go new file mode 100644 index 00000000..d7afb79d --- /dev/null +++ b/cmd/monitor/monitor_backup_schedule.go @@ -0,0 +1,110 @@ +package main + +import ( + "context" + "fmt" + "log" + "time" + + "github.com/fly-apps/postgres-flex/internal/flypg" +) + +func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { + // Determine when the last backup was taken. + lastBackupTime, err := barman.LastBackupTaken(ctx) + if err != nil { + log.Printf("Failed to resolve the last backup taken: %s", err) + } + + // Ensure we have a least one backup before proceeding. + if lastBackupTime.IsZero() { + log.Println("No backups found! Performing the initial base backup.") + + if err := performInitialBaseBackup(ctx, barman); err != nil { + log.Printf("Failed to perform the initial full backup: %s", err) + log.Printf("Backup scheduler will reattempt in 24 hours.") + } + + lastBackupTime = time.Now() + } + + log.Printf("Last backup taken at: %s", lastBackupTime) + + // Calculate the time until the next backup is due. + timeUntilNextBackup := lastBackupTime.Add(defaultFullBackupSchedule).Sub(time.Now()) + + // Perform backup immediately if the time until the next backup is negative. + if timeUntilNextBackup < 0 { + log.Println("Performing full backup now") + _, err := barman.Backup(ctx, false) + if err != nil { + log.Printf("Full backup failed with: %s", err) + } + + timeUntilNextBackup = defaultFullBackupSchedule + } + + log.Printf("Next backup due in: %s", timeUntilNextBackup) + + ticker := time.NewTicker(timeUntilNextBackup) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + log.Println("Shutting down backup scheduler") + return + case <-ticker.C: + // Perform a backup while passively waiting for the checkpoint process to complete. + // This could actually take a while, so we should be prepared to wait. + log.Println("Performing full backup") + _, err := barman.Backup(ctx, false) + if err != nil { + // TODO - Implement a backup-off strategy. + timeUntilNextBackup = time.Hour * 1 + ticker.Reset(timeUntilNextBackup) + + log.Printf("Backup retention failed with: %s.", err) + log.Printf("Backup will be re-attempted in %s.", timeUntilNextBackup) + + continue + } + + log.Printf("Full backup completed successfully") + ticker.Reset(defaultFullBackupSchedule) + } + } +} + +func performInitialBaseBackup(ctx context.Context, barman *flypg.Barman) error { + maxRetries := 10 + retryCount := 0 + for { + select { + case <-ctx.Done(): + return nil + default: + _, err := barman.Backup(ctx, true) + if err != nil { + log.Printf("Failed to perform the initial full backup: %s. Retrying in 30 seconds.", err) + + // If we've exceeded the maximum number of retries, we should return an error. + if retryCount >= maxRetries { + return fmt.Errorf("failed to perform the initial full backup after %d retries", maxRetries) + } + + retryCount++ + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(time.Second * 30): + continue + } + } + + log.Println("Initial full backup completed successfully") + return nil + } + } +} diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 52afb08b..95543e8a 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -4,8 +4,10 @@ import ( "context" "encoding/json" "fmt" + "log" "os" "strings" + "time" "github.com/fly-apps/postgres-flex/internal/utils" ) @@ -38,11 +40,14 @@ type BackupList struct { } func NewBarman(validate bool) (*Barman, error) { + bucket := strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")) + bucket = fmt.Sprintf("s3://%s", bucket) + b := &Barman{ appName: os.Getenv("FLY_APP_NAME"), provider: providerDefault, endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL_S3")), - bucket: strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")), + bucket: bucket, retentionDays: getenv("CLOUD_ARCHIVING_RETENTION_DAYS", "7"), minimumRedundancy: getenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "3"), @@ -55,40 +60,45 @@ func NewBarman(validate bool) (*Barman, error) { return b, nil } -func (b *Barman) Backup(ctx context.Context) ([]byte, error) { - backupCmd := fmt.Sprintf("barman-cloud-backup --cloud-provider %s --endpoint-url %s s3://%s %s", - b.provider, - b.endpoint, +// Backup performs a base backup of the database. +// forceCheckpoint - forces the initial checkpoint to be done as quickly as possible. +func (b *Barman) Backup(ctx context.Context, forceCheckpoint bool) ([]byte, error) { + args := []string{ + "--cloud-provider", providerDefault, + "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), + "--host", fmt.Sprintf("%s.internal", b.appName), + "--user", "repmgr", b.bucket, b.appName, - ) + } - return utils.RunCommand(backupCmd, "postgres") + return utils.RunCmd(ctx, "postgres", "barman-cloud-backup", args...) } // RestoreBackup returns the command string used to restore a base backup. func (b *Barman) RestoreBackup(ctx context.Context, backupID, recoveryDir string) ([]byte, error) { - restoreCmd := fmt.Sprintf("barman-cloud-restore --cloud-provider %s --endpoint-url %s s3://%s %s %s %s", - b.provider, - b.endpoint, + args := []string{ + "--cloud-provider", providerDefault, + "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), b.bucket, b.appName, backupID, recoveryDir, - ) + } - return utils.RunCommand(restoreCmd, "postgres") + return utils.RunCmd(ctx, "postgres", "barman-cloud-restore", args...) } func (b *Barman) ListBackups(ctx context.Context) (BackupList, error) { - listBackupCmd := fmt.Sprintf("barman-cloud-backup-list --cloud-provider %s --endpoint-url %s s3://%s %s --format json", - b.provider, - b.endpoint, + args := []string{ + "--cloud-provider", providerDefault, + "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), + "--format", "json", b.bucket, b.appName, - ) + } - backupsBytes, err := utils.RunCommand(listBackupCmd, "postgres") + backupsBytes, err := utils.RunCmd(ctx, "postgres", "barman-cloud-backup-list", args...) if err != nil { return BackupList{}, fmt.Errorf("failed to list backups: %s", err) } @@ -97,16 +107,53 @@ func (b *Barman) ListBackups(ctx context.Context) (BackupList, error) { } func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { - deleteCmd := fmt.Sprintf("barman-cloud-backup-delete --cloud-provider %s --endpoint-url %s --retention %s --minimum-redundancy %s s3://%s %s", - b.provider, - b.endpoint, - b.RetentionPolicy(), - b.minimumRedundancy, + args := []string{ + "--cloud-provider", providerDefault, + "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), + "--retention", b.RetentionPolicy(), + "--minimum-redundancy", b.minimumRedundancy, b.bucket, b.appName, - ) + } + + return utils.RunCmd(ctx, "postgres", "barman-cloud-backup-delete", args...) +} + +func (b *Barman) LastBackupTaken(ctx context.Context) (time.Time, error) { + backups, err := b.ListBackups(ctx) + if err != nil { + return time.Time{}, fmt.Errorf("failed to list backups: %s", err) + } + + if len(backups.Backups) == 0 { + return time.Time{}, nil + } + + layout := "Mon Jan 2 15:04:05 2006" + + var latestBackupTime time.Time + + // Sort the backups start time + for _, backup := range backups.Backups { + startTime, err := time.Parse(layout, backup.StartTime) + if err != nil { + return time.Time{}, fmt.Errorf("failed to parse backup start time: %s", err) + } + + if latestBackupTime.IsZero() || startTime.After(latestBackupTime) { + latestBackupTime = startTime + } + } + + return latestBackupTime, nil +} - return utils.RunCommand(deleteCmd, "postgres") +func (b *Barman) PrintRetentionPolicy() { + log.Printf(` +Retention Policy +----------------- +RECOVERY WINDOW: %s DAYS +MINIMUM BACKUP REDUNDANCY: %s`, b.retentionDays, b.minimumRedundancy) } func (b *Barman) RetentionPolicy() string { @@ -145,7 +192,7 @@ func (b *Barman) parseBackups(backupBytes []byte) (BackupList, error) { func (b *Barman) walArchiveCommand() string { // TODO - Make compression configurable - return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s s3://%s %s %%p", + return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s %s %s %%p", b.provider, b.endpoint, b.bucket, @@ -156,7 +203,7 @@ func (b *Barman) walArchiveCommand() string { // walRestoreCommand returns the command string used to restore WAL files. // The %f and %p placeholders are replaced with the file path and file name respectively. func (b *Barman) walRestoreCommand() string { - return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s s3://%s %s %%f %%p", + return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s %s %s %%f %%p", b.provider, b.endpoint, b.bucket, diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index d8388ed1..fc697ed6 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -21,7 +21,7 @@ func TestNewBarman(t *testing.T) { t.Fatalf("expected endpoint to be https://fly.storage.tigris.dev, but got %s", barman.endpoint) } - if barman.bucket != "my-bucket" { + if barman.bucket != "s3://my-bucket" { t.Fatalf("expected bucket to be my-bucket, but got %s", barman.bucket) } diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 2910ff33..8660ea40 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -176,13 +176,13 @@ func (c *PGConfig) SetDefaults() error { switch strings.ToLower(os.Getenv("CLOUD_ARCHIVING_ENABLED")) { case "true": barman, err := NewBarman(true) - if err != nil { - return err + switch { + case err != nil: + log.Printf("[WARN] Failed to initialize barman: %s", err) + default: + c.internalConfig["archive_mode"] = "on" + c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) } - - c.internalConfig["archive_mode"] = "on" - c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) - case "false": c.internalConfig["archive_mode"] = "off" c.internalConfig["archive_command"] = "" diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index cf0c3d1f..4306bfa7 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -136,11 +136,8 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - expected := fmt.Sprintf("'barman-cloud-wal-archive --cloud-provider aws-s3 --gzip --endpoint-url %s s3://%s %s %%p'", - os.Getenv("AWS_ENDPOINT_URL_S3"), - os.Getenv("AWS_BUCKET_NAME"), - pgConf.AppName) - + barman, _ := NewBarman(false) + expected := fmt.Sprintf("'%s'", barman.walArchiveCommand()) if cfg["archive_command"] != expected { t.Fatalf("expected %s, got %s", expected, cfg["archive_command"]) } diff --git a/internal/flypg/remote_restore.go b/internal/flypg/restore.go similarity index 100% rename from internal/flypg/remote_restore.go rename to internal/flypg/restore.go diff --git a/internal/utils/shell.go b/internal/utils/shell.go index bd9c1727..253d20d4 100644 --- a/internal/utils/shell.go +++ b/internal/utils/shell.go @@ -2,6 +2,7 @@ package utils import ( "bytes" + "context" "fmt" "io" "log" @@ -14,6 +15,35 @@ import ( // TODO - RunCommand needs a context +func RunCmd(ctx context.Context, usr string, name string, args ...string) ([]byte, error) { + uid, gid, err := SystemUserIDs(usr) + if err != nil { + return nil, err + } + + cmd := exec.CommandContext(ctx, name, args...) + cmd.SysProcAttr = &syscall.SysProcAttr{} + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)} + + if os.Getenv("DEBUG") != "" { + log.Printf("> Running command as %s: %s\n", usr, cmd.String()) + } + + var stdoutBuf, stderrBuf bytes.Buffer + cmd.Stdout = io.MultiWriter(os.Stdout, &stdoutBuf) + cmd.Stderr = io.MultiWriter(os.Stderr, &stderrBuf) + + err = cmd.Run() + if err != nil { + if ee, ok := err.(*exec.ExitError); ok { + ee.Stderr = stderrBuf.Bytes() + } + } + + return stdoutBuf.Bytes(), err +} + +// Deprecated, use RunCmd instead func RunCommand(cmdStr, usr string) ([]byte, error) { uid, gid, err := SystemUserIDs(usr) if err != nil { From 49784fe534245497e8867f335e626bb40eee1d35 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Fri, 28 Jun 2024 19:48:47 -0500 Subject: [PATCH 15/51] Working PITR, but there's quite a bit of cleanup to do --- cmd/monitor/main.go | 4 +- internal/flypg/barman.go | 147 ++++++++++++++++-------- internal/flypg/barman_restore.go | 110 +++++++++++------- internal/flypg/barman_restore_test.go | 57 ++++++++-- internal/flypg/barman_test.go | 158 ++++++++++++++++---------- internal/flypg/node.go | 52 +++++++-- internal/flypg/pg.go | 40 +++++-- internal/flypg/pg_test.go | 20 ++-- internal/flypg/restore.go | 4 +- 9 files changed, 396 insertions(+), 196 deletions(-) diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 1057f0cd..dd3eead8 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -42,8 +42,8 @@ func main() { } }() - if os.Getenv("CLOUD_ARCHIVING_ENABLED") == "true" { - barman, err := flypg.NewBarman(true) + if os.Getenv("BARMAN_ENABLED") == "true" { + barman, err := flypg.NewBarman(os.Getenv("BARMAN_ENABLED")) if err != nil { panic(err) } diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 95543e8a..527aedce 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -5,7 +5,9 @@ import ( "encoding/json" "fmt" "log" + "net/url" "os" + "path/filepath" "strings" "time" @@ -13,7 +15,8 @@ import ( ) const ( - providerDefault = "aws-s3" + providerDefault = "aws-s3" + awsCredentialsPath = "/data/.aws/credentials" ) type Barman struct { @@ -22,9 +25,10 @@ type Barman struct { endpoint string bucket string - fullBackupFrequency string // TODO - Implement - minimumRedundancy string - retentionDays string + configURL string + + retentionDays string + minimumRedundancy string } type Backup struct { @@ -39,39 +43,73 @@ type BackupList struct { Backups []Backup `json:"backups_list"` } -func NewBarman(validate bool) (*Barman, error) { - bucket := strings.TrimSpace(os.Getenv("AWS_BUCKET_NAME")) - bucket = fmt.Sprintf("s3://%s", bucket) +// NewBarman creates a new Barman instance. +// The configURL is expected to be in the format: +// barman://access-key:secret-key@endpoint/bucket +func NewBarman(configURL string) (*Barman, error) { + parsedURL, err := url.Parse(configURL) + if err != nil { + return nil, fmt.Errorf("invalid credential url: %w", err) + } + + endpoint := parsedURL.Host + if endpoint == "" { + return nil, fmt.Errorf("object storage endpoint missing") + } + + bucket := strings.TrimLeft(parsedURL.Path, "/") + if bucket == "" { + return nil, fmt.Errorf("bucket name is missing") + } - b := &Barman{ - appName: os.Getenv("FLY_APP_NAME"), - provider: providerDefault, - endpoint: strings.TrimSpace(os.Getenv("AWS_ENDPOINT_URL_S3")), - bucket: bucket, + // Extract user information for credentials (not used here but necessary for the complete parsing) + username := parsedURL.User.Username() + password, _ := parsedURL.User.Password() - retentionDays: getenv("CLOUD_ARCHIVING_RETENTION_DAYS", "7"), - minimumRedundancy: getenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "3"), + // Ensure the credentials are not empty + if username == "" || password == "" { + return nil, fmt.Errorf("access key or secret key is missing") } - if validate { - return b, b.ValidateRequiredEnv() + // Set the environment variable within the Go process + if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { + fmt.Printf("Error setting environment variable: %s\n", err) + return nil, err } - return b, nil + return &Barman{ + appName: os.Getenv("FLY_APP_NAME"), + provider: providerDefault, + endpoint: fmt.Sprintf("https://%s", endpoint), + bucket: bucket, + configURL: configURL, + + retentionDays: "7", + minimumRedundancy: "3", + }, nil + +} + +func (b *Barman) Bucket() string { + return fmt.Sprintf("s3://%s", b.bucket) } // Backup performs a base backup of the database. -// forceCheckpoint - forces the initial checkpoint to be done as quickly as possible. -func (b *Barman) Backup(ctx context.Context, forceCheckpoint bool) ([]byte, error) { +// immediateCheckpoint - forces the initial checkpoint to be done as quickly as possible. +func (b *Barman) Backup(ctx context.Context, immediateCheckpoint bool) ([]byte, error) { args := []string{ "--cloud-provider", providerDefault, - "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), + "--endpoint-url", b.endpoint, "--host", fmt.Sprintf("%s.internal", b.appName), "--user", "repmgr", - b.bucket, + b.Bucket(), b.appName, } + if immediateCheckpoint { + args = append(args, "--immediate-checkpoint") + } + return utils.RunCmd(ctx, "postgres", "barman-cloud-backup", args...) } @@ -79,9 +117,9 @@ func (b *Barman) Backup(ctx context.Context, forceCheckpoint bool) ([]byte, erro func (b *Barman) RestoreBackup(ctx context.Context, backupID, recoveryDir string) ([]byte, error) { args := []string{ "--cloud-provider", providerDefault, - "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), + "--endpoint-url", b.endpoint, + b.Bucket(), b.bucket, - b.appName, backupID, recoveryDir, } @@ -92,10 +130,10 @@ func (b *Barman) RestoreBackup(ctx context.Context, backupID, recoveryDir string func (b *Barman) ListBackups(ctx context.Context) (BackupList, error) { args := []string{ "--cloud-provider", providerDefault, - "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), + "--endpoint-url", b.endpoint, "--format", "json", + b.Bucket(), b.bucket, - b.appName, } backupsBytes, err := utils.RunCmd(ctx, "postgres", "barman-cloud-backup-list", args...) @@ -109,10 +147,10 @@ func (b *Barman) ListBackups(ctx context.Context) (BackupList, error) { func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { args := []string{ "--cloud-provider", providerDefault, - "--endpoint-url", os.Getenv("AWS_ENDPOINT_URL_S3"), + "--endpoint-url", b.endpoint, "--retention", b.RetentionPolicy(), "--minimum-redundancy", b.minimumRedundancy, - b.bucket, + b.Bucket(), b.appName, } @@ -160,26 +198,6 @@ func (b *Barman) RetentionPolicy() string { return fmt.Sprintf("'RECOVERY WINDOW OF %s days'", b.retentionDays) } -func (b *Barman) ValidateRequiredEnv() error { - if os.Getenv("AWS_ACCESS_KEY_ID") == "" { - return fmt.Errorf("AWS_ACCESS_KEY_ID secret must be set") - } - - if os.Getenv("AWS_SECRET_ACCESS_KEY") == "" { - return fmt.Errorf("AWS_SECRET_ACCESS_KEY secret must be set") - } - - if os.Getenv("AWS_BUCKET_NAME") == "" { - return fmt.Errorf("AWS_BUCKET_NAME envvar must be set") - } - - if os.Getenv("AWS_ENDPOINT_URL_S3") == "" { - return fmt.Errorf("AWS_ENDPOINT_URL_S3 envvar must be set") - } - - return nil -} - func (b *Barman) parseBackups(backupBytes []byte) (BackupList, error) { var backupList BackupList @@ -195,8 +213,8 @@ func (b *Barman) walArchiveCommand() string { return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s %s %s %%p", b.provider, b.endpoint, + b.Bucket(), b.bucket, - b.appName, ) } @@ -206,11 +224,42 @@ func (b *Barman) walRestoreCommand() string { return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s %s %s %%f %%p", b.provider, b.endpoint, + b.Bucket(), b.bucket, - b.appName, ) } +func (b *Barman) writeAWSCredentials(profile string, credentialsPath string) error { + barmanURL, err := url.Parse(b.configURL) + if err != nil { + return fmt.Errorf("invalid configURL: %w", err) + } + + accessKey := barmanURL.User.Username() + if accessKey == "" { + return fmt.Errorf("AWS ACCESS KEY is missing") + } + + secretAccessKey, _ := barmanURL.User.Password() + if secretAccessKey == "" { + return fmt.Errorf("AWS SECRET KEY is missing") + } + + credentials := fmt.Sprintf("[%s]\naws_access_key_id=%s\naws_secret_access_key=%s", + profile, accessKey, secretAccessKey) + + // Ensure the directory exists + if err := os.MkdirAll(filepath.Dir(credentialsPath), 0700); err != nil { + return fmt.Errorf("failed to create AWS credentials directory: %w", err) + } + + if err := os.WriteFile(credentialsPath, []byte(credentials), 0644); err != nil { + return fmt.Errorf("failed to write AWS credentials file: %w", err) + } + + return nil +} + func getenv(key, fallback string) string { value := os.Getenv(key) if len(value) == 0 { diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 45b12c89..e261e17f 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -3,36 +3,53 @@ package flypg import ( "context" "fmt" - "os" + "net/url" "time" ) type BarmanRestore struct { *Barman - recoveryTarget string - recoveryTargetTimeline string - recoveryTargetAction string + + recoveryTargetName string + recoveryTargetTime string } const ( defaultRestoreDir = "/data/postgresql" ) -func NewBarmanRestore() (*BarmanRestore, error) { - if err := validateRestoreEnv(); err != nil { - return nil, err +func NewBarmanRestore(configURL string) (*BarmanRestore, error) { + barman, err := NewBarman(configURL) + if err != nil { + return nil, fmt.Errorf("failed to create barman client: %s", err) } - barman, _ := NewBarman(false) + url, err := url.Parse(configURL) + if err != nil { + return nil, fmt.Errorf("invalid restore config url: %w", err) + } - return &BarmanRestore{ + restore := &BarmanRestore{ Barman: barman, + } - recoveryTarget: getenv("WAL_RECOVERY_TARGET", "immediate"), - // TODO - Use recovery target time instead. This is a temporary solution. - recoveryTargetTimeline: getenv("WAL_RECOVERY_TARGET_TIMELINE", "latest"), - recoveryTargetAction: getenv("WAL_RECOVERY_TARGET_ACTION", "promote"), - }, nil + // evaluate the query parameters + for key, value := range url.Query() { + switch key { + case "targetName": + restore.recoveryTargetName = value[0] + case "targetTime": + restore.recoveryTargetTime = value[0] + default: + return nil, fmt.Errorf("unknown query parameter: %s", key) + } + } + + if restore.recoveryTargetName == "" && restore.recoveryTargetTime == "" { + return nil, fmt.Errorf("restore target not specified") + } + + return restore, nil } func (b *BarmanRestore) Restore(ctx context.Context) error { @@ -46,10 +63,30 @@ func (b *BarmanRestore) Restore(ctx context.Context) error { return fmt.Errorf("no backups found") } - // Resolve the base backup to restore - backupID, err := b.resolveBackupTarget(backups, b.recoveryTargetTimeline) - if err != nil { - return fmt.Errorf("failed to resolve backup target: %s", err) + var backupID string + + switch { + case b.recoveryTargetName != "": + // Resolve the target base backup + + // TODO - the id needs to be prefixed with barman_ + backupID, err = b.resolveBackupFromID(backups, b.recoveryTargetName) + if err != nil { + return fmt.Errorf("failed to resolve backup target by id: %s", err) + } + case b.recoveryTargetTime != "": + // Resolve the target base backup + backupID, err = b.resolveBackupFromTime(backups, b.recoveryTargetTime) + if err != nil { + return fmt.Errorf("failed to resolve backup target by time: %s", err) + } + default: + return fmt.Errorf("restore target not specified") + } + + // TODO - Consider just using the last available backup if the target is not found + if backupID == "" { + return fmt.Errorf("no backup found") } // Download and restore the base backup @@ -60,7 +97,21 @@ func (b *BarmanRestore) Restore(ctx context.Context) error { return nil } -func (b *BarmanRestore) resolveBackupTarget(backupList BackupList, restoreStr string) (string, error) { +func (b *BarmanRestore) resolveBackupFromID(backupList BackupList, id string) (string, error) { + if len(backupList.Backups) == 0 { + return "", fmt.Errorf("no backups found") + } + + for _, backup := range backupList.Backups { + if backup.BackupID == id { + return backup.BackupID, nil + } + } + + return "", fmt.Errorf("no backup found with id %s", id) +} + +func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr string) (string, error) { if len(backupList.Backups) == 0 { return "", fmt.Errorf("no backups found") } @@ -71,6 +122,7 @@ func (b *BarmanRestore) resolveBackupTarget(backupList BackupList, restoreStr st if restoreStr == "latest" { restoreTime = time.Now() } else { + var err error restoreTime, err = time.Parse(time.RFC3339, restoreStr) if err != nil { @@ -122,23 +174,3 @@ func (b *BarmanRestore) resolveBackupTarget(backupList BackupList, restoreStr st return latestBackupID, nil } - -func validateRestoreEnv() error { - if os.Getenv("SOURCE_AWS_ACCESS_KEY_ID") == "" { - return fmt.Errorf("SOURCE_AWS_ACCESS_KEY_ID secret must be set") - } - - if os.Getenv("SOURCE_AWS_SECRET_ACCESS_KEY") == "" { - return fmt.Errorf("SOURCE_AWS_SECRET_ACCESS_KEY secret must be set") - } - - if os.Getenv("SOURCE_AWS_BUCKET_NAME") == "" { - return fmt.Errorf("SOURCE_AWS_BUCKET_NAME envvar must be set") - } - - if os.Getenv("SOURCE_AWS_ENDPOINT_URL_S3") == "" { - return fmt.Errorf("SOURCE_AWS_ENDPOINT_URL_S3 envvar must be set") - } - - return nil -} diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index ebe7514f..9dd18b24 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -1,6 +1,7 @@ package flypg import ( + "os" "testing" ) @@ -104,11 +105,49 @@ const backupsResponse = `{ ] }` +func TestNewBarmanRestore(t *testing.T) { + setRestoreDefaultEnv(t) + + restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } + + if restore.bucket != "my-bucket" { + t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) + } + + if restore.Bucket() != "s3://my-bucket" { + t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) + } + + if restore.appName != "postgres-flex" { + t.Fatalf("expected app name to be postgres-flex, got %s", restore.appName) + } + + if restore.provider != "aws-s3" { + t.Fatalf("expected provider to be aws-s3, got %s", restore.provider) + } + + if restore.endpoint != "https://fly.storage.tigris.dev" { + t.Fatalf("expected endpoint to be https://fly.storage.tigris.dev, got %s", restore.endpoint) + } + + if restore.recoveryTargetName != "" { + t.Fatalf("expected recovery target name to be empty, got %s", restore.recoveryTargetName) + } + + if restore.recoveryTargetTime != "1212121212" { + t.Fatalf("expected recovery target time to be 1212121212, got %s", restore.recoveryTargetTime) + } + +} + func TestParseBackups(t *testing.T) { t.Run("parseBackups", func(t *testing.T) { setRestoreDefaultEnv(t) - restore, err := NewBarmanRestore() + restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) if err != nil { t.Fatalf("NewBarmanRestore failed with: %v", err) } @@ -154,7 +193,7 @@ func TestParseBackups(t *testing.T) { func TestResolveBackupTarget(t *testing.T) { setRestoreDefaultEnv(t) - restore, err := NewBarmanRestore() + restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) if err != nil { t.Fatalf("NewBarmanRestore failed with: %v", err) } @@ -165,7 +204,7 @@ func TestResolveBackupTarget(t *testing.T) { } t.Run("resolve-latest-backup-target", func(t *testing.T) { - backupID, err := restore.resolveBackupTarget(list, "latest") + backupID, err := restore.resolveBackupFromTime(list, "latest") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -176,7 +215,7 @@ func TestResolveBackupTarget(t *testing.T) { }) t.Run("resolve-earliest-backup-target", func(t *testing.T) { - backupID, err := restore.resolveBackupTarget(list, "2024-06-25T19:44:12Z") + backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:12Z") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -189,7 +228,7 @@ func TestResolveBackupTarget(t *testing.T) { // "begin_time": "Tue Jun 25 19:44:12 2024" // "end_time": "Tue Jun 25 19:44:18 2024", t.Run("resolve-backup-within-first-window", func(t *testing.T) { - backupID, err := restore.resolveBackupTarget(list, "2024-06-25T19:44:15Z") + backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:15Z") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -202,7 +241,7 @@ func TestResolveBackupTarget(t *testing.T) { // "begin_time": "Wed Jun 26 17:24:43 2024", // "end_time": "Wed Jun 26 17:27:02 2024", t.Run("resolve-backup-within-second-window", func(t *testing.T) { - backupID, err := restore.resolveBackupTarget(list, "2024-06-26T17:25:15Z") + backupID, err := restore.resolveBackupFromTime(list, "2024-06-26T17:25:15Z") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -214,10 +253,6 @@ func TestResolveBackupTarget(t *testing.T) { } func setRestoreDefaultEnv(t *testing.T) { - t.Setenv("CLOUD_ARCHIVING_WAL_RESTORE", "true") + t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket?targetTime=1212121212") t.Setenv("FLY_APP_NAME", "postgres-flex") - t.Setenv("SOURCE_AWS_ACCESS_KEY_ID", "my-key") - t.Setenv("SOURCE_AWS_SECRET_ACCESS_KEY", "my-secret") - t.Setenv("SOURCE_AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") - t.Setenv("SOURCE_AWS_BUCKET_NAME", "my-bucket") } diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index fc697ed6..641fe0c8 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -1,14 +1,17 @@ package flypg import ( + "os" "testing" + + "github.com/fly-apps/postgres-flex/internal/utils" ) func TestNewBarman(t *testing.T) { t.Run("defaults", func(t *testing.T) { setDefaultEnv(t) - barman, err := NewBarman(true) + barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -21,10 +24,14 @@ func TestNewBarman(t *testing.T) { t.Fatalf("expected endpoint to be https://fly.storage.tigris.dev, but got %s", barman.endpoint) } - if barman.bucket != "s3://my-bucket" { + if barman.bucket != "my-bucket" { t.Fatalf("expected bucket to be my-bucket, but got %s", barman.bucket) } + if barman.Bucket() != "s3://my-bucket" { + t.Fatalf("expected bucket to be s3://my-bucket, but got %s", barman.bucket) + } + if barman.appName != "postgres-flex" { t.Fatalf("expected appName to be postgres-flex, but got %s", barman.appName) } @@ -37,82 +44,118 @@ func TestNewBarman(t *testing.T) { if barman.retentionDays != "7" { t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) } - }) +} - t.Run("custom-retention", func(t *testing.T) { - setDefaultEnv(t) - t.Setenv("CLOUD_ARCHIVING_RETENTION_DAYS", "30") +func TestWriteAWSCredentials(t *testing.T) { + setup(t) + defer cleanup() - barman, err := NewBarman(true) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if barman.retentionDays != "30" { - t.Fatalf("expected retentionDays to be 30, but got %s", barman.retentionDays) - } - }) - - t.Run("custom-min-redundancy", func(t *testing.T) { + t.Run("write-aws-credentials", func(t *testing.T) { setDefaultEnv(t) - t.Setenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "7") - barman, err := NewBarman(true) + barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) if err != nil { t.Fatalf("unexpected error: %v", err) } - if barman.minimumRedundancy != "7" { - t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) - } - }) -} - -func TestValidateBarmanRequirements(t *testing.T) { - b, _ := NewBarman(false) - t.Run("missing-aws-access-key", func(t *testing.T) { - err := b.ValidateRequiredEnv() + credFile := "./test_results/credentials" - if err.Error() != "AWS_ACCESS_KEY_ID secret must be set" { - t.Fatalf("expected error to be 'AWS_ACCESS_KEY_ID secret must be set', but got %s", err.Error()) + if err := barman.writeAWSCredentials("default", credFile); err != nil { + t.Fatalf("unexpected error: %v", err) } - }) - t.Run("missing-aws-secret-access-key", func(t *testing.T) { - setDefaultEnv(t) - t.Setenv("AWS_SECRET_ACCESS_KEY", "") - err := b.ValidateRequiredEnv() - - if err.Error() != "AWS_SECRET_ACCESS_KEY secret must be set" { - t.Fatalf("expected error to be 'AWS_SECRET_ACCESS_KEY secret must be set', but got %s", err.Error()) + if !utils.FileExists(credFile) { + t.Fatalf("expected %s to exist, but doesn't", credFile) } - }) - t.Run("missing-aws-bucket-name", func(t *testing.T) { - setDefaultEnv(t) - t.Setenv("AWS_BUCKET_NAME", "") - err := b.ValidateRequiredEnv() - - if err.Error() != "AWS_BUCKET_NAME envvar must be set" { - t.Fatalf("expected error to be 'AWS_BUCKET_NAME envvar must be set', but got %s", err.Error()) + // Check contents + contents, err := os.ReadFile(credFile) + if err != nil { + t.Fatalf("unexpected error: %v", err) } - }) - t.Run("missing-aws-endpoint-url", func(t *testing.T) { - setDefaultEnv(t) - t.Setenv("AWS_ENDPOINT_URL_S3", "") - err := b.ValidateRequiredEnv() + expected := "[default]\naws_access_key_id=my-key\naws_secret_access_key=my-secret" - if err.Error() != "AWS_ENDPOINT_URL_S3 envvar must be set" { - t.Fatalf("expected error to be 'AWS_ENDPOINT_URL_S3 envvar must be set', but got %s", err.Error()) + if string(contents) != expected { + t.Fatalf("expected contents to be %s, but got %s", expected, string(contents)) } }) } +// t.Run("custom-retention", func(t *testing.T) { +// setDefaultEnv(t) +// t.Setenv("CLOUD_ARCHIVING_RETENTION_DAYS", "30") + +// barman, err := NewBarman(true) +// if err != nil { +// t.Fatalf("unexpected error: %v", err) +// } +// if barman.retentionDays != "30" { +// t.Fatalf("expected retentionDays to be 30, but got %s", barman.retentionDays) +// } +// }) + +// t.Run("custom-min-redundancy", func(t *testing.T) { +// setDefaultEnv(t) +// t.Setenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "7") + +// barman, err := NewBarman(true) +// if err != nil { +// t.Fatalf("unexpected error: %v", err) +// } +// if barman.minimumRedundancy != "7" { +// t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) +// } +// }) +// } + +// func TestValidateBarmanRequirements(t *testing.T) { +// b, _ := NewBarman(false) + +// t.Run("missing-aws-access-key", func(t *testing.T) { +// err := b.ValidateRequiredEnv() + +// if err.Error() != "AWS_ACCESS_KEY_ID secret must be set" { +// t.Fatalf("expected error to be 'AWS_ACCESS_KEY_ID secret must be set', but got %s", err.Error()) +// } +// }) + +// t.Run("missing-aws-secret-access-key", func(t *testing.T) { +// setDefaultEnv(t) +// t.Setenv("AWS_SECRET_ACCESS_KEY", "") +// err := b.ValidateRequiredEnv() + +// if err.Error() != "AWS_SECRET_ACCESS_KEY secret must be set" { +// t.Fatalf("expected error to be 'AWS_SECRET_ACCESS_KEY secret must be set', but got %s", err.Error()) +// } +// }) + +// t.Run("missing-aws-bucket-name", func(t *testing.T) { +// setDefaultEnv(t) +// t.Setenv("AWS_BUCKET_NAME", "") +// err := b.ValidateRequiredEnv() + +// if err.Error() != "AWS_BUCKET_NAME envvar must be set" { +// t.Fatalf("expected error to be 'AWS_BUCKET_NAME envvar must be set', but got %s", err.Error()) +// } +// }) + +// t.Run("missing-aws-endpoint-url", func(t *testing.T) { +// setDefaultEnv(t) +// t.Setenv("AWS_ENDPOINT_URL_S3", "") +// err := b.ValidateRequiredEnv() + +// if err.Error() != "AWS_ENDPOINT_URL_S3 envvar must be set" { +// t.Fatalf("expected error to be 'AWS_ENDPOINT_URL_S3 envvar must be set', but got %s", err.Error()) +// } +// }) +// } + func TestBarmanRetentionPolicy(t *testing.T) { setDefaultEnv(t) - barman, err := NewBarman(true) + barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -123,10 +166,7 @@ func TestBarmanRetentionPolicy(t *testing.T) { } func setDefaultEnv(t *testing.T) { - t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket") t.Setenv("FLY_APP_NAME", "postgres-flex") - t.Setenv("AWS_ACCESS_KEY_ID", "my-key") - t.Setenv("AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") - t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") - t.Setenv("AWS_BUCKET_NAME", "my-bucket") + } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 8e7ebaac..0f2252e7 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -15,6 +15,7 @@ import ( "github.com/fly-apps/postgres-flex/internal/flypg/admin" "github.com/fly-apps/postgres-flex/internal/flypg/state" "github.com/fly-apps/postgres-flex/internal/privnet" + "github.com/fly-apps/postgres-flex/internal/supervisor" "github.com/fly-apps/postgres-flex/internal/utils" "github.com/jackc/pgx/v5" ) @@ -138,19 +139,58 @@ func (n *Node) Init(ctx context.Context) error { } } + store, err := state.NewStore() + if err != nil { + return fmt.Errorf("failed initialize cluster state store: %s", err) + } + // Remote point-in-time restore. - if os.Getenv("CLOUD_ARCHIVING_REMOTE_RESTORE") != "" { + if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { // TODO - We may need to check for the restore lock her - restore, err := NewBarmanRestore() + restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) if err != nil { return fmt.Errorf("failed to initialize barman restore: %s", err) } + // Write the source AWS credentials + if err := restore.writeAWSCredentials("default", awsCredentialsPath); err != nil { + return fmt.Errorf("failed to write aws credentials: %s", err) + } + + log.Println("Restoring base backup") if err := restore.Restore(ctx); err != nil { return fmt.Errorf("failed to restore from PIT: %s", err) } + // Ensure PG is properly configured before we boot. + if err := n.PGConfig.initialize(store); err != nil { + return fmt.Errorf("failed to initialize pg config: %s", err) + } + + log.Printf("Starting supervisor process to replay WAL\n") + // Start the postgres process and wait for it to exit. + svisor := supervisor.New("flypg", 5*time.Minute) + + args := []string{"-D", n.DataDir} + + svisor.AddProcess("postgres", fmt.Sprintf("gosu postgres postgres %s", strings.Join(args, " "))) + + if err := svisor.Run(); err != nil { + return fmt.Errorf("postgres replay process failed with: %s", err) + } + + log.Printf("Supervisor finished!\n") + + if err := os.Remove("/data/postgresql/recovery.signal"); err != nil { + return fmt.Errorf("failed to remove recovery.signal: %s", err) + } + + os.Unsetenv("BARMAN_REMOTE_RESTORE") + + // utils.RunCmd(ctx, "postgres", "postgres", args...) + + log.Println("Resetting environment for restore") if err := prepareRemoteRestore(ctx, n); err != nil { return fmt.Errorf("failed to issue restore: %s", err) } @@ -163,16 +203,10 @@ func (n *Node) Init(ctx context.Context) error { } } - err := WriteSSHKey() - if err != nil { + if err = WriteSSHKey(); err != nil { return fmt.Errorf("failed write ssh keys: %s", err) } - store, err := state.NewStore() - if err != nil { - return fmt.Errorf("failed initialize cluster state store: %s", err) - } - if err := n.RepMgr.initialize(); err != nil { return fmt.Errorf("failed to initialize repmgr: %s", err) } diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 8660ea40..71101097 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -173,33 +173,49 @@ func (c *PGConfig) SetDefaults() error { "shared_preload_libraries": fmt.Sprintf("'%s'", strings.Join(sharedPreloadLibraries, ",")), } - switch strings.ToLower(os.Getenv("CLOUD_ARCHIVING_ENABLED")) { - case "true": - barman, err := NewBarman(true) + if os.Getenv("BARMAN_ENABLED") != "" { + barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) switch { case err != nil: log.Printf("[WARN] Failed to initialize barman: %s", err) default: + if err := barman.writeAWSCredentials("default", awsCredentialsPath); err != nil { + log.Printf("[WARN] Failed to write AWS credentials: %s", err) + } + c.internalConfig["archive_mode"] = "on" c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) } - case "false": + } else { c.internalConfig["archive_mode"] = "off" - c.internalConfig["archive_command"] = "" - default: - // Noop } - if os.Getenv("CLOUD_ARCHIVING_WAL_RESTORE") != "" { - barmanRestore, err := NewBarmanRestore() + if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + barmanRestore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) if err != nil { return err } + // At most one of: + // recovery_target, + // recovery_target_lsn, + // recovery_target_name, + // recovery_target_time, + // recovery_target_xid can be used; + // if more than one of these is specified in the configuration file, an error will be raised + c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) - c.internalConfig["recovery_target"] = barmanRestore.recoveryTarget - c.internalConfig["recovery_target_timeline"] = barmanRestore.recoveryTargetTimeline - c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction + c.internalConfig["recovery_target_action"] = "shutdown" + // c.internalConfig["recovery_target_timeline"] = "latest" + + switch { + case barmanRestore.recoveryTargetName != "": + c.internalConfig["recovery_target_name"] = fmt.Sprintf("barman_%s", barmanRestore.recoveryTargetName) + case barmanRestore.recoveryTargetTime != "": + c.internalConfig["recovery_target_time"] = "2024-06-27 20:24:34 UTC" + default: + return errors.New("recovery target name or time must be specified") + } // Write the recovery.signal file if err := os.WriteFile("/data/postgresql/recovery.signal", []byte(""), 0600); err != nil { diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index 4306bfa7..1149e303 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -115,11 +115,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("cloud-archiving", func(t *testing.T) { - t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") - t.Setenv("AWS_ACCESS_KEY_ID", "my-key") - t.Setenv("AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") - t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") - t.Setenv("AWS_BUCKET_NAME", "my-bucket") + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket") store, _ := state.NewStore() @@ -136,7 +132,10 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - barman, _ := NewBarman(false) + barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) + if err != nil { + t.Fatal(err) + } expected := fmt.Sprintf("'%s'", barman.walArchiveCommand()) if cfg["archive_command"] != expected { t.Fatalf("expected %s, got %s", expected, cfg["archive_command"]) @@ -144,12 +143,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("cloud-archiving-disabled", func(t *testing.T) { - t.Setenv("CLOUD_ARCHIVING_ENABLED", "true") - t.Setenv("AWS_ACCESS_KEY_ID", "my-key") - t.Setenv("AWS_ENDPOINT_URL_S3", "https://fly.storage.tigris.dev") - t.Setenv("AWS_SECRET_ACCESS_KEY", "my-secret") - t.Setenv("AWS_BUCKET_NAME", "my-bucket") - + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { @@ -165,7 +159,7 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - t.Setenv("CLOUD_ARCHIVING_ENABLED", "false") + t.Setenv("BARMAN_ENABLED", "") if err := pgConf.initialize(store); err != nil { t.Fatal(err) diff --git a/internal/flypg/restore.go b/internal/flypg/restore.go index 43d76c13..19c928f9 100644 --- a/internal/flypg/restore.go +++ b/internal/flypg/restore.go @@ -195,8 +195,8 @@ func openConn(ctx context.Context, n *Node) (*pgx.Conn, error) { } conf.User = "postgres" - // Allow up to 30 seconds for PG to boot and accept connections. - timeout := time.After(30 * time.Second) + // Allow up to 60 seconds for PG to boot and accept connections. + timeout := time.After(60 * time.Second) tick := time.NewTicker(1 * time.Second) defer tick.Stop() for { From f5e13539d9557b74529c5a2f500b9130063acfe7 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Fri, 28 Jun 2024 21:57:57 -0500 Subject: [PATCH 16/51] cleanup --- internal/flypg/barman_restore.go | 101 ++++++++++++++++++++++++++++++- internal/flypg/node.go | 65 +++++++++----------- internal/flypg/pg.go | 11 +--- internal/flypg/restore.go | 10 ++- 4 files changed, 136 insertions(+), 51 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index e261e17f..60f5f432 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -3,8 +3,12 @@ package flypg import ( "context" "fmt" + "log" "net/url" + "os" "time" + + "github.com/fly-apps/postgres-flex/internal/supervisor" ) type BarmanRestore struct { @@ -52,7 +56,73 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { return restore, nil } -func (b *BarmanRestore) Restore(ctx context.Context) error { +func (b *BarmanRestore) WALReplayAndReset(ctx context.Context, node *Node) error { + + // create a copy of the pg_hba.conf file so we can revert back to it when needed. + if err := backupHBAFile(); err != nil { + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("failed backing up pg_hba.conf: %s", err) + } + + // Grant local access so we can update internal credentials + // to match the environment. + if err := grantLocalAccess(); err != nil { + return fmt.Errorf("failed to grant local access: %s", err) + } + + // Boot postgres and wait for WAL replay to complete + svisor := supervisor.New("flypg", 5*time.Minute) + svisor.AddProcess("postgres", fmt.Sprintf("gosu postgres postgres -D /data/postgresql -p 5433 -h %s", node.PrivateIP)) + + // Start the postgres process in the background. + go func() { + if err := svisor.Run(); err != nil { + log.Printf("[ERROR] failed to boot postgres in the background: %s", err) + } + }() + + // Wait for the WAL replay to complete. + if err := b.waitOnRecoveryMode(ctx, node.PrivateIP); err != nil { + return fmt.Errorf("failed to monitor recovery mode: %s", err) + } + + // Open read/write connection + conn, err := openConn(ctx, node.PrivateIP, false) + if err != nil { + return fmt.Errorf("failed to establish connection to local node: %s", err) + } + defer func() { _ = conn.Close(ctx) }() + + // Drop repmgr database to clear any metadata that belonged to the old cluster. + _, err = conn.Exec(ctx, "DROP DATABASE repmgr;") + if err != nil { + return fmt.Errorf("failed to drop repmgr database: %s", err) + } + + // Ensure auth is configured to match the environment. + if err := node.setupCredentials(ctx, conn); err != nil { + return fmt.Errorf("failed creating required users: %s", err) + } + + // Stop the postgres process + svisor.Stop() + + // Revert back to the original config file + if err := restoreHBAFile(); err != nil { + return fmt.Errorf("failed to restore original pg_hba.conf: %s", err) + } + + if err := conn.Close(ctx); err != nil { + return fmt.Errorf("failed to close connection: %s", err) + } + + return nil + +} + +func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { // Query available backups from object storage backups, err := b.ListBackups(ctx) if err != nil { @@ -174,3 +244,32 @@ func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr return latestBackupID, nil } + +func (b *BarmanRestore) waitOnRecoveryMode(ctx context.Context, privateIP string) error { + conn, err := openConn(ctx, privateIP, true) + if err != nil { + return fmt.Errorf("failed to establish connection to local node: %s", err) + } + defer func() { _ = conn.Close(ctx) }() + + timeout := time.After(10 * time.Minute) + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-timeout: + return fmt.Errorf("timed out waiting for PG to exit recovery mode") + case <-ticker.C: + var inRecovery bool + if err := conn.QueryRow(ctx, "SELECT pg_is_in_recovery();").Scan(&inRecovery); err != nil { + return fmt.Errorf("failed to check recovery status: %w", err) + } + if !inRecovery { + return nil + } + } + } +} diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 0f2252e7..71587d10 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -15,7 +15,6 @@ import ( "github.com/fly-apps/postgres-flex/internal/flypg/admin" "github.com/fly-apps/postgres-flex/internal/flypg/state" "github.com/fly-apps/postgres-flex/internal/privnet" - "github.com/fly-apps/postgres-flex/internal/supervisor" "github.com/fly-apps/postgres-flex/internal/utils" "github.com/jackc/pgx/v5" ) @@ -146,54 +145,46 @@ func (n *Node) Init(ctx context.Context) error { // Remote point-in-time restore. if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { - // TODO - We may need to check for the restore lock her - - restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + active, err := isRestoreActive() if err != nil { - return fmt.Errorf("failed to initialize barman restore: %s", err) - } - - // Write the source AWS credentials - if err := restore.writeAWSCredentials("default", awsCredentialsPath); err != nil { - return fmt.Errorf("failed to write aws credentials: %s", err) + return fmt.Errorf("failed to verify active restore: %s", err) } - log.Println("Restoring base backup") - if err := restore.Restore(ctx); err != nil { - return fmt.Errorf("failed to restore from PIT: %s", err) - } + if active { + restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + if err != nil { + return fmt.Errorf("failed to initialize barman restore: %s", err) + } - // Ensure PG is properly configured before we boot. - if err := n.PGConfig.initialize(store); err != nil { - return fmt.Errorf("failed to initialize pg config: %s", err) - } + // Write the source AWS credentials + if err := restore.writeAWSCredentials("default", awsCredentialsPath); err != nil { + return fmt.Errorf("failed to write aws credentials: %s", err) + } - log.Printf("Starting supervisor process to replay WAL\n") - // Start the postgres process and wait for it to exit. - svisor := supervisor.New("flypg", 5*time.Minute) + log.Println("Restoring base backup") + if err := restore.RestoreFromBackup(ctx); err != nil { + return fmt.Errorf("failed to restore base backup: %s", err) + } - args := []string{"-D", n.DataDir} + // Set restore configuration + if err := n.PGConfig.initialize(store); err != nil { + return fmt.Errorf("failed to initialize pg config: %s", err) + } - svisor.AddProcess("postgres", fmt.Sprintf("gosu postgres postgres %s", strings.Join(args, " "))) + if err := restore.WALReplayAndReset(ctx, n); err != nil { + return fmt.Errorf("failed to replay WAL: %s", err) + } - if err := svisor.Run(); err != nil { - return fmt.Errorf("postgres replay process failed with: %s", err) - } + os.Remove("/data/postgresql/recovery.signal") - log.Printf("Supervisor finished!\n") + os.Unsetenv("BARMAN_REMOTE_RESTORE") - if err := os.Remove("/data/postgresql/recovery.signal"); err != nil { - return fmt.Errorf("failed to remove recovery.signal: %s", err) + // Set the lock file so the init process knows not to restart the restore process. + if err := setRestoreLock(); err != nil { + return fmt.Errorf("failed to set restore lock: %s", err) + } } - os.Unsetenv("BARMAN_REMOTE_RESTORE") - - // utils.RunCmd(ctx, "postgres", "postgres", args...) - - log.Println("Resetting environment for restore") - if err := prepareRemoteRestore(ctx, n); err != nil { - return fmt.Errorf("failed to issue restore: %s", err) - } } // Verify whether we are a booting zombie. diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 71101097..e576dea5 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -196,17 +196,8 @@ func (c *PGConfig) SetDefaults() error { return err } - // At most one of: - // recovery_target, - // recovery_target_lsn, - // recovery_target_name, - // recovery_target_time, - // recovery_target_xid can be used; - // if more than one of these is specified in the configuration file, an error will be raised - c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) - c.internalConfig["recovery_target_action"] = "shutdown" - // c.internalConfig["recovery_target_timeline"] = "latest" + c.internalConfig["recovery_target_action"] = "promote" switch { case barmanRestore.recoveryTargetName != "": diff --git a/internal/flypg/restore.go b/internal/flypg/restore.go index 19c928f9..d27c4d49 100644 --- a/internal/flypg/restore.go +++ b/internal/flypg/restore.go @@ -61,7 +61,7 @@ func prepareRemoteRestore(ctx context.Context, node *Node) error { } }() - conn, err := openConn(ctx, node) + conn, err := openConn(ctx, node.PrivateIP, false) if err != nil { return fmt.Errorf("failed to establish connection to local node: %s", err) } @@ -187,8 +187,12 @@ func setRestoreLock() error { return file.Sync() } -func openConn(ctx context.Context, n *Node) (*pgx.Conn, error) { - url := fmt.Sprintf("postgres://[%s]:5433?target_session_attrs=any", n.PrivateIP) +func openConn(ctx context.Context, privateIP string, readOnly bool) (*pgx.Conn, error) { + url := fmt.Sprintf("postgres://[%s]:5433?target_session_attrs=any", privateIP) + if readOnly { + url += "&default_transaction_read_only=on" + } + conf, err := pgx.ParseConfig(url) if err != nil { return nil, err From c1b27e50598b0dabf32e40569184dbc409707dca Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Sat, 29 Jun 2024 21:04:14 -0500 Subject: [PATCH 17/51] cleanup --- internal/flypg/barman.go | 4 ++-- internal/flypg/barman_restore.go | 40 ++++++++++++++++++++++---------- internal/flypg/node.go | 3 --- internal/flypg/pg.go | 7 +++--- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 527aedce..fd611ed2 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -114,14 +114,14 @@ func (b *Barman) Backup(ctx context.Context, immediateCheckpoint bool) ([]byte, } // RestoreBackup returns the command string used to restore a base backup. -func (b *Barman) RestoreBackup(ctx context.Context, backupID, recoveryDir string) ([]byte, error) { +func (b *Barman) RestoreBackup(ctx context.Context, backupID string) ([]byte, error) { args := []string{ "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, b.Bucket(), b.bucket, backupID, - recoveryDir, + defaultRestoreDir, } return utils.RunCmd(ctx, "postgres", "barman-cloud-restore", args...) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 60f5f432..692d1d06 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -14,8 +14,11 @@ import ( type BarmanRestore struct { *Barman - recoveryTargetName string - recoveryTargetTime string + recoveryTarget string + recoveryTargetName string + recoveryTargetTime string + recoveryTargetAction string + recoveryTargetInclusive bool } const ( @@ -40,24 +43,33 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { // evaluate the query parameters for key, value := range url.Query() { switch key { + case "target": + restore.recoveryTarget = value[0] case "targetName": restore.recoveryTargetName = value[0] case "targetTime": restore.recoveryTargetTime = value[0] + case "targetInclusive": + restore.recoveryTargetInclusive = value[0] == "true" + case "targetAction": + restore.recoveryTargetAction = value[0] default: return nil, fmt.Errorf("unknown query parameter: %s", key) } } - if restore.recoveryTargetName == "" && restore.recoveryTargetTime == "" { - return nil, fmt.Errorf("restore target not specified") + if restore.recoveryTargetAction == "" { + restore.recoveryTargetAction = "promote" + } + + if restore.recoveryTargetName == "" && restore.recoveryTargetTime == "" && restore.recoveryTarget == "" { + return nil, fmt.Errorf("no restore target not specified") } return restore, nil } func (b *BarmanRestore) WALReplayAndReset(ctx context.Context, node *Node) error { - // create a copy of the pg_hba.conf file so we can revert back to it when needed. if err := backupHBAFile(); err != nil { if os.IsNotExist(err) { @@ -84,10 +96,12 @@ func (b *BarmanRestore) WALReplayAndReset(ctx context.Context, node *Node) error }() // Wait for the WAL replay to complete. - if err := b.waitOnRecoveryMode(ctx, node.PrivateIP); err != nil { + if err := b.waitOnRecovery(ctx, node.PrivateIP); err != nil { return fmt.Errorf("failed to monitor recovery mode: %s", err) } + // os.Remove("/data/postgresql/recovery.signal") + // Open read/write connection conn, err := openConn(ctx, node.PrivateIP, false) if err != nil { @@ -138,18 +152,20 @@ func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { switch { case b.recoveryTargetName != "": // Resolve the target base backup - - // TODO - the id needs to be prefixed with barman_ backupID, err = b.resolveBackupFromID(backups, b.recoveryTargetName) if err != nil { return fmt.Errorf("failed to resolve backup target by id: %s", err) } case b.recoveryTargetTime != "": - // Resolve the target base backup backupID, err = b.resolveBackupFromTime(backups, b.recoveryTargetTime) if err != nil { return fmt.Errorf("failed to resolve backup target by time: %s", err) } + case b.recoveryTarget != "": + backupID, err = b.resolveBackupFromTime(backups, time.Now().String()) + if err != nil { + return fmt.Errorf("failed to resolve backup target by time: %s", err) + } default: return fmt.Errorf("restore target not specified") } @@ -160,7 +176,7 @@ func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { } // Download and restore the base backup - if _, err := b.RestoreBackup(ctx, backupID, defaultRestoreDir); err != nil { + if _, err := b.RestoreBackup(ctx, backupID); err != nil { return fmt.Errorf("failed to restore backup: %s", err) } @@ -245,8 +261,8 @@ func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr return latestBackupID, nil } -func (b *BarmanRestore) waitOnRecoveryMode(ctx context.Context, privateIP string) error { - conn, err := openConn(ctx, privateIP, true) +func (b *BarmanRestore) waitOnRecovery(ctx context.Context, privateIP string) error { + conn, err := openConn(ctx, privateIP, false) if err != nil { return fmt.Errorf("failed to establish connection to local node: %s", err) } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 71587d10..f3876e61 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -175,8 +175,6 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed to replay WAL: %s", err) } - os.Remove("/data/postgresql/recovery.signal") - os.Unsetenv("BARMAN_REMOTE_RESTORE") // Set the lock file so the init process knows not to restart the restore process. @@ -184,7 +182,6 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed to set restore lock: %s", err) } } - } // Verify whether we are a booting zombie. diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index e576dea5..7732df78 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -182,7 +182,6 @@ func (c *PGConfig) SetDefaults() error { if err := barman.writeAWSCredentials("default", awsCredentialsPath); err != nil { log.Printf("[WARN] Failed to write AWS credentials: %s", err) } - c.internalConfig["archive_mode"] = "on" c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) } @@ -197,13 +196,15 @@ func (c *PGConfig) SetDefaults() error { } c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) - c.internalConfig["recovery_target_action"] = "promote" + c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction switch { + case barmanRestore.recoveryTarget != "": + c.internalConfig["recovery_target"] = barmanRestore.recoveryTarget case barmanRestore.recoveryTargetName != "": c.internalConfig["recovery_target_name"] = fmt.Sprintf("barman_%s", barmanRestore.recoveryTargetName) case barmanRestore.recoveryTargetTime != "": - c.internalConfig["recovery_target_time"] = "2024-06-27 20:24:34 UTC" + c.internalConfig["recovery_target_time"] = barmanRestore.recoveryTargetTime default: return errors.New("recovery target name or time must be specified") } From a3e3ceda7fb93a0dcd1ec82bd1edb65a8736ce92 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Sat, 29 Jun 2024 21:06:49 -0500 Subject: [PATCH 18/51] Didn't mean to remove this --- cmd/start/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/start/main.go b/cmd/start/main.go index 382ebbae..abe8a8c3 100644 --- a/cmd/start/main.go +++ b/cmd/start/main.go @@ -38,6 +38,7 @@ func main() { } svisor := supervisor.New("flybarman", 1*time.Minute) + svisor.AddProcess("cron", "/usr/sbin/cron -f", supervisor.WithRestart(0, 5*time.Second)) svisor.AddProcess("barman", fmt.Sprintf("tail -f %s", node.LogFile)) svisor.AddProcess("admin", "/usr/local/bin/start_admin_server", supervisor.WithRestart(0, 5*time.Second), From f431edbfd8969c694481dac301dd8e2f4117ee6e Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Sat, 29 Jun 2024 21:25:51 -0500 Subject: [PATCH 19/51] Set archive_timeout --- internal/flypg/pg.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 7732df78..e88f476c 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -184,6 +184,10 @@ func (c *PGConfig) SetDefaults() error { } c.internalConfig["archive_mode"] = "on" c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) + // TODO - Make this configurable + // This controls the minimum frequency WAL files are archived to barman + // in the event there is database activity. + c.internalConfig["archive_timeout"] = "60s" } } else { c.internalConfig["archive_mode"] = "off" From 383da388a122a6794c14cde04c45ad8ab5f32e50 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Sun, 30 Jun 2024 19:44:01 -0500 Subject: [PATCH 20/51] More cleanup --- internal/flypg/barman.go | 61 +++++++++++++++------------ internal/flypg/barman_restore.go | 11 ++--- internal/flypg/barman_restore_test.go | 8 +++- internal/flypg/barman_test.go | 24 ++++++++++- internal/flypg/pg_test.go | 4 +- 5 files changed, 71 insertions(+), 37 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index fd611ed2..bcc428fc 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -20,11 +20,13 @@ const ( ) type Barman struct { - appName string - provider string - endpoint string - bucket string + appName string + provider string + endpoint string + bucket string + bucketDirectory string + // TODO - This was for convenience, but should probably be removed. configURL string retentionDays string @@ -45,7 +47,7 @@ type BackupList struct { // NewBarman creates a new Barman instance. // The configURL is expected to be in the format: -// barman://access-key:secret-key@endpoint/bucket +// https://s3-access-key:s3-secret-key@s3-endpoint/bucket/directory func NewBarman(configURL string) (*Barman, error) { parsedURL, err := url.Parse(configURL) if err != nil { @@ -57,9 +59,14 @@ func NewBarman(configURL string) (*Barman, error) { return nil, fmt.Errorf("object storage endpoint missing") } - bucket := strings.TrimLeft(parsedURL.Path, "/") - if bucket == "" { - return nil, fmt.Errorf("bucket name is missing") + path := strings.TrimLeft(parsedURL.Path, "/") + if path == "" { + return nil, fmt.Errorf("bucket and directory missing") + } + + pathSlice := strings.Split(path, "/") + if len(pathSlice) != 2 { + return nil, fmt.Errorf("invalid bucket and directory format") } // Extract user information for credentials (not used here but necessary for the complete parsing) @@ -72,17 +79,19 @@ func NewBarman(configURL string) (*Barman, error) { } // Set the environment variable within the Go process + // TODO - We need to find a better way to do this. if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { fmt.Printf("Error setting environment variable: %s\n", err) return nil, err } return &Barman{ - appName: os.Getenv("FLY_APP_NAME"), - provider: providerDefault, - endpoint: fmt.Sprintf("https://%s", endpoint), - bucket: bucket, - configURL: configURL, + appName: os.Getenv("FLY_APP_NAME"), + provider: providerDefault, + endpoint: fmt.Sprintf("https://%s", endpoint), + bucket: pathSlice[0], + bucketDirectory: pathSlice[1], + configURL: configURL, retentionDays: "7", minimumRedundancy: "3", @@ -90,7 +99,7 @@ func NewBarman(configURL string) (*Barman, error) { } -func (b *Barman) Bucket() string { +func (b *Barman) BucketURL() string { return fmt.Sprintf("s3://%s", b.bucket) } @@ -102,8 +111,8 @@ func (b *Barman) Backup(ctx context.Context, immediateCheckpoint bool) ([]byte, "--endpoint-url", b.endpoint, "--host", fmt.Sprintf("%s.internal", b.appName), "--user", "repmgr", - b.Bucket(), - b.appName, + b.BucketURL(), + b.bucketDirectory, } if immediateCheckpoint { @@ -118,8 +127,8 @@ func (b *Barman) RestoreBackup(ctx context.Context, backupID string) ([]byte, er args := []string{ "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, - b.Bucket(), - b.bucket, + b.BucketURL(), + b.bucketDirectory, backupID, defaultRestoreDir, } @@ -132,8 +141,8 @@ func (b *Barman) ListBackups(ctx context.Context) (BackupList, error) { "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, "--format", "json", - b.Bucket(), - b.bucket, + b.BucketURL(), + b.bucketDirectory, } backupsBytes, err := utils.RunCmd(ctx, "postgres", "barman-cloud-backup-list", args...) @@ -150,8 +159,8 @@ func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { "--endpoint-url", b.endpoint, "--retention", b.RetentionPolicy(), "--minimum-redundancy", b.minimumRedundancy, - b.Bucket(), - b.appName, + b.BucketURL(), + b.bucketDirectory, } return utils.RunCmd(ctx, "postgres", "barman-cloud-backup-delete", args...) @@ -213,8 +222,8 @@ func (b *Barman) walArchiveCommand() string { return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s %s %s %%p", b.provider, b.endpoint, - b.Bucket(), - b.bucket, + b.BucketURL(), + b.bucketDirectory, ) } @@ -224,8 +233,8 @@ func (b *Barman) walRestoreCommand() string { return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s %s %s %%f %%p", b.provider, b.endpoint, - b.Bucket(), - b.bucket, + b.BucketURL(), + b.bucketDirectory, ) } diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 692d1d06..3d738ee6 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -150,6 +150,11 @@ func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { var backupID string switch { + case b.recoveryTarget != "": + backupID, err = b.resolveBackupFromTime(backups, time.Now().String()) + if err != nil { + return fmt.Errorf("failed to resolve backup target by time: %s", err) + } case b.recoveryTargetName != "": // Resolve the target base backup backupID, err = b.resolveBackupFromID(backups, b.recoveryTargetName) @@ -161,11 +166,6 @@ func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { if err != nil { return fmt.Errorf("failed to resolve backup target by time: %s", err) } - case b.recoveryTarget != "": - backupID, err = b.resolveBackupFromTime(backups, time.Now().String()) - if err != nil { - return fmt.Errorf("failed to resolve backup target by time: %s", err) - } default: return fmt.Errorf("restore target not specified") } @@ -268,6 +268,7 @@ func (b *BarmanRestore) waitOnRecovery(ctx context.Context, privateIP string) er } defer func() { _ = conn.Close(ctx) }() + // TODO - Figure out a more reasonable timeout to use here. timeout := time.After(10 * time.Minute) ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index 9dd18b24..f96f0b33 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -117,10 +117,14 @@ func TestNewBarmanRestore(t *testing.T) { t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) } - if restore.Bucket() != "s3://my-bucket" { + if restore.BucketURL() != "s3://my-bucket" { t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) } + if restore.bucketDirectory != "my-directory" { + t.Fatalf("expected bucket directory to be my-directory, got %s", restore.bucketDirectory) + } + if restore.appName != "postgres-flex" { t.Fatalf("expected app name to be postgres-flex, got %s", restore.appName) } @@ -253,6 +257,6 @@ func TestResolveBackupTarget(t *testing.T) { } func setRestoreDefaultEnv(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket?targetTime=1212121212") + t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=1212121212") t.Setenv("FLY_APP_NAME", "postgres-flex") } diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index 641fe0c8..70208d1f 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -1,6 +1,7 @@ package flypg import ( + "fmt" "os" "testing" @@ -28,10 +29,14 @@ func TestNewBarman(t *testing.T) { t.Fatalf("expected bucket to be my-bucket, but got %s", barman.bucket) } - if barman.Bucket() != "s3://my-bucket" { + if barman.BucketURL() != "s3://my-bucket" { t.Fatalf("expected bucket to be s3://my-bucket, but got %s", barman.bucket) } + if barman.bucketDirectory != "my-directory" { + t.Fatalf("expected directory to be my-directory, but got %s", barman.bucketDirectory) + } + if barman.appName != "postgres-flex" { t.Fatalf("expected appName to be postgres-flex, but got %s", barman.appName) } @@ -47,6 +52,21 @@ func TestNewBarman(t *testing.T) { }) } +func TestWALRestoreCommand(t *testing.T) { + setDefaultEnv(t) + + barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + expected := fmt.Sprintf("barman-cloud-wal-restore --cloud-provider aws-s3 --endpoint-url https://fly.storage.tigris.dev s3://my-bucket my-directory %%f %%p") + + if barman.walRestoreCommand() != expected { + t.Fatalf("expected WALRestoreCommand to be %s, but got %s", expected, barman.walRestoreCommand()) + } +} + func TestWriteAWSCredentials(t *testing.T) { setup(t) defer cleanup() @@ -166,7 +186,7 @@ func TestBarmanRetentionPolicy(t *testing.T) { } func setDefaultEnv(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket") + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") t.Setenv("FLY_APP_NAME", "postgres-flex") } diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index 1149e303..63f7a823 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -115,7 +115,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("cloud-archiving", func(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket") + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") store, _ := state.NewStore() @@ -143,7 +143,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("cloud-archiving-disabled", func(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket") + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { From b17193fe00d6ae3cff8ff30dd54d838f50da95f0 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Sun, 30 Jun 2024 20:32:57 -0500 Subject: [PATCH 21/51] fixes and cleanup --- cmd/start/main.go | 6 ++++++ internal/flypg/barman.go | 7 ------- internal/flypg/barman_restore.go | 8 ++++---- internal/flypg/pg.go | 6 +++++- internal/flypg/restore.go | 7 ++----- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cmd/start/main.go b/cmd/start/main.go index abe8a8c3..9f4f746a 100644 --- a/cmd/start/main.go +++ b/cmd/start/main.go @@ -54,6 +54,12 @@ func main() { return } + // Set the environment variable within the Go process + // TODO - We need to find a better way to do this. + if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { + panicHandler(err) + } + node, err := flypg.NewNode() if err != nil { panicHandler(err) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index bcc428fc..c9602cbf 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -78,13 +78,6 @@ func NewBarman(configURL string) (*Barman, error) { return nil, fmt.Errorf("access key or secret key is missing") } - // Set the environment variable within the Go process - // TODO - We need to find a better way to do this. - if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { - fmt.Printf("Error setting environment variable: %s\n", err) - return nil, err - } - return &Barman{ appName: os.Getenv("FLY_APP_NAME"), provider: providerDefault, diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 3d738ee6..ba15bb47 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -86,7 +86,7 @@ func (b *BarmanRestore) WALReplayAndReset(ctx context.Context, node *Node) error // Boot postgres and wait for WAL replay to complete svisor := supervisor.New("flypg", 5*time.Minute) - svisor.AddProcess("postgres", fmt.Sprintf("gosu postgres postgres -D /data/postgresql -p 5433 -h %s", node.PrivateIP)) + svisor.AddProcess("restore", fmt.Sprintf("gosu postgres postgres -D /data/postgresql -p 5433 -h %s", node.PrivateIP)) // Start the postgres process in the background. go func() { @@ -103,7 +103,7 @@ func (b *BarmanRestore) WALReplayAndReset(ctx context.Context, node *Node) error // os.Remove("/data/postgresql/recovery.signal") // Open read/write connection - conn, err := openConn(ctx, node.PrivateIP, false) + conn, err := openConn(ctx, node.PrivateIP) if err != nil { return fmt.Errorf("failed to establish connection to local node: %s", err) } @@ -151,7 +151,7 @@ func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { switch { case b.recoveryTarget != "": - backupID, err = b.resolveBackupFromTime(backups, time.Now().String()) + backupID, err = b.resolveBackupFromTime(backups, time.Now().Format(time.RFC3339)) if err != nil { return fmt.Errorf("failed to resolve backup target by time: %s", err) } @@ -262,7 +262,7 @@ func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr } func (b *BarmanRestore) waitOnRecovery(ctx context.Context, privateIP string) error { - conn, err := openConn(ctx, privateIP, false) + conn, err := openConn(ctx, privateIP) if err != nil { return fmt.Errorf("failed to establish connection to local node: %s", err) } diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index e88f476c..81c4ce1e 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -184,9 +184,9 @@ func (c *PGConfig) SetDefaults() error { } c.internalConfig["archive_mode"] = "on" c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) - // TODO - Make this configurable // This controls the minimum frequency WAL files are archived to barman // in the event there is database activity. + // TODO - Make this configurable c.internalConfig["archive_timeout"] = "60s" } } else { @@ -202,6 +202,10 @@ func (c *PGConfig) SetDefaults() error { c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction + if barmanRestore.recoveryTargetInclusive { + c.internalConfig["recovery_target_inclusive"] = barmanRestore.recoveryTargetInclusive + } + switch { case barmanRestore.recoveryTarget != "": c.internalConfig["recovery_target"] = barmanRestore.recoveryTarget diff --git a/internal/flypg/restore.go b/internal/flypg/restore.go index d27c4d49..f8e5481c 100644 --- a/internal/flypg/restore.go +++ b/internal/flypg/restore.go @@ -61,7 +61,7 @@ func prepareRemoteRestore(ctx context.Context, node *Node) error { } }() - conn, err := openConn(ctx, node.PrivateIP, false) + conn, err := openConn(ctx, node.PrivateIP) if err != nil { return fmt.Errorf("failed to establish connection to local node: %s", err) } @@ -187,11 +187,8 @@ func setRestoreLock() error { return file.Sync() } -func openConn(ctx context.Context, privateIP string, readOnly bool) (*pgx.Conn, error) { +func openConn(ctx context.Context, privateIP string) (*pgx.Conn, error) { url := fmt.Sprintf("postgres://[%s]:5433?target_session_attrs=any", privateIP) - if readOnly { - url += "&default_transaction_read_only=on" - } conf, err := pgx.ParseConfig(url) if err != nil { From 4d6c4121c0a490c543fcb9aa57164602b43519a7 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Sun, 30 Jun 2024 22:45:03 -0500 Subject: [PATCH 22/51] Fixing up recovery time logic --- internal/flypg/barman_restore.go | 22 ++++++-- internal/flypg/barman_restore_test.go | 12 ++-- internal/flypg/pg.go | 9 +-- internal/flypg/pg_test.go | 80 ++++++++++++++++++++++++++- 4 files changed, 102 insertions(+), 21 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index ba15bb47..5eae9a9c 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -18,11 +18,12 @@ type BarmanRestore struct { recoveryTargetName string recoveryTargetTime string recoveryTargetAction string - recoveryTargetInclusive bool + recoveryTargetInclusive string } const ( defaultRestoreDir = "/data/postgresql" + ISO8601 = "2006-01-02T15:04:05-07:00" ) func NewBarmanRestore(configURL string) (*BarmanRestore, error) { @@ -48,9 +49,13 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { case "targetName": restore.recoveryTargetName = value[0] case "targetTime": - restore.recoveryTargetTime = value[0] + ts, err := time.Parse(ISO8601, value[0]) + if err != nil { + return nil, fmt.Errorf("failed to parse target time: %s", err) + } + restore.recoveryTargetTime = ts.Format(ISO8601) case "targetInclusive": - restore.recoveryTargetInclusive = value[0] == "true" + restore.recoveryTargetInclusive = value[0] case "targetAction": restore.recoveryTargetAction = value[0] default: @@ -151,7 +156,7 @@ func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { switch { case b.recoveryTarget != "": - backupID, err = b.resolveBackupFromTime(backups, time.Now().Format(time.RFC3339)) + backupID, err = b.resolveBackupFromTime(backups, time.Now().Format(ISO8601)) if err != nil { return fmt.Errorf("failed to resolve backup target by time: %s", err) } @@ -180,6 +185,11 @@ func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { return fmt.Errorf("failed to restore backup: %s", err) } + // Write the recovery.signal file + if err := os.WriteFile("/data/postgresql/recovery.signal", []byte(""), 0600); err != nil { + return fmt.Errorf("failed to write recovery.signal: %s", err) + } + return nil } @@ -208,9 +218,8 @@ func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr if restoreStr == "latest" { restoreTime = time.Now() } else { - var err error - restoreTime, err = time.Parse(time.RFC3339, restoreStr) + restoreTime, err = time.Parse(ISO8601, restoreStr) if err != nil { return "", fmt.Errorf("failed to parse restore time: %s", err) } @@ -222,6 +231,7 @@ func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr earliestBackupID := "" earliestEndTime := time.Time{} + // This is the layout presented by barman layout := "Mon Jan 2 15:04:05 2006" for _, backup := range backupList.Backups { diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index f96f0b33..bb27b8c4 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -141,8 +141,8 @@ func TestNewBarmanRestore(t *testing.T) { t.Fatalf("expected recovery target name to be empty, got %s", restore.recoveryTargetName) } - if restore.recoveryTargetTime != "1212121212" { - t.Fatalf("expected recovery target time to be 1212121212, got %s", restore.recoveryTargetTime) + if restore.recoveryTargetTime != "2024-06-30T11:15:00-06:00" { + t.Fatalf("expected recovery target time to be 2024-06-30T11:15:00-06:00, got %s", restore.recoveryTargetTime) } } @@ -219,7 +219,7 @@ func TestResolveBackupTarget(t *testing.T) { }) t.Run("resolve-earliest-backup-target", func(t *testing.T) { - backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:12Z") + backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:12-00:00") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -232,7 +232,7 @@ func TestResolveBackupTarget(t *testing.T) { // "begin_time": "Tue Jun 25 19:44:12 2024" // "end_time": "Tue Jun 25 19:44:18 2024", t.Run("resolve-backup-within-first-window", func(t *testing.T) { - backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:15Z") + backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:15-00:00") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -245,7 +245,7 @@ func TestResolveBackupTarget(t *testing.T) { // "begin_time": "Wed Jun 26 17:24:43 2024", // "end_time": "Wed Jun 26 17:27:02 2024", t.Run("resolve-backup-within-second-window", func(t *testing.T) { - backupID, err := restore.resolveBackupFromTime(list, "2024-06-26T17:25:15Z") + backupID, err := restore.resolveBackupFromTime(list, "2024-06-26T17:25:15-00:00") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -257,6 +257,6 @@ func TestResolveBackupTarget(t *testing.T) { } func setRestoreDefaultEnv(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=1212121212") + t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00") t.Setenv("FLY_APP_NAME", "postgres-flex") } diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 81c4ce1e..3701b0f2 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -202,7 +202,7 @@ func (c *PGConfig) SetDefaults() error { c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction - if barmanRestore.recoveryTargetInclusive { + if barmanRestore.recoveryTargetInclusive != "" { c.internalConfig["recovery_target_inclusive"] = barmanRestore.recoveryTargetInclusive } @@ -212,15 +212,10 @@ func (c *PGConfig) SetDefaults() error { case barmanRestore.recoveryTargetName != "": c.internalConfig["recovery_target_name"] = fmt.Sprintf("barman_%s", barmanRestore.recoveryTargetName) case barmanRestore.recoveryTargetTime != "": - c.internalConfig["recovery_target_time"] = barmanRestore.recoveryTargetTime + c.internalConfig["recovery_target_time"] = fmt.Sprintf("'%s'", barmanRestore.recoveryTargetTime) default: return errors.New("recovery target name or time must be specified") } - - // Write the recovery.signal file - if err := os.WriteFile("/data/postgresql/recovery.signal", []byte(""), 0600); err != nil { - return fmt.Errorf("failed to write recovery.signal: %s", err) - } } return nil diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index 63f7a823..0b813237 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -114,7 +114,7 @@ func TestPGConfigInitialization(t *testing.T) { } }) - t.Run("cloud-archiving", func(t *testing.T) { + t.Run("barman-enabled", func(t *testing.T) { t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") store, _ := state.NewStore() @@ -142,7 +142,7 @@ func TestPGConfigInitialization(t *testing.T) { } }) - t.Run("cloud-archiving-disabled", func(t *testing.T) { + t.Run("barman-disabled", func(t *testing.T) { t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") store, _ := state.NewStore() @@ -174,6 +174,82 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be off, got %v", cfg["archive_mode"]) } }) + + t.Run("barman-restore-from-time", func(t *testing.T) { + t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00") + store, _ := state.NewStore() + + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err := pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["recovery_target_time"] != "'2024-06-30T11:15:00-06:00'" { + t.Fatalf("expected recovery_target_time to be 2024-06-30T11:15:00-06:00, got %v", cfg["recovery_target_time"]) + } + }) + + t.Run("barman-restore-from-name", func(t *testing.T) { + t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetName=20240626T172443") + store, _ := state.NewStore() + + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err := pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["recovery_target_name"] != "barman_20240626T172443" { + t.Fatalf("expected recovery_target_name to be barman_20240626T172443, got %v", cfg["recovery_target_name"]) + } + }) + + t.Run("barman-restore-from-target", func(t *testing.T) { + t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?target=immediate") + store, _ := state.NewStore() + + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err := pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["recovery_target"] != "immediate" { + t.Fatalf("expected recovery_target to be immediate, got %v", cfg["recovery_target_name"]) + } + }) + + t.Run("barman-restore-from-target-time-non-inclusive", func(t *testing.T) { + t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00&targetInclusive=false") + store, _ := state.NewStore() + + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err := pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["recovery_target_time"] != "'2024-06-30T11:15:00-06:00'" { + t.Fatalf("expected recovery_target_time to be 2024-06-30T11:15:00-06:00, got %v", cfg["recovery_target_time"]) + } + + if cfg["recovery_target_inclusive"] != "false" { + t.Fatalf("expected recovery_target_inclusive to be false, got %v", cfg["recovery_target_inclusive"]) + } + }) } func TestPGUserConfigOverride(t *testing.T) { From 900b8e63c1953ebc44e13197eed23efc8f183816 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Mon, 1 Jul 2024 09:56:31 -0500 Subject: [PATCH 23/51] Adding todo's and removing unused code --- internal/flypg/barman.go | 3 +- internal/flypg/barman_test.go | 69 ----------------------------------- internal/flypg/node.go | 3 +- 3 files changed, 3 insertions(+), 72 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index c9602cbf..d4e64bba 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -47,7 +47,7 @@ type BackupList struct { // NewBarman creates a new Barman instance. // The configURL is expected to be in the format: -// https://s3-access-key:s3-secret-key@s3-endpoint/bucket/directory +// https://s3-access-key:s3-secret-key@s3-endpoint/bucket/bucket-directory func NewBarman(configURL string) (*Barman, error) { parsedURL, err := url.Parse(configURL) if err != nil { @@ -169,6 +169,7 @@ func (b *Barman) LastBackupTaken(ctx context.Context) (time.Time, error) { return time.Time{}, nil } + // Layout used by barman. layout := "Mon Jan 2 15:04:05 2006" var latestBackupTime time.Time diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index 70208d1f..159f31bc 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -103,75 +103,6 @@ func TestWriteAWSCredentials(t *testing.T) { }) } -// t.Run("custom-retention", func(t *testing.T) { -// setDefaultEnv(t) -// t.Setenv("CLOUD_ARCHIVING_RETENTION_DAYS", "30") - -// barman, err := NewBarman(true) -// if err != nil { -// t.Fatalf("unexpected error: %v", err) -// } -// if barman.retentionDays != "30" { -// t.Fatalf("expected retentionDays to be 30, but got %s", barman.retentionDays) -// } -// }) - -// t.Run("custom-min-redundancy", func(t *testing.T) { -// setDefaultEnv(t) -// t.Setenv("CLOUD_ARCHIVING_MINIMUM_REDUNDANCY", "7") - -// barman, err := NewBarman(true) -// if err != nil { -// t.Fatalf("unexpected error: %v", err) -// } -// if barman.minimumRedundancy != "7" { -// t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) -// } -// }) -// } - -// func TestValidateBarmanRequirements(t *testing.T) { -// b, _ := NewBarman(false) - -// t.Run("missing-aws-access-key", func(t *testing.T) { -// err := b.ValidateRequiredEnv() - -// if err.Error() != "AWS_ACCESS_KEY_ID secret must be set" { -// t.Fatalf("expected error to be 'AWS_ACCESS_KEY_ID secret must be set', but got %s", err.Error()) -// } -// }) - -// t.Run("missing-aws-secret-access-key", func(t *testing.T) { -// setDefaultEnv(t) -// t.Setenv("AWS_SECRET_ACCESS_KEY", "") -// err := b.ValidateRequiredEnv() - -// if err.Error() != "AWS_SECRET_ACCESS_KEY secret must be set" { -// t.Fatalf("expected error to be 'AWS_SECRET_ACCESS_KEY secret must be set', but got %s", err.Error()) -// } -// }) - -// t.Run("missing-aws-bucket-name", func(t *testing.T) { -// setDefaultEnv(t) -// t.Setenv("AWS_BUCKET_NAME", "") -// err := b.ValidateRequiredEnv() - -// if err.Error() != "AWS_BUCKET_NAME envvar must be set" { -// t.Fatalf("expected error to be 'AWS_BUCKET_NAME envvar must be set', but got %s", err.Error()) -// } -// }) - -// t.Run("missing-aws-endpoint-url", func(t *testing.T) { -// setDefaultEnv(t) -// t.Setenv("AWS_ENDPOINT_URL_S3", "") -// err := b.ValidateRequiredEnv() - -// if err.Error() != "AWS_ENDPOINT_URL_S3 envvar must be set" { -// t.Fatalf("expected error to be 'AWS_ENDPOINT_URL_S3 envvar must be set', but got %s", err.Error()) -// } -// }) -// } - func TestBarmanRetentionPolicy(t *testing.T) { setDefaultEnv(t) diff --git a/internal/flypg/node.go b/internal/flypg/node.go index f3876e61..d421b0d8 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -145,6 +145,7 @@ func (n *Node) Init(ctx context.Context) error { // Remote point-in-time restore. if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + // TODO - Probably not safe to use the same lock as the snapshot restore. active, err := isRestoreActive() if err != nil { return fmt.Errorf("failed to verify active restore: %s", err) @@ -175,8 +176,6 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed to replay WAL: %s", err) } - os.Unsetenv("BARMAN_REMOTE_RESTORE") - // Set the lock file so the init process knows not to restart the restore process. if err := setRestoreLock(); err != nil { return fmt.Errorf("failed to set restore lock: %s", err) From 8fa510c0c47b2d4c4f9189d769cb98ee315d1dec Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Mon, 1 Jul 2024 10:03:22 -0500 Subject: [PATCH 24/51] Unexport functions --- internal/flypg/barman_restore.go | 6 +++--- internal/flypg/node.go | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 5eae9a9c..25143e98 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -68,13 +68,13 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { } if restore.recoveryTargetName == "" && restore.recoveryTargetTime == "" && restore.recoveryTarget == "" { - return nil, fmt.Errorf("no restore target not specified") + return nil, fmt.Errorf("no restore target specified") } return restore, nil } -func (b *BarmanRestore) WALReplayAndReset(ctx context.Context, node *Node) error { +func (b *BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error { // create a copy of the pg_hba.conf file so we can revert back to it when needed. if err := backupHBAFile(); err != nil { if os.IsNotExist(err) { @@ -141,7 +141,7 @@ func (b *BarmanRestore) WALReplayAndReset(ctx context.Context, node *Node) error } -func (b *BarmanRestore) RestoreFromBackup(ctx context.Context) error { +func (b *BarmanRestore) restoreFromBackup(ctx context.Context) error { // Query available backups from object storage backups, err := b.ListBackups(ctx) if err != nil { diff --git a/internal/flypg/node.go b/internal/flypg/node.go index d421b0d8..87fcf56a 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -163,7 +163,7 @@ func (n *Node) Init(ctx context.Context) error { } log.Println("Restoring base backup") - if err := restore.RestoreFromBackup(ctx); err != nil { + if err := restore.restoreFromBackup(ctx); err != nil { return fmt.Errorf("failed to restore base backup: %s", err) } @@ -172,7 +172,7 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed to initialize pg config: %s", err) } - if err := restore.WALReplayAndReset(ctx, n); err != nil { + if err := restore.walReplayAndReset(ctx, n); err != nil { return fmt.Errorf("failed to replay WAL: %s", err) } From 4ddcecc09bd69132b5f2bfb2e25f704e7084fbb2 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Mon, 1 Jul 2024 12:34:09 -0500 Subject: [PATCH 25/51] Separate credentials into separate profiles --- cmd/monitor/main.go | 4 +- cmd/start/main.go | 2 +- internal/flypg/barman.go | 58 ++++---------- internal/flypg/barman_restore.go | 11 +-- internal/flypg/barman_restore_test.go | 14 ++++ internal/flypg/barman_test.go | 61 +------------- internal/flypg/node.go | 34 ++++---- internal/flypg/pg.go | 7 +- internal/flypg/pg_test.go | 2 +- internal/flypg/s3_credentials.go | 110 ++++++++++++++++++++++++++ internal/flypg/s3_credentials_test.go | 92 +++++++++++++++++++++ 11 files changed, 267 insertions(+), 128 deletions(-) create mode 100644 internal/flypg/s3_credentials.go create mode 100644 internal/flypg/s3_credentials_test.go diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index dd3eead8..239d730b 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -42,8 +42,8 @@ func main() { } }() - if os.Getenv("BARMAN_ENABLED") == "true" { - barman, err := flypg.NewBarman(os.Getenv("BARMAN_ENABLED")) + if os.Getenv("BARMAN_ENABLED") != "" { + barman, err := flypg.NewBarman(os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) if err != nil { panic(err) } diff --git a/cmd/start/main.go b/cmd/start/main.go index 9f4f746a..7a438e3f 100644 --- a/cmd/start/main.go +++ b/cmd/start/main.go @@ -23,6 +23,7 @@ func main() { } } + // Deprecated - We are moving away from having a separate barman Machine if os.Getenv("IS_BARMAN") != "" { node, err := flybarman.NewNode() if err != nil { @@ -54,7 +55,6 @@ func main() { return } - // Set the environment variable within the Go process // TODO - We need to find a better way to do this. if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { panicHandler(err) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index d4e64bba..ebc68bf1 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -7,7 +7,6 @@ import ( "log" "net/url" "os" - "path/filepath" "strings" "time" @@ -15,8 +14,11 @@ import ( ) const ( - providerDefault = "aws-s3" - awsCredentialsPath = "/data/.aws/credentials" + providerDefault = "aws-s3" + // DefaultAuthProfile is the default AWS profile used for barman operations. + DefaultAuthProfile = "barman" + // RestoreAuthProfile is the AWS profile used for barman restore operations. + RestoreAuthProfile = "restore" ) type Barman struct { @@ -25,10 +27,9 @@ type Barman struct { endpoint string bucket string bucketDirectory string + authProfile string - // TODO - This was for convenience, but should probably be removed. - configURL string - + // TODO - Make these configurable retentionDays string minimumRedundancy string } @@ -48,7 +49,7 @@ type BackupList struct { // NewBarman creates a new Barman instance. // The configURL is expected to be in the format: // https://s3-access-key:s3-secret-key@s3-endpoint/bucket/bucket-directory -func NewBarman(configURL string) (*Barman, error) { +func NewBarman(configURL, authProfile string) (*Barman, error) { parsedURL, err := url.Parse(configURL) if err != nil { return nil, fmt.Errorf("invalid credential url: %w", err) @@ -84,7 +85,7 @@ func NewBarman(configURL string) (*Barman, error) { endpoint: fmt.Sprintf("https://%s", endpoint), bucket: pathSlice[0], bucketDirectory: pathSlice[1], - configURL: configURL, + authProfile: authProfile, retentionDays: "7", minimumRedundancy: "3", @@ -102,6 +103,7 @@ func (b *Barman) Backup(ctx context.Context, immediateCheckpoint bool) ([]byte, args := []string{ "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, + "--profile", b.authProfile, "--host", fmt.Sprintf("%s.internal", b.appName), "--user", "repmgr", b.BucketURL(), @@ -120,6 +122,7 @@ func (b *Barman) RestoreBackup(ctx context.Context, backupID string) ([]byte, er args := []string{ "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, + "--profile", b.authProfile, b.BucketURL(), b.bucketDirectory, backupID, @@ -133,6 +136,7 @@ func (b *Barman) ListBackups(ctx context.Context) (BackupList, error) { args := []string{ "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, + "--profile", b.authProfile, "--format", "json", b.BucketURL(), b.bucketDirectory, @@ -150,6 +154,7 @@ func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { args := []string{ "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, + "--profile", b.authProfile, "--retention", b.RetentionPolicy(), "--minimum-redundancy", b.minimumRedundancy, b.BucketURL(), @@ -213,9 +218,10 @@ func (b *Barman) parseBackups(backupBytes []byte) (BackupList, error) { func (b *Barman) walArchiveCommand() string { // TODO - Make compression configurable - return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s %s %s %%p", + return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s --profile %s %s %s %%p", b.provider, b.endpoint, + b.authProfile, b.BucketURL(), b.bucketDirectory, ) @@ -224,45 +230,15 @@ func (b *Barman) walArchiveCommand() string { // walRestoreCommand returns the command string used to restore WAL files. // The %f and %p placeholders are replaced with the file path and file name respectively. func (b *Barman) walRestoreCommand() string { - return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s %s %s %%f %%p", + return fmt.Sprintf("barman-cloud-wal-restore --cloud-provider %s --endpoint-url %s --profile %s %s %s %%f %%p", b.provider, b.endpoint, + b.authProfile, b.BucketURL(), b.bucketDirectory, ) } -func (b *Barman) writeAWSCredentials(profile string, credentialsPath string) error { - barmanURL, err := url.Parse(b.configURL) - if err != nil { - return fmt.Errorf("invalid configURL: %w", err) - } - - accessKey := barmanURL.User.Username() - if accessKey == "" { - return fmt.Errorf("AWS ACCESS KEY is missing") - } - - secretAccessKey, _ := barmanURL.User.Password() - if secretAccessKey == "" { - return fmt.Errorf("AWS SECRET KEY is missing") - } - - credentials := fmt.Sprintf("[%s]\naws_access_key_id=%s\naws_secret_access_key=%s", - profile, accessKey, secretAccessKey) - - // Ensure the directory exists - if err := os.MkdirAll(filepath.Dir(credentialsPath), 0700); err != nil { - return fmt.Errorf("failed to create AWS credentials directory: %w", err) - } - - if err := os.WriteFile(credentialsPath, []byte(credentials), 0644); err != nil { - return fmt.Errorf("failed to write AWS credentials file: %w", err) - } - - return nil -} - func getenv(key, fallback string) string { value := os.Getenv(key) if len(value) == 0 { diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 25143e98..6282ca81 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -23,11 +23,12 @@ type BarmanRestore struct { const ( defaultRestoreDir = "/data/postgresql" - ISO8601 = "2006-01-02T15:04:05-07:00" + // ISO8601 is the format required for the barman restore targets. + ISO8601 = "2006-01-02T15:04:05-07:00" ) func NewBarmanRestore(configURL string) (*BarmanRestore, error) { - barman, err := NewBarman(configURL) + barman, err := NewBarman(configURL, RestoreAuthProfile) if err != nil { return nil, fmt.Errorf("failed to create barman client: %s", err) } @@ -41,7 +42,6 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { Barman: barman, } - // evaluate the query parameters for key, value := range url.Query() { switch key { case "target": @@ -83,8 +83,7 @@ func (b *BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error return fmt.Errorf("failed backing up pg_hba.conf: %s", err) } - // Grant local access so we can update internal credentials - // to match the environment. + // Grant local access so we can update internal credentials to match the environment. if err := grantLocalAccess(); err != nil { return fmt.Errorf("failed to grant local access: %s", err) } @@ -105,8 +104,6 @@ func (b *BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error return fmt.Errorf("failed to monitor recovery mode: %s", err) } - // os.Remove("/data/postgresql/recovery.signal") - // Open read/write connection conn, err := openConn(ctx, node.PrivateIP) if err != nil { diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index bb27b8c4..ca8d51df 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -1,6 +1,7 @@ package flypg import ( + "fmt" "os" "testing" ) @@ -144,7 +145,20 @@ func TestNewBarmanRestore(t *testing.T) { if restore.recoveryTargetTime != "2024-06-30T11:15:00-06:00" { t.Fatalf("expected recovery target time to be 2024-06-30T11:15:00-06:00, got %s", restore.recoveryTargetTime) } +} +func TestWALRestoreCommand(t *testing.T) { + setRestoreDefaultEnv(t) + restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + expected := fmt.Sprintf("barman-cloud-wal-restore --cloud-provider aws-s3 --endpoint-url https://fly.storage.tigris.dev --profile restore s3://my-bucket my-directory %%f %%p") + + if restore.walRestoreCommand() != expected { + t.Fatalf("expected WALRestoreCommand to be %s, but got %s", expected, restore.walRestoreCommand()) + } } func TestParseBackups(t *testing.T) { diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index 159f31bc..cc4d6920 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -1,18 +1,16 @@ package flypg import ( - "fmt" "os" "testing" - - "github.com/fly-apps/postgres-flex/internal/utils" ) func TestNewBarman(t *testing.T) { t.Run("defaults", func(t *testing.T) { setDefaultEnv(t) - barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) + configURL := os.Getenv("BARMAN_ENABLED") + barman, err := NewBarman(configURL, "barman") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -52,61 +50,10 @@ func TestNewBarman(t *testing.T) { }) } -func TestWALRestoreCommand(t *testing.T) { - setDefaultEnv(t) - - barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - expected := fmt.Sprintf("barman-cloud-wal-restore --cloud-provider aws-s3 --endpoint-url https://fly.storage.tigris.dev s3://my-bucket my-directory %%f %%p") - - if barman.walRestoreCommand() != expected { - t.Fatalf("expected WALRestoreCommand to be %s, but got %s", expected, barman.walRestoreCommand()) - } -} - -func TestWriteAWSCredentials(t *testing.T) { - setup(t) - defer cleanup() - - t.Run("write-aws-credentials", func(t *testing.T) { - setDefaultEnv(t) - - barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - credFile := "./test_results/credentials" - - if err := barman.writeAWSCredentials("default", credFile); err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if !utils.FileExists(credFile) { - t.Fatalf("expected %s to exist, but doesn't", credFile) - } - - // Check contents - contents, err := os.ReadFile(credFile) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - expected := "[default]\naws_access_key_id=my-key\naws_secret_access_key=my-secret" - - if string(contents) != expected { - t.Fatalf("expected contents to be %s, but got %s", expected, string(contents)) - } - }) -} - func TestBarmanRetentionPolicy(t *testing.T) { setDefaultEnv(t) - - barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) + configURL := os.Getenv("BARMAN_ENABLED") + barman, err := NewBarman(configURL, DefaultAuthProfile) if err != nil { t.Fatalf("unexpected error: %v", err) } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 87fcf56a..3f77f189 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -7,7 +7,6 @@ import ( "log" "net" "os" - "os/exec" "strconv" "strings" "time" @@ -120,7 +119,7 @@ func NewNode() (*Node, error) { func (n *Node) Init(ctx context.Context) error { // Ensure directory and files have proper permissions - if err := setDirOwnership(); err != nil { + if err := setDirOwnership(ctx, "/data"); err != nil { return fmt.Errorf("failed to set directory ownership: %s", err) } @@ -143,6 +142,12 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed initialize cluster state store: %s", err) } + if os.Getenv("BARMAN_ENABLED") != "" || os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + if err := writeS3Credentials(ctx, s3AuthDir); err != nil { + return fmt.Errorf("failed to write s3 credentials: %s", err) + } + } + // Remote point-in-time restore. if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { // TODO - Probably not safe to use the same lock as the snapshot restore. @@ -152,17 +157,12 @@ func (n *Node) Init(ctx context.Context) error { } if active { - restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + configURL := os.Getenv("BARMAN_REMOTE_RESTORE") + restore, err := NewBarmanRestore(configURL) if err != nil { return fmt.Errorf("failed to initialize barman restore: %s", err) } - // Write the source AWS credentials - if err := restore.writeAWSCredentials("default", awsCredentialsPath); err != nil { - return fmt.Errorf("failed to write aws credentials: %s", err) - } - - log.Println("Restoring base backup") if err := restore.restoreFromBackup(ctx); err != nil { return fmt.Errorf("failed to restore base backup: %s", err) } @@ -251,7 +251,7 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed to initialize pg config: %s", err) } - if err := setDirOwnership(); err != nil { + if err := setDirOwnership(ctx, "/data"); err != nil { return fmt.Errorf("failed to set directory ownership: %s", err) } @@ -497,16 +497,20 @@ func openConnection(parentCtx context.Context, host string, database string, cre return pgx.ConnectConfig(ctx, conf) } -func setDirOwnership() error { +func setDirOwnership(ctx context.Context, pathToDir string) error { + if os.Getenv("UNIT_TESTING") == "true" { + return nil + } + pgUID, pgGID, err := utils.SystemUserIDs("postgres") if err != nil { return fmt.Errorf("failed to find postgres user ids: %s", err) } - cmdStr := fmt.Sprintf("chown -R %d:%d %s", pgUID, pgGID, "/data") - cmd := exec.Command("sh", "-c", cmdStr) - if _, err = cmd.Output(); err != nil { - return err + args := []string{"-R", fmt.Sprintf("%d:%d", pgUID, pgGID), pathToDir} + + if _, err := utils.RunCmd(ctx, "root", "chown", args...); err != nil { + return fmt.Errorf("failed to set directory ownership: %s", err) } return nil diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 3701b0f2..aca41af0 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -174,14 +174,13 @@ func (c *PGConfig) SetDefaults() error { } if os.Getenv("BARMAN_ENABLED") != "" { - barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) + configURL := os.Getenv("BARMAN_ENABLED") + + barman, err := NewBarman(configURL, "barman") switch { case err != nil: log.Printf("[WARN] Failed to initialize barman: %s", err) default: - if err := barman.writeAWSCredentials("default", awsCredentialsPath); err != nil { - log.Printf("[WARN] Failed to write AWS credentials: %s", err) - } c.internalConfig["archive_mode"] = "on" c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) // This controls the minimum frequency WAL files are archived to barman diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index 0b813237..d9a33150 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -132,7 +132,7 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - barman, err := NewBarman(os.Getenv("BARMAN_ENABLED")) + barman, err := NewBarman(os.Getenv("BARMAN_ENABLED"), "barman") if err != nil { t.Fatal(err) } diff --git a/internal/flypg/s3_credentials.go b/internal/flypg/s3_credentials.go new file mode 100644 index 00000000..10c2baad --- /dev/null +++ b/internal/flypg/s3_credentials.go @@ -0,0 +1,110 @@ +package flypg + +import ( + "context" + "fmt" + "net/url" + "os" + "path/filepath" +) + +const ( + s3AuthDir = "/data/.aws" + s3AuthFileName = "credentials" +) + +type s3Credentials struct { + profile string + accessKeyID string + secretAccessKey string +} + +func writeS3Credentials(ctx context.Context, s3AuthDir string) error { + var creds []*s3Credentials + + if os.Getenv("BARMAN_ENABLED") != "" { + cred, err := parseCredentialsFromConfigURL(os.Getenv("BARMAN_ENABLED"), DefaultAuthProfile) + if err != nil { + return fmt.Errorf("failed to parse credentials from barman configURL: %v", err) + } + creds = append(creds, cred) + } + + if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + cred, err := parseCredentialsFromConfigURL(os.Getenv("BARMAN_REMOTE_RESTORE"), RestoreAuthProfile) + if err != nil { + return fmt.Errorf("failed to parse credentials from barman restore configURL: %v", err) + } + creds = append(creds, cred) + } + + if len(creds) == 0 { + return nil + } + + s3AuthFilePath := filepath.Join(s3AuthDir, s3AuthFileName) + + // Ensure the directory exists + if err := os.MkdirAll(s3AuthDir, 0700); err != nil { + return fmt.Errorf("failed to create AWS credentials directory: %w", err) + } + + // Write the credentials to disk + if err := writeCredentialsToFile(creds, s3AuthFilePath); err != nil { + return fmt.Errorf("failed to write credentials to file: %w", err) + } + + // Set file permissions + if err := os.Chmod(s3AuthFilePath, 0644); err != nil { + return fmt.Errorf("failed to set file permissions: %w", err) + } + + // Ensure the directory has the correct ownership + if err := setDirOwnership(ctx, s3AuthDir); err != nil { + return fmt.Errorf("failed to set directory ownership: %w", err) + } + + return nil +} + +func writeCredentialsToFile(credentials []*s3Credentials, pathToCredentialFile string) error { + file, err := os.Create(pathToCredentialFile) + if err != nil { + return fmt.Errorf("failed to create file: %w", err) + } + defer file.Close() + + // Write the credentials to disk + for _, cred := range credentials { + _, err := file.WriteString(fmt.Sprintf("[%s]\naws_access_key_id=%s\naws_secret_access_key=%s\n\n", + cred.profile, cred.accessKeyID, cred.secretAccessKey)) + if err != nil { + return fmt.Errorf("failed to write s3 profile %s: %w", cred.profile, err) + } + } + + return nil +} + +func parseCredentialsFromConfigURL(configURL, assignedProfile string) (*s3Credentials, error) { + barmanURL, err := url.Parse(configURL) + if err != nil { + return nil, fmt.Errorf("invalid configURL: %w", err) + } + + accessKey := barmanURL.User.Username() + if accessKey == "" { + return nil, fmt.Errorf("AWS ACCESS KEY is missing") + } + + secretAccessKey, _ := barmanURL.User.Password() + if secretAccessKey == "" { + return nil, fmt.Errorf("AWS SECRET KEY is missing") + } + + return &s3Credentials{ + profile: assignedProfile, + accessKeyID: accessKey, + secretAccessKey: secretAccessKey, + }, nil +} diff --git a/internal/flypg/s3_credentials_test.go b/internal/flypg/s3_credentials_test.go new file mode 100644 index 00000000..c8541f5f --- /dev/null +++ b/internal/flypg/s3_credentials_test.go @@ -0,0 +1,92 @@ +package flypg + +import ( + "context" + "fmt" + "os" + "testing" + + "github.com/fly-apps/postgres-flex/internal/utils" +) + +func TestWriteAWSCredentials(t *testing.T) { + setup(t) + defer cleanup() + + authDir := "./test_results/.aws" + pathToCredentials := fmt.Sprintf("%s/credentials", authDir) + + t.Run("write-barman-credentials", func(t *testing.T) { + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") + + if err := writeS3Credentials(context.TODO(), authDir); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if !utils.FileExists(pathToCredentials) { + t.Fatalf("expected %s to exist, but doesn't", pathToCredentials) + } + + // Check contents + contents, err := os.ReadFile(pathToCredentials) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + expected := "[barman]\naws_access_key_id=my-key\naws_secret_access_key=my-secret\n\n" + + if string(contents) != expected { + t.Fatalf("expected contents to be %s, but got %s", expected, string(contents)) + } + }) + + t.Run("write-restore-credentials", func(t *testing.T) { + t.Setenv("BARMAN_REMOTE_RESTORE", "https://source-key:source-secret@fly.storage.tigris.dev/my-bucket/my-directory") + + if err := writeS3Credentials(context.TODO(), authDir); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if !utils.FileExists(pathToCredentials) { + t.Fatalf("expected %s to exist, but doesn't", pathToCredentials) + } + + // Check contents + contents, err := os.ReadFile(pathToCredentials) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + expected := "[restore]\naws_access_key_id=source-key\naws_secret_access_key=source-secret\n\n" + + if string(contents) != expected { + t.Fatalf("expected contents to be %s, but got %s", expected, string(contents)) + } + }) + + t.Run("write-barman-and-restore-credentials", func(t *testing.T) { + t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") + t.Setenv("BARMAN_REMOTE_RESTORE", "https://source-key:source-secret@fly.storage.tigris.dev/source-bucket/source-directory") + + if err := writeS3Credentials(context.TODO(), authDir); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if !utils.FileExists(pathToCredentials) { + t.Fatalf("expected %s to exist, but doesn't", pathToCredentials) + } + + // Check contents + contents, err := os.ReadFile(pathToCredentials) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + expected := "[barman]\naws_access_key_id=my-key\naws_secret_access_key=my-secret\n\n[restore]\naws_access_key_id=source-key\naws_secret_access_key=source-secret\n\n" + + if string(contents) != expected { + t.Fatalf("expected contents to be %s, but got %s", expected, string(contents)) + } + }) + +} From 61eedd0484191b614a6061f16d04c10b7b4d9030 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Mon, 1 Jul 2024 12:37:09 -0500 Subject: [PATCH 26/51] Fix errcheck --- internal/flypg/s3_credentials.go | 4 ++-- internal/flypg/s3_credentials_test.go | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/internal/flypg/s3_credentials.go b/internal/flypg/s3_credentials.go index 10c2baad..a61e8492 100644 --- a/internal/flypg/s3_credentials.go +++ b/internal/flypg/s3_credentials.go @@ -72,7 +72,7 @@ func writeCredentialsToFile(credentials []*s3Credentials, pathToCredentialFile s if err != nil { return fmt.Errorf("failed to create file: %w", err) } - defer file.Close() + defer func() { _ = file.Close() }() // Write the credentials to disk for _, cred := range credentials { @@ -83,7 +83,7 @@ func writeCredentialsToFile(credentials []*s3Credentials, pathToCredentialFile s } } - return nil + return file.Sync() } func parseCredentialsFromConfigURL(configURL, assignedProfile string) (*s3Credentials, error) { diff --git a/internal/flypg/s3_credentials_test.go b/internal/flypg/s3_credentials_test.go index c8541f5f..2718ece0 100644 --- a/internal/flypg/s3_credentials_test.go +++ b/internal/flypg/s3_credentials_test.go @@ -10,7 +10,9 @@ import ( ) func TestWriteAWSCredentials(t *testing.T) { - setup(t) + if err := setup(t); err != nil { + t.Fatal(err) + } defer cleanup() authDir := "./test_results/.aws" From ea0c59e1cd24c4cc426659ad6f351d06a9ec4843 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Mon, 1 Jul 2024 12:43:21 -0500 Subject: [PATCH 27/51] Add todo --- cmd/monitor/monitor_backup_schedule.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/monitor/monitor_backup_schedule.go b/cmd/monitor/monitor_backup_schedule.go index d7afb79d..88a61f5f 100644 --- a/cmd/monitor/monitor_backup_schedule.go +++ b/cmd/monitor/monitor_backup_schedule.go @@ -16,6 +16,8 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { log.Printf("Failed to resolve the last backup taken: %s", err) } + // TODO - Wait for Postgres to be ready before proceeding. + // Ensure we have a least one backup before proceeding. if lastBackupTime.IsZero() { log.Println("No backups found! Performing the initial base backup.") From fd001e575658fdc7d9808e22dc06f6ba3c524212 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Mon, 1 Jul 2024 12:53:54 -0500 Subject: [PATCH 28/51] static check fixes --- cmd/monitor/monitor_backup_schedule.go | 2 +- internal/flypg/barman.go | 22 +++++++--------------- internal/flypg/barman_restore.go | 8 ++++---- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/cmd/monitor/monitor_backup_schedule.go b/cmd/monitor/monitor_backup_schedule.go index 88a61f5f..6a0bce09 100644 --- a/cmd/monitor/monitor_backup_schedule.go +++ b/cmd/monitor/monitor_backup_schedule.go @@ -33,7 +33,7 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { log.Printf("Last backup taken at: %s", lastBackupTime) // Calculate the time until the next backup is due. - timeUntilNextBackup := lastBackupTime.Add(defaultFullBackupSchedule).Sub(time.Now()) + timeUntilNextBackup := time.Until(lastBackupTime.Add(defaultFullBackupSchedule)) // Perform backup immediately if the time until the next backup is negative. if timeUntilNextBackup < 0 { diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index ebc68bf1..06a92182 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -206,16 +206,6 @@ func (b *Barman) RetentionPolicy() string { return fmt.Sprintf("'RECOVERY WINDOW OF %s days'", b.retentionDays) } -func (b *Barman) parseBackups(backupBytes []byte) (BackupList, error) { - var backupList BackupList - - if err := json.Unmarshal(backupBytes, &backupList); err != nil { - return BackupList{}, fmt.Errorf("failed to parse backups: %s", err) - } - - return backupList, nil -} - func (b *Barman) walArchiveCommand() string { // TODO - Make compression configurable return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s --profile %s %s %s %%p", @@ -239,10 +229,12 @@ func (b *Barman) walRestoreCommand() string { ) } -func getenv(key, fallback string) string { - value := os.Getenv(key) - if len(value) == 0 { - return fallback +func (*Barman) parseBackups(backupBytes []byte) (BackupList, error) { + var backupList BackupList + + if err := json.Unmarshal(backupBytes, &backupList); err != nil { + return BackupList{}, fmt.Errorf("failed to parse backups: %s", err) } - return value + + return backupList, nil } diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 6282ca81..29b2700f 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -100,7 +100,7 @@ func (b *BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error }() // Wait for the WAL replay to complete. - if err := b.waitOnRecovery(ctx, node.PrivateIP); err != nil { + if err := waitOnRecovery(ctx, node.PrivateIP); err != nil { return fmt.Errorf("failed to monitor recovery mode: %s", err) } @@ -190,7 +190,7 @@ func (b *BarmanRestore) restoreFromBackup(ctx context.Context) error { return nil } -func (b *BarmanRestore) resolveBackupFromID(backupList BackupList, id string) (string, error) { +func (*BarmanRestore) resolveBackupFromID(backupList BackupList, id string) (string, error) { if len(backupList.Backups) == 0 { return "", fmt.Errorf("no backups found") } @@ -204,7 +204,7 @@ func (b *BarmanRestore) resolveBackupFromID(backupList BackupList, id string) (s return "", fmt.Errorf("no backup found with id %s", id) } -func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr string) (string, error) { +func (*BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr string) (string, error) { if len(backupList.Backups) == 0 { return "", fmt.Errorf("no backups found") } @@ -268,7 +268,7 @@ func (b *BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr return latestBackupID, nil } -func (b *BarmanRestore) waitOnRecovery(ctx context.Context, privateIP string) error { +func waitOnRecovery(ctx context.Context, privateIP string) error { conn, err := openConn(ctx, privateIP) if err != nil { return fmt.Errorf("failed to establish connection to local node: %s", err) From dc6c13c550c1c6e4948673661f03e6dfab87ba4f Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 09:30:43 -0500 Subject: [PATCH 29/51] Make archive settings configurable --- cmd/monitor/main.go | 13 +- internal/api/handle_admin.go | 98 +++++++++++++ internal/api/handler.go | 4 + internal/flypg/barman.go | 39 +++-- internal/flypg/barman_config.go | 202 ++++++++++++++++++++++++++ internal/flypg/barman_restore.go | 3 +- internal/flypg/barman_restore_test.go | 15 ++ internal/flypg/barman_test.go | 46 +++--- internal/flypg/config.go | 11 +- internal/flypg/node.go | 6 + internal/flypg/pg.go | 30 ++-- internal/flypg/pg_test.go | 10 +- 12 files changed, 418 insertions(+), 59 deletions(-) create mode 100644 internal/flypg/barman_config.go diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 239d730b..e368ed02 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -8,6 +8,7 @@ import ( "time" "github.com/fly-apps/postgres-flex/internal/flypg" + "github.com/fly-apps/postgres-flex/internal/flypg/state" ) var ( @@ -43,8 +44,17 @@ func main() { }() if os.Getenv("BARMAN_ENABLED") != "" { - barman, err := flypg.NewBarman(os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) + store, err := state.NewStore() if err != nil { + panic(fmt.Errorf("failed initialize cluster state store: %s", err)) + } + + barman, err := flypg.NewBarman(store, os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) + if err != nil { + panic(err) + } + + if err := barman.LoadConfig(flypg.DefaultBarmanConfigDir); err != nil { panic(err) } @@ -54,7 +64,6 @@ func main() { // Backup retention monitor log.Println("Monitoring backup retention") - barman.PrintRetentionPolicy() go monitorBackupRetention(ctx, barman) } diff --git a/internal/api/handle_admin.go b/internal/api/handle_admin.go index 89ff78be..ab8c5e31 100644 --- a/internal/api/handle_admin.go +++ b/internal/api/handle_admin.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "net/http" + "os" "strings" "github.com/fly-apps/postgres-flex/internal/flypg" @@ -298,3 +299,100 @@ func handleViewRepmgrSettings(w http.ResponseWriter, r *http.Request) { resp := &Response{Result: out} renderJSON(w, resp, http.StatusOK) } + +func handleViewBarmanSettings(w http.ResponseWriter, r *http.Request) { + if os.Getenv("BARMAN_ENABLED") == "" { + renderErr(w, fmt.Errorf("barman is not enabled")) + return + } + + store, err := state.NewStore() + if err != nil { + renderErr(w, err) + return + } + + barman, err := flypg.NewBarman(store, os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) + if err != nil { + renderErr(w, err) + return + } + + if err := barman.LoadConfig(flypg.DefaultBarmanConfigDir); err != nil { + renderErr(w, err) + return + } + + all, err := barman.CurrentConfig() + if err != nil { + renderErr(w, err) + return + } + + resp := &Response{Result: all} + renderJSON(w, resp, http.StatusOK) +} + +func handleUpdateBarmanSettings(w http.ResponseWriter, r *http.Request) { + if os.Getenv("BARMAN_ENABLED") == "" { + renderErr(w, fmt.Errorf("barman is not enabled")) + return + } + + store, err := state.NewStore() + if err != nil { + renderErr(w, err) + return + } + + barman, err := flypg.NewBarman(store, os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) + if err != nil { + renderErr(w, err) + return + } + + if err := barman.LoadConfig(flypg.DefaultBarmanConfigDir); err != nil { + renderErr(w, err) + return + } + + cfg, err := flypg.ReadFromFile(barman.UserConfigFile()) + if err != nil { + renderErr(w, err) + return + } + + var requestedChanges map[string]interface{} + if err := json.NewDecoder(r.Body).Decode(&requestedChanges); err != nil { + renderErr(w, err) + return + } + + if err := barman.Validate(requestedChanges); err != nil { + renderErr(w, err) + return + } + + for k, v := range requestedChanges { + cfg[k] = v + } + + barman.SetUserConfig(cfg) + + if err := flypg.PushUserConfig(barman, store); err != nil { + renderErr(w, err) + return + } + + if err := flypg.SyncUserConfig(barman, store); err != nil { + renderErr(w, err) + return + } + + res := &Response{Result: SettingsUpdate{ + Message: "Updated", + RestartRequired: true, + }} + + renderJSON(w, res, http.StatusOK) +} diff --git a/internal/api/handler.go b/internal/api/handler.go index 204147b0..ee37b77a 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -60,7 +60,11 @@ func Handler() http.Handler { r.Get("/role", handleRole) r.Get("/settings/view/postgres", handleViewPostgresSettings) r.Get("/settings/view/repmgr", handleViewRepmgrSettings) + r.Get("/settings/view/barman", handleViewBarmanSettings) + r.Post("/settings/update/postgres", handleUpdatePostgresSettings) + r.Post("/settings/update/barman", handleUpdateBarmanSettings) + r.Post("/settings/apply", handleApplyConfig) }) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 06a92182..fc5601e3 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -4,12 +4,12 @@ import ( "context" "encoding/json" "fmt" - "log" "net/url" "os" "strings" "time" + "github.com/fly-apps/postgres-flex/internal/flypg/state" "github.com/fly-apps/postgres-flex/internal/utils" ) @@ -28,10 +28,9 @@ type Barman struct { bucket string bucketDirectory string authProfile string + store *state.Store - // TODO - Make these configurable - retentionDays string - minimumRedundancy string + *BarmanConfig } type Backup struct { @@ -49,7 +48,7 @@ type BackupList struct { // NewBarman creates a new Barman instance. // The configURL is expected to be in the format: // https://s3-access-key:s3-secret-key@s3-endpoint/bucket/bucket-directory -func NewBarman(configURL, authProfile string) (*Barman, error) { +func NewBarman(store *state.Store, configURL, authProfile string) (*Barman, error) { parsedURL, err := url.Parse(configURL) if err != nil { return nil, fmt.Errorf("invalid credential url: %w", err) @@ -86,11 +85,19 @@ func NewBarman(configURL, authProfile string) (*Barman, error) { bucket: pathSlice[0], bucketDirectory: pathSlice[1], authProfile: authProfile, - - retentionDays: "7", - minimumRedundancy: "3", + store: store, }, nil +} + +func (b *Barman) LoadConfig(configDir string) error { + barCfg, err := NewBarmanConfig(b.store, configDir) + if err != nil { + return err + } + b.BarmanConfig = barCfg + + return nil } func (b *Barman) BucketURL() string { @@ -155,8 +162,8 @@ func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { "--cloud-provider", providerDefault, "--endpoint-url", b.endpoint, "--profile", b.authProfile, - "--retention", b.RetentionPolicy(), - "--minimum-redundancy", b.minimumRedundancy, + "--retention", b.Settings.RecoveryWindow, + "--minimum-redundancy", b.Settings.MinimumRedundancy, b.BucketURL(), b.bucketDirectory, } @@ -194,18 +201,6 @@ func (b *Barman) LastBackupTaken(ctx context.Context) (time.Time, error) { return latestBackupTime, nil } -func (b *Barman) PrintRetentionPolicy() { - log.Printf(` -Retention Policy ------------------ -RECOVERY WINDOW: %s DAYS -MINIMUM BACKUP REDUNDANCY: %s`, b.retentionDays, b.minimumRedundancy) -} - -func (b *Barman) RetentionPolicy() string { - return fmt.Sprintf("'RECOVERY WINDOW OF %s days'", b.retentionDays) -} - func (b *Barman) walArchiveCommand() string { // TODO - Make compression configurable return fmt.Sprintf("barman-cloud-wal-archive --cloud-provider %s --gzip --endpoint-url %s --profile %s %s %s %%p", diff --git a/internal/flypg/barman_config.go b/internal/flypg/barman_config.go new file mode 100644 index 00000000..a5310ced --- /dev/null +++ b/internal/flypg/barman_config.go @@ -0,0 +1,202 @@ +package flypg + +import ( + "fmt" + "log" + "os" + "strconv" + "strings" + "time" + + "github.com/fly-apps/postgres-flex/internal/flypg/state" +) + +type BarmanSettings struct { + ArchiveTimeout string + RecoveryWindow string + FullBackupFrequency string + MinimumRedundancy string +} + +type BarmanConfig struct { + internalConfigFilePath string + userConfigFilePath string + internalConfig ConfigMap + userConfig ConfigMap + + Settings BarmanSettings +} + +const ( + barmanConsulKey = "BarmanConfig" + DefaultBarmanConfigDir = "/data/barman/" +) + +// type assertion +var _ Config = &BarmanConfig{} + +func (c *BarmanConfig) InternalConfig() ConfigMap { return c.internalConfig } +func (c *BarmanConfig) UserConfig() ConfigMap { return c.userConfig } +func (*BarmanConfig) ConsulKey() string { return barmanConsulKey } +func (c *BarmanConfig) SetUserConfig(newConfig ConfigMap) { c.userConfig = newConfig } +func (c *BarmanConfig) InternalConfigFile() string { return c.internalConfigFilePath } +func (c *BarmanConfig) UserConfigFile() string { return c.userConfigFilePath } + +func NewBarmanConfig(store *state.Store, configDir string) (*BarmanConfig, error) { + cfg := &BarmanConfig{ + internalConfigFilePath: configDir + "barman.internal.conf", + userConfigFilePath: configDir + "barman.user.conf", + } + + if err := cfg.initialize(store, configDir); err != nil { + return nil, err + } + + return cfg, nil +} + +func (c *BarmanConfig) SetDefaults() { + c.internalConfig = ConfigMap{ + "archive_timeout": "60s", + "recovery_window": "7d", + "full_backup_frequency": "24h", + "minimum_redundancy": "3", + } +} + +func (c *BarmanConfig) CurrentConfig() (ConfigMap, error) { + internal, err := ReadFromFile(c.InternalConfigFile()) + if err != nil { + return nil, err + } + + user, err := ReadFromFile(c.UserConfigFile()) + if err != nil { + return nil, err + } + + all := ConfigMap{} + + for k, v := range internal { + all[k] = v + } + for k, v := range user { + all[k] = v + } + + return all, nil +} + +// ParseSettings reads the current config and returns the settings in a structured format. +func (c *BarmanConfig) ParseSettings() (BarmanSettings, error) { + cfg, err := c.CurrentConfig() + if err != nil { + return BarmanSettings{}, fmt.Errorf("failed to read current config: %s", err) + } + + backupFrequency, err := time.ParseDuration(cfg["full_backup_frequency"].(string)) + if err != nil { + return BarmanSettings{}, fmt.Errorf("failed to parse full backup frequency: %s", err) + } + + recoveryWindow := fmt.Sprintf("RECOVERY WINDOW OF %s", + convertRecoveryWindowDuration(cfg["recovery_window"].(string))) + + return BarmanSettings{ + ArchiveTimeout: cfg["archive_timeout"].(string), + RecoveryWindow: recoveryWindow, + FullBackupFrequency: fmt.Sprint(backupFrequency.Hours()), + MinimumRedundancy: cfg["minimum_redundancy"].(string), + }, nil +} + +func (c *BarmanConfig) Validate(requestedChanges map[string]interface{}) error { + // Verify that the keys provided are valid + for k := range requestedChanges { + if _, ok := c.internalConfig[k]; !ok { + return fmt.Errorf("invalid key: %s", k) + } + } + + // Verify that the values provided are valid + for k, v := range requestedChanges { + switch k { + case "archive_timeout": + if _, ok := v.(string); !ok { + return fmt.Errorf("invalid value for archive_timeout: %v", v) + } + // TODO - validate + + case "recovery_window": + if _, ok := v.(string); !ok { + return fmt.Errorf("invalid value for recovery_window: %v", v) + } + // TODO - validate + + case "full_backup_frequency": + if _, ok := v.(string); !ok { + return fmt.Errorf("invalid value for full_backup_frequency: %v", v) + } + // TODO - validate + case "minimum_redundancy": + if _, ok := v.(string); !ok { + return fmt.Errorf("invalid value for minimum_redundancy: %v", v) + } + + // Ensure that the value is a non-negative integer + val, err := strconv.Atoi(v.(string)) + if err != nil { + return fmt.Errorf("invalid value for minimum_redundancy: %v", v) + } + + if val < 0 { + return fmt.Errorf("invalid value for minimum_redundancy (expected be >= 0, got %v)", val) + } + } + } + + return nil +} + +func (c *BarmanConfig) initialize(store *state.Store, configDir string) error { + // Ensure directory exists + if err := os.MkdirAll(configDir, 0755); err != nil { + return fmt.Errorf("failed to create barman config directory: %s", err) + } + + c.SetDefaults() + + if err := SyncUserConfig(c, store); err != nil { + log.Printf("[WARN] Failed to sync user config from consul for barman: %s\n", err.Error()) + if err := writeInternalConfigFile(c); err != nil { + return fmt.Errorf("failed to write barman config files: %s", err) + } + } else { + if err := WriteConfigFiles(c); err != nil { + return fmt.Errorf("failed to write barman config files: %s", err) + } + } + + settings, err := c.ParseSettings() + if err != nil { + return fmt.Errorf("failed to parse barman config: %w", err) + } + + c.Settings = settings + + return nil +} + +func convertRecoveryWindowDuration(durationStr string) string { + unitMap := map[string]string{ + "m": "MONTHS", + "w": "WEEKS", + "d": "DAYS", + } + for unit, text := range unitMap { + if strings.HasSuffix(durationStr, unit) { + return strings.TrimSuffix(durationStr, unit) + " " + text + } + } + return durationStr +} diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 29b2700f..22b8977d 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -28,7 +28,8 @@ const ( ) func NewBarmanRestore(configURL string) (*BarmanRestore, error) { - barman, err := NewBarman(configURL, RestoreAuthProfile) + // We only need access to the barman endpoints + barman, err := NewBarman(nil, configURL, RestoreAuthProfile) if err != nil { return nil, fmt.Errorf("failed to create barman client: %s", err) } diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index ca8d51df..4f6faa00 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -148,6 +148,11 @@ func TestNewBarmanRestore(t *testing.T) { } func TestWALRestoreCommand(t *testing.T) { + if err := setup(t); err != nil { + t.Fatal(err) + } + defer cleanup() + setRestoreDefaultEnv(t) restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) if err != nil { @@ -162,6 +167,11 @@ func TestWALRestoreCommand(t *testing.T) { } func TestParseBackups(t *testing.T) { + if err := setup(t); err != nil { + t.Fatal(err) + } + defer cleanup() + t.Run("parseBackups", func(t *testing.T) { setRestoreDefaultEnv(t) @@ -209,6 +219,11 @@ func TestParseBackups(t *testing.T) { } func TestResolveBackupTarget(t *testing.T) { + if err := setup(t); err != nil { + t.Fatal(err) + } + defer cleanup() + setRestoreDefaultEnv(t) restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index cc4d6920..a107d651 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -3,18 +3,35 @@ package flypg import ( "os" "testing" + + "github.com/fly-apps/postgres-flex/internal/flypg/state" +) + +const ( + testBarmanConfigDir = "./test_results/barman" ) func TestNewBarman(t *testing.T) { + if err := setup(t); err != nil { + t.Fatal(err) + } + // defer cleanup() + + store, _ := state.NewStore() + t.Run("defaults", func(t *testing.T) { setDefaultEnv(t) configURL := os.Getenv("BARMAN_ENABLED") - barman, err := NewBarman(configURL, "barman") + barman, err := NewBarman(store, configURL, DefaultAuthProfile) if err != nil { t.Fatalf("unexpected error: %v", err) } + if err := barman.LoadConfig(testBarmanConfigDir); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if barman.provider != "aws-s3" { t.Fatalf("expected provider to be aws, but got %s", barman.provider) } @@ -40,27 +57,22 @@ func TestNewBarman(t *testing.T) { } // Defaults - if barman.minimumRedundancy != "3" { - t.Fatalf("expected minimumRedundancy to be 3, but got %s", barman.minimumRedundancy) + if barman.Settings.MinimumRedundancy != "3" { + t.Fatalf("expected minimumRedundancy to be 3, but got %s", barman.Settings.MinimumRedundancy) } - if barman.retentionDays != "7" { - t.Fatalf("expected retentionDays to be 7, but got %s", barman.retentionDays) + if barman.Settings.RecoveryWindow != "RECOVERY WINDOW OF 7 DAYS" { + t.Fatalf("expected recovery_window to be 'RECOVERY WINDOW OF 7 DAYS', but got %s", barman.Settings.RecoveryWindow) } - }) -} -func TestBarmanRetentionPolicy(t *testing.T) { - setDefaultEnv(t) - configURL := os.Getenv("BARMAN_ENABLED") - barman, err := NewBarman(configURL, DefaultAuthProfile) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } + if barman.Settings.FullBackupFrequency != "24" { + t.Fatalf("expected fullBackupFrequency to be 24, but got %s", barman.Settings.FullBackupFrequency) + } - if barman.RetentionPolicy() != "'RECOVERY WINDOW OF 7 days'" { - t.Fatalf("expected retention policy to be 'RECOVERY WINDOW OF 7 days', but got %s", barman.RetentionPolicy()) - } + if barman.Settings.ArchiveTimeout != "60s" { + t.Fatalf("expected archiveTimeout to be 60s, but got %s", barman.Settings.ArchiveTimeout) + } + }) } func setDefaultEnv(t *testing.T) { diff --git a/internal/flypg/config.go b/internal/flypg/config.go index 2bd10bcd..5eac91d2 100644 --- a/internal/flypg/config.go +++ b/internal/flypg/config.go @@ -63,6 +63,7 @@ func SyncUserConfig(c Config, consul *state.Store) error { if cfg == nil { return nil } + c.SetUserConfig(cfg) if err := writeUserConfigFile(c); err != nil { @@ -134,7 +135,7 @@ func ReadFromFile(path string) (ConfigMap, error) { conf[key] = value } - return conf, file.Sync() + return conf, nil } func writeInternalConfigFile(c Config) error { @@ -159,6 +160,10 @@ func writeInternalConfigFile(c Config) error { return fmt.Errorf("failed to close file: %s", err) } + if os.Getenv("UNIT_TESTING") != "" { + return nil + } + if err := utils.SetFileOwnership(c.InternalConfigFile(), "postgres"); err != nil { return fmt.Errorf("failed to set file ownership on %s: %s", c.InternalConfigFile(), err) } @@ -186,6 +191,10 @@ func writeUserConfigFile(c Config) error { return fmt.Errorf("failed to close file: %s", err) } + if os.Getenv("UNIT_TESTING") != "" { + return nil + } + if err := utils.SetFileOwnership(c.UserConfigFile(), "postgres"); err != nil { return fmt.Errorf("failed to set file ownership on %s: %s", c.UserConfigFile(), err) } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 3f77f189..f137ba11 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -32,6 +32,7 @@ type Node struct { PGConfig PGConfig RepMgr RepMgr FlyConfig FlyPGConfig + Barman Barman } func NewNode() (*Node, error) { @@ -109,6 +110,11 @@ func NewNode() (*Node, error) { repmgrDatabase: node.RepMgr.DatabaseName, } + if os.Getenv("BARMAN_ENABLED") != "" { + // Specific PG configuration is injected when Barman is enabled. + node.PGConfig.barmanConfigPath = DefaultBarmanConfigDir + } + node.FlyConfig = FlyPGConfig{ internalConfigFilePath: "/data/flypg.internal.conf", userConfigFilePath: "/data/flypg.user.conf", diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index aca41af0..817b5ce1 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -26,6 +26,8 @@ type PGConfig struct { Port int DataDir string + barmanConfigPath string + passwordFilePath string repmgrUsername string repmgrDatabase string @@ -111,7 +113,7 @@ func (c *PGConfig) Print(w io.Writer) error { return e.Encode(cfg) } -func (c *PGConfig) SetDefaults() error { +func (c *PGConfig) SetDefaults(store *state.Store) error { // The default wal_segment_size in mb const walSegmentSize = 16 @@ -174,20 +176,19 @@ func (c *PGConfig) SetDefaults() error { } if os.Getenv("BARMAN_ENABLED") != "" { - configURL := os.Getenv("BARMAN_ENABLED") + barman, err := NewBarman(store, os.Getenv("BARMAN_ENABLED"), DefaultAuthProfile) + if err != nil { + return fmt.Errorf("failed to initialize barman instance: %s", err) + } - barman, err := NewBarman(configURL, "barman") - switch { - case err != nil: - log.Printf("[WARN] Failed to initialize barman: %s", err) - default: - c.internalConfig["archive_mode"] = "on" - c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) - // This controls the minimum frequency WAL files are archived to barman - // in the event there is database activity. - // TODO - Make this configurable - c.internalConfig["archive_timeout"] = "60s" + if err := barman.LoadConfig(c.barmanConfigPath); err != nil { + return err } + + c.internalConfig["archive_mode"] = "on" + c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) + c.internalConfig["archive_timeout"] = barman.Settings.ArchiveTimeout + } else { c.internalConfig["archive_mode"] = "off" } @@ -198,6 +199,7 @@ func (c *PGConfig) SetDefaults() error { return err } + // Set restore command and associated recovery target settings c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction @@ -271,7 +273,7 @@ func (c *PGConfig) initialize(store *state.Store) error { } } - if err := c.SetDefaults(); err != nil { + if err := c.SetDefaults(store); err != nil { return fmt.Errorf("failed to set pg defaults: %s", err) } diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index d9a33150..b710ab46 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -16,6 +16,7 @@ const ( pgInternalConfigFilePath = "./test_results/postgresql.internal.conf" pgUserConfigFilePath = "./test_results/postgresql.user.conf" pgPasswordFilePath = "./test_results/default_password" + barmanConfigDir = "./test_results/barman" pgHBAFilePath = "./test_results/pg_hba.conf" ) @@ -33,6 +34,7 @@ func TestPGConfigInitialization(t *testing.T) { InternalConfigFilePath: pgInternalConfigFilePath, UserConfigFilePath: pgUserConfigFilePath, passwordFilePath: pgPasswordFilePath, + barmanConfigPath: barmanConfigDir, } if err := stubPGConfigFile(); err != nil { @@ -118,7 +120,6 @@ func TestPGConfigInitialization(t *testing.T) { t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") store, _ := state.NewStore() - if err := pgConf.initialize(store); err != nil { t.Fatal(err) } @@ -132,10 +133,15 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - barman, err := NewBarman(os.Getenv("BARMAN_ENABLED"), "barman") + barman, err := NewBarman(store, os.Getenv("BARMAN_ENABLED"), DefaultAuthProfile) if err != nil { t.Fatal(err) } + + if err := barman.LoadConfig(testBarmanConfigDir); err != nil { + t.Fatal(err) + } + expected := fmt.Sprintf("'%s'", barman.walArchiveCommand()) if cfg["archive_command"] != expected { t.Fatalf("expected %s, got %s", expected, cfg["archive_command"]) From eba8aa78bb2d7617db67992670198ac81fd61b75 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 09:34:42 -0500 Subject: [PATCH 30/51] Removing unnecessary change --- internal/flypg/node.go | 12 +++++------- internal/flypg/pg.go | 2 -- internal/flypg/pg_test.go | 2 -- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/internal/flypg/node.go b/internal/flypg/node.go index f137ba11..62722848 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -32,7 +32,6 @@ type Node struct { PGConfig PGConfig RepMgr RepMgr FlyConfig FlyPGConfig - Barman Barman } func NewNode() (*Node, error) { @@ -98,7 +97,6 @@ func NewNode() (*Node, error) { node.RepMgr.Witness = present node.PGConfig = PGConfig{ - AppName: node.AppName, DataDir: node.DataDir, Port: node.Port, ConfigFilePath: fmt.Sprintf("%s/postgresql.conf", node.DataDir), @@ -143,17 +141,17 @@ func (n *Node) Init(ctx context.Context) error { } } - store, err := state.NewStore() - if err != nil { - return fmt.Errorf("failed initialize cluster state store: %s", err) - } - if os.Getenv("BARMAN_ENABLED") != "" || os.Getenv("BARMAN_REMOTE_RESTORE") != "" { if err := writeS3Credentials(ctx, s3AuthDir); err != nil { return fmt.Errorf("failed to write s3 credentials: %s", err) } } + store, err := state.NewStore() + if err != nil { + return fmt.Errorf("failed initialize cluster state store: %s", err) + } + // Remote point-in-time restore. if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { // TODO - Probably not safe to use the same lock as the snapshot restore. diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 817b5ce1..d846cd65 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -19,7 +19,6 @@ import ( ) type PGConfig struct { - AppName string ConfigFilePath string InternalConfigFilePath string UserConfigFilePath string @@ -160,7 +159,6 @@ func (c *PGConfig) SetDefaults(store *state.Store) error { } c.internalConfig = ConfigMap{ - "cluster_name": c.AppName, "random_page_cost": "1.1", "port": c.Port, "shared_buffers": fmt.Sprintf("%dMB", sharedBuffersMb), diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index b710ab46..e56457f0 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -350,7 +350,6 @@ func TestPGDefaultPassword(t *testing.T) { defer cleanup() pgConf := &PGConfig{ - AppName: "my-app", DataDir: pgTestDirectory, Port: 5433, ConfigFilePath: pgConfigFilePath, @@ -405,7 +404,6 @@ func TestValidateCompatibility(t *testing.T) { defer cleanup() pgConf := &PGConfig{ - AppName: "my-app", DataDir: pgTestDirectory, Port: 5433, ConfigFilePath: pgConfigFilePath, From 644c0d122ff8eaf9383756c66a716cf236fd583d Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 09:41:52 -0500 Subject: [PATCH 31/51] Cleanup --- cmd/start/main.go | 5 ----- internal/flypg/node.go | 4 ++++ internal/flypg/pg.go | 2 +- internal/utils/shell.go | 1 - 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cmd/start/main.go b/cmd/start/main.go index 7a438e3f..b3cb3fbf 100644 --- a/cmd/start/main.go +++ b/cmd/start/main.go @@ -55,11 +55,6 @@ func main() { return } - // TODO - We need to find a better way to do this. - if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { - panicHandler(err) - } - node, err := flypg.NewNode() if err != nil { panicHandler(err) diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 62722848..3e40fb17 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -145,6 +145,10 @@ func (n *Node) Init(ctx context.Context) error { if err := writeS3Credentials(ctx, s3AuthDir); err != nil { return fmt.Errorf("failed to write s3 credentials: %s", err) } + + if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { + return fmt.Errorf("failed to set aws credentials path: %s", err) + } } store, err := state.NewStore() diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index d846cd65..eea01d03 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -180,7 +180,7 @@ func (c *PGConfig) SetDefaults(store *state.Store) error { } if err := barman.LoadConfig(c.barmanConfigPath); err != nil { - return err + return fmt.Errorf("failed to load barman config: %s", err) } c.internalConfig["archive_mode"] = "on" diff --git a/internal/utils/shell.go b/internal/utils/shell.go index 253d20d4..29f2cac2 100644 --- a/internal/utils/shell.go +++ b/internal/utils/shell.go @@ -70,7 +70,6 @@ func RunCommand(cmdStr, usr string) ([]byte, error) { } return stdoutBuf.Bytes(), err - } func SetFileOwnership(pathToFile, owner string) error { From d19644cdbff779e562fb1f824098a98735fc0f07 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 11:01:11 -0500 Subject: [PATCH 32/51] Adding tests --- internal/api/handle_admin.go | 2 +- internal/flypg/barman_config.go | 29 +++++----- internal/flypg/barman_config_test.go | 80 ++++++++++++++++++++++++++++ internal/flypg/barman_restore.go | 2 +- 4 files changed, 98 insertions(+), 15 deletions(-) create mode 100644 internal/flypg/barman_config_test.go diff --git a/internal/api/handle_admin.go b/internal/api/handle_admin.go index ab8c5e31..08e504cc 100644 --- a/internal/api/handle_admin.go +++ b/internal/api/handle_admin.go @@ -300,7 +300,7 @@ func handleViewRepmgrSettings(w http.ResponseWriter, r *http.Request) { renderJSON(w, resp, http.StatusOK) } -func handleViewBarmanSettings(w http.ResponseWriter, r *http.Request) { +func handleViewBarmanSettings(w http.ResponseWriter, _ *http.Request) { if os.Getenv("BARMAN_ENABLED") == "" { renderErr(w, fmt.Errorf("barman is not enabled")) return diff --git a/internal/flypg/barman_config.go b/internal/flypg/barman_config.go index a5310ced..7e7c6433 100644 --- a/internal/flypg/barman_config.go +++ b/internal/flypg/barman_config.go @@ -4,6 +4,7 @@ import ( "fmt" "log" "os" + "regexp" "strconv" "strings" "time" @@ -118,32 +119,34 @@ func (c *BarmanConfig) Validate(requestedChanges map[string]interface{}) error { } } - // Verify that the values provided are valid for k, v := range requestedChanges { switch k { case "archive_timeout": - if _, ok := v.(string); !ok { + // Ensure that the value is a valid duration + if _, err := time.ParseDuration(v.(string)); err != nil { return fmt.Errorf("invalid value for archive_timeout: %v", v) } - // TODO - validate - case "recovery_window": - if _, ok := v.(string); !ok { + // Ensure that the value is a valid duration + re := regexp.MustCompile(`^(\d+)([dwy])$`) + matches := re.FindStringSubmatch(v.(string)) + if len(matches) != 3 { return fmt.Errorf("invalid value for recovery_window: %v", v) } - // TODO - validate + + fmt.Println(matches) + log.Printf("matches: %v\n", matches) + + _, err := strconv.Atoi(matches[1]) + if err != nil { + return fmt.Errorf("failed to parse recovery_window: %w", err) + } case "full_backup_frequency": - if _, ok := v.(string); !ok { + if _, err := time.ParseDuration(v.(string)); err != nil { return fmt.Errorf("invalid value for full_backup_frequency: %v", v) } - // TODO - validate case "minimum_redundancy": - if _, ok := v.(string); !ok { - return fmt.Errorf("invalid value for minimum_redundancy: %v", v) - } - - // Ensure that the value is a non-negative integer val, err := strconv.Atoi(v.(string)) if err != nil { return fmt.Errorf("invalid value for minimum_redundancy: %v", v) diff --git a/internal/flypg/barman_config_test.go b/internal/flypg/barman_config_test.go new file mode 100644 index 00000000..1b2aa862 --- /dev/null +++ b/internal/flypg/barman_config_test.go @@ -0,0 +1,80 @@ +package flypg + +import ( + "testing" + + "github.com/fly-apps/postgres-flex/internal/flypg/state" +) + +const ( + barmanConfigTestDir = "./test_results/barman" +) + +func TestValidateBarmanConfig(t *testing.T) { + if err := setup(t); err != nil { + t.Fatal(err) + } + defer cleanup() + + store, _ := state.NewStore() + + b, err := NewBarmanConfig(store, barmanConfigTestDir) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + t.Run("valid-config", func(t *testing.T) { + + conf := ConfigMap{ + "archive_timeout": "120s", + "recovery_window": "7d", + "full_backup_frequency": "24h", + "minimum_redundancy": "3", + } + + if err := b.Validate(conf); err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + + t.Run("invalid-archive-timeout", func(t *testing.T) { + conf := ConfigMap{ + "archive_timeout": "120seconds", + } + + if err := b.Validate(conf); err == nil { + t.Fatalf("expected error, got nil") + } + }) + + t.Run("invalid-recovery-window", func(t *testing.T) { + conf := ConfigMap{ + "recovery_window": "10seconds", + } + + if err := b.Validate(conf); err == nil { + t.Fatalf("expected error, got nil") + } + }) + + t.Run("invalid-full-backup-frequency", func(t *testing.T) { + conf := ConfigMap{ + "full_backup_frequency": "10seconds", + } + + if err := b.Validate(conf); err == nil { + t.Fatalf("expected error, got nil") + } + }) + + t.Run("invalid-minimum-redundancy", func(t *testing.T) { + conf := ConfigMap{ + "minimum_redundancy": "-1", + } + + if err := b.Validate(conf); err == nil { + t.Fatalf("expected error, got nil") + } + }) + +} diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 22b8977d..3cf01b81 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -75,7 +75,7 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { return restore, nil } -func (b *BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error { +func (*BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error { // create a copy of the pg_hba.conf file so we can revert back to it when needed. if err := backupHBAFile(); err != nil { if os.IsNotExist(err) { From 2ac43272a746664a89e829d775c425a77a8bc646 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 11:25:22 -0500 Subject: [PATCH 33/51] Bug fix --- cmd/start/main.go | 8 ++++++++ internal/flypg/node.go | 4 ---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cmd/start/main.go b/cmd/start/main.go index b3cb3fbf..9dda20d3 100644 --- a/cmd/start/main.go +++ b/cmd/start/main.go @@ -55,6 +55,14 @@ func main() { return } + // TODO - Find a better way to handle this + if os.Getenv("BARMAN_ENABLED") != "" || os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { + panicHandler(err) + return + } + } + node, err := flypg.NewNode() if err != nil { panicHandler(err) diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 3e40fb17..62722848 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -145,10 +145,6 @@ func (n *Node) Init(ctx context.Context) error { if err := writeS3Credentials(ctx, s3AuthDir); err != nil { return fmt.Errorf("failed to write s3 credentials: %s", err) } - - if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { - return fmt.Errorf("failed to set aws credentials path: %s", err) - } } store, err := state.NewStore() From 6284d767cac84a94808f8e51de2b8f28c908563a Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 11:56:09 -0500 Subject: [PATCH 34/51] Wait for PG to be up before monitoring. Also removing some logging that is unnecessary --- cmd/monitor/main.go | 37 +++++++++++++++++++++----- cmd/monitor/monitor_backup_schedule.go | 6 +---- internal/utils/shell.go | 22 +++++++-------- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index e368ed02..98ee6d36 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -35,8 +35,11 @@ func main() { panic(fmt.Sprintf("failed to reference node: %s\n", err)) } + // Wait for postgres to boot and become accessible. + log.Println("Waiting for Postgres to be ready...") + waitOnPostgres(ctx, node) + // Dead member monitor - log.Println("Monitoring dead members") go func() { if err := monitorDeadMembers(ctx, node); err != nil { panic(err) @@ -59,19 +62,41 @@ func main() { } // Backup scheduler - log.Println("Monitoring backup schedule") - go monitorBackupSchedule(ctx, barman) +\ go monitorBackupSchedule(ctx, barman) // Backup retention monitor - log.Println("Monitoring backup retention") go monitorBackupRetention(ctx, barman) } // Readonly monitor - log.Println("Monitoring cluster state") go monitorClusterState(ctx, node) // Replication slot monitor - log.Println("Monitoring replication slots") monitorReplicationSlots(ctx, node) } + +func waitOnPostgres(ctx context.Context, node *flypg.Node) { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + conn, err := node.NewLocalConnection(ctx, "postgres", node.SUCredentials) + if err != nil { + log.Printf("failed to open local connection: %s", err) + continue + } + defer func() { _ = conn.Close(ctx) }() + + if err := conn.Ping(ctx); err != nil { + log.Printf("failed to ping local connection: %s", err) + continue + } + + return + } + } +} diff --git a/cmd/monitor/monitor_backup_schedule.go b/cmd/monitor/monitor_backup_schedule.go index 6a0bce09..e2b2d56f 100644 --- a/cmd/monitor/monitor_backup_schedule.go +++ b/cmd/monitor/monitor_backup_schedule.go @@ -16,8 +16,6 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { log.Printf("Failed to resolve the last backup taken: %s", err) } - // TODO - Wait for Postgres to be ready before proceeding. - // Ensure we have a least one backup before proceeding. if lastBackupTime.IsZero() { log.Println("No backups found! Performing the initial base backup.") @@ -30,8 +28,6 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { lastBackupTime = time.Now() } - log.Printf("Last backup taken at: %s", lastBackupTime) - // Calculate the time until the next backup is due. timeUntilNextBackup := time.Until(lastBackupTime.Add(defaultFullBackupSchedule)) @@ -46,7 +42,7 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { timeUntilNextBackup = defaultFullBackupSchedule } - log.Printf("Next backup due in: %s", timeUntilNextBackup) + log.Printf("Next full backup due in: %s", timeUntilNextBackup) ticker := time.NewTicker(timeUntilNextBackup) defer ticker.Stop() diff --git a/internal/utils/shell.go b/internal/utils/shell.go index 29f2cac2..0241f58e 100644 --- a/internal/utils/shell.go +++ b/internal/utils/shell.go @@ -13,8 +13,6 @@ import ( "syscall" ) -// TODO - RunCommand needs a context - func RunCmd(ctx context.Context, usr string, name string, args ...string) ([]byte, error) { uid, gid, err := SystemUserIDs(usr) if err != nil { @@ -27,20 +25,22 @@ func RunCmd(ctx context.Context, usr string, name string, args ...string) ([]byt if os.Getenv("DEBUG") != "" { log.Printf("> Running command as %s: %s\n", usr, cmd.String()) - } - var stdoutBuf, stderrBuf bytes.Buffer - cmd.Stdout = io.MultiWriter(os.Stdout, &stdoutBuf) - cmd.Stderr = io.MultiWriter(os.Stderr, &stderrBuf) + var stdoutBuf, stderrBuf bytes.Buffer + cmd.Stdout = io.MultiWriter(os.Stdout, &stdoutBuf) + cmd.Stderr = io.MultiWriter(os.Stderr, &stderrBuf) - err = cmd.Run() - if err != nil { - if ee, ok := err.(*exec.ExitError); ok { - ee.Stderr = stderrBuf.Bytes() + err = cmd.Run() + if err != nil { + if ee, ok := err.(*exec.ExitError); ok { + ee.Stderr = stderrBuf.Bytes() + } } + + return stdoutBuf.Bytes(), err } - return stdoutBuf.Bytes(), err + return cmd.Output() } // Deprecated, use RunCmd instead From 6a76b7b2cd89e5566f30b9fc8243190e43d4cef5 Mon Sep 17 00:00:00 2001 From: Ben Iofel Date: Tue, 2 Jul 2024 13:00:19 -0400 Subject: [PATCH 35/51] Fix compile error --- cmd/monitor/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 98ee6d36..6993cc27 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -62,7 +62,7 @@ func main() { } // Backup scheduler -\ go monitorBackupSchedule(ctx, barman) + go monitorBackupSchedule(ctx, barman) // Backup retention monitor go monitorBackupRetention(ctx, barman) From c37df405b96efb41082d0895443ba3f547bfd8b3 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 12:04:29 -0500 Subject: [PATCH 36/51] Adding log indicating the monitor is starting --- cmd/monitor/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 6993cc27..67afc927 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -38,6 +38,7 @@ func main() { // Wait for postgres to boot and become accessible. log.Println("Waiting for Postgres to be ready...") waitOnPostgres(ctx, node) + log.Println("Postgres is ready to accept connections. Starting monitor...") // Dead member monitor go func() { From b7802e3f989bc693b24ff8a8300b7fe098005813 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 12:29:42 -0500 Subject: [PATCH 37/51] Rename the env vars to something more appropriate --- cmd/monitor/main.go | 4 ++-- cmd/start/main.go | 2 +- internal/api/handle_admin.go | 8 ++++---- internal/flypg/barman_restore_test.go | 10 +++++----- internal/flypg/barman_test.go | 6 +++--- internal/flypg/node.go | 8 ++++---- internal/flypg/pg.go | 8 ++++---- internal/flypg/pg_test.go | 16 ++++++++-------- internal/flypg/s3_credentials.go | 8 ++++---- internal/flypg/s3_credentials_test.go | 8 ++++---- 10 files changed, 39 insertions(+), 39 deletions(-) diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 67afc927..a11f48b7 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -47,13 +47,13 @@ func main() { } }() - if os.Getenv("BARMAN_ENABLED") != "" { + if os.Getenv("S3_ARCHIVE_CONFIG") != "" { store, err := state.NewStore() if err != nil { panic(fmt.Errorf("failed initialize cluster state store: %s", err)) } - barman, err := flypg.NewBarman(store, os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) + barman, err := flypg.NewBarman(store, os.Getenv("S3_ARCHIVE_CONFIG"), flypg.DefaultAuthProfile) if err != nil { panic(err) } diff --git a/cmd/start/main.go b/cmd/start/main.go index 9dda20d3..e9b7005f 100644 --- a/cmd/start/main.go +++ b/cmd/start/main.go @@ -56,7 +56,7 @@ func main() { } // TODO - Find a better way to handle this - if os.Getenv("BARMAN_ENABLED") != "" || os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + if os.Getenv("S3_ARCHIVE_CONFIG") != "" || os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { if err := os.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/data/.aws/credentials"); err != nil { panicHandler(err) return diff --git a/internal/api/handle_admin.go b/internal/api/handle_admin.go index 08e504cc..7148dcf1 100644 --- a/internal/api/handle_admin.go +++ b/internal/api/handle_admin.go @@ -301,7 +301,7 @@ func handleViewRepmgrSettings(w http.ResponseWriter, r *http.Request) { } func handleViewBarmanSettings(w http.ResponseWriter, _ *http.Request) { - if os.Getenv("BARMAN_ENABLED") == "" { + if os.Getenv("S3_ARCHIVE_CONFIG") == "" { renderErr(w, fmt.Errorf("barman is not enabled")) return } @@ -312,7 +312,7 @@ func handleViewBarmanSettings(w http.ResponseWriter, _ *http.Request) { return } - barman, err := flypg.NewBarman(store, os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) + barman, err := flypg.NewBarman(store, os.Getenv("S3_ARCHIVE_CONFIG"), flypg.DefaultAuthProfile) if err != nil { renderErr(w, err) return @@ -334,7 +334,7 @@ func handleViewBarmanSettings(w http.ResponseWriter, _ *http.Request) { } func handleUpdateBarmanSettings(w http.ResponseWriter, r *http.Request) { - if os.Getenv("BARMAN_ENABLED") == "" { + if os.Getenv("S3_ARCHIVE_CONFIG") == "" { renderErr(w, fmt.Errorf("barman is not enabled")) return } @@ -345,7 +345,7 @@ func handleUpdateBarmanSettings(w http.ResponseWriter, r *http.Request) { return } - barman, err := flypg.NewBarman(store, os.Getenv("BARMAN_ENABLED"), flypg.DefaultAuthProfile) + barman, err := flypg.NewBarman(store, os.Getenv("S3_ARCHIVE_CONFIG"), flypg.DefaultAuthProfile) if err != nil { renderErr(w, err) return diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index 4f6faa00..3b83aeff 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -109,7 +109,7 @@ const backupsResponse = `{ func TestNewBarmanRestore(t *testing.T) { setRestoreDefaultEnv(t) - restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) if err != nil { t.Fatalf("NewBarmanRestore failed with: %v", err) } @@ -154,7 +154,7 @@ func TestWALRestoreCommand(t *testing.T) { defer cleanup() setRestoreDefaultEnv(t) - restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -175,7 +175,7 @@ func TestParseBackups(t *testing.T) { t.Run("parseBackups", func(t *testing.T) { setRestoreDefaultEnv(t) - restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) if err != nil { t.Fatalf("NewBarmanRestore failed with: %v", err) } @@ -226,7 +226,7 @@ func TestResolveBackupTarget(t *testing.T) { setRestoreDefaultEnv(t) - restore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) if err != nil { t.Fatalf("NewBarmanRestore failed with: %v", err) } @@ -286,6 +286,6 @@ func TestResolveBackupTarget(t *testing.T) { } func setRestoreDefaultEnv(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00") t.Setenv("FLY_APP_NAME", "postgres-flex") } diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index a107d651..89a49b02 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -15,14 +15,14 @@ func TestNewBarman(t *testing.T) { if err := setup(t); err != nil { t.Fatal(err) } - // defer cleanup() + defer cleanup() store, _ := state.NewStore() t.Run("defaults", func(t *testing.T) { setDefaultEnv(t) - configURL := os.Getenv("BARMAN_ENABLED") + configURL := os.Getenv("S3_ARCHIVE_CONFIG") barman, err := NewBarman(store, configURL, DefaultAuthProfile) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -76,7 +76,7 @@ func TestNewBarman(t *testing.T) { } func setDefaultEnv(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") + t.Setenv("S3_ARCHIVE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") t.Setenv("FLY_APP_NAME", "postgres-flex") } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 62722848..376dced1 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -108,7 +108,7 @@ func NewNode() (*Node, error) { repmgrDatabase: node.RepMgr.DatabaseName, } - if os.Getenv("BARMAN_ENABLED") != "" { + if os.Getenv("S3_ARCHIVE_CONFIG") != "" { // Specific PG configuration is injected when Barman is enabled. node.PGConfig.barmanConfigPath = DefaultBarmanConfigDir } @@ -141,7 +141,7 @@ func (n *Node) Init(ctx context.Context) error { } } - if os.Getenv("BARMAN_ENABLED") != "" || os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + if os.Getenv("S3_ARCHIVE_CONFIG") != "" || os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { if err := writeS3Credentials(ctx, s3AuthDir); err != nil { return fmt.Errorf("failed to write s3 credentials: %s", err) } @@ -153,7 +153,7 @@ func (n *Node) Init(ctx context.Context) error { } // Remote point-in-time restore. - if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { + if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { // TODO - Probably not safe to use the same lock as the snapshot restore. active, err := isRestoreActive() if err != nil { @@ -161,7 +161,7 @@ func (n *Node) Init(ctx context.Context) error { } if active { - configURL := os.Getenv("BARMAN_REMOTE_RESTORE") + configURL := os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") restore, err := NewBarmanRestore(configURL) if err != nil { return fmt.Errorf("failed to initialize barman restore: %s", err) diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index eea01d03..601ae289 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -173,8 +173,8 @@ func (c *PGConfig) SetDefaults(store *state.Store) error { "shared_preload_libraries": fmt.Sprintf("'%s'", strings.Join(sharedPreloadLibraries, ",")), } - if os.Getenv("BARMAN_ENABLED") != "" { - barman, err := NewBarman(store, os.Getenv("BARMAN_ENABLED"), DefaultAuthProfile) + if os.Getenv("S3_ARCHIVE_CONFIG") != "" { + barman, err := NewBarman(store, os.Getenv("S3_ARCHIVE_CONFIG"), DefaultAuthProfile) if err != nil { return fmt.Errorf("failed to initialize barman instance: %s", err) } @@ -191,8 +191,8 @@ func (c *PGConfig) SetDefaults(store *state.Store) error { c.internalConfig["archive_mode"] = "off" } - if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { - barmanRestore, err := NewBarmanRestore(os.Getenv("BARMAN_REMOTE_RESTORE")) + if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { + barmanRestore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) if err != nil { return err } diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index e56457f0..17f5d15d 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -117,7 +117,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("barman-enabled", func(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") + t.Setenv("S3_ARCHIVE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { @@ -133,7 +133,7 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - barman, err := NewBarman(store, os.Getenv("BARMAN_ENABLED"), DefaultAuthProfile) + barman, err := NewBarman(store, os.Getenv("S3_ARCHIVE_CONFIG"), DefaultAuthProfile) if err != nil { t.Fatal(err) } @@ -149,7 +149,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("barman-disabled", func(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") + t.Setenv("S3_ARCHIVE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { @@ -165,7 +165,7 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected archive_mode to be on, got %v", cfg["archive_mode"]) } - t.Setenv("BARMAN_ENABLED", "") + t.Setenv("S3_ARCHIVE_CONFIG", "") if err := pgConf.initialize(store); err != nil { t.Fatal(err) @@ -182,7 +182,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("barman-restore-from-time", func(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { @@ -200,7 +200,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("barman-restore-from-name", func(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetName=20240626T172443") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetName=20240626T172443") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { @@ -218,7 +218,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("barman-restore-from-target", func(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?target=immediate") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?target=immediate") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { @@ -236,7 +236,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("barman-restore-from-target-time-non-inclusive", func(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00&targetInclusive=false") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00&targetInclusive=false") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { diff --git a/internal/flypg/s3_credentials.go b/internal/flypg/s3_credentials.go index a61e8492..3b65da24 100644 --- a/internal/flypg/s3_credentials.go +++ b/internal/flypg/s3_credentials.go @@ -22,16 +22,16 @@ type s3Credentials struct { func writeS3Credentials(ctx context.Context, s3AuthDir string) error { var creds []*s3Credentials - if os.Getenv("BARMAN_ENABLED") != "" { - cred, err := parseCredentialsFromConfigURL(os.Getenv("BARMAN_ENABLED"), DefaultAuthProfile) + if os.Getenv("S3_ARCHIVE_CONFIG") != "" { + cred, err := parseCredentialsFromConfigURL(os.Getenv("S3_ARCHIVE_CONFIG"), DefaultAuthProfile) if err != nil { return fmt.Errorf("failed to parse credentials from barman configURL: %v", err) } creds = append(creds, cred) } - if os.Getenv("BARMAN_REMOTE_RESTORE") != "" { - cred, err := parseCredentialsFromConfigURL(os.Getenv("BARMAN_REMOTE_RESTORE"), RestoreAuthProfile) + if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { + cred, err := parseCredentialsFromConfigURL(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG"), RestoreAuthProfile) if err != nil { return fmt.Errorf("failed to parse credentials from barman restore configURL: %v", err) } diff --git a/internal/flypg/s3_credentials_test.go b/internal/flypg/s3_credentials_test.go index 2718ece0..68ea5f7e 100644 --- a/internal/flypg/s3_credentials_test.go +++ b/internal/flypg/s3_credentials_test.go @@ -19,7 +19,7 @@ func TestWriteAWSCredentials(t *testing.T) { pathToCredentials := fmt.Sprintf("%s/credentials", authDir) t.Run("write-barman-credentials", func(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") + t.Setenv("S3_ARCHIVE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") if err := writeS3Credentials(context.TODO(), authDir); err != nil { t.Fatalf("unexpected error: %v", err) @@ -43,7 +43,7 @@ func TestWriteAWSCredentials(t *testing.T) { }) t.Run("write-restore-credentials", func(t *testing.T) { - t.Setenv("BARMAN_REMOTE_RESTORE", "https://source-key:source-secret@fly.storage.tigris.dev/my-bucket/my-directory") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://source-key:source-secret@fly.storage.tigris.dev/my-bucket/my-directory") if err := writeS3Credentials(context.TODO(), authDir); err != nil { t.Fatalf("unexpected error: %v", err) @@ -67,8 +67,8 @@ func TestWriteAWSCredentials(t *testing.T) { }) t.Run("write-barman-and-restore-credentials", func(t *testing.T) { - t.Setenv("BARMAN_ENABLED", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") - t.Setenv("BARMAN_REMOTE_RESTORE", "https://source-key:source-secret@fly.storage.tigris.dev/source-bucket/source-directory") + t.Setenv("S3_ARCHIVE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://source-key:source-secret@fly.storage.tigris.dev/source-bucket/source-directory") if err := writeS3Credentials(context.TODO(), authDir); err != nil { t.Fatalf("unexpected error: %v", err) From 06c4695d7e909a65672742d1f9979d8d87dce5b3 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 13:31:09 -0500 Subject: [PATCH 38/51] Cleanup --- internal/flypg/pg.go | 84 +++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 601ae289..fc8f37bb 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -173,48 +173,66 @@ func (c *PGConfig) SetDefaults(store *state.Store) error { "shared_preload_libraries": fmt.Sprintf("'%s'", strings.Join(sharedPreloadLibraries, ",")), } - if os.Getenv("S3_ARCHIVE_CONFIG") != "" { - barman, err := NewBarman(store, os.Getenv("S3_ARCHIVE_CONFIG"), DefaultAuthProfile) - if err != nil { - return fmt.Errorf("failed to initialize barman instance: %s", err) - } + if err := c.setArchiveConfig(os.Getenv("S3_ARCHIVE_CONFIG"), store); err != nil { + return fmt.Errorf("failed to set archive config: %s", err) + } - if err := barman.LoadConfig(c.barmanConfigPath); err != nil { - return fmt.Errorf("failed to load barman config: %s", err) - } + if err := c.setRecoveryTargetConfig(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")); err != nil { + return fmt.Errorf("failed to set recovery target config: %s", err) + } - c.internalConfig["archive_mode"] = "on" - c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) - c.internalConfig["archive_timeout"] = barman.Settings.ArchiveTimeout + return nil +} - } else { +func (c *PGConfig) setArchiveConfig(configURL string, store *state.Store) error { + if configURL == "" { c.internalConfig["archive_mode"] = "off" + return nil } - if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { - barmanRestore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) - if err != nil { - return err - } + barman, err := NewBarman(store, configURL, DefaultAuthProfile) + if err != nil { + return fmt.Errorf("failed to initialize barman instance: %s", err) + } + + if err := barman.LoadConfig(c.barmanConfigPath); err != nil { + return fmt.Errorf("failed to load barman config: %s", err) + } - // Set restore command and associated recovery target settings - c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) - c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction + c.internalConfig["archive_mode"] = "on" + c.internalConfig["archive_command"] = fmt.Sprintf("'%s'", barman.walArchiveCommand()) + c.internalConfig["archive_timeout"] = barman.Settings.ArchiveTimeout - if barmanRestore.recoveryTargetInclusive != "" { - c.internalConfig["recovery_target_inclusive"] = barmanRestore.recoveryTargetInclusive - } + return nil +} - switch { - case barmanRestore.recoveryTarget != "": - c.internalConfig["recovery_target"] = barmanRestore.recoveryTarget - case barmanRestore.recoveryTargetName != "": - c.internalConfig["recovery_target_name"] = fmt.Sprintf("barman_%s", barmanRestore.recoveryTargetName) - case barmanRestore.recoveryTargetTime != "": - c.internalConfig["recovery_target_time"] = fmt.Sprintf("'%s'", barmanRestore.recoveryTargetTime) - default: - return errors.New("recovery target name or time must be specified") - } +func (c *PGConfig) setRecoveryTargetConfig(configURL string) error { + if configURL == "" { + return nil + } + + barmanRestore, err := NewBarmanRestore(configURL) + if err != nil { + return err + } + + // Set restore command and associated recovery target settings + c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) + c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction + + if barmanRestore.recoveryTargetInclusive != "" { + c.internalConfig["recovery_target_inclusive"] = barmanRestore.recoveryTargetInclusive + } + + switch { + case barmanRestore.recoveryTarget != "": + c.internalConfig["recovery_target"] = barmanRestore.recoveryTarget + case barmanRestore.recoveryTargetName != "": + c.internalConfig["recovery_target_name"] = fmt.Sprintf("barman_%s", barmanRestore.recoveryTargetName) + case barmanRestore.recoveryTargetTime != "": + c.internalConfig["recovery_target_time"] = fmt.Sprintf("'%s'", barmanRestore.recoveryTargetTime) + default: + return fmt.Errorf("recovery target name or time must be specified") } return nil From 2029c2c2ae9c5685a952c111cc0cb50d8f65d02a Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 14:35:10 -0500 Subject: [PATCH 39/51] Fix that ensures the user configured full_backup_frequency is honored --- cmd/monitor/monitor_backup_schedule.go | 23 +++++++++++++++++++---- internal/flypg/barman_config.go | 9 ++------- internal/flypg/barman_config_test.go | 24 ++++++++++++++++++++++++ internal/flypg/barman_test.go | 2 +- 4 files changed, 46 insertions(+), 12 deletions(-) diff --git a/cmd/monitor/monitor_backup_schedule.go b/cmd/monitor/monitor_backup_schedule.go index e2b2d56f..dbb9c832 100644 --- a/cmd/monitor/monitor_backup_schedule.go +++ b/cmd/monitor/monitor_backup_schedule.go @@ -16,20 +16,35 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { log.Printf("Failed to resolve the last backup taken: %s", err) } + fullBackupSchedule := defaultFullBackupSchedule + + // Set the full backup schedule if it is defined in the configuration. + if barman.Settings.FullBackupFrequency != "" { + fullBackupDur, err := time.ParseDuration(barman.Settings.FullBackupFrequency) + switch { + case err != nil: + log.Printf("Failed to parse full backup frequency: %s", err) + default: + fullBackupSchedule = fullBackupDur + } + } + // Ensure we have a least one backup before proceeding. if lastBackupTime.IsZero() { log.Println("No backups found! Performing the initial base backup.") if err := performInitialBaseBackup(ctx, barman); err != nil { log.Printf("Failed to perform the initial full backup: %s", err) - log.Printf("Backup scheduler will reattempt in 24 hours.") + log.Printf("Backup scheduler will re-attempt in %s.", fullBackupSchedule) } lastBackupTime = time.Now() } + log.Printf("Full backup schedule set to: %s", fullBackupSchedule) + // Calculate the time until the next backup is due. - timeUntilNextBackup := time.Until(lastBackupTime.Add(defaultFullBackupSchedule)) + timeUntilNextBackup := time.Until(lastBackupTime.Add(fullBackupSchedule)) // Perform backup immediately if the time until the next backup is negative. if timeUntilNextBackup < 0 { @@ -39,7 +54,7 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { log.Printf("Full backup failed with: %s", err) } - timeUntilNextBackup = defaultFullBackupSchedule + timeUntilNextBackup = fullBackupSchedule } log.Printf("Next full backup due in: %s", timeUntilNextBackup) @@ -69,7 +84,7 @@ func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { } log.Printf("Full backup completed successfully") - ticker.Reset(defaultFullBackupSchedule) + ticker.Reset(fullBackupSchedule) } } } diff --git a/internal/flypg/barman_config.go b/internal/flypg/barman_config.go index 7e7c6433..89a36c57 100644 --- a/internal/flypg/barman_config.go +++ b/internal/flypg/barman_config.go @@ -95,18 +95,13 @@ func (c *BarmanConfig) ParseSettings() (BarmanSettings, error) { return BarmanSettings{}, fmt.Errorf("failed to read current config: %s", err) } - backupFrequency, err := time.ParseDuration(cfg["full_backup_frequency"].(string)) - if err != nil { - return BarmanSettings{}, fmt.Errorf("failed to parse full backup frequency: %s", err) - } - recoveryWindow := fmt.Sprintf("RECOVERY WINDOW OF %s", convertRecoveryWindowDuration(cfg["recovery_window"].(string))) return BarmanSettings{ ArchiveTimeout: cfg["archive_timeout"].(string), RecoveryWindow: recoveryWindow, - FullBackupFrequency: fmt.Sprint(backupFrequency.Hours()), + FullBackupFrequency: cfg["full_backup_frequency"].(string), MinimumRedundancy: cfg["minimum_redundancy"].(string), }, nil } @@ -163,7 +158,7 @@ func (c *BarmanConfig) Validate(requestedChanges map[string]interface{}) error { func (c *BarmanConfig) initialize(store *state.Store, configDir string) error { // Ensure directory exists - if err := os.MkdirAll(configDir, 0755); err != nil { + if err := os.MkdirAll(configDir, 0600); err != nil { return fmt.Errorf("failed to create barman config directory: %s", err) } diff --git a/internal/flypg/barman_config_test.go b/internal/flypg/barman_config_test.go index 1b2aa862..c834dde5 100644 --- a/internal/flypg/barman_config_test.go +++ b/internal/flypg/barman_config_test.go @@ -76,5 +76,29 @@ func TestValidateBarmanConfig(t *testing.T) { t.Fatalf("expected error, got nil") } }) +} + +func TestBarmanConfigSettings(t *testing.T) { + if err := setup(t); err != nil { + t.Fatal(err) + } + defer cleanup() + + store, _ := state.NewStore() + t.Run("defaults", func(t *testing.T) { + b, err := NewBarmanConfig(store, barmanConfigTestDir) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if b.Settings.MinimumRedundancy != "3" { + t.Fatalf("expected minimumRedundancy to be 3, but got %s", b.Settings.MinimumRedundancy) + } + + if b.Settings.FullBackupFrequency != "24h" { + t.Fatalf("expected fullBackupFrequency to be 24h, but got %s", b.Settings.FullBackupFrequency) + } + + }) } diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index 89a49b02..daf8254e 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -65,7 +65,7 @@ func TestNewBarman(t *testing.T) { t.Fatalf("expected recovery_window to be 'RECOVERY WINDOW OF 7 DAYS', but got %s", barman.Settings.RecoveryWindow) } - if barman.Settings.FullBackupFrequency != "24" { + if barman.Settings.FullBackupFrequency != "24h" { t.Fatalf("expected fullBackupFrequency to be 24, but got %s", barman.Settings.FullBackupFrequency) } From 589866bf2782f0d9e2c91fb641cfa225e512c6d6 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 2 Jul 2024 22:22:49 -0500 Subject: [PATCH 40/51] Cleanup --- internal/flypg/barman_config.go | 7 +------ internal/flypg/barman_restore.go | 19 ++++++++++--------- internal/flypg/restore.go | 6 ++---- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/internal/flypg/barman_config.go b/internal/flypg/barman_config.go index 89a36c57..437ad00a 100644 --- a/internal/flypg/barman_config.go +++ b/internal/flypg/barman_config.go @@ -129,14 +129,9 @@ func (c *BarmanConfig) Validate(requestedChanges map[string]interface{}) error { return fmt.Errorf("invalid value for recovery_window: %v", v) } - fmt.Println(matches) - log.Printf("matches: %v\n", matches) - - _, err := strconv.Atoi(matches[1]) - if err != nil { + if _, err := strconv.Atoi(matches[1]); err != nil { return fmt.Errorf("failed to parse recovery_window: %w", err) } - case "full_backup_frequency": if _, err := time.ParseDuration(v.(string)); err != nil { return fmt.Errorf("invalid value for full_backup_frequency: %v", v) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 3cf01b81..4e091e1c 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -44,21 +44,23 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { } for key, value := range url.Query() { + v := value[0] + switch key { case "target": - restore.recoveryTarget = value[0] + restore.recoveryTarget = v case "targetName": - restore.recoveryTargetName = value[0] + restore.recoveryTargetName = v + case "targetInclusive": + restore.recoveryTargetInclusive = v + case "targetAction": + restore.recoveryTargetAction = v case "targetTime": - ts, err := time.Parse(ISO8601, value[0]) + ts, err := time.Parse(ISO8601, v) if err != nil { return nil, fmt.Errorf("failed to parse target time: %s", err) } restore.recoveryTargetTime = ts.Format(ISO8601) - case "targetInclusive": - restore.recoveryTargetInclusive = value[0] - case "targetAction": - restore.recoveryTargetAction = value[0] default: return nil, fmt.Errorf("unknown query parameter: %s", key) } @@ -113,8 +115,7 @@ func (*BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error { defer func() { _ = conn.Close(ctx) }() // Drop repmgr database to clear any metadata that belonged to the old cluster. - _, err = conn.Exec(ctx, "DROP DATABASE repmgr;") - if err != nil { + if _, err = conn.Exec(ctx, "DROP DATABASE repmgr;"); err != nil { return fmt.Errorf("failed to drop repmgr database: %s", err) } diff --git a/internal/flypg/restore.go b/internal/flypg/restore.go index f8e5481c..aa927062 100644 --- a/internal/flypg/restore.go +++ b/internal/flypg/restore.go @@ -159,8 +159,7 @@ func restoreHBAFile() error { defer func() { _ = file.Close() }() // revert back to our original config - _, err = file.Write(data) - if err != nil { + if _, err = file.Write(data); err != nil { return err } @@ -179,8 +178,7 @@ func setRestoreLock() error { } defer func() { _ = file.Close() }() - _, err = file.WriteString(os.Getenv("FLY_APP_NAME")) - if err != nil { + if _, err = file.WriteString(os.Getenv("FLY_APP_NAME")); err != nil { return err } From 5eaa2c204229137f73ee1acb28f21ed58b08cf1c Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 09:58:42 -0500 Subject: [PATCH 41/51] Fixes issue where a failed backup was being selected as a base restore target --- cmd/monitor/monitor_backup_schedule.go | 2 +- internal/flypg/barman.go | 20 +++++++++- internal/flypg/barman_restore.go | 20 ++++++---- internal/flypg/barman_restore_test.go | 55 ++++++++++++++++++++------ 4 files changed, 76 insertions(+), 21 deletions(-) diff --git a/cmd/monitor/monitor_backup_schedule.go b/cmd/monitor/monitor_backup_schedule.go index dbb9c832..f26b7a28 100644 --- a/cmd/monitor/monitor_backup_schedule.go +++ b/cmd/monitor/monitor_backup_schedule.go @@ -11,7 +11,7 @@ import ( func monitorBackupSchedule(ctx context.Context, barman *flypg.Barman) { // Determine when the last backup was taken. - lastBackupTime, err := barman.LastBackupTaken(ctx) + lastBackupTime, err := barman.LastCompletedBackup(ctx) if err != nil { log.Printf("Failed to resolve the last backup taken: %s", err) } diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index fc5601e3..040033b4 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -34,6 +34,7 @@ type Barman struct { } type Backup struct { + Status string `json:"status"` BackupID string `json:"backup_id"` StartTime string `json:"begin_time"` EndTime string `json:"end_time"` @@ -171,8 +172,25 @@ func (b *Barman) WALArchiveDelete(ctx context.Context) ([]byte, error) { return utils.RunCmd(ctx, "postgres", "barman-cloud-backup-delete", args...) } -func (b *Barman) LastBackupTaken(ctx context.Context) (time.Time, error) { +func (b *Barman) ListCompletedBackups(ctx context.Context) (BackupList, error) { backups, err := b.ListBackups(ctx) + if err != nil { + return BackupList{}, fmt.Errorf("failed to list backups: %s", err) + } + + var completedBackups BackupList + + for _, backup := range backups.Backups { + if backup.Status == "DONE" { + completedBackups.Backups = append(completedBackups.Backups, backup) + } + } + + return completedBackups, nil +} + +func (b *Barman) LastCompletedBackup(ctx context.Context) (time.Time, error) { + backups, err := b.ListCompletedBackups(ctx) if err != nil { return time.Time{}, fmt.Errorf("failed to list backups: %s", err) } diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 4e091e1c..4700d2cc 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -141,8 +141,7 @@ func (*BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error { } func (b *BarmanRestore) restoreFromBackup(ctx context.Context) error { - // Query available backups from object storage - backups, err := b.ListBackups(ctx) + backups, err := b.ListCompletedBackups(ctx) if err != nil { return fmt.Errorf("failed to list backups: %s", err) } @@ -159,22 +158,21 @@ func (b *BarmanRestore) restoreFromBackup(ctx context.Context) error { if err != nil { return fmt.Errorf("failed to resolve backup target by time: %s", err) } + case b.recoveryTargetTime != "": + backupID, err = b.resolveBackupFromTime(backups, b.recoveryTargetTime) + if err != nil { + return fmt.Errorf("failed to resolve backup target by time: %s", err) + } case b.recoveryTargetName != "": // Resolve the target base backup backupID, err = b.resolveBackupFromID(backups, b.recoveryTargetName) if err != nil { return fmt.Errorf("failed to resolve backup target by id: %s", err) } - case b.recoveryTargetTime != "": - backupID, err = b.resolveBackupFromTime(backups, b.recoveryTargetTime) - if err != nil { - return fmt.Errorf("failed to resolve backup target by time: %s", err) - } default: return fmt.Errorf("restore target not specified") } - // TODO - Consider just using the last available backup if the target is not found if backupID == "" { return fmt.Errorf("no backup found") } @@ -234,6 +232,12 @@ func (*BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr st layout := "Mon Jan 2 15:04:05 2006" for _, backup := range backupList.Backups { + // Skip backups that that failed or are in progress + // TODO - This shouldn't be needed, but will keep it around until we can improve tests. + if backup.Status != "DONE" { + continue + } + // Parse the backup start time startTime, err := time.Parse(layout, backup.StartTime) if err != nil { diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index 3b83aeff..4333282f 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -8,6 +8,39 @@ import ( const backupsResponse = `{ "backups_list": [ + { + "backup_label": null, + "begin_offset": 216, + "begin_time": "Tue Jun 24 19:44:20 2024", + "begin_wal": "00000005000000000000002A", + "begin_xlog": "0/2A0000D8", + "compression": null, + "config_file": "/data/postgresql/postgresql.conf", + "copy_stats": null, + "deduplicated_size": null, + "end_offset": null, + "end_time": null, + "end_wal": null, + "end_xlog": null, + "error": "failure uploading data (connection already closed)", + "hba_file": "/data/postgresql/pg_hba.conf", + "ident_file": "/data/postgresql/pg_ident.conf", + "included_files": [ + "/data/postgresql/postgresql.auto.conf", + "/data/postgresql/postgresql.internal.conf" + ], + "mode": null, + "pgdata": "/data/postgresql", + "server_name": "cloud", + "size": null, + "status": "FAILED", + "systemid": "7332222271544570189", + "tablespaces": null, + "timeline": 5, + "version": 150006, + "xlog_segment_size": 16777216, + "backup_id": "20240702T210544" + }, { "backup_label": "'START WAL LOCATION: 0/8000028 (file 000000010000000000000008)\\nCHECKPOINT LOCATION: 0/8000098\\nBACKUP METHOD: streamed\\nBACKUP FROM: primary\\nSTART TIME: 2024-06-25 19:44:13 UTC\\nLABEL: Barman backup cloud 20240625T194412\\nSTART TIMELINE: 1\\n'", "backup_name": "test-backup-2", @@ -185,24 +218,28 @@ func TestParseBackups(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(list.Backups) != 2 { + if len(list.Backups) != 3 { t.Fatalf("expected 2 backups, got %d", len(list.Backups)) } firstBackup := list.Backups[0] - if firstBackup.BackupID != "20240625T194412" { + if firstBackup.BackupID != "20240702T210544" { t.Fatalf("expected backup ID to be 20240625T194412, got %s", firstBackup.BackupID) } - if firstBackup.StartTime != "Tue Jun 25 19:44:12 2024" { - t.Fatalf("expected start time to be Tue Jun 25 19:44:12 2024, got %s", firstBackup.StartTime) + if firstBackup.StartTime != "Tue Jun 24 19:44:20 2024" { + t.Fatalf("expected start time to be Tue Jun 24 19:44:20 2024, got %s", firstBackup.StartTime) + } + + if firstBackup.EndTime != "" { + t.Fatalf("expected end time to be empty, but got %s", firstBackup.EndTime) } - if firstBackup.EndTime != "Tue Jun 25 19:44:18 2024" { - t.Fatalf("expected end time to be Tue Jun 25 19:44:18 2024, got %s", firstBackup.EndTime) + if firstBackup.Status != "FAILED" { + t.Fatalf("expected status to be FAILED, got %s", firstBackup.Status) } - secondBackup := list.Backups[1] + secondBackup := list.Backups[2] if secondBackup.BackupID != "20240626T172443" { t.Fatalf("expected backup ID to be 20240626T172443, got %s", secondBackup.BackupID) @@ -258,8 +295,6 @@ func TestResolveBackupTarget(t *testing.T) { } }) - // "begin_time": "Tue Jun 25 19:44:12 2024" - // "end_time": "Tue Jun 25 19:44:18 2024", t.Run("resolve-backup-within-first-window", func(t *testing.T) { backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:15-00:00") if err != nil { @@ -271,8 +306,6 @@ func TestResolveBackupTarget(t *testing.T) { } }) - // "begin_time": "Wed Jun 26 17:24:43 2024", - // "end_time": "Wed Jun 26 17:27:02 2024", t.Run("resolve-backup-within-second-window", func(t *testing.T) { backupID, err := restore.resolveBackupFromTime(list, "2024-06-26T17:25:15-00:00") if err != nil { From 5734e7cdee826715f6a5e0e9d39997a9eb2c6ee8 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 10:02:58 -0500 Subject: [PATCH 42/51] Remove support for 'latest' string, since this should be controlled with the restore target --- internal/flypg/barman_restore.go | 11 +++-------- internal/flypg/barman_restore_test.go | 11 ----------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 4700d2cc..6f5e0ad9 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -212,14 +212,9 @@ func (*BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr st var restoreTime time.Time // Parse the restore string - if restoreStr == "latest" { - restoreTime = time.Now() - } else { - var err error - restoreTime, err = time.Parse(ISO8601, restoreStr) - if err != nil { - return "", fmt.Errorf("failed to parse restore time: %s", err) - } + restoreTime, err := time.Parse(ISO8601, restoreStr) + if err != nil { + return "", fmt.Errorf("failed to parse restore time: %s", err) } latestBackupID := "" diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index 4333282f..6cda8993 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -273,17 +273,6 @@ func TestResolveBackupTarget(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - t.Run("resolve-latest-backup-target", func(t *testing.T) { - backupID, err := restore.resolveBackupFromTime(list, "latest") - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if backupID != "20240626T172443" { - t.Fatalf("expected backup ID to be 20240626T172443, got %s", backupID) - } - }) - t.Run("resolve-earliest-backup-target", func(t *testing.T) { backupID, err := restore.resolveBackupFromTime(list, "2024-06-25T19:44:12-00:00") if err != nil { From 010ffb0ec6ebfbb3aa1215a945f61542decf3865 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 10:52:25 -0500 Subject: [PATCH 43/51] Don't allow full_backup_frequencies lower than 1h --- internal/flypg/barman_config.go | 7 ++++++- internal/flypg/barman_config_test.go | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/internal/flypg/barman_config.go b/internal/flypg/barman_config.go index 437ad00a..63f75ebf 100644 --- a/internal/flypg/barman_config.go +++ b/internal/flypg/barman_config.go @@ -133,9 +133,14 @@ func (c *BarmanConfig) Validate(requestedChanges map[string]interface{}) error { return fmt.Errorf("failed to parse recovery_window: %w", err) } case "full_backup_frequency": - if _, err := time.ParseDuration(v.(string)); err != nil { + dur, err := time.ParseDuration(v.(string)) + if err != nil { return fmt.Errorf("invalid value for full_backup_frequency: %v", v) } + + if dur.Hours() < 1 { + return fmt.Errorf("invalid value for full_backup_frequency (expected to be >= 1h, got %v)", dur) + } case "minimum_redundancy": val, err := strconv.Atoi(v.(string)) if err != nil { diff --git a/internal/flypg/barman_config_test.go b/internal/flypg/barman_config_test.go index c834dde5..e3001897 100644 --- a/internal/flypg/barman_config_test.go +++ b/internal/flypg/barman_config_test.go @@ -65,6 +65,14 @@ func TestValidateBarmanConfig(t *testing.T) { if err := b.Validate(conf); err == nil { t.Fatalf("expected error, got nil") } + + conf = ConfigMap{ + "full_backup_frequency": "1m", + } + + if err := b.Validate(conf); err == nil { + t.Fatalf("expected error, got nil") + } }) t.Run("invalid-minimum-redundancy", func(t *testing.T) { From 884a782cd685d61c20a9f6b4e2815701c22d485e Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 10:59:39 -0500 Subject: [PATCH 44/51] Cleaning up the config validations --- internal/flypg/barman_config.go | 8 +++++++- internal/flypg/barman_config_test.go | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/internal/flypg/barman_config.go b/internal/flypg/barman_config.go index 63f75ebf..48fa4c55 100644 --- a/internal/flypg/barman_config.go +++ b/internal/flypg/barman_config.go @@ -129,9 +129,15 @@ func (c *BarmanConfig) Validate(requestedChanges map[string]interface{}) error { return fmt.Errorf("invalid value for recovery_window: %v", v) } - if _, err := strconv.Atoi(matches[1]); err != nil { + num, err := strconv.Atoi(matches[1]) + if err != nil { return fmt.Errorf("failed to parse recovery_window: %w", err) } + + if num < 1 { + return fmt.Errorf("invalid value for recovery_window (expected to be >= 1, got %v)", num) + } + case "full_backup_frequency": dur, err := time.ParseDuration(v.(string)) if err != nil { diff --git a/internal/flypg/barman_config_test.go b/internal/flypg/barman_config_test.go index e3001897..edeaf759 100644 --- a/internal/flypg/barman_config_test.go +++ b/internal/flypg/barman_config_test.go @@ -55,6 +55,21 @@ func TestValidateBarmanConfig(t *testing.T) { if err := b.Validate(conf); err == nil { t.Fatalf("expected error, got nil") } + + conf = ConfigMap{ + "recovery_window": "1m", + } + + if err := b.Validate(conf); err == nil { + t.Fatalf("expected error, got nil") + } + + conf = ConfigMap{ + "recovery_window": "0w", + } + if err := b.Validate(conf); err == nil { + t.Fatalf("expected error, got nil") + } }) t.Run("invalid-full-backup-frequency", func(t *testing.T) { @@ -73,6 +88,13 @@ func TestValidateBarmanConfig(t *testing.T) { if err := b.Validate(conf); err == nil { t.Fatalf("expected error, got nil") } + + conf = ConfigMap{ + "full_backup_frequency": "0w", + } + if err := b.Validate(conf); err == nil { + t.Fatalf("expected invalid value for recovery_window (expected to be >= 1, got, got 1") + } }) t.Run("invalid-minimum-redundancy", func(t *testing.T) { From 53c1112810ce9526dca94355953779645c2acb04 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 11:06:49 -0500 Subject: [PATCH 45/51] Adding support for targetTimeline --- internal/flypg/barman_restore.go | 3 +++ internal/flypg/barman_restore_test.go | 14 +++++++++++++- internal/flypg/pg.go | 4 ++++ internal/flypg/pg_test.go | 22 ++++++++++++++++++++++ 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 6f5e0ad9..8036a0e7 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -17,6 +17,7 @@ type BarmanRestore struct { recoveryTarget string recoveryTargetName string recoveryTargetTime string + recoveryTargetTimeline string recoveryTargetAction string recoveryTargetInclusive string } @@ -61,6 +62,8 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { return nil, fmt.Errorf("failed to parse target time: %s", err) } restore.recoveryTargetTime = ts.Format(ISO8601) + case "targetTimeline": + restore.recoveryTargetTimeline = v default: return nil, fmt.Errorf("unknown query parameter: %s", key) } diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index 6cda8993..2b635f43 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -178,6 +178,18 @@ func TestNewBarmanRestore(t *testing.T) { if restore.recoveryTargetTime != "2024-06-30T11:15:00-06:00" { t.Fatalf("expected recovery target time to be 2024-06-30T11:15:00-06:00, got %s", restore.recoveryTargetTime) } + + if restore.recoveryTargetTimeline != "2" { + t.Fatalf("expected recovery target timeline to be 2, got %s", restore.recoveryTargetTimeline) + } + + if restore.recoveryTargetInclusive != "false" { + t.Fatalf("expected recovery target inclusive to be false, got %s", restore.recoveryTargetInclusive) + } + + if restore.recoveryTargetAction != "shutdown" { + t.Fatalf("expected recovery target action to be shutdown, got %s", restore.recoveryTargetAction) + } } func TestWALRestoreCommand(t *testing.T) { @@ -308,6 +320,6 @@ func TestResolveBackupTarget(t *testing.T) { } func setRestoreDefaultEnv(t *testing.T) { - t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00&targetTimeline=2&targetInclusive=false&targetAction=shutdown") t.Setenv("FLY_APP_NAME", "postgres-flex") } diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index fc8f37bb..350e9bb7 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -220,6 +220,10 @@ func (c *PGConfig) setRecoveryTargetConfig(configURL string) error { c.internalConfig["restore_command"] = fmt.Sprintf("'%s'", barmanRestore.walRestoreCommand()) c.internalConfig["recovery_target_action"] = barmanRestore.recoveryTargetAction + if barmanRestore.recoveryTargetTimeline != "" { + c.internalConfig["recovery_target_timeline"] = barmanRestore.recoveryTargetTimeline + } + if barmanRestore.recoveryTargetInclusive != "" { c.internalConfig["recovery_target_inclusive"] = barmanRestore.recoveryTargetInclusive } diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index 17f5d15d..1e008fc2 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -256,6 +256,28 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatalf("expected recovery_target_inclusive to be false, got %v", cfg["recovery_target_inclusive"]) } }) + + t.Run("barman-restore-from-target-time-custom-timeline", func(t *testing.T) { + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00&targetTimeline=2") + store, _ := state.NewStore() + + if err := pgConf.initialize(store); err != nil { + t.Fatal(err) + } + + cfg, err := pgConf.CurrentConfig() + if err != nil { + t.Fatal(err) + } + + if cfg["recovery_target_time"] != "'2024-06-30T11:15:00-06:00'" { + t.Fatalf("expected recovery_target_time to be 2024-06-30T11:15:00-06:00, got %v", cfg["recovery_target_time"]) + } + + if cfg["recovery_target_timeline"] != "2" { + t.Fatalf("expected recovery_target_timeline to be 2, got %v", cfg["recovery_target_timeline"]) + } + }) } func TestPGUserConfigOverride(t *testing.T) { From b8fdd5cc961c6bc0e8d3de3c1ef9dfb26c5dd7c8 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 11:41:10 -0500 Subject: [PATCH 46/51] Do not perform a remote restore if there's an existing postgresql directory --- internal/flypg/node.go | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 376dced1..d9bdec7e 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -49,6 +49,7 @@ func NewNode() (*Node, error) { if err != nil { return nil, fmt.Errorf("failed getting private ip: %s", err) } + node.PrivateIP = ipv6.String() node.PrimaryRegion = os.Getenv("PRIMARY_REGION") @@ -127,6 +128,11 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed to set directory ownership: %s", err) } + store, err := state.NewStore() + if err != nil { + return fmt.Errorf("failed initialize cluster state store: %s", err) + } + // Snapshot restore if os.Getenv("FLY_RESTORED_FROM") != "" { active, err := isRestoreActive() @@ -141,26 +147,16 @@ func (n *Node) Init(ctx context.Context) error { } } + // Ensure we have the necessary credentials to access S3 when Barman is enabled. if os.Getenv("S3_ARCHIVE_CONFIG") != "" || os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { if err := writeS3Credentials(ctx, s3AuthDir); err != nil { return fmt.Errorf("failed to write s3 credentials: %s", err) } } - store, err := state.NewStore() - if err != nil { - return fmt.Errorf("failed initialize cluster state store: %s", err) - } - - // Remote point-in-time restore. + // Remote PITR if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { - // TODO - Probably not safe to use the same lock as the snapshot restore. - active, err := isRestoreActive() - if err != nil { - return fmt.Errorf("failed to verify active restore: %s", err) - } - - if active { + if !n.PGConfig.isInitialized() { configURL := os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") restore, err := NewBarmanRestore(configURL) if err != nil { @@ -179,11 +175,8 @@ func (n *Node) Init(ctx context.Context) error { if err := restore.walReplayAndReset(ctx, n); err != nil { return fmt.Errorf("failed to replay WAL: %s", err) } - - // Set the lock file so the init process knows not to restart the restore process. - if err := setRestoreLock(); err != nil { - return fmt.Errorf("failed to set restore lock: %s", err) - } + } else { + log.Println("[WARN] Postgres directory present, ignoring `S3_ARCHIVE_REMOTE_RESTORE_CONFIG` ") } } From 096e3ad8487b4dbdd55b10f3d5a33a3b85508934 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 12:51:30 -0500 Subject: [PATCH 47/51] Revert back to RFC3339 --- internal/flypg/barman_restore.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 8036a0e7..e1066a83 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -24,8 +24,6 @@ type BarmanRestore struct { const ( defaultRestoreDir = "/data/postgresql" - // ISO8601 is the format required for the barman restore targets. - ISO8601 = "2006-01-02T15:04:05-07:00" ) func NewBarmanRestore(configURL string) (*BarmanRestore, error) { @@ -57,11 +55,11 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { case "targetAction": restore.recoveryTargetAction = v case "targetTime": - ts, err := time.Parse(ISO8601, v) + ts, err := time.Parse(time.RFC3339, v) if err != nil { return nil, fmt.Errorf("failed to parse target time: %s", err) } - restore.recoveryTargetTime = ts.Format(ISO8601) + restore.recoveryTargetTime = ts.Format(time.RFC3339) case "targetTimeline": restore.recoveryTargetTimeline = v default: @@ -157,7 +155,7 @@ func (b *BarmanRestore) restoreFromBackup(ctx context.Context) error { switch { case b.recoveryTarget != "": - backupID, err = b.resolveBackupFromTime(backups, time.Now().Format(ISO8601)) + backupID, err = b.resolveBackupFromTime(backups, time.Now().Format(time.RFC3339)) if err != nil { return fmt.Errorf("failed to resolve backup target by time: %s", err) } @@ -215,7 +213,7 @@ func (*BarmanRestore) resolveBackupFromTime(backupList BackupList, restoreStr st var restoreTime time.Time // Parse the restore string - restoreTime, err := time.Parse(ISO8601, restoreStr) + restoreTime, err := time.Parse(time.RFC3339, restoreStr) if err != nil { return "", fmt.Errorf("failed to parse restore time: %s", err) } From fdbbc1268c26d1581fdeb87294f0db1ce1f47401 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 13:05:32 -0500 Subject: [PATCH 48/51] Postgres doesn't support 'Z', so trim it and conver to -00:00 --- internal/flypg/barman_restore.go | 4 ++++ internal/flypg/barman_restore_test.go | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index e1066a83..91da75f1 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -6,6 +6,7 @@ import ( "log" "net/url" "os" + "strings" "time" "github.com/fly-apps/postgres-flex/internal/supervisor" @@ -55,6 +56,9 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { case "targetAction": restore.recoveryTargetAction = v case "targetTime": + if strings.HasSuffix(v, "Z") { + v = strings.TrimSuffix(v, "Z") + "-00:00" + } ts, err := time.Parse(time.RFC3339, v) if err != nil { return nil, fmt.Errorf("failed to parse target time: %s", err) diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index 2b635f43..a8a6576d 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -175,8 +175,8 @@ func TestNewBarmanRestore(t *testing.T) { t.Fatalf("expected recovery target name to be empty, got %s", restore.recoveryTargetName) } - if restore.recoveryTargetTime != "2024-06-30T11:15:00-06:00" { - t.Fatalf("expected recovery target time to be 2024-06-30T11:15:00-06:00, got %s", restore.recoveryTargetTime) + if restore.recoveryTargetTime != "2024-07-03T17:55:22Z" { + t.Fatalf("expected recovery target time to be 2024-07-03T17:55:22-00:00, got %s", restore.recoveryTargetTime) } if restore.recoveryTargetTimeline != "2" { @@ -320,6 +320,6 @@ func TestResolveBackupTarget(t *testing.T) { } func setRestoreDefaultEnv(t *testing.T) { - t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00&targetTimeline=2&targetInclusive=false&targetAction=shutdown") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-07-03T17:55:22Z&targetTimeline=2&targetInclusive=false&targetAction=shutdown") t.Setenv("FLY_APP_NAME", "postgres-flex") } From 6e3aa7387a2919e6f8b2f32d48a421914dceecb1 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 3 Jul 2024 13:42:05 -0500 Subject: [PATCH 49/51] Moving back to RFC3339, however, 'Z' needs to be stripped and replaced with +00:00 --- internal/flypg/barman.go | 12 ++++++++++++ internal/flypg/barman_restore.go | 8 ++------ internal/flypg/barman_restore_test.go | 4 ++-- internal/flypg/barman_test.go | 27 +++++++++++++++++++++++++++ internal/flypg/node.go | 2 ++ internal/flypg/pg_test.go | 6 +++--- 6 files changed, 48 insertions(+), 11 deletions(-) diff --git a/internal/flypg/barman.go b/internal/flypg/barman.go index 040033b4..176a3959 100644 --- a/internal/flypg/barman.go +++ b/internal/flypg/barman.go @@ -251,3 +251,15 @@ func (*Barman) parseBackups(backupBytes []byte) (BackupList, error) { return backupList, nil } + +func formatTimestamp(timestamp string) (string, error) { + if strings.HasSuffix(timestamp, "Z") { + timestamp = strings.TrimSuffix(timestamp, "Z") + "+00:00" + } + parsedTime, err := time.Parse(time.RFC3339, timestamp) + if err != nil { + return "", fmt.Errorf("failed to parse timestamp: %s", err) + } + + return parsedTime.Format("2006-01-02T15:04:05-07:00"), nil +} diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 91da75f1..ed64a7c6 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -6,7 +6,6 @@ import ( "log" "net/url" "os" - "strings" "time" "github.com/fly-apps/postgres-flex/internal/supervisor" @@ -56,14 +55,11 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { case "targetAction": restore.recoveryTargetAction = v case "targetTime": - if strings.HasSuffix(v, "Z") { - v = strings.TrimSuffix(v, "Z") + "-00:00" - } - ts, err := time.Parse(time.RFC3339, v) + ts, err := formatTimestamp(v) if err != nil { return nil, fmt.Errorf("failed to parse target time: %s", err) } - restore.recoveryTargetTime = ts.Format(time.RFC3339) + restore.recoveryTargetTime = ts case "targetTimeline": restore.recoveryTargetTimeline = v default: diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index a8a6576d..f8306da0 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -175,8 +175,8 @@ func TestNewBarmanRestore(t *testing.T) { t.Fatalf("expected recovery target name to be empty, got %s", restore.recoveryTargetName) } - if restore.recoveryTargetTime != "2024-07-03T17:55:22Z" { - t.Fatalf("expected recovery target time to be 2024-07-03T17:55:22-00:00, got %s", restore.recoveryTargetTime) + if restore.recoveryTargetTime != "2024-07-03T17:55:22+00:00" { + t.Fatalf("expected recovery target time to be 2024-07-03T17:55:22+00:00, got %s", restore.recoveryTargetTime) } if restore.recoveryTargetTimeline != "2" { diff --git a/internal/flypg/barman_test.go b/internal/flypg/barman_test.go index daf8254e..3e800186 100644 --- a/internal/flypg/barman_test.go +++ b/internal/flypg/barman_test.go @@ -75,6 +75,33 @@ func TestNewBarman(t *testing.T) { }) } +func TestFormatTimestamp(t *testing.T) { + t.Run("valid", func(t *testing.T) { + ts, err := formatTimestamp("2024-07-03T17:55:22Z") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ts != "2024-07-03T17:55:22+00:00" { + t.Fatalf("expected timestamp to be 2024-07-03T17:55:22+00:00, but got %s", ts) + } + + ts, err = formatTimestamp("2024-07-03T17:55:22-07:00") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ts != "2024-07-03T17:55:22-07:00" { + t.Fatalf("expected timestamp to be 2024-07-03T17:55:22-07:00, but got %s", ts) + } + }) + + t.Run("invalid", func(t *testing.T) { + _, err := formatTimestamp("2024-07-03T17:55:22Z07:00") + if err == nil { + t.Fatalf("unexpected error, but not nil") + } + }) +} + func setDefaultEnv(t *testing.T) { t.Setenv("S3_ARCHIVE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") t.Setenv("FLY_APP_NAME", "postgres-flex") diff --git a/internal/flypg/node.go b/internal/flypg/node.go index d9bdec7e..5820153f 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -157,6 +157,8 @@ func (n *Node) Init(ctx context.Context) error { // Remote PITR if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { if !n.PGConfig.isInitialized() { + log.Println("[INFO] Postgres directory not present, proceeding with remote restore.") + configURL := os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") restore, err := NewBarmanRestore(configURL) if err != nil { diff --git a/internal/flypg/pg_test.go b/internal/flypg/pg_test.go index 1e008fc2..a6d603d1 100644 --- a/internal/flypg/pg_test.go +++ b/internal/flypg/pg_test.go @@ -236,7 +236,7 @@ func TestPGConfigInitialization(t *testing.T) { }) t.Run("barman-restore-from-target-time-non-inclusive", func(t *testing.T) { - t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00-06:00&targetInclusive=false") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-06-30T11:15:00Z&targetInclusive=false") store, _ := state.NewStore() if err := pgConf.initialize(store); err != nil { @@ -248,8 +248,8 @@ func TestPGConfigInitialization(t *testing.T) { t.Fatal(err) } - if cfg["recovery_target_time"] != "'2024-06-30T11:15:00-06:00'" { - t.Fatalf("expected recovery_target_time to be 2024-06-30T11:15:00-06:00, got %v", cfg["recovery_target_time"]) + if cfg["recovery_target_time"] != "'2024-06-30T11:15:00+00:00'" { + t.Fatalf("expected recovery_target_time to be 2024-06-30T11:15:00+00:00, got %v", cfg["recovery_target_time"]) } if cfg["recovery_target_inclusive"] != "false" { From 0f58fd531ce16b6e37b522fb9767811cfa141a77 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Thu, 4 Jul 2024 23:47:48 -0500 Subject: [PATCH 50/51] Allow target to be left out --- internal/flypg/barman_restore.go | 9 ++++----- internal/flypg/pg.go | 2 -- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index ed64a7c6..316aba44 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -71,10 +71,6 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { restore.recoveryTargetAction = "promote" } - if restore.recoveryTargetName == "" && restore.recoveryTargetTime == "" && restore.recoveryTarget == "" { - return nil, fmt.Errorf("no restore target specified") - } - return restore, nil } @@ -171,7 +167,10 @@ func (b *BarmanRestore) restoreFromBackup(ctx context.Context) error { return fmt.Errorf("failed to resolve backup target by id: %s", err) } default: - return fmt.Errorf("restore target not specified") + backupID, err = b.resolveBackupFromTime(backups, time.Now().Format(time.RFC3339)) + if err != nil { + return fmt.Errorf("failed to resolve backup target by time: %s", err) + } } if backupID == "" { diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 350e9bb7..22a4fed1 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -235,8 +235,6 @@ func (c *PGConfig) setRecoveryTargetConfig(configURL string) error { c.internalConfig["recovery_target_name"] = fmt.Sprintf("barman_%s", barmanRestore.recoveryTargetName) case barmanRestore.recoveryTargetTime != "": c.internalConfig["recovery_target_time"] = fmt.Sprintf("'%s'", barmanRestore.recoveryTargetTime) - default: - return fmt.Errorf("recovery target name or time must be specified") } return nil From 5a6eae6126087cda90fdd3735146c9b606779e2d Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Fri, 5 Jul 2024 17:38:26 -0500 Subject: [PATCH 51/51] Cleanup --- internal/flypg/barman_restore.go | 22 ++--- internal/flypg/barman_restore_test.go | 119 ++++++++++++++++++-------- internal/flypg/node.go | 92 +++++++++++--------- internal/flypg/pg.go | 18 ++-- 4 files changed, 154 insertions(+), 97 deletions(-) diff --git a/internal/flypg/barman_restore.go b/internal/flypg/barman_restore.go index 316aba44..8ec6d51a 100644 --- a/internal/flypg/barman_restore.go +++ b/internal/flypg/barman_restore.go @@ -14,16 +14,21 @@ import ( type BarmanRestore struct { *Barman - recoveryTarget string - recoveryTargetName string - recoveryTargetTime string + // Target parameters + // For more information on these parameters, see: + // https://www.postgresql.org/docs/current/runtime-config-wal.html#RUNTIME-CONFIG-WAL-RECOVERY-TARGET + recoveryTarget string + recoveryTargetName string + recoveryTargetTime string + recoveryTargetTimeline string recoveryTargetAction string recoveryTargetInclusive string } const ( - defaultRestoreDir = "/data/postgresql" + defaultRestoreDir = "/data/postgresql" + waitOnRecoveryTimeout = 10 * time.Minute ) func NewBarmanRestore(configURL string) (*BarmanRestore, error) { @@ -38,9 +43,7 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { return nil, fmt.Errorf("invalid restore config url: %w", err) } - restore := &BarmanRestore{ - Barman: barman, - } + restore := &BarmanRestore{Barman: barman} for key, value := range url.Query() { v := value[0] @@ -63,7 +66,7 @@ func NewBarmanRestore(configURL string) (*BarmanRestore, error) { case "targetTimeline": restore.recoveryTargetTimeline = v default: - return nil, fmt.Errorf("unknown query parameter: %s", key) + log.Printf("[WARN] unknown query parameter: %s. ignoring.", key) } } @@ -134,7 +137,6 @@ func (*BarmanRestore) walReplayAndReset(ctx context.Context, node *Node) error { } return nil - } func (b *BarmanRestore) restoreFromBackup(ctx context.Context) error { @@ -277,7 +279,7 @@ func waitOnRecovery(ctx context.Context, privateIP string) error { defer func() { _ = conn.Close(ctx) }() // TODO - Figure out a more reasonable timeout to use here. - timeout := time.After(10 * time.Minute) + timeout := time.After(waitOnRecoveryTimeout) ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() diff --git a/internal/flypg/barman_restore_test.go b/internal/flypg/barman_restore_test.go index f8306da0..b6f3f32f 100644 --- a/internal/flypg/barman_restore_test.go +++ b/internal/flypg/barman_restore_test.go @@ -141,55 +141,98 @@ const backupsResponse = `{ func TestNewBarmanRestore(t *testing.T) { setRestoreDefaultEnv(t) + t.Run("defaults", func(t *testing.T) { + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } - restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) - if err != nil { - t.Fatalf("NewBarmanRestore failed with: %v", err) - } + if restore.bucket != "my-bucket" { + t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) + } - if restore.bucket != "my-bucket" { - t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) - } + if restore.BucketURL() != "s3://my-bucket" { + t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) + } - if restore.BucketURL() != "s3://my-bucket" { - t.Fatalf("expected bucket to be my-bucket, got %s", restore.bucket) - } + if restore.bucketDirectory != "my-directory" { + t.Fatalf("expected bucket directory to be my-directory, got %s", restore.bucketDirectory) + } - if restore.bucketDirectory != "my-directory" { - t.Fatalf("expected bucket directory to be my-directory, got %s", restore.bucketDirectory) - } + if restore.appName != "postgres-flex" { + t.Fatalf("expected app name to be postgres-flex, got %s", restore.appName) + } - if restore.appName != "postgres-flex" { - t.Fatalf("expected app name to be postgres-flex, got %s", restore.appName) - } + if restore.provider != "aws-s3" { + t.Fatalf("expected provider to be aws-s3, got %s", restore.provider) + } - if restore.provider != "aws-s3" { - t.Fatalf("expected provider to be aws-s3, got %s", restore.provider) - } + if restore.endpoint != "https://fly.storage.tigris.dev" { + t.Fatalf("expected endpoint to be https://fly.storage.tigris.dev, got %s", restore.endpoint) + } - if restore.endpoint != "https://fly.storage.tigris.dev" { - t.Fatalf("expected endpoint to be https://fly.storage.tigris.dev, got %s", restore.endpoint) - } + }) - if restore.recoveryTargetName != "" { - t.Fatalf("expected recovery target name to be empty, got %s", restore.recoveryTargetName) - } + t.Run("target", func(t *testing.T) { + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?target=immediate") - if restore.recoveryTargetTime != "2024-07-03T17:55:22+00:00" { - t.Fatalf("expected recovery target time to be 2024-07-03T17:55:22+00:00, got %s", restore.recoveryTargetTime) - } + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } - if restore.recoveryTargetTimeline != "2" { - t.Fatalf("expected recovery target timeline to be 2, got %s", restore.recoveryTargetTimeline) - } + if restore.recoveryTarget != "immediate" { + t.Fatalf("expected recovery target to be 'immediate', got %s", restore.recoveryTarget) + } + }) - if restore.recoveryTargetInclusive != "false" { - t.Fatalf("expected recovery target inclusive to be false, got %s", restore.recoveryTargetInclusive) - } + t.Run("target-time", func(t *testing.T) { + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-07-03T17:55:22Z") - if restore.recoveryTargetAction != "shutdown" { - t.Fatalf("expected recovery target action to be shutdown, got %s", restore.recoveryTargetAction) - } + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } + + if restore.recoveryTargetTime != "2024-07-03T17:55:22+00:00" { + t.Fatalf("expected recovery target time to be 2024-07-03T17:55:22+00:00, got %s", restore.recoveryTargetTime) + } + }) + + t.Run("target-name", func(t *testing.T) { + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetName=20240705T051010") + + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } + + if restore.recoveryTargetName != "20240705T051010" { + t.Fatalf("expected recovery target name to be 20240705T051010, got %s", restore.recoveryTargetName) + } + }) + + t.Run("target-name-with-options", func(t *testing.T) { + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetName=20240705T051010&targetAction=shutdown&targetTimeline=2&targetInclusive=false") + + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) + if err != nil { + t.Fatalf("NewBarmanRestore failed with: %v", err) + } + + if restore.recoveryTargetName != "20240705T051010" { + t.Fatalf("expected recovery target name to be 20240705T051010, got %s", restore.recoveryTargetName) + } + + if restore.recoveryTargetAction != "shutdown" { + t.Fatalf("expected recovery target action to be shutdown, got %s", restore.recoveryTargetAction) + } + + if restore.recoveryTargetTimeline != "2" { + t.Fatalf("expected recovery target timeline to be 2, got %s", restore.recoveryTargetTimeline) + } + + }) } func TestWALRestoreCommand(t *testing.T) { @@ -320,6 +363,6 @@ func TestResolveBackupTarget(t *testing.T) { } func setRestoreDefaultEnv(t *testing.T) { - t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory?targetTime=2024-07-03T17:55:22Z&targetTimeline=2&targetInclusive=false&targetAction=shutdown") + t.Setenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG", "https://my-key:my-secret@fly.storage.tigris.dev/my-bucket/my-directory") t.Setenv("FLY_APP_NAME", "postgres-flex") } diff --git a/internal/flypg/node.go b/internal/flypg/node.go index 5820153f..f309092a 100644 --- a/internal/flypg/node.go +++ b/internal/flypg/node.go @@ -133,55 +133,18 @@ func (n *Node) Init(ctx context.Context) error { return fmt.Errorf("failed initialize cluster state store: %s", err) } - // Snapshot restore - if os.Getenv("FLY_RESTORED_FROM") != "" { - active, err := isRestoreActive() - if err != nil { - return fmt.Errorf("failed to verify active restore: %s", err) - } - - if active { - if err := prepareRemoteRestore(ctx, n); err != nil { - return fmt.Errorf("failed to issue restore: %s", err) - } - } + // Determine if we are performing a remote restore. + if err := n.handleRemoteRestore(ctx, store); err != nil { + return fmt.Errorf("failed to handle remote restore: %s", err) } - // Ensure we have the necessary credentials to access S3 when Barman is enabled. + // Ensure we have the required s3 credentials set. if os.Getenv("S3_ARCHIVE_CONFIG") != "" || os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { if err := writeS3Credentials(ctx, s3AuthDir); err != nil { return fmt.Errorf("failed to write s3 credentials: %s", err) } } - // Remote PITR - if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { - if !n.PGConfig.isInitialized() { - log.Println("[INFO] Postgres directory not present, proceeding with remote restore.") - - configURL := os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") - restore, err := NewBarmanRestore(configURL) - if err != nil { - return fmt.Errorf("failed to initialize barman restore: %s", err) - } - - if err := restore.restoreFromBackup(ctx); err != nil { - return fmt.Errorf("failed to restore base backup: %s", err) - } - - // Set restore configuration - if err := n.PGConfig.initialize(store); err != nil { - return fmt.Errorf("failed to initialize pg config: %s", err) - } - - if err := restore.walReplayAndReset(ctx, n); err != nil { - return fmt.Errorf("failed to replay WAL: %s", err) - } - } else { - log.Println("[WARN] Postgres directory present, ignoring `S3_ARCHIVE_REMOTE_RESTORE_CONFIG` ") - } - } - // Verify whether we are a booting zombie. if ZombieLockExists() { if err := handleZombieLock(ctx, n); err != nil { @@ -514,3 +477,50 @@ func setDirOwnership(ctx context.Context, pathToDir string) error { return nil } + +func (n *Node) handleRemoteRestore(ctx context.Context, store *state.Store) error { + // Handle snapshot restore + if os.Getenv("FLY_RESTORED_FROM") != "" { + active, err := isRestoreActive() + if err != nil { + return fmt.Errorf("failed to verify active restore: %s", err) + } + + if !active { + return nil + } + + return prepareRemoteRestore(ctx, n) + } + + // WAL-based Restore + if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") != "" { + if n.PGConfig.isInitialized() { + log.Println("[INFO] Postgres directory present, ignoring `S3_ARCHIVE_REMOTE_RESTORE_CONFIG` ") + return nil + } + + log.Println("[INFO] Postgres directory not present, proceeding with remote restore.") + + // Initialize barman restore + restore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) + if err != nil { + return fmt.Errorf("failed to initialize barman restore: %s", err) + } + + // Restore base backup + if err := restore.restoreFromBackup(ctx); err != nil { + return fmt.Errorf("failed to restore base backup: %s", err) + } + + // Set restore configuration + if err := n.PGConfig.initialize(store); err != nil { + return fmt.Errorf("failed to initialize pg config: %s", err) + } + + // Replay WAL and reset the cluster + return restore.walReplayAndReset(ctx, n) + } + + return nil +} diff --git a/internal/flypg/pg.go b/internal/flypg/pg.go index 22a4fed1..730cbd5b 100644 --- a/internal/flypg/pg.go +++ b/internal/flypg/pg.go @@ -173,24 +173,26 @@ func (c *PGConfig) SetDefaults(store *state.Store) error { "shared_preload_libraries": fmt.Sprintf("'%s'", strings.Join(sharedPreloadLibraries, ",")), } - if err := c.setArchiveConfig(os.Getenv("S3_ARCHIVE_CONFIG"), store); err != nil { + // Set WAL Archive specific settings + if err := c.setArchiveConfig(store); err != nil { return fmt.Errorf("failed to set archive config: %s", err) } - if err := c.setRecoveryTargetConfig(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")); err != nil { + // Set recovery target settings + if err := c.setRecoveryTargetConfig(); err != nil { return fmt.Errorf("failed to set recovery target config: %s", err) } return nil } -func (c *PGConfig) setArchiveConfig(configURL string, store *state.Store) error { - if configURL == "" { +func (c *PGConfig) setArchiveConfig(store *state.Store) error { + if os.Getenv("S3_ARCHIVE_CONFIG") == "" { c.internalConfig["archive_mode"] = "off" return nil } - barman, err := NewBarman(store, configURL, DefaultAuthProfile) + barman, err := NewBarman(store, os.Getenv("S3_ARCHIVE_CONFIG"), DefaultAuthProfile) if err != nil { return fmt.Errorf("failed to initialize barman instance: %s", err) } @@ -206,12 +208,12 @@ func (c *PGConfig) setArchiveConfig(configURL string, store *state.Store) error return nil } -func (c *PGConfig) setRecoveryTargetConfig(configURL string) error { - if configURL == "" { +func (c *PGConfig) setRecoveryTargetConfig() error { + if os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG") == "" { return nil } - barmanRestore, err := NewBarmanRestore(configURL) + barmanRestore, err := NewBarmanRestore(os.Getenv("S3_ARCHIVE_REMOTE_RESTORE_CONFIG")) if err != nil { return err }