diff --git a/.github/workflows/q2q-candidate-upgrade.yml b/.github/workflows/q2q-candidate-upgrade.yml index 9963d7ec..fbf00651 100644 --- a/.github/workflows/q2q-candidate-upgrade.yml +++ b/.github/workflows/q2q-candidate-upgrade.yml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/q2r-candidate-upgrade.yaml b/.github/workflows/q2r-candidate-upgrade.yaml index c06b67da..d2a08c57 100644 --- a/.github/workflows/q2r-candidate-upgrade.yaml +++ b/.github/workflows/q2r-candidate-upgrade.yaml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/r2r-candidate-upgrade.yaml b/.github/workflows/r2r-candidate-upgrade.yaml index f00f9c3e..c1f94b1c 100644 --- a/.github/workflows/r2r-candidate-upgrade.yaml +++ b/.github/workflows/r2r-candidate-upgrade.yaml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6cbbdd32..8d10a426 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -255,6 +255,9 @@ jobs: - name: Setup cluster run: ~/actionutils.sh cluster_nodes custom + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Add 2 OSDs run: | for c in node-wrk1 node-wrk2 ; do @@ -489,4 +492,3 @@ jobs: - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw - diff --git a/microceph/ceph/config.go b/microceph/ceph/config.go index 82859024..eabc547a 100644 --- a/microceph/ceph/config.go +++ b/microceph/ceph/config.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "github.com/canonical/microceph/microceph/interfaces" + "net" "os" "path/filepath" "strings" @@ -162,36 +163,98 @@ func ListConfigs() (types.Configs, error) { return configs, nil } -// updates the ceph config file. -func UpdateConfig(s interfaces.StateInterface) error { - confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf") - runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run") +// backwardCompatPubnet ensures that the public_network is set in the database +// this is a backward-compat shim to accomodate older versions of microceph +// which will ensure that the public_network is set in the database +func backwardCompatPubnet(s interfaces.StateInterface) error { + config, err := getConfigDb(s) + if err != nil { + return fmt.Errorf("failed to get config from db: %w", err) + } - // Get the configuration and servers. + // do we have a public_network configured? + // if it is unset, the below will evaluate to the empty string + // and subsequently fail the net.ParseCIDR check + pubNet := config["public_network"] + _, _, err = net.ParseCIDR(pubNet) + if err != nil { + // get public network from default address + pubNet, err = common.Network.FindNetworkAddress(s.ClusterState().Address().Hostname()) + if err != nil { + return fmt.Errorf("failed to locate public network: %w", err) + } + // update the database + err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { + _, err = database.CreateConfigItem(ctx, tx, database.ConfigItem{Key: "public_network", Value: pubNet}) + if err != nil { + return fmt.Errorf("failed to record public_network: %w", err) + } + return nil + }) + } + + return nil +} + +// backwardCompatMonitors retrieves monitor addresses from the node list and returns that +// this a backward-compat shim to accomodate older versions of microceph +func backwardCompatMonitors(s interfaces.StateInterface) ([]string, error) { var err error - var configItems []database.ConfigItem + var monitors []database.Service + serviceName := "mon" err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { - configItems, err = database.GetConfigItems(ctx, tx) + monitors, err = database.GetServices(ctx, tx, database.ServiceFilter{Service: &serviceName}) if err != nil { return err } - return nil }) if err != nil { - return err + return nil, err } - config := map[string]string{} - for _, item := range configItems { - config[item.Key] = item.Value + monitorAddresses := make([]string, len(monitors)) + remotes := s.ClusterState().Remotes().RemotesByName() + for i, monitor := range monitors { + remote, ok := remotes[monitor.Member] + if !ok { + continue + } + monitorAddresses[i] = remote.Address.Addr().String() + } + return monitorAddresses, nil +} + +// UpdateConfig updates the ceph.conf file with the current configuration. +func UpdateConfig(s interfaces.StateInterface) error { + confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf") + runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run") + + err := backwardCompatPubnet(s) + if err != nil { + return fmt.Errorf("failed to ensure backward compat: %w", err) + } + + config, err := getConfigDb(s) + if err != nil { + return fmt.Errorf("failed to get config db: %w", err) } // REF: https://docs.ceph.com/en/quincy/rados/configuration/network-config-ref/#ceph-daemons // The mon host configuration option only needs to be sufficiently up to date such that a // client can reach one monitor that is currently online. monitorAddresses := getMonitorAddresses(config) + + // backward compat: if no mon hosts found, get them from the node addresses but don't + // insert into db, as the join logic will take care of that. + if len(monitorAddresses) == 0 { + monitorAddresses, err = backwardCompatMonitors(s) + if err != nil { + return fmt.Errorf("failed to get monitor addresses: %w", err) + } + } + conf := newCephConfig(confPath) // Check if host has IP address on the configured public network. @@ -199,6 +262,7 @@ func UpdateConfig(s interfaces.StateInterface) error { if err != nil { return fmt.Errorf("failed to locate IP on public network %s: %w", config["public_network"], err) } + clientConfig, err := GetClientConfigForHost(s, s.ClusterState().Name()) if err != nil { logger.Errorf("Failed to pull Client Configurations: %v", err) @@ -225,6 +289,7 @@ func UpdateConfig(s interfaces.StateInterface) error { if err != nil { return fmt.Errorf("couldn't render ceph.conf: %w", err) } + logger.Debugf("updated ceph.conf: %v", conf.GetPath()) // Generate ceph.client.admin.keyring keyring := newCephKeyring(confPath, "ceph.keyring") @@ -242,6 +307,30 @@ func UpdateConfig(s interfaces.StateInterface) error { return nil } +// getConfigDb retrieves the configuration from the database. +func getConfigDb(s interfaces.StateInterface) (map[string]string, error) { + var err error + var configItems []database.ConfigItem + + err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { + configItems, err = database.GetConfigItems(ctx, tx) + if err != nil { + return err + } + + return nil + }) + if err != nil { + return nil, err + } + + config := map[string]string{} + for _, item := range configItems { + config[item.Key] = item.Value + } + return config, nil +} + // getMonitorAddresses scans a provided config key/value map and returns a list of mon hosts found. func getMonitorAddresses(configs map[string]string) []string { monHosts := []string{} diff --git a/microceph/ceph/start.go b/microceph/ceph/start.go index aac8ec6b..6af259c6 100644 --- a/microceph/ceph/start.go +++ b/microceph/ceph/start.go @@ -3,6 +3,7 @@ package ceph import ( "context" "database/sql" + "github.com/canonical/lxd/shared/logger" "github.com/canonical/microceph/microceph/interfaces" "reflect" "time" @@ -19,6 +20,7 @@ func Start(s interfaces.StateInterface) error { for { // Check that the database is ready. if !s.ClusterState().Database.IsOpen() { + logger.Debug("start: database not ready, waiting...") time.Sleep(10 * time.Second) continue } @@ -39,26 +41,28 @@ func Start(s interfaces.StateInterface) error { return nil }) if err != nil { + logger.Warnf("start: failed to fetch monitors, retrying: %v", err) time.Sleep(10 * time.Second) continue } // Compare to the previous list. if reflect.DeepEqual(oldMonitors, monitors) { + logger.Debugf("start: monitors unchanged, sleeping: %v", monitors) time.Sleep(time.Minute) continue } err = UpdateConfig(s) if err != nil { + logger.Errorf("start: failed to update config, retrying: %v", err) time.Sleep(10 * time.Second) continue } - + logger.Debug("start: updated config, sleeping") oldMonitors = monitors time.Sleep(time.Minute) } - }() return nil diff --git a/tests/scripts/actionutils.sh b/tests/scripts/actionutils.sh index 78302f88..b49d3b84 100755 --- a/tests/scripts/actionutils.sh +++ b/tests/scripts/actionutils.sh @@ -388,6 +388,31 @@ function test_migration() { return -1 } +function test_ceph_conf() { + set -uex + for n in $( lxc ls -c n --format csv ); do + echo "checking node $n" + lxc exec $n -- sh <<'EOF' +# Test: configured rundir must be current +current=$( realpath /var/snap/microceph/current ) +rundir=$( cat /var/snap/microceph/current/conf/ceph.conf | awk '/run dir/{ print $4 }' ) +p=$( dirname $rundir ) +if [ $p != $current ]; then + echo "Error: snap data dir $current, configured run dir: $rundir" + cat /var/snap/microceph/current/conf/ceph.conf + exit -1 +fi + +# Test: must contain public_network +if ! grep -q public_net /var/snap/microceph/current/conf/ceph.conf ; then + echo "Error: didn't find public_net in ceph.conf" + cat /var/snap/microceph/current/conf/ceph.conf + exit -1 +fi +EOF + done +} + function headexec() { local run="${1?missing}" shift