Skip to content

Commit

Permalink
Merge pull request #322 from sabaini/bug/318
Browse files Browse the repository at this point in the history
Fix: ensure backward compat config
  • Loading branch information
sabaini authored Feb 29, 2024
2 parents 4c0f3d0 + 3f3f0ef commit bde449f
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/q2q-candidate-upgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
3 changes: 3 additions & 0 deletions .github/workflows/q2r-candidate-upgrade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
3 changes: 3 additions & 0 deletions .github/workflows/r2r-candidate-upgrade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
4 changes: 3 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,9 @@ jobs:
- name: Setup cluster
run: ~/actionutils.sh cluster_nodes custom

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Add 2 OSDs
run: |
for c in node-wrk1 node-wrk2 ; do
Expand Down Expand Up @@ -489,4 +492,3 @@ jobs:

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw

113 changes: 101 additions & 12 deletions microceph/ceph/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"encoding/json"
"fmt"
"github.com/canonical/microceph/microceph/interfaces"
"net"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -162,43 +163,106 @@ func ListConfigs() (types.Configs, error) {
return configs, nil
}

// updates the ceph config file.
func UpdateConfig(s interfaces.StateInterface) error {
confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf")
runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run")
// backwardCompatPubnet ensures that the public_network is set in the database
// this is a backward-compat shim to accomodate older versions of microceph
// which will ensure that the public_network is set in the database
func backwardCompatPubnet(s interfaces.StateInterface) error {
config, err := getConfigDb(s)
if err != nil {
return fmt.Errorf("failed to get config from db: %w", err)
}

// Get the configuration and servers.
// do we have a public_network configured?
// if it is unset, the below will evaluate to the empty string
// and subsequently fail the net.ParseCIDR check
pubNet := config["public_network"]
_, _, err = net.ParseCIDR(pubNet)
if err != nil {
// get public network from default address
pubNet, err = common.Network.FindNetworkAddress(s.ClusterState().Address().Hostname())
if err != nil {
return fmt.Errorf("failed to locate public network: %w", err)
}
// update the database
err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
_, err = database.CreateConfigItem(ctx, tx, database.ConfigItem{Key: "public_network", Value: pubNet})
if err != nil {
return fmt.Errorf("failed to record public_network: %w", err)
}
return nil
})
}

return nil
}

// backwardCompatMonitors retrieves monitor addresses from the node list and returns that
// this a backward-compat shim to accomodate older versions of microceph
func backwardCompatMonitors(s interfaces.StateInterface) ([]string, error) {
var err error
var configItems []database.ConfigItem
var monitors []database.Service
serviceName := "mon"

err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
configItems, err = database.GetConfigItems(ctx, tx)
monitors, err = database.GetServices(ctx, tx, database.ServiceFilter{Service: &serviceName})
if err != nil {
return err
}

return nil
})
if err != nil {
return err
return nil, err
}

config := map[string]string{}
for _, item := range configItems {
config[item.Key] = item.Value
monitorAddresses := make([]string, len(monitors))
remotes := s.ClusterState().Remotes().RemotesByName()
for i, monitor := range monitors {
remote, ok := remotes[monitor.Member]
if !ok {
continue
}
monitorAddresses[i] = remote.Address.Addr().String()
}
return monitorAddresses, nil
}

// UpdateConfig updates the ceph.conf file with the current configuration.
func UpdateConfig(s interfaces.StateInterface) error {
confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf")
runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run")

err := backwardCompatPubnet(s)
if err != nil {
return fmt.Errorf("failed to ensure backward compat: %w", err)
}

config, err := getConfigDb(s)
if err != nil {
return fmt.Errorf("failed to get config db: %w", err)
}

// REF: https://docs.ceph.com/en/quincy/rados/configuration/network-config-ref/#ceph-daemons
// The mon host configuration option only needs to be sufficiently up to date such that a
// client can reach one monitor that is currently online.
monitorAddresses := getMonitorAddresses(config)

// backward compat: if no mon hosts found, get them from the node addresses but don't
// insert into db, as the join logic will take care of that.
if len(monitorAddresses) == 0 {
monitorAddresses, err = backwardCompatMonitors(s)
if err != nil {
return fmt.Errorf("failed to get monitor addresses: %w", err)
}
}

conf := newCephConfig(confPath)

// Check if host has IP address on the configured public network.
_, err = common.Network.FindIpOnSubnet(config["public_network"])
if err != nil {
return fmt.Errorf("failed to locate IP on public network %s: %w", config["public_network"], err)
}

clientConfig, err := GetClientConfigForHost(s, s.ClusterState().Name())
if err != nil {
logger.Errorf("Failed to pull Client Configurations: %v", err)
Expand All @@ -225,6 +289,7 @@ func UpdateConfig(s interfaces.StateInterface) error {
if err != nil {
return fmt.Errorf("couldn't render ceph.conf: %w", err)
}
logger.Debugf("updated ceph.conf: %v", conf.GetPath())

// Generate ceph.client.admin.keyring
keyring := newCephKeyring(confPath, "ceph.keyring")
Expand All @@ -242,6 +307,30 @@ func UpdateConfig(s interfaces.StateInterface) error {
return nil
}

// getConfigDb retrieves the configuration from the database.
func getConfigDb(s interfaces.StateInterface) (map[string]string, error) {
var err error
var configItems []database.ConfigItem

err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
configItems, err = database.GetConfigItems(ctx, tx)
if err != nil {
return err
}

return nil
})
if err != nil {
return nil, err
}

config := map[string]string{}
for _, item := range configItems {
config[item.Key] = item.Value
}
return config, nil
}

// getMonitorAddresses scans a provided config key/value map and returns a list of mon hosts found.
func getMonitorAddresses(configs map[string]string) []string {
monHosts := []string{}
Expand Down
8 changes: 6 additions & 2 deletions microceph/ceph/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package ceph
import (
"context"
"database/sql"
"github.com/canonical/lxd/shared/logger"
"github.com/canonical/microceph/microceph/interfaces"
"reflect"
"time"
Expand All @@ -19,6 +20,7 @@ func Start(s interfaces.StateInterface) error {
for {
// Check that the database is ready.
if !s.ClusterState().Database.IsOpen() {
logger.Debug("start: database not ready, waiting...")
time.Sleep(10 * time.Second)
continue
}
Expand All @@ -39,26 +41,28 @@ func Start(s interfaces.StateInterface) error {
return nil
})
if err != nil {
logger.Warnf("start: failed to fetch monitors, retrying: %v", err)
time.Sleep(10 * time.Second)
continue
}

// Compare to the previous list.
if reflect.DeepEqual(oldMonitors, monitors) {
logger.Debugf("start: monitors unchanged, sleeping: %v", monitors)
time.Sleep(time.Minute)
continue
}

err = UpdateConfig(s)
if err != nil {
logger.Errorf("start: failed to update config, retrying: %v", err)
time.Sleep(10 * time.Second)
continue
}

logger.Debug("start: updated config, sleeping")
oldMonitors = monitors
time.Sleep(time.Minute)
}

}()

return nil
Expand Down
25 changes: 25 additions & 0 deletions tests/scripts/actionutils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,31 @@ function test_migration() {
return -1
}

function test_ceph_conf() {
set -uex
for n in $( lxc ls -c n --format csv ); do
echo "checking node $n"
lxc exec $n -- sh <<'EOF'
# Test: configured rundir must be current
current=$( realpath /var/snap/microceph/current )
rundir=$( cat /var/snap/microceph/current/conf/ceph.conf | awk '/run dir/{ print $4 }' )
p=$( dirname $rundir )
if [ $p != $current ]; then
echo "Error: snap data dir $current, configured run dir: $rundir"
cat /var/snap/microceph/current/conf/ceph.conf
exit -1
fi
# Test: must contain public_network
if ! grep -q public_net /var/snap/microceph/current/conf/ceph.conf ; then
echo "Error: didn't find public_net in ceph.conf"
cat /var/snap/microceph/current/conf/ceph.conf
exit -1
fi
EOF
done
}

function headexec() {
local run="${1?missing}"
shift
Expand Down

0 comments on commit bde449f

Please sign in to comment.