diff --git a/pkg/config/system_test.go b/pkg/config/system_test.go index 1e3d33a04f4a..3f06b6fbc692 100644 --- a/pkg/config/system_test.go +++ b/pkg/config/system_test.go @@ -566,7 +566,7 @@ func TestGetZoneConfigForKey(t *testing.T) { {roachpb.RKey(keys.TimeseriesPrefix.PrefixEnd()), keys.SystemRangesID}, {roachpb.RKey(keys.TableDataMin), keys.SystemDatabaseID}, {roachpb.RKey(keys.SystemConfigSplitKey), keys.SystemDatabaseID}, - {roachpb.RKey(keys.GracePeriodInitTimestamp), keys.SystemRangesID}, + {roachpb.RKey(keys.ClusterInitGracePeriodTimestamp), keys.SystemRangesID}, {tkey(keys.ZonesTableID), keys.ZonesTableID}, {roachpb.RKey(keys.SystemZonesTableSpan.Key), keys.ZonesTableID}, diff --git a/pkg/keys/constants.go b/pkg/keys/constants.go index 4aea293fd3f7..ebb5189b695c 100644 --- a/pkg/keys/constants.go +++ b/pkg/keys/constants.go @@ -301,10 +301,10 @@ var ( // BootstrapVersionKey is the key at which clusters bootstrapped with a version // > 1.0 persist the version at which they were bootstrapped. BootstrapVersionKey = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("bootstrap-version"))) - // GracePeriodInitTimestamp is used for license enforcement. It marks the timestamp + // ClusterInitGracePeriodTimestamp is used for license enforcement. It marks the timestamp // set during cluster initialization, by which a license must be installed to avoid // throttling. The value is stored as the number of seconds since the Unix epoch. - GracePeriodInitTimestamp = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("lic-gpi-ts"))) + ClusterInitGracePeriodTimestamp = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("lic-gpi-ts"))) // // NodeIDGenerator is the global node ID generator sequence. NodeIDGenerator = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("node-idgen"))) diff --git a/pkg/keys/doc.go b/pkg/keys/doc.go index e2407920d97f..2f9e216fcf4b 100644 --- a/pkg/keys/doc.go +++ b/pkg/keys/doc.go @@ -246,15 +246,15 @@ var _ = [...]interface{}{ // 2. System keys: This is where we store global, system data which is // replicated across the cluster. SystemPrefix, - NodeLivenessPrefix, // "\x00liveness-" - BootstrapVersionKey, // "bootstrap-version" - GracePeriodInitTimestamp, // "lic-gpi-ts" - NodeIDGenerator, // "node-idgen" - RangeIDGenerator, // "range-idgen" - StatusPrefix, // "status-" - StatusNodePrefix, // "status-node-" - StoreIDGenerator, // "store-idgen" - StartupMigrationPrefix, // "system-version/" + NodeLivenessPrefix, // "\x00liveness-" + BootstrapVersionKey, // "bootstrap-version" + ClusterInitGracePeriodTimestamp, // "lic-gpi-ts" + NodeIDGenerator, // "node-idgen" + RangeIDGenerator, // "range-idgen" + StatusPrefix, // "status-" + StatusNodePrefix, // "status-node-" + StoreIDGenerator, // "store-idgen" + StartupMigrationPrefix, // "system-version/" // StartupMigrationLease, // "system-version/lease" - removed in 23.1 TimeseriesPrefix, // "tsd" SystemSpanConfigPrefix, // "xffsys-scfg" diff --git a/pkg/server/license/BUILD.bazel b/pkg/server/license/BUILD.bazel index 3d7c8a6f6aad..aa86a16c3b5a 100644 --- a/pkg/server/license/BUILD.bazel +++ b/pkg/server/license/BUILD.bazel @@ -14,6 +14,7 @@ go_library( "//pkg/sql/isql", "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", + "//pkg/sql/sem/tree", "//pkg/util/envutil", "//pkg/util/log", "//pkg/util/syncutil", diff --git a/pkg/server/license/enforcer.go b/pkg/server/license/enforcer.go index aa8a5a6291bf..dac05918f50a 100644 --- a/pkg/server/license/enforcer.go +++ b/pkg/server/license/enforcer.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/syncutil" @@ -110,6 +111,11 @@ type TestingKnobs struct { // OverrideMaxOpenTransactions if set, overrides the maximum open transactions // when checking if active throttling. OverrideMaxOpenTransactions *int64 + + // OverwriteClusterInitGracePeriodTS, if true, forces the enforcer to + // overwrite the existing cluster initialization grace period timestamp, + // even if one is already set. + OverwriteClusterInitGracePeriodTS bool } // TelemetryStatusReporter is the interface we use to find the last ping @@ -163,9 +169,7 @@ func (e *Enforcer) GetTestingKnobs() *TestingKnobs { // Start will load the necessary metadata for the enforcer. It reads from the // KV license metadata and will populate any missing data as needed. The DB // passed in must have access to the system tenant. -func (e *Enforcer) Start( - ctx context.Context, st *cluster.Settings, db isql.DB, initialStart bool, -) error { +func (e *Enforcer) Start(ctx context.Context, st *cluster.Settings, db isql.DB) error { // We always start disabled. If an error occurs, the enforcer setup will be // incomplete, but the server will continue to start. To ensure stability in // that case, we leave throttling disabled. @@ -175,7 +179,7 @@ func (e *Enforcer) Start( e.maybeLogActiveOverrides(ctx) if !startDisabled { - if err := e.maybeWriteClusterInitGracePeriodTS(ctx, db, initialStart); err != nil { + if err := e.maybeWriteClusterInitGracePeriodTS(ctx, db); err != nil { return err } } @@ -199,20 +203,24 @@ func (e *Enforcer) Start( // maybeWriteClusterInitGracePeriodTS checks if the cluster init grace period // timestamp needs to be written to the KV layer and writes it if needed. -func (e *Enforcer) maybeWriteClusterInitGracePeriodTS( - ctx context.Context, db isql.DB, initialStart bool, -) error { +func (e *Enforcer) maybeWriteClusterInitGracePeriodTS(ctx context.Context, db isql.DB) error { return db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { // We could use a conditional put for this logic. However, we want to read // and cache the value, and the common case is that the value will be read. // Only during the initialization of the first node in the cluster will we // need to write a new timestamp. So, we optimize for the case where the // timestamp already exists. - val, err := txn.KV().Get(ctx, keys.GracePeriodInitTimestamp) + val, err := txn.KV().Get(ctx, keys.ClusterInitGracePeriodTimestamp) if err != nil { return err } - if val.Value == nil { + tk := e.GetTestingKnobs() + if val.Value == nil || (tk != nil && tk.OverwriteClusterInitGracePeriodTS) { + initialStart, err := e.getIsNewClusterEstimate(ctx, txn) + if err != nil { + return err + } + // The length of the grace period without a license varies based on the // cluster's creation time. Older databases built when we had a // CockroachDB core license are given more time. @@ -224,7 +232,7 @@ func (e *Enforcer) maybeWriteClusterInitGracePeriodTS( end := e.getStartTime().Add(gracePeriodLength) log.Infof(ctx, "generated new cluster init grace period end time: %s", end.UTC().String()) e.clusterInitGracePeriodEndTS.Store(end.Unix()) - return txn.KV().Put(ctx, keys.GracePeriodInitTimestamp, e.clusterInitGracePeriodEndTS.Load()) + return txn.KV().Put(ctx, keys.ClusterInitGracePeriodTimestamp, e.clusterInitGracePeriodEndTS.Load()) } e.clusterInitGracePeriodEndTS.Store(val.ValueInt()) log.Infof(ctx, "fetched existing cluster init grace period end time: %s", e.GetClusterInitGracePeriodEndTS().String()) @@ -487,3 +495,36 @@ func (e *Enforcer) getInitialIsDisabledValue() bool { } return !tk.Enable } + +// getIsNewClusterEstimate is a helper to determine if the cluster is a newly +// created one. This is used in Start processing to help determine the length +// of the grace period when no license is installed. +func (e *Enforcer) getIsNewClusterEstimate(ctx context.Context, txn isql.Txn) (bool, error) { + data, err := txn.QueryRow(ctx, "check if enforcer start is near cluster init time", txn.KV(), + `SELECT min(completed_at) FROM system.migrations`) + if err != nil { + return false, err + } + if len(data) == 0 { + return false, errors.New("no rows found in system.migrations") + } + var ts time.Time + switch t := data[0].(type) { + case *tree.DTimestampTZ: + ts = t.Time + default: + return false, errors.Newf("unexpected data type: %v", t) + } + + // We are going to lean on system.migrations to determine if the cluster is + // new or not. If the cluster is new, the minimum value for the completed_at + // column should roughly match the start time of this enforcer. We will query + // that value and see if it's within 2 hours of it. If it's within that range + // we treat it as if it's a new cluster. + st := e.getStartTime() + if st.After(ts.Add(-1*time.Hour)) && st.Before(ts.Add(1*time.Hour)) { + return true, nil + } + log.Infof(ctx, "cluster init is not within the bounds of the enforcer start time: %v", ts) + return false, nil +} diff --git a/pkg/server/license/enforcer_test.go b/pkg/server/license/enforcer_test.go index eec6317842d6..52c75e69722c 100644 --- a/pkg/server/license/enforcer_test.go +++ b/pkg/server/license/enforcer_test.go @@ -40,14 +40,14 @@ func (m mockTelemetryStatusReporter) GetLastSuccessfulTelemetryPing() time.Time return m.lastPingTime } -func TestGracePeriodInitTSCache(t *testing.T) { +func TestClusterInitGracePeriod_NoOverwrite(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) // This is the timestamp that we'll override the grace period init timestamp with. // This will be set when bringing up the server. ts1 := timeutil.Unix(1724329716, 0) - ts1End := ts1.Add(7 * 24 * time.Hour) // Calculate the end of the grace period based on ts1 + ts1End := ts1.Add(30 * 24 * time.Hour) // Calculate the end of the grace period based on ts1 ctx := context.Background() srv := serverutils.StartServerOnly(t, base.TestServerArgs{ @@ -76,7 +76,7 @@ func TestGracePeriodInitTSCache(t *testing.T) { require.Equal(t, ts2End, enforcer.GetClusterInitGracePeriodEndTS()) // Start the enforcer to read the timestamp from the KV. enforcer.SetTelemetryStatusReporter(&mockTelemetryStatusReporter{lastPingTime: ts1}) - err := enforcer.Start(ctx, srv.ClusterSettings(), srv.SystemLayer().InternalDB().(descs.DB), false /* initialStart */) + err := enforcer.Start(ctx, srv.ClusterSettings(), srv.SystemLayer().InternalDB().(descs.DB)) require.NoError(t, err) require.Equal(t, ts1End, enforcer.GetClusterInitGracePeriodEndTS()) @@ -86,6 +86,63 @@ func TestGracePeriodInitTSCache(t *testing.T) { require.Equal(t, ts1End, srv.ApplicationLayer().ExecutorConfig().(sql.ExecutorConfig).LicenseEnforcer.GetClusterInitGracePeriodEndTS()) } +func TestClusterInitGracePeriod_NewClusterEstimation(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // This will be the start time of the enforcer for each unit test + ts1 := timeutil.Unix(1631494860, 0) + + ctx := context.Background() + srv := serverutils.StartServerOnly(t, base.TestServerArgs{ + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + LicenseTestingKnobs: license.TestingKnobs{ + Enable: true, + OverrideStartTime: &ts1, + }, + }, + }, + }) + defer srv.Stopper().Stop(ctx) + + for _, tc := range []struct { + desc string + minSysMigrationTime time.Time + expGracePeriodEndTS time.Time + }{ + {"init-2020", timeutil.Unix(1577836800, 0), ts1.Add(30 * 24 * time.Hour)}, + {"init-5min-ago", ts1.Add(-5 * time.Minute), ts1.Add(7 * 24 * time.Hour)}, + {"init-59min-ago", ts1.Add(-59 * time.Minute), ts1.Add(7 * 24 * time.Hour)}, + {"init-1h1min-ago", ts1.Add(-61 * time.Minute), ts1.Add(30 * 24 * time.Hour)}, + } { + t.Run(tc.desc, func(t *testing.T) { + enforcer := &license.Enforcer{} + enforcer.SetTestingKnobs(&license.TestingKnobs{ + Enable: true, + OverrideStartTime: &ts1, + OverwriteClusterInitGracePeriodTS: true, + }) + + // Set up the min time in system.migrations. This is used by the enforcer + // to figure out if the cluster is new or old. The grace period length is + // adjusted based on this. + db := srv.SystemLayer().InternalDB().(descs.DB) + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + _, err := txn.Exec(ctx, "add new min to system.migrations", txn.KV(), + "UPDATE system.migrations SET completed_at=$1 LIMIT 1", + tc.minSysMigrationTime) + return err + }) + require.NoError(t, err) + + err = enforcer.Start(ctx, srv.ClusterSettings(), srv.SystemLayer().InternalDB().(descs.DB)) + require.NoError(t, err) + require.Equal(t, tc.expGracePeriodEndTS, enforcer.GetClusterInitGracePeriodEndTS()) + }) + } +} + func TestThrottle(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) diff --git a/pkg/server/server.go b/pkg/server/server.go index 33b037d9a6c4..fad3ad9922d1 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -2096,7 +2096,6 @@ func (s *topLevelServer) PreStart(ctx context.Context) error { s.stopper, s.cfg.TestingKnobs, orphanedLeasesTimeThresholdNanos, - s.InitialStart(), ); err != nil { return err } diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 61eaa163a765..c76258ffb5ac 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -1445,7 +1445,6 @@ func (s *SQLServer) preStart( stopper *stop.Stopper, knobs base.TestingKnobs, orphanedLeasesTimeThresholdNanos int64, - initialStart bool, ) error { // If necessary, start the tenant proxy first, to ensure all other // components can properly route to KV nodes. The Start method will block @@ -1763,7 +1762,7 @@ func (s *SQLServer) preStart( ) s.execCfg.SyntheticPrivilegeCache.Start(ctx) - s.startLicenseEnforcer(ctx, knobs, initialStart) + s.startLicenseEnforcer(ctx, knobs) // Report a warning if the server is being shut down via the stopper // before it was gracefully drained. This warning may be innocuous @@ -1914,9 +1913,7 @@ func (s *SQLServer) StartDiagnostics(ctx context.Context) { s.diagnosticsReporter.PeriodicallyReportDiagnostics(ctx, s.stopper) } -func (s *SQLServer) startLicenseEnforcer( - ctx context.Context, knobs base.TestingKnobs, initialStart bool, -) { +func (s *SQLServer) startLicenseEnforcer(ctx context.Context, knobs base.TestingKnobs) { // Start the license enforcer. This is only started for the system tenant since // it requires access to the system keyspace. For secondary tenants, this struct // is shared to provide access to the values cached from the KV read. @@ -1930,7 +1927,7 @@ func (s *SQLServer) startLicenseEnforcer( // diagnostics reporter. This will be handled in CRDB-39991 err := startup.RunIdempotentWithRetry(ctx, s.stopper.ShouldQuiesce(), "license enforcer start", func(ctx context.Context) error { - return licenseEnforcer.Start(ctx, s.cfg.Settings, s.internalDB, initialStart) + return licenseEnforcer.Start(ctx, s.cfg.Settings, s.internalDB) }) // This is not a critical component. If it fails to start, we log a warning // rather than prevent the entire server from starting. diff --git a/pkg/server/tenant.go b/pkg/server/tenant.go index 99e809d1fc9e..402ed52934b5 100644 --- a/pkg/server/tenant.go +++ b/pkg/server/tenant.go @@ -844,7 +844,6 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error { s.stopper, s.sqlServer.cfg.TestingKnobs, orphanedLeasesTimeThresholdNanos, - false, /* initialStart */ ); err != nil { return err }