Skip to content

Commit

Permalink
spill to disk based on flow-worker memory usage (#1231)
Browse files Browse the repository at this point in the history
the existing number of records based threshold is still present, both
can be active at the same time.

Best used with `GOMEMLIMIT` to keep heap more aligned with set memory
limit.
  • Loading branch information
heavycrystal authored Feb 9, 2024
1 parent 68e9ef1 commit 094b69d
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 11 deletions.
38 changes: 34 additions & 4 deletions flow/connectors/utils/cdc_records/cdc_records_storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"log/slog"
"math/big"
"os"
"runtime"
"time"

"github.com/cockroachdb/pebble"
Expand All @@ -34,6 +35,9 @@ type cdcRecordsStore struct {
flowJobName string
dbFolderName string
numRecordsSwitchThreshold int
memThresholdBytes uint64
thresholdReason string
memStats runtime.MemStats
}

func NewCDCRecordsStore(flowJobName string) *cdcRecordsStore {
Expand All @@ -43,7 +47,16 @@ func NewCDCRecordsStore(flowJobName string) *cdcRecordsStore {
numRecords: 0,
flowJobName: flowJobName,
dbFolderName: fmt.Sprintf("%s/%s_%s", os.TempDir(), flowJobName, shared.RandomString(8)),
numRecordsSwitchThreshold: peerdbenv.PeerDBCDCDiskSpillThreshold(),
numRecordsSwitchThreshold: peerdbenv.PeerDBCDCDiskSpillRecordsThreshold(),
memThresholdBytes: func() uint64 {
memPercent := peerdbenv.PeerDBCDCDiskSpillMemPercentThreshold()
maxMemBytes := peerdbenv.PeerDBFlowWorkerMaxMemBytes()
if memPercent > 0 && maxMemBytes > 0 {
return maxMemBytes * uint64(memPercent) / 100
}
return 0
}(),
thresholdReason: "",
}
}

Expand Down Expand Up @@ -72,15 +85,32 @@ func (c *cdcRecordsStore) initPebbleDB() error {
return nil
}

func (c *cdcRecordsStore) diskSpillThresholdsExceeded() bool {
if len(c.inMemoryRecords) >= c.numRecordsSwitchThreshold {
c.thresholdReason = fmt.Sprintf("more than %d primary keys read, spilling to disk",
c.numRecordsSwitchThreshold)
return true
}
if c.memThresholdBytes > 0 {
runtime.ReadMemStats(&c.memStats)

if c.memStats.Alloc >= c.memThresholdBytes {
c.thresholdReason = fmt.Sprintf("memalloc greater than %d bytes, spilling to disk",
c.memThresholdBytes)
return true
}
}
return false
}

func (c *cdcRecordsStore) Set(key *model.TableWithPkey, rec model.Record) error {
if key != nil {
_, ok := c.inMemoryRecords[*key]
if ok || len(c.inMemoryRecords) < c.numRecordsSwitchThreshold {
if ok || !c.diskSpillThresholdsExceeded() {
c.inMemoryRecords[*key] = rec
} else {
if c.pebbleDB == nil {
slog.Info(fmt.Sprintf("more than %d primary keys read, spilling to disk",
c.numRecordsSwitchThreshold),
slog.Info(c.thresholdReason,
slog.String(string(shared.FlowNameKey), c.flowJobName))
err := c.initPebbleDB()
if err != nil {
Expand Down
18 changes: 14 additions & 4 deletions flow/peerdbenv/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,19 @@ func PeerDBCDCIdleTimeoutSeconds(providedValue int) time.Duration {
return time.Duration(x) * time.Second
}

// PEERDB_CDC_DISK_SPILL_THRESHOLD
func PeerDBCDCDiskSpillThreshold() int {
return getEnvInt("PEERDB_CDC_DISK_SPILL_THRESHOLD", 1_000_000)
// PEERDB_CDC_DISK_SPILL_RECORDS_THRESHOLD
func PeerDBCDCDiskSpillRecordsThreshold() int {
return getEnvInt("PEERDB_CDC_DISK_SPILL_RECORDS_THRESHOLD", 1_000_000)
}

// PEERDB_CDC_DISK_SPILL_RECORDS_THRESHOLD, negative numbers means memory threshold disabled
func PeerDBCDCDiskSpillMemPercentThreshold() int {
return getEnvInt("PEERDB_CDC_DISK_SPILL_MEM_PERCENT_THRESHOLD", -1)
}

// GOMEMLIMIT is a variable internal to Golang itself, we use this for internal targets, 0 means no maximum
func PeerDBFlowWorkerMaxMemBytes() uint64 {
return getEnvUint[uint64]("GOMEMLIMIT", 0)
}

// PEERDB_CATALOG_HOST
Expand All @@ -53,7 +63,7 @@ func PeerDBCatalogHost() string {

// PEERDB_CATALOG_PORT
func PeerDBCatalogPort() uint32 {
return getEnvUint32("PEERDB_CATALOG_PORT", 5432)
return getEnvUint[uint32]("PEERDB_CATALOG_PORT", 5432)
}

// PEERDB_CATALOG_USER
Expand Down
9 changes: 6 additions & 3 deletions flow/peerdbenv/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package peerdbenv
import (
"os"
"strconv"

"golang.org/x/exp/constraints"
)

// GetEnv returns the value of the environment variable with the given name
Expand Down Expand Up @@ -32,18 +34,19 @@ func getEnvInt(name string, defaultValue int) int {
// getEnvUint32 returns the value of the environment variable with the given name
// or defaultValue if the environment variable is not set or is not a valid
// uint32 value.
func getEnvUint32(name string, defaultValue uint32) uint32 {
func getEnvUint[T constraints.Unsigned](name string, defaultValue T) T {
val, ok := getEnv(name)
if !ok {
return defaultValue
}

i, err := strconv.ParseUint(val, 10, 32)
// widest bit size, truncate later
i, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return defaultValue
}

return uint32(i)
return T(i)
}

// getEnvBool returns the value of the environment variable with the given name
Expand Down

0 comments on commit 094b69d

Please sign in to comment.