Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

spill to disk based on flow-worker memory usage #1231

Merged
merged 2 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions flow/connectors/utils/cdc_records/cdc_records_storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"log/slog"
"math/big"
"os"
"runtime"
"time"

"github.com/cockroachdb/pebble"
Expand All @@ -34,6 +35,9 @@ type cdcRecordsStore struct {
flowJobName string
dbFolderName string
numRecordsSwitchThreshold int
memThresholdBytes uint64
thresholdReason string
memStats runtime.MemStats
}

func NewCDCRecordsStore(flowJobName string) *cdcRecordsStore {
Expand All @@ -43,7 +47,16 @@ func NewCDCRecordsStore(flowJobName string) *cdcRecordsStore {
numRecords: 0,
flowJobName: flowJobName,
dbFolderName: fmt.Sprintf("%s/%s_%s", os.TempDir(), flowJobName, shared.RandomString(8)),
numRecordsSwitchThreshold: peerdbenv.PeerDBCDCDiskSpillThreshold(),
numRecordsSwitchThreshold: peerdbenv.PeerDBCDCDiskSpillRecordsThreshold(),
memThresholdBytes: func() uint64 {
memPercent := peerdbenv.PeerDBCDCDiskSpillMemPercentThreshold()
maxMemBytes := peerdbenv.PeerDBFlowWorkerMaxMemBytes()
if memPercent > 0 && maxMemBytes > 0 {
return maxMemBytes * uint64(memPercent) / 100
}
return 0
}(),
thresholdReason: "",
}
}

Expand Down Expand Up @@ -72,15 +85,32 @@ func (c *cdcRecordsStore) initPebbleDB() error {
return nil
}

func (c *cdcRecordsStore) diskSpillThresholdsExceeded() bool {
if len(c.inMemoryRecords) >= c.numRecordsSwitchThreshold {
c.thresholdReason = fmt.Sprintf("more than %d primary keys read, spilling to disk",
c.numRecordsSwitchThreshold)
return true
}
if c.memThresholdBytes > 0 {
runtime.ReadMemStats(&c.memStats)

if c.memStats.Alloc >= c.memThresholdBytes {
c.thresholdReason = fmt.Sprintf("memalloc greater than %d bytes, spilling to disk",
c.memThresholdBytes)
return true
}
}
return false
}

func (c *cdcRecordsStore) Set(key *model.TableWithPkey, rec model.Record) error {
if key != nil {
_, ok := c.inMemoryRecords[*key]
if ok || len(c.inMemoryRecords) < c.numRecordsSwitchThreshold {
if ok || !c.diskSpillThresholdsExceeded() {
c.inMemoryRecords[*key] = rec
} else {
if c.pebbleDB == nil {
slog.Info(fmt.Sprintf("more than %d primary keys read, spilling to disk",
c.numRecordsSwitchThreshold),
slog.Info(c.thresholdReason,
slog.String(string(shared.FlowNameKey), c.flowJobName))
err := c.initPebbleDB()
if err != nil {
Expand Down
18 changes: 14 additions & 4 deletions flow/peerdbenv/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,19 @@ func PeerDBCDCIdleTimeoutSeconds(providedValue int) time.Duration {
return time.Duration(x) * time.Second
}

// PEERDB_CDC_DISK_SPILL_THRESHOLD
func PeerDBCDCDiskSpillThreshold() int {
return getEnvInt("PEERDB_CDC_DISK_SPILL_THRESHOLD", 1_000_000)
// PEERDB_CDC_DISK_SPILL_RECORDS_THRESHOLD
func PeerDBCDCDiskSpillRecordsThreshold() int {
return getEnvInt("PEERDB_CDC_DISK_SPILL_RECORDS_THRESHOLD", 1_000_000)
}

// PEERDB_CDC_DISK_SPILL_RECORDS_THRESHOLD, negative numbers means memory threshold disabled
func PeerDBCDCDiskSpillMemPercentThreshold() int {
return getEnvInt("PEERDB_CDC_DISK_SPILL_MEM_PERCENT_THRESHOLD", -1)
}

// GOMEMLIMIT is a variable internal to Golang itself, we use this for internal targets, 0 means no maximum
func PeerDBFlowWorkerMaxMemBytes() uint64 {
return getEnvUint[uint64]("GOMEMLIMIT", 0)
}

// PEERDB_CATALOG_HOST
Expand All @@ -53,7 +63,7 @@ func PeerDBCatalogHost() string {

// PEERDB_CATALOG_PORT
func PeerDBCatalogPort() uint32 {
return getEnvUint32("PEERDB_CATALOG_PORT", 5432)
return getEnvUint[uint32]("PEERDB_CATALOG_PORT", 5432)
}

// PEERDB_CATALOG_USER
Expand Down
9 changes: 6 additions & 3 deletions flow/peerdbenv/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package peerdbenv
import (
"os"
"strconv"

"golang.org/x/exp/constraints"
)

// GetEnv returns the value of the environment variable with the given name
Expand Down Expand Up @@ -32,18 +34,19 @@ func getEnvInt(name string, defaultValue int) int {
// getEnvUint32 returns the value of the environment variable with the given name
// or defaultValue if the environment variable is not set or is not a valid
// uint32 value.
func getEnvUint32(name string, defaultValue uint32) uint32 {
func getEnvUint[T constraints.Unsigned](name string, defaultValue T) T {
val, ok := getEnv(name)
if !ok {
return defaultValue
}

i, err := strconv.ParseUint(val, 10, 32)
// widest bit size, truncate later
i, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return defaultValue
}

return uint32(i)
return T(i)
}

// getEnvBool returns the value of the environment variable with the given name
Expand Down
Loading