Skip to content

Commit

Permalink
Search service with local storage (#2701)
Browse files Browse the repository at this point in the history
* search service with local storage

* Unmount all references to volumeDuckDBIndex

* fix style

* Remove duckdb mount

* Add clean directory call randomly

* Fix missing env vars

* validate disk space

* profile only when running cleaning

* fix space validation

* remove extra profile step

* fix style and apply code review suggestion

* Apply code review suggestion
  • Loading branch information
AndreaFrancis authored Apr 16, 2024
1 parent 6de4bbf commit a489c0b
Show file tree
Hide file tree
Showing 36 changed files with 173 additions and 394 deletions.
12 changes: 1 addition & 11 deletions chart/env/prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -227,16 +227,6 @@ backfillRetryableErrors:
cpu: 2
memory: "8Gi"

cleanDuckdbIndexCache:
nodeSelector:
role-datasets-server: "true"
expiredTimeIntervalSeconds: 259_200 # 3 days

cleanDuckdbIndexDownloads:
nodeSelector:
role-datasets-server: "true"
expiredTimeIntervalSeconds: 259_200 # 3 days

postMessages:
nodeSelector:
role-datasets-server: "true"
Expand Down Expand Up @@ -387,7 +377,7 @@ search:
uvicornNumWorkers: "2"
nodeSelector:
role-datasets-server-search: "true"
replicas: 7
replicas: 4
service:
type: NodePort
ingress:
Expand Down
10 changes: 0 additions & 10 deletions chart/templates/_common/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,6 @@ app.kubernetes.io/component: "{{ include "name" . }}-backfill"
app.kubernetes.io/component: "{{ include "name" . }}-backfill-retryable-errors"
{{- end -}}

{{- define "labels.cleanDuckdbIndexCache" -}}
{{ include "hf.labels.commons" . }}
app.kubernetes.io/component: "{{ include "name" . }}-clean-duckdb-cache"
{{- end -}}

{{- define "labels.cleanDuckdbIndexDownloads" -}}
{{ include "hf.labels.commons" . }}
app.kubernetes.io/component: "{{ include "name" . }}-clean-duckdb-downloads"
{{- end -}}

{{- define "labels.postMessages" -}}
{{ include "hf.labels.commons" . }}
app.kubernetes.io/component: "{{ include "name" . }}-post-messages"
Expand Down
21 changes: 0 additions & 21 deletions chart/templates/_initContainers/_initContainerDuckDBIndex.tpl

This file was deleted.

18 changes: 0 additions & 18 deletions chart/templates/_volumeMounts/_volumeMountDuckDBIndex.tpl

This file was deleted.

27 changes: 0 additions & 27 deletions chart/templates/cron-jobs/clean-duckdb-index-cache/_container.tpl

This file was deleted.

27 changes: 0 additions & 27 deletions chart/templates/cron-jobs/clean-duckdb-index-cache/job.yaml

This file was deleted.

This file was deleted.

27 changes: 0 additions & 27 deletions chart/templates/cron-jobs/clean-duckdb-index-downloads/job.yaml

This file was deleted.

3 changes: 0 additions & 3 deletions chart/templates/services/admin/_container.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
{{ include "envAssets" . | nindent 2 }}
{{ include "envCachedAssets" . | nindent 2 }}
{{ include "envParquetMetadata" . | nindent 2 }}
- name: DUCKDB_INDEX_CACHE_DIRECTORY
value: {{ .Values.duckDBIndex.cacheDirectory | quote }}
# service
- name: ADMIN_HF_ORGANIZATION
value: {{ .Values.admin.hfOrganization | quote }}
Expand All @@ -41,7 +39,6 @@
- name: ADMIN_UVICORN_PORT
value: {{ .Values.admin.uvicornPort | quote }}
volumeMounts:
{{ include "volumeMountDuckDBIndexRO" . | nindent 2 }}
{{ include "volumeMountParquetMetadataRO" . | nindent 2 }}
securityContext:
allowPrivilegeEscalation: false
Expand Down
2 changes: 0 additions & 2 deletions chart/templates/services/admin/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,10 @@ spec:
{{- include "dnsConfig" . | nindent 6 }}
{{- include "image.imagePullSecrets" . | nindent 6 }}
initContainers:
{{ include "initContainerDuckDBIndex" . | nindent 8 }}
{{ include "initContainerParquetMetadata" . | nindent 8 }}
containers: {{ include "containerAdmin" . | nindent 8 }}
nodeSelector: {{ toYaml .Values.admin.nodeSelector | nindent 8 }}
tolerations: {{ toYaml .Values.admin.tolerations | nindent 8 }}
volumes:
{{ include "volumeDuckDBIndex" . | nindent 8 }}
{{ include "volumeParquetMetadata" . | nindent 8 }}
securityContext: {{ include "securityContext" . | nindent 8 }}
6 changes: 4 additions & 2 deletions chart/templates/services/search/_container.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@
value: {{ .Values.duckDBIndex.targetRevision | quote }}
- name: DUCKDB_INDEX_CACHE_DIRECTORY
value: {{ .Values.duckDBIndex.cacheDirectory | quote }}
- name: DUCKDB_INDEX_CACHE_CLEAN_CACHE_PROBA
value: {{ .Values.search.cleanCacheProba | quote }}
- name: DUCKDB_INDEX_CACHE_EXPIRED_TIME_INTERVAL_SECONDS
value: {{ .Values.search.expiredTimeIntervalSeconds | quote }}
- name: DUCKDB_INDEX_EXTENSIONS_DIRECTORY
value: "/tmp/duckdb-extensions"
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
volumeMounts:
{{ include "volumeMountDuckDBIndexRW" . | nindent 2 }}
securityContext:
allowPrivilegeEscalation: false
readinessProbe:
Expand Down
4 changes: 0 additions & 4 deletions chart/templates/services/search/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@ spec:
spec:
{{- include "dnsConfig" . | nindent 6 }}
{{- include "image.imagePullSecrets" . | nindent 6 }}
initContainers:
{{ include "initContainerDuckDBIndex" . | nindent 8 }}
containers: {{ include "containerSearch" . | nindent 8 }}
nodeSelector: {{ toYaml .Values.search.nodeSelector | nindent 8 }}
tolerations: {{ toYaml .Values.search.tolerations | nindent 8 }}
volumes:
{{ include "volumeDuckDBIndex" . | nindent 8 }}
securityContext: {{ include "securityContext" . | nindent 8 }}
5 changes: 0 additions & 5 deletions chart/templates/storage-admin/_container.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,7 @@
image: {{ include "services.storageAdmin.image" . }}
imagePullPolicy: {{ .Values.images.pullPolicy }}
volumeMounts:
{{ include "volumeMountDuckDBIndexRW" . | nindent 2 }}
{{ include "volumeMountParquetMetadataRW" . | nindent 2 }}
- mountPath: /volumes/duckdb-index
mountPropagation: None
name: volume-duckdb-index
readOnly: false
- mountPath: /volumes/parquet-metadata
mountPropagation: None
name: volume-parquet-metadata
Expand Down
2 changes: 0 additions & 2 deletions chart/templates/storage-admin/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,9 @@ spec:
labels: {{ include "labels.storageAdmin" . | nindent 8 }}
spec:
initContainers:
{{ include "initContainerDuckDBIndex" . | nindent 8 }}
{{ include "initContainerParquetMetadata" . | nindent 8 }}
containers: {{ include "containerStorageAdmin" . | nindent 8 }}
nodeSelector: {{ toYaml .Values.storageAdmin.nodeSelector | nindent 8 }}
tolerations: {{ toYaml .Values.storageAdmin.tolerations | nindent 8 }}
volumes:
{{ include "volumeDuckDBIndex" . | nindent 8 }}
{{ include "volumeParquetMetadata" . | nindent 8 }}
46 changes: 6 additions & 40 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,6 @@ uid: 1000
gid: 3000

persistence:
duckDBIndex:
existingClaim: ""
parquetMetadata:
existingClaim: ""

Expand Down Expand Up @@ -259,8 +257,8 @@ parquetMetadata:
storageDirectory: "/storage/parquet-metadata"

duckDBIndex:
# Directory on the shared storage (used to cache the duckdb files for /filter and /search)
cacheDirectory: "/storage/duckdb-index"
# Directory on the local storage (used to cache the duckdb files for /filter and /search)
cacheDirectory: "/tmp/duckdb-index"
# Directory on the worker (used temporarily to prepare the duckdb indexes before sending to the Hub)
workerDirectory: "/tmp/duckdb-index"
# the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`.
Expand Down Expand Up @@ -337,42 +335,6 @@ backfillRetryableErrors:
cpu: 0
tolerations: []

cleanDuckdbIndexCache:
enabled: true
log:
level: "info"
action: "clean-directory"
schedule: "0 */3 * * *"
# every 3 hours
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
# the time interval at which cached downloaded data files will be considered as expired and will be deleted
expiredTimeIntervalSeconds: 600
subfolderPattern: "cache/*"

cleanDuckdbIndexDownloads:
enabled: true
log:
level: "info"
action: "clean-directory"
schedule: "0 */3 * * *"
# every 3 hours
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
# the time interval at which cached downloaded data files will be considered as expired and will be deleted
expiredTimeIntervalSeconds: 600
subfolderPattern: "downloads/*"

postMessages:
enabled: true
log:
Expand Down Expand Up @@ -572,6 +534,10 @@ search:
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
# Probability of cleaning the downloads folder at each request.
cleanCacheProba: 0.05
# Retention period for downloads.
expiredTimeIntervalSeconds: 43_200 # 12 hours

nodeSelector: {}
replicas: 1
Expand Down
6 changes: 0 additions & 6 deletions e2e/tests/test_31_admin_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,6 @@ def test_metrics() -> None:
), f"responses_in_cache_total - cache kind {cache_kind} found in {metrics}"

# the disk usage metrics, on the other end, are computed at runtime, so we should see them
assert has_metric(
name="duckdb_disk_usage",
labels={"type": "total", "pid": "[0-9]*"},
metric_names=metric_names,
), "duckdb_disk_usage"

assert has_metric(
name="parquet_metadata_disk_usage",
labels={"type": "total", "pid": "[0-9]*"},
Expand Down
Loading

0 comments on commit a489c0b

Please sign in to comment.