astrolabsoftware · fjammes · Nov 6, 2024 · Dec 20, 2024 · Dec 23, 2024 · Dec 27, 2024
diff --git a/.ciux b/.ciux
@@ -28,10 +28,13 @@ dependencies:
       dev: "true"
       itest: "true"
       release: "true"
+  - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/stackable-hadoop:v24.11.0
+    labels:
+      itest: "true"
   - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/spark-py:k8s-3.4.1
     labels:
       build: "true"
-  - package: github.com/k8s-school/[email protected]rc1
+  - package: github.com/k8s-school/[email protected]rc4
     labels:
       itest: "optional"
   - package: github.com/astrolabsoftware/finkctl/[email protected]

diff --git a/.github/workflows/e2e-common.yml b/.github/workflows/e2e-common.yml
@@ -6,28 +6,29 @@ on:
         required: true
         type: string
       ci_repo:
-        required: true
+        description: 'Intermediate registry to use'
+        required: false
         type: string
+        default: ""
       runner:
         required: true
         type: string
       kind_version:
-        required: true
+        description: 'Kind version to use'
+        required: false
         type: string
+        default: "v0.20.0"
     secrets:
       registry_username:
         required: true
       registry_token:
         required: true
-      private_registry_username:
-        required: true
-      private_registry_token:
-        required: true
 env:
-  CIUX_VERSION: v0.0.4-rc9
+  CIUX_VERSION: v0.0.4-rc10
   GHA_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   SUFFIX: ${{ inputs.suffix }}
   CI_REPO: ${{ inputs.ci_repo }}
+  STORAGE: ${{ inputs.storage }}
   # Override the self-hosted runner value
   POD_NAMESPACE: default
 jobs:
@@ -78,6 +79,9 @@ jobs:
           name: docker-artifact
           path: artifacts
   integration-tests:
+    strategy:
+      matrix:
+        storage: [hdfs, s3]
     name: Run integration tests
     runs-on: ${{ fromJSON(inputs.runner) }}
     outputs:
@@ -134,13 +138,13 @@ jobs:
           else
             echo "Using pre-existing image from registry (See  "Ciux project ignition" section)"
           fi
-      - name: Run argoCD
-        run: |
-          ./e2e/argocd.sh
       # - name: Setup tmate session
       #   uses: mxschmitt/action-tmate@v3
       #   with:
       #     detached: true
+      - name: Run argoCD
+        run: |
+          ./e2e/argocd.sh -S "${{ matrix.storage }}"
       - name: Check results
         run: |
           ./e2e/check-results.sh

diff --git a/.github/workflows/e2e-gha.yml b/.github/workflows/e2e-gha.yml
@@ -12,11 +12,7 @@ jobs:
     uses: ./.github/workflows/e2e-common.yml
     with:
       suffix: "noscience"
-      ci_repo: ""
       runner: "['ubuntu-22.04']"
-      kind_version: "v0.20.0"
     secrets:
       registry_username: ${{ secrets.REGISTRY_USERNAME }}
       registry_token: ${{ secrets.REGISTRY_TOKEN }}
-      private_registry_username: ${{ secrets.PRIVATE_REGISTRY_USERNAME }}
-      private_registry_token: ${{ secrets.PRIVATE_REGISTRY_TOKEN }}
diff --git a/Dockerfile b/Dockerfile
@@ -27,6 +27,7 @@ RUN apt-get update && \
     apt install -y --no-install-recommends wget git apt-transport-https ca-certificates gnupg-agent apt-utils build-essential && \
     rm -rf /var/cache/apt/*
 
+# Download and install Spark dependencies listed in jars-urls.txt
 ADD deps/jars-urls.txt $FINK_HOME/
 RUN xargs -n 1 curl --fail --output-dir /opt/spark/jars -O < $FINK_HOME/jars-urls.txt
 

diff --git a/TODO.argocd b/TODO.argocd
diff --git a/TODO.org b/TODO.org
@@ -1,10 +1,42 @@
-* DONE use gitlab@virtualdata as a CI repo
-* DONE check fink-alert-simulator error message in CI:
- ⚠ fink-alert-simulator-cjxv2  main      fink-alert-simulator-cjxv2  5m        Error (exit code 1): pods "fink-alert-simulator-cjxv2" is forbidden: User "system:serviceaccount:argocd:default" cannot patch resource "pods" in API group "" in the namespace "argocd"
-* DONE trigger ci for OOMkill
-* 729
-** DONE use "kubectl get kafkatopics.kafka.strimzi.io  -n kafka" to check success of integration tests, maybe in fnkctl?
-** TODO DELAYED BECAUSE IT NOT BLOCKS BUT WARN create topic in distribute before sending alerts in order to avoid error below: https://fink-broker.slack.com/archives/D03KJ390F17/p1692008729660549
+#+TITLE: current
+* TODO hdfs operator management
+
+** TODO limit and request for memory: monitor issue https://github.com/stackabletech/hdfs-operator/issues/625
+** TODO open issue: zkfc on datanode is not compliant with memory setting
+In the example below memory limit is 256Mi for nameNode in hdfscluster CR, but it become 768Mi in each related pod because the `zkfs` container is not impacted by the CR configuration.
+This should be fixed because it prevents running the setup on CI platforms with low memory like Github Action for instances.
+
+kubectl get -n hdfs hdfscluster simple-hdfs  -o yaml -o jsonpath  -o=jsonpath='{.spec.nameNodes.config.resources}'
+{"cpu":{"min":"0"},"memory":{"limit":"256Mi"}}
+
+kubectl describe nodes | grep namenode
+  hdfs                        simple-hdfs-namenode-default-0                                                         100m (0%)     1400m (1%)  768Mi (0%)       768Mi (0%)     34m
+  hdfs                        simple-hdfs-namenode-default-1                                                         100m (0%)     1400m (1%)  768Mi (0%)       768Mi (0%)     31m
+
+kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath  -o=jsonpath='{.spec.containers[1].name}'
+zkfc
+
+kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath  -o=jsonpath='{.spec.containers[1].resources}'  | jq
+{
+  "limits": {
+    "cpu": "400m",
+    "memory": "512Mi"
+  },
+  "requests": {
+    "cpu": "100m",
+    "memory": "512Mi"
+  }
+}
+
+
+** TODO management of argoCD default values (jqpath expression): monitor issue https://github.com/stackabletech/hdfs-operator/issues/626
+** TODO open issue: be able to run only one dataNode on CI
+
+** TODO Add helm option on HDFS cpu.min (also for operators!)
+** TODO Move fink image to docker.stackable.tech/stackable/hadoop:3.3.6-stackable24.11.
+
+#+TITLE: previous
+* TODO DELAYED BECAUSE IT NOT BLOCKS BUT WARN create topic in distribute before sending alerts in order to avoid error below: https://fink-broker.slack.com/archives/D03KJ390F17/p1692008729660549
 Du coup ça fonctionne avec un compte utilisateur, par contre j'ai pas activé les autorisations dans kafka car le fink-alert-simulator aurait pu plus écrire dans le topic sans authentification.
 12 h 28
 J'ai maintenant ce message d'erreur:
@@ -15,41 +47,17 @@ En fait c'est du au fait que le topic existe pas, ça fonctionne si on relance l
 Tu crois qu'on pourrais pré-créer les topic pour éviter ce problème
 @JulienPeloton
 ?
-** DONE add user authentication in kafka https://stackoverflow.com/questions/65729535/how-to-do-i-connect-kafka-python-to-accept-username-and-password-for-jaas-like-i
 * TODO Enable authZ in kafka (require authN setup in fink-alert-simulator)
-* TODO [#B] distribute should wait for data to appear instead of crashing in connect_to_raw_database()
 * TODO move nodeport to internal for svc kafka-cluster-kafka-external-bootstrap
-* DONE improve final test in CI (check Kafka with fink-client https://github.com/astrolabsoftware/fink-client)
 * TODO run code-check.sh in CI
-* DONE add unit test for schema_converter
-* TODO https://stackoverflow.com/questions/30385981/how-to-access-s3a-files-from-apache-spark
-Document +add SO post?:
-Download hadoop binary release: https://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-3.2.4/hadoop-3.2.4.tar.gz
-extract and copy jar:
- fjammes@clrinfopo18  ~/Downloads/hadoop-3.2.4  cp ./share/hadoop/tools/lib/hadoop-aws-3.2.4.jar ~/src/k8s-spark-py/custom/jars
- fjammes@clrinfopo18  ~/Downloads/hadoop-3.2.4  cp ./share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.901.jar ~/src/k8s-spark-py/custom/jars
-	// WARNING package are not deployed in spark-executor
-	// see https://stackoverflow.com/a/67299668/2784039
-* TODO document hack to retrieve Maven URLs
-kubectl logs stream2raw-py-f529af864f8dee60-driver | grep downlo | cut -d' ' -f2 > jars-urls.txt
-OR add mnv copy:dependencies when building the image?
 * TODO manage dependencies
 What to do with:
 1. hbase-spark-hbase2.4_spark3_scala2.12_hadoop3.2.jar
 hbase-spark-protocol-shaded-hbase2.4_spark3_scala2.12_hadoop3.2.jar
 which are both in k8s-spark-py/custom and fink-broker/libs (cf. FINK_JARS)
 cf. Julien are they required?
 2. custom/jars/commons-pool2-2.6.2.jar which was in k8s-spark-py/custom
-* DONE document minio install and bucket creation:
-    5  curl https://dl.min.io/client/mc/release/linux-amd64/mc  --create-dirs -o $HOME/minio-binaries/mc
-    6  chmod +x $HOME/minio-binaries/mc
-   15  export PATH=$PATH:$HOME/minio-binaries/
-   17  mc alias set s3 http://minio.minio:9000 minioadmin minioadmin
-   19  mc ls s3
-   27  mc mb s3/fink-broker-online
- mc ls f1 --recursive fink-broker-online/
-* TODO test removal of options below
+* TODO test removal of options below whith useing hdfs
 +    --conf spark.driver.extraJavaOptions="-Divy.cache.dir=/tmp -Divy.home=/tmp" \
      --conf spark.hadoop.fs.s3a.path.style.access=true \
 +    --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \
-* DONE INSTALL MINIO https://min.io/docs/minio/kubernetes/upstream/index.html?
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
@@ -52,6 +52,7 @@ app.kubernetes.io/instance: {{ .Release.Name }}
 
 {{/* Generate s3 configuration */}}
 {{- define "fink.s3config" -}}
+{{ if eq .Values.storage "s3" -}}
 spark.hadoop.fs.s3a.endpoint: {{ .Values.s3.endpoint }}
 spark.hadoop.fs.s3a.access.key: {{ .Values.s3.access_key }}
 spark.hadoop.fs.s3a.secret.key: {{ .Values.s3.secret_key }}
@@ -62,7 +63,15 @@ spark.hadoop.fs.s3a.path.style.access: "true"
 spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
 spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
 {{- end }}
+{{- end }}
 
+{{/* Generate hdfs configuration */}}
+{{- define "fink.hdfsconfig" -}}
+{{ if eq .Values.storage "hdfs" -}}
+- name: SPARK_USER
+  value: "{{ .Values.hdfs.hadoop_user_name }}"
+{{- end }}
+{{- end }}
 
 {{/* Generate common configuration */}}
 {{- define "fink.common" -}}
@@ -85,7 +94,11 @@ restartPolicy:
 - '-log_level'
 - '{{ .Values.log_level }}'
 - '-online_data_prefix'
+{{- if .Values.online_data_prefix }}
+- '{{ .Values.online_data_prefix }}'
+{{- else }}
 - 's3a://{{ tpl .Values.s3.bucket . }}'
+{{- end }}
 - '-producer'
 - '{{ .Values.producer }}'
 - '-tinterval'

diff --git a/chart/templates/job-hdfs-init.yaml b/chart/templates/job-hdfs-init.yaml
@@ -0,0 +1,27 @@
+{{ if eq .Values.storage "hdfs" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: hdfs-init
+  namespace: hdfs
+  annotations:
+    "helm.sh/hook": "pre-install"
+spec:
+  template:
+    spec:
+      containers:
+      - name: hdfs-client
+        image: apache/hadoop:3.4.0
+        command: ["sh", "-c"]
+        args:
+          - |
+            hdfs dfs -fs $HDFS_URL -mkdir -p /user/185 && \
+            hdfs dfs -fs $HDFS_URL -chown 185:hdfs /user/185 && \
+            hdfs dfs -fs $HDFS_URL -chmod 700 /user/185
+        env:
+        - name: HDFS_URL
+          value: hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020
+        - name: HADOOP_USER_NAME
+          value: stackable
+      restartPolicy: OnFailure
+{{- end }}
diff --git a/chart/templates/spark-fink-distribution.yaml b/chart/templates/spark-fink-distribution.yaml
@@ -23,6 +23,7 @@ spec:
   driver:
     cores: {{ tpl .Values.distribution.cores . }}
     coreRequest: "{{ tpl .Values.distribution.coreRequest . }}"
+    env: {{- include "fink.hdfsconfig" . | nindent 6 }}
     memory: "{{ tpl .Values.distribution.memory . }}"
     javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true"
     labels:
@@ -31,6 +32,7 @@ spec:
   executor:
     cores: {{ tpl .Values.distribution.cores . }}
     coreRequest: "{{ tpl .Values.distribution.coreRequest . }}"
+    env: {{- include "fink.hdfsconfig" . | nindent 6 }}
     memory: "{{ tpl .Values.distribution.memory . }}"
     instances: {{ tpl .Values.distribution.instances . }}
     javaOptions: "-Djava.security.auth.login.config=/etc/fink-broker/kafka-jaas.conf -Dcom.amazonaws.sdk.disableCertChecking=true"

diff --git a/chart/templates/spark-fink-raw2science.yaml b/chart/templates/spark-fink-raw2science.yaml
@@ -12,6 +12,7 @@ spec:
   driver:
     cores: {{ tpl .Values.raw2science.cores . }}
     coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}"
+    env: {{- include "fink.hdfsconfig" . | nindent 6 }}
     memory: "{{ tpl .Values.raw2science.memory . }}"
     javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true"
     labels:
@@ -20,6 +21,7 @@ spec:
   executor:
     cores: {{ tpl .Values.raw2science.cores . }}
     coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}"
+    env: {{- include "fink.hdfsconfig" . | nindent 6 }}
     memory: "{{ tpl .Values.raw2science.memory . }}"
     javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true"
     instances: {{ tpl .Values.raw2science.instances . }}

diff --git a/chart/templates/spark-fink-stream2raw.yaml b/chart/templates/spark-fink-stream2raw.yaml
@@ -21,6 +21,7 @@ spec:
   driver:
     cores: {{ tpl .Values.distribution.cores . }}
     coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}"
+    env: {{- include "fink.hdfsconfig" . | nindent 6 }}
     memory: "{{ tpl .Values.stream2raw.memory . }}"
     labels:
       version: 3.4.1
@@ -29,6 +30,7 @@ spec:
   executor:
     cores: {{ tpl .Values.distribution.cores . }}
     coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}"
+    env: {{- include "fink.hdfsconfig" . | nindent 6 }}
     memory: "{{ tpl .Values.stream2raw.memory . }}"
     instances: {{ tpl .Values.distribution.instances . }}
     javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true"

diff --git a/chart/values-ci-noscience.yaml b/chart/values-ci-noscience.yaml
@@ -9,10 +9,6 @@ instances: 1
 
 fink_trigger_update: "2"
 
-# Can be overriden using --image option
-
-# Default to s3a://<s3.bucket>
-# online_data_prefix: s3a://fink-broker-online
 producer: sims
 
 log_level: INFO

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -13,15 +13,15 @@ image:
 cores: 1
 coreRequest: 0
 instances: 1
-memory: 1500m 
+memory: "1000m"
 # instances: 1
 
 fink_trigger_update: "2"
 
 # Can be overriden using --image option
 
 # Default to s3a://<s3.bucket>
-# online_data_prefix: s3a://fink-broker-online
+online_data_prefix: hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020///user/185
 producer: sims
 
 log_level: INFO
@@ -62,6 +62,9 @@ distribution:
     schema: "/home/fink/fink-alert-schemas/ztf/distribution_schema_0p2.avsc"
     substream_prefix: "fink_"
 
+
+storage: hdfs
+
 #
 # Parameters used to access the S3 bucket
 #
@@ -73,6 +76,9 @@ s3:
   access_key: "minio"
   secret_key: "minio123"
 
+hdfs:
+  hadoop_user_name: "185"
+
 serviceAccount:
   # Specifies whether a service account should be created
   create: true

diff --git a/doc/release.md b/doc/release.md
@@ -17,7 +17,7 @@ Url for the CI is: https://github.com/astrolabsoftware/fink-broker/actions
     ciux get deps ./fink-broker -l release
     ```
 
-    Clone all the necessary repositories and ensure you are using their `main` branch.
+    Clone all the necessary repositories and ensure you are using their `master/main` branch.
 
 ## Get Release Tag