diff --git a/.ciux b/.ciux index 1e7af2ce..469da244 100644 --- a/.ciux +++ b/.ciux @@ -28,10 +28,13 @@ dependencies: dev: "true" itest: "true" release: "true" + - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/stackable-hadoop:v24.11.0 + labels: + itest: "true" - image: gitlab-registry.in2p3.fr/astrolabsoftware/fink/spark-py:k8s-3.4.1 labels: build: "true" - - package: github.com/k8s-school/ktbx@v1.1.4-rc1 + - package: github.com/k8s-school/ktbx@v1.1.4-rc4 labels: itest: "optional" - package: github.com/astrolabsoftware/finkctl/v3@v3.1.3-rc1 diff --git a/.github/workflows/e2e-common.yml b/.github/workflows/e2e-common.yml index 8c9f5395..98e40a50 100644 --- a/.github/workflows/e2e-common.yml +++ b/.github/workflows/e2e-common.yml @@ -6,28 +6,29 @@ on: required: true type: string ci_repo: - required: true + description: 'Intermediate registry to use' + required: false type: string + default: "" runner: required: true type: string kind_version: - required: true + description: 'Kind version to use' + required: false type: string + default: "v0.20.0" secrets: registry_username: required: true registry_token: required: true - private_registry_username: - required: true - private_registry_token: - required: true env: - CIUX_VERSION: v0.0.4-rc9 + CIUX_VERSION: v0.0.4-rc10 GHA_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} SUFFIX: ${{ inputs.suffix }} CI_REPO: ${{ inputs.ci_repo }} + STORAGE: ${{ inputs.storage }} # Override the self-hosted runner value POD_NAMESPACE: default jobs: @@ -78,6 +79,9 @@ jobs: name: docker-artifact path: artifacts integration-tests: + strategy: + matrix: + storage: [hdfs, s3] name: Run integration tests runs-on: ${{ fromJSON(inputs.runner) }} outputs: @@ -134,13 +138,13 @@ jobs: else echo "Using pre-existing image from registry (See "Ciux project ignition" section)" fi - - name: Run argoCD - run: | - ./e2e/argocd.sh # - name: Setup tmate session # uses: mxschmitt/action-tmate@v3 # with: # detached: true + - name: Run argoCD + run: | + ./e2e/argocd.sh -S "${{ matrix.storage }}" - name: Check results run: | ./e2e/check-results.sh diff --git a/.github/workflows/e2e-gha.yml b/.github/workflows/e2e-gha.yml index 52874541..26dc2803 100644 --- a/.github/workflows/e2e-gha.yml +++ b/.github/workflows/e2e-gha.yml @@ -12,11 +12,7 @@ jobs: uses: ./.github/workflows/e2e-common.yml with: suffix: "noscience" - ci_repo: "" runner: "['ubuntu-22.04']" - kind_version: "v0.20.0" secrets: registry_username: ${{ secrets.REGISTRY_USERNAME }} registry_token: ${{ secrets.REGISTRY_TOKEN }} - private_registry_username: ${{ secrets.PRIVATE_REGISTRY_USERNAME }} - private_registry_token: ${{ secrets.PRIVATE_REGISTRY_TOKEN }} diff --git a/Dockerfile b/Dockerfile index d5ba75bc..7e1eaa48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ RUN apt-get update && \ apt install -y --no-install-recommends wget git apt-transport-https ca-certificates gnupg-agent apt-utils build-essential && \ rm -rf /var/cache/apt/* +# Download and install Spark dependencies listed in jars-urls.txt ADD deps/jars-urls.txt $FINK_HOME/ RUN xargs -n 1 curl --fail --output-dir /opt/spark/jars -O < $FINK_HOME/jars-urls.txt diff --git a/TODO.argocd b/TODO.argocd deleted file mode 100644 index a35dfb35..00000000 --- a/TODO.argocd +++ /dev/null @@ -1,3 +0,0 @@ -- check https://stackoverflow.com/questions/78922618/how-to-enforce-sync-order-in-argocd-app-of-apps-pattern and argocd issue -- WIP4 upgrade spark-operator chart to 2.0.0-rc0 and remove prereq from fink-cd -- check https://github.com/argoproj/argocd-example-apps/tree/master/helm-dependency to install spark-operator, and other operator diff --git a/TODO.org b/TODO.org index eac8e4f5..2da5df81 100644 --- a/TODO.org +++ b/TODO.org @@ -1,10 +1,42 @@ -* DONE use gitlab@virtualdata as a CI repo -* DONE check fink-alert-simulator error message in CI: - ⚠ fink-alert-simulator-cjxv2 main fink-alert-simulator-cjxv2 5m Error (exit code 1): pods "fink-alert-simulator-cjxv2" is forbidden: User "system:serviceaccount:argocd:default" cannot patch resource "pods" in API group "" in the namespace "argocd" -* DONE trigger ci for OOMkill -* 729 -** DONE use "kubectl get kafkatopics.kafka.strimzi.io -n kafka" to check success of integration tests, maybe in fnkctl? -** TODO DELAYED BECAUSE IT NOT BLOCKS BUT WARN create topic in distribute before sending alerts in order to avoid error below: https://fink-broker.slack.com/archives/D03KJ390F17/p1692008729660549 +#+TITLE: current +* TODO hdfs operator management + +** TODO limit and request for memory: monitor issue https://github.com/stackabletech/hdfs-operator/issues/625 +** TODO open issue: zkfc on datanode is not compliant with memory setting +In the example below memory limit is 256Mi for nameNode in hdfscluster CR, but it become 768Mi in each related pod because the `zkfs` container is not impacted by the CR configuration. +This should be fixed because it prevents running the setup on CI platforms with low memory like Github Action for instances. + +kubectl get -n hdfs hdfscluster simple-hdfs -o yaml -o jsonpath -o=jsonpath='{.spec.nameNodes.config.resources}' +{"cpu":{"min":"0"},"memory":{"limit":"256Mi"}} + +kubectl describe nodes | grep namenode + hdfs simple-hdfs-namenode-default-0 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 34m + hdfs simple-hdfs-namenode-default-1 100m (0%) 1400m (1%) 768Mi (0%) 768Mi (0%) 31m + +kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].name}' +zkfc + +kubectl get pods -n hdfs simple-hdfs-namenode-default-0 -o jsonpath -o=jsonpath='{.spec.containers[1].resources}' | jq +{ + "limits": { + "cpu": "400m", + "memory": "512Mi" + }, + "requests": { + "cpu": "100m", + "memory": "512Mi" + } +} + + +** TODO management of argoCD default values (jqpath expression): monitor issue https://github.com/stackabletech/hdfs-operator/issues/626 +** TODO open issue: be able to run only one dataNode on CI + +** TODO Add helm option on HDFS cpu.min (also for operators!) +** TODO Move fink image to docker.stackable.tech/stackable/hadoop:3.3.6-stackable24.11. + +#+TITLE: previous +* TODO DELAYED BECAUSE IT NOT BLOCKS BUT WARN create topic in distribute before sending alerts in order to avoid error below: https://fink-broker.slack.com/archives/D03KJ390F17/p1692008729660549 Du coup ça fonctionne avec un compte utilisateur, par contre j'ai pas activé les autorisations dans kafka car le fink-alert-simulator aurait pu plus écrire dans le topic sans authentification. 12 h 28 J'ai maintenant ce message d'erreur: @@ -15,24 +47,9 @@ En fait c'est du au fait que le topic existe pas, ça fonctionne si on relance l Tu crois qu'on pourrais pré-créer les topic pour éviter ce problème @JulienPeloton ? -** DONE add user authentication in kafka https://stackoverflow.com/questions/65729535/how-to-do-i-connect-kafka-python-to-accept-username-and-password-for-jaas-like-i * TODO Enable authZ in kafka (require authN setup in fink-alert-simulator) -* TODO [#B] distribute should wait for data to appear instead of crashing in connect_to_raw_database() * TODO move nodeport to internal for svc kafka-cluster-kafka-external-bootstrap -* DONE improve final test in CI (check Kafka with fink-client https://github.com/astrolabsoftware/fink-client) * TODO run code-check.sh in CI -* DONE add unit test for schema_converter -* TODO https://stackoverflow.com/questions/30385981/how-to-access-s3a-files-from-apache-spark -Document +add SO post?: -Download hadoop binary release: https://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-3.2.4/hadoop-3.2.4.tar.gz -extract and copy jar: - fjammes@clrinfopo18  ~/Downloads/hadoop-3.2.4  cp ./share/hadoop/tools/lib/hadoop-aws-3.2.4.jar ~/src/k8s-spark-py/custom/jars - fjammes@clrinfopo18  ~/Downloads/hadoop-3.2.4  cp ./share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.901.jar ~/src/k8s-spark-py/custom/jars - // WARNING package are not deployed in spark-executor - // see https://stackoverflow.com/a/67299668/2784039 -* TODO document hack to retrieve Maven URLs -kubectl logs stream2raw-py-f529af864f8dee60-driver | grep downlo | cut -d' ' -f2 > jars-urls.txt -OR add mnv copy:dependencies when building the image? * TODO manage dependencies What to do with: 1. hbase-spark-hbase2.4_spark3_scala2.12_hadoop3.2.jar @@ -40,16 +57,7 @@ hbase-spark-protocol-shaded-hbase2.4_spark3_scala2.12_hadoop3.2.jar which are both in k8s-spark-py/custom and fink-broker/libs (cf. FINK_JARS) cf. Julien are they required? 2. custom/jars/commons-pool2-2.6.2.jar which was in k8s-spark-py/custom -* DONE document minio install and bucket creation: - 5 curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o $HOME/minio-binaries/mc - 6 chmod +x $HOME/minio-binaries/mc - 15 export PATH=$PATH:$HOME/minio-binaries/ - 17 mc alias set s3 http://minio.minio:9000 minioadmin minioadmin - 19 mc ls s3 - 27 mc mb s3/fink-broker-online - mc ls f1 --recursive fink-broker-online/ -* TODO test removal of options below +* TODO test removal of options below whith useing hdfs + --conf spark.driver.extraJavaOptions="-Divy.cache.dir=/tmp -Divy.home=/tmp" \ --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ -* DONE INSTALL MINIO https://min.io/docs/minio/kubernetes/upstream/index.html? diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 0cc22e35..f37207e5 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -52,6 +52,7 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{/* Generate s3 configuration */}} {{- define "fink.s3config" -}} +{{ if eq .Values.storage "s3" -}} spark.hadoop.fs.s3a.endpoint: {{ .Values.s3.endpoint }} spark.hadoop.fs.s3a.access.key: {{ .Values.s3.access_key }} spark.hadoop.fs.s3a.secret.key: {{ .Values.s3.secret_key }} @@ -62,7 +63,15 @@ spark.hadoop.fs.s3a.path.style.access: "true" spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem" {{- end }} +{{- end }} +{{/* Generate hdfs configuration */}} +{{- define "fink.hdfsconfig" -}} +{{ if eq .Values.storage "hdfs" -}} +- name: SPARK_USER + value: "{{ .Values.hdfs.hadoop_user_name }}" +{{- end }} +{{- end }} {{/* Generate common configuration */}} {{- define "fink.common" -}} @@ -85,7 +94,11 @@ restartPolicy: - '-log_level' - '{{ .Values.log_level }}' - '-online_data_prefix' +{{- if .Values.online_data_prefix }} +- '{{ .Values.online_data_prefix }}' +{{- else }} - 's3a://{{ tpl .Values.s3.bucket . }}' +{{- end }} - '-producer' - '{{ .Values.producer }}' - '-tinterval' diff --git a/chart/templates/job-hdfs-init.yaml b/chart/templates/job-hdfs-init.yaml new file mode 100644 index 00000000..4f966be6 --- /dev/null +++ b/chart/templates/job-hdfs-init.yaml @@ -0,0 +1,27 @@ +{{ if eq .Values.storage "hdfs" -}} +apiVersion: batch/v1 +kind: Job +metadata: + name: hdfs-init + namespace: hdfs + annotations: + "helm.sh/hook": "pre-install" +spec: + template: + spec: + containers: + - name: hdfs-client + image: apache/hadoop:3.4.0 + command: ["sh", "-c"] + args: + - | + hdfs dfs -fs $HDFS_URL -mkdir -p /user/185 && \ + hdfs dfs -fs $HDFS_URL -chown 185:hdfs /user/185 && \ + hdfs dfs -fs $HDFS_URL -chmod 700 /user/185 + env: + - name: HDFS_URL + value: hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020 + - name: HADOOP_USER_NAME + value: stackable + restartPolicy: OnFailure +{{- end }} diff --git a/chart/templates/spark-fink-distribution.yaml b/chart/templates/spark-fink-distribution.yaml index 3fb5232e..3ac81084 100644 --- a/chart/templates/spark-fink-distribution.yaml +++ b/chart/templates/spark-fink-distribution.yaml @@ -23,6 +23,7 @@ spec: driver: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.distribution.coreRequest . }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.distribution.memory . }}" javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true" labels: @@ -31,6 +32,7 @@ spec: executor: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.distribution.coreRequest . }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.distribution.memory . }}" instances: {{ tpl .Values.distribution.instances . }} javaOptions: "-Djava.security.auth.login.config=/etc/fink-broker/kafka-jaas.conf -Dcom.amazonaws.sdk.disableCertChecking=true" diff --git a/chart/templates/spark-fink-raw2science.yaml b/chart/templates/spark-fink-raw2science.yaml index bdc56953..b08920da 100644 --- a/chart/templates/spark-fink-raw2science.yaml +++ b/chart/templates/spark-fink-raw2science.yaml @@ -12,6 +12,7 @@ spec: driver: cores: {{ tpl .Values.raw2science.cores . }} coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.raw2science.memory . }}" javaOptions: "-Divy.cache.dir=/tmp -Divy.home=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true" labels: @@ -20,6 +21,7 @@ spec: executor: cores: {{ tpl .Values.raw2science.cores . }} coreRequest: "{{ tpl .Values.raw2science.coreRequest . }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.raw2science.memory . }}" javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true" instances: {{ tpl .Values.raw2science.instances . }} diff --git a/chart/templates/spark-fink-stream2raw.yaml b/chart/templates/spark-fink-stream2raw.yaml index c54a68a6..783bdb83 100644 --- a/chart/templates/spark-fink-stream2raw.yaml +++ b/chart/templates/spark-fink-stream2raw.yaml @@ -21,6 +21,7 @@ spec: driver: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.stream2raw.memory . }}" labels: version: 3.4.1 @@ -29,6 +30,7 @@ spec: executor: cores: {{ tpl .Values.distribution.cores . }} coreRequest: "{{ tpl .Values.stream2raw.coreRequest . }}" + env: {{- include "fink.hdfsconfig" . | nindent 6 }} memory: "{{ tpl .Values.stream2raw.memory . }}" instances: {{ tpl .Values.distribution.instances . }} javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true" diff --git a/chart/values-ci-noscience.yaml b/chart/values-ci-noscience.yaml index 3a7c0dc4..1d2ce651 100644 --- a/chart/values-ci-noscience.yaml +++ b/chart/values-ci-noscience.yaml @@ -9,10 +9,6 @@ instances: 1 fink_trigger_update: "2" -# Can be overriden using --image option - -# Default to s3a:// -# online_data_prefix: s3a://fink-broker-online producer: sims log_level: INFO diff --git a/chart/values.yaml b/chart/values.yaml index 81e17cf8..115e00d5 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -13,7 +13,7 @@ image: cores: 1 coreRequest: 0 instances: 1 -memory: 1500m +memory: "1000m" # instances: 1 fink_trigger_update: "2" @@ -21,7 +21,7 @@ fink_trigger_update: "2" # Can be overriden using --image option # Default to s3a:// -# online_data_prefix: s3a://fink-broker-online +online_data_prefix: hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020///user/185 producer: sims log_level: INFO @@ -62,6 +62,9 @@ distribution: schema: "/home/fink/fink-alert-schemas/ztf/distribution_schema_0p2.avsc" substream_prefix: "fink_" + +storage: hdfs + # # Parameters used to access the S3 bucket # @@ -73,6 +76,9 @@ s3: access_key: "minio" secret_key: "minio123" +hdfs: + hadoop_user_name: "185" + serviceAccount: # Specifies whether a service account should be created create: true diff --git a/doc/release.md b/doc/release.md index ecb6ad97..5708c430 100644 --- a/doc/release.md +++ b/doc/release.md @@ -17,7 +17,7 @@ Url for the CI is: https://github.com/astrolabsoftware/fink-broker/actions ciux get deps ./fink-broker -l release ``` - Clone all the necessary repositories and ensure you are using their `main` branch. + Clone all the necessary repositories and ensure you are using their `master/main` branch. ## Get Release Tag diff --git a/doc/troubleshoot.md b/doc/troubleshoot.md index dfaf0e78..704d10da 100644 --- a/doc/troubleshoot.md +++ b/doc/troubleshoot.md @@ -1,6 +1,5 @@ # Troubleshooting guide - ## Run s5cmd (s3 client) From inside the k8s cluster: @@ -22,4 +21,32 @@ kubectl run -it --rm s5cmd --image=peakcom/s5cmd --env AWS_ACCESS_KEY_ID=minio - ## Use --all if needed kubectl delete -n spark sparkapplication fink-broker-distribution argocd app sync fink-broker +``` + +## Debug fink-broker helm chart + +```shell +cd fink-broker +helm install --debug fink ./chart -f ./chart/values-ci-noscience.yaml --dry-run +``` + +## ArgoCD + +### Access argoCD web UI + +```bash +kubectl port-forward -n argocd $(kubectl get pods --selector=app.kubernetes.io/name=argocd-server -n argocd --output=jsonpath="{.items..metadata.name}") 8080 +# Login is "admin, Password is set to "password", fix this in production +kubectl -n argocd patch secret argocd-secret -p '{"stringData": {"admin.password": "$2a$10$rRyBsGSHK6.uc8fntPwVIuLVHgsAhAX7TcdrqW/RADU0uh7CaChLa", "admin.passwordMtime": "'$(date +%FT%T%Z)'" }}' +``` + +### Fine-tune "ignoreDifferences" field of an ArgoCD Application + +```bash +# Install yq +sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq &&\\n sudo chmod +x /usr/bin/yq +# Retrieve failedsyncmanifest.yaml file in ArgoCD web UI +yq failedsyncmanifest.yaml -o json > failedsyncmanifest.json +# Fine-tune 'jqPathExpressions' +cat failedsyncmanifest.json | jq '.spec.versions[].additionalPrinterColumns | select(. == [])' ``` \ No newline at end of file diff --git a/e2e/argocd.sh b/e2e/argocd.sh index 2e2bd3df..27f2a968 100755 --- a/e2e/argocd.sh +++ b/e2e/argocd.sh @@ -9,6 +9,27 @@ set -euxo pipefail DIR=$(cd "$(dirname "$0")"; pwd -P) +storage="hdfs" + +usage() { + cat << EOD +Usage: $(basename "$0") [options] +Available options: + -h This message + -S Storage to use (hdfs or minio) +EOD +} + +# Get the options +while getopts hS: c ; do + case $c in + h) usage ; exit 0 ;; + S) storage="$OPTARG" ;; + \?) usage ; exit 2 ;; + esac +done +shift "$((OPTIND-1))" + CIUXCONFIG=${CIUXCONFIG:-"$HOME/.ciux/ciux.sh"} . $CIUXCONFIG @@ -36,11 +57,26 @@ e2e_enabled="true" argocd login --core kubectl config set-context --current --namespace="$NS" +if [ $storage == "s3" ] +then + hdfs_enabled="false" + s3_enabled="true" + online_data_prefix="" +elif [ $storage == "hdfs" ] +then + hdfs_enabled="true" + s3_enabled="false" + online_data_prefix="hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.hdfs:8020///user/185" +fi + + # Create fink app argocd app create fink --dest-server https://kubernetes.default.svc \ --dest-namespace "$NS" \ --repo https://github.com/astrolabsoftware/fink-cd.git \ --path apps --revision "$FINK_CD_WORKBRANCH" \ + -p s3.enabled="$s3_enabled" \ + -p hdfs.enabled="$hdfs_enabled" \ -p spec.source.targetRevision.default="$FINK_CD_WORKBRANCH" \ -p spec.source.targetRevision.finkbroker="$FINK_BROKER_WORKBRANCH" \ -p spec.source.targetRevision.finkalertsimulator="$FINK_ALERT_SIMULATOR_WORKBRANCH" @@ -61,7 +97,9 @@ argocd app set fink-broker -p image.repository="$CIUX_IMAGE_REGISTRY" \ -p e2e.enabled="$e2e_enabled" \ -p image.tag="$CIUX_IMAGE_TAG" \ -p log_level="DEBUG" \ - -p night="20200101" + -p night="20200101" \ + -p online_data_prefix="$online_data_prefix" \ + -p storage="$storage" argocd app set fink-alert-simulator -p image.tag="$FINK_ALERT_SIMULATOR_VERSION" @@ -76,7 +114,8 @@ argocd app wait -l app.kubernetes.io/part-of=fink,app.kubernetes.io/component=st # Sync fink-broker argocd app sync -l app.kubernetes.io/instance=fink -if [ $e2e_enabled == "true" ]; then +if [ $e2e_enabled == "true" ] +then echo "Retrieve kafka secrets for e2e tests" while ! kubectl get secret fink-producer --namespace kafka do diff --git a/e2e/run.sh b/e2e/run.sh index ab7c7cda..fc58b8aa 100755 --- a/e2e/run.sh +++ b/e2e/run.sh @@ -21,7 +21,7 @@ usage () { SUFFIX="noscience" -ciux_version=v0.0.4-rc8 +ciux_version=v0.0.4-rc10 export CIUXCONFIG=$HOME/.ciux/ciux.sh src_dir=$DIR/.. @@ -30,25 +30,29 @@ build=false e2e=false monitoring=false push=false +storage="hdfs" +CIUX_IMAGE_URL="undefined" token="${TOKEN:-}" # Get options for suffix -while getopts hcms opt; do +while getopts hcmsS: opt; do case ${opt} in - s ) - SUFFIX="" - ;; + c ) cleanup=true ;; - m ) - monitoring=true - ;; h ) usage exit 0 ;; + m ) + monitoring=true + ;; + s ) + SUFFIX="" + ;; + S) storage="$OPTARG" ;; \? ) usage exit 1 @@ -120,13 +124,14 @@ then fi $DIR/prereq-install.sh $monitoring_opt + . $CIUXCONFIG if [ $CIUX_BUILD = true ]; then kind load docker-image $CIUX_IMAGE_URL --name "$cluster" fi echo "Run ArgoCD to install the whole fink e2e tests stack" -$DIR/argocd.sh +$DIR/argocd.sh -S "$storage" echo "Check the results of the tests." $DIR/check-results.sh diff --git a/fink_broker/spark_utils.py b/fink_broker/spark_utils.py index 0df4470b..088e477b 100644 --- a/fink_broker/spark_utils.py +++ b/fink_broker/spark_utils.py @@ -336,7 +336,17 @@ def connect_to_raw_database(basepath: str, path: str, latestfirst: bool) -> Data wait_sec = increase_wait_time(wait_sec) # Create a DF from the database - userschema = spark.read.parquet(basepath).schema + # We need to wait for the schema to be available + while True: + try: + userschema = spark.read.parquet(basepath).schema + except Exception as e: # noqa: PERF203 + _LOG.error("Error while reading %s, %s", basepath, e) + time.sleep(wait_sec) + wait_sec = increase_wait_time(wait_sec) + continue + else: + break df = ( spark.readStream.format("parquet") diff --git a/hdfs-tmp/README.md b/hdfs-tmp/README.md new file mode 100644 index 00000000..c21051b4 --- /dev/null +++ b/hdfs-tmp/README.md @@ -0,0 +1,11 @@ +# Minio should be disabled in fink-cd! + +kubectl run --image apache/hadoop:3.4.0 hdfs-client -- sleep infinity + +kubectl exec -it hdfs-client bash + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -df + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -ls + export HADOOP_USER_NAME=stackable + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -touch /toto + hdfs dfs -fs hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:8020 -ls / + diff --git a/hdfs-tmp/install.sh b/hdfs-tmp/install.sh new file mode 100755 index 00000000..8e076bd8 --- /dev/null +++ b/hdfs-tmp/install.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euxo pipefail + +DIR=$(cd "$(dirname "$0")"; pwd -P) + +STCK_BIN="/tmp/stackable" + +# TODO move to a job inside argoCD +ink "Create hdfs directory" +$DIR/mkdir.sh diff --git a/hdfs-tmp/mkdir.sh b/hdfs-tmp/mkdir.sh new file mode 100755 index 00000000..b7cddb55 --- /dev/null +++ b/hdfs-tmp/mkdir.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# This script creates a directory in HDFS and sets the owner to user 185 + +set -euxo pipefail + +DIR=$(cd "$(dirname "$0")"; pwd -P) + +NS=hdfs + +timeout=300s + +# Wait for HDFS statefulset to be available +# TODO improve this +kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=hdfs --timeout=$timeout -n $NS +kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=zookeeper --timeout=$timeout -n $NS +sleep 60 + +hdfs_url="hdfs://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.$NS:8020" + +# Check if pod hdfs-client exists +if ! kubectl get -n "$NS" pod hdfs-client &> /dev/null; then + kubectl run -n "$NS" --image apache/hadoop:3.4.0 hdfs-client -- sleep infinity +fi + +kubectl wait -n "$NS" --for=condition=ready pod/hdfs-client --timeout=$timeout + +kubectl exec -n "$NS" -it hdfs-client -- sh -c "export HADOOP_USER_NAME=stackable && \ + hdfs dfs -fs $hdfs_url -mkdir -p /user/185 && \ + hdfs dfs -fs $hdfs_url -chown 185:hdfs /user/185 && \ + hdfs dfs -fs $hdfs_url -chmod 700 /user/185" diff --git a/hdfs-tmp/test-hdfs.sh b/hdfs-tmp/test-hdfs.sh new file mode 100755 index 00000000..3339692f --- /dev/null +++ b/hdfs-tmp/test-hdfs.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +set -euo pipefail + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Step 1: Create the webhdfs.yaml file +cat < $DIR.webhdfs.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: webhdfs + labels: + app: webhdfs +spec: + replicas: 1 + serviceName: webhdfs-svc + selector: + matchLabels: + app: webhdfs + template: + metadata: + labels: + app: webhdfs + spec: + containers: + - name: webhdfs + image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable0.0.0-dev + stdin: true + tty: true +EOF + +# Step 2: Apply the StatefulSet and monitor progress +echo "Applying webhdfs StatefulSet..." +kubectl apply -f $DIR/webhdfs.yaml +echo "Waiting for webhdfs pod to be ready..." +kubectl rollout status --watch --timeout=5m statefulset/webhdfs + +# Step 3: Check the root directory status in HDFS (should be empty) +echo "Checking root directory in HDFS..." +kubectl exec -n default webhdfs-0 -- curl -s -XGET "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/?op=LISTSTATUS" + +# Step 4: Create a sample file for uploading +echo "Creating sample file testdata.txt..." +echo "This is a test file for HDFS upload." > $DIR/testdata.txt + +# Step 5: Copy the file to the helper pod +echo "Copying testdata.txt to webhdfs pod..." +kubectl cp -n default $DIR/testdata.txt webhdfs-0:/tmp + +# Step 6: Initiate a two-step PUT request to create the file in HDFS +echo "Initiating file creation in HDFS (first step)..." +create_response=$(kubectl exec -n default webhdfs-0 -- \ +curl -s -XPUT -T /tmp/testdata.txt "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=CREATE&noredirect=true") + +# Extract the location for the second PUT request +location=$(echo "$create_response" | grep -o 'http://[^"]*') +echo "Location for second PUT request: $location" + +# Step 7: Complete the file upload +echo "Completing file creation in HDFS (second step)..." +kubectl exec -n default webhdfs-0 -- curl -s -XPUT -T /tmp/testdata.txt "$location" + +# Step 8: Verify that the file has been created in HDFS +echo "Re-checking root directory in HDFS to verify file creation..." +kubectl exec -n default webhdfs-0 -- curl -s -XGET "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/?op=LISTSTATUS" + +# Step 9: Delete the file from HDFS to clean up +echo "Deleting testdata.txt from HDFS..." +kubectl exec -n default webhdfs-0 -- curl -s -XDELETE "http://simple-hdfs-namenode-default-0.simple-hdfs-namenode-default.default.svc.cluster.local:9870/webhdfs/v1/testdata.txt?user.name=stackable&op=DELETE" + +# Clean up local files +rm $DIR/webhdfs.yaml $DIR/testdata.txt +echo "Cleanup completed. HDFS testing script finished."