From 551a6f3f56e4257c271ddfb30fd598877ec00968 Mon Sep 17 00:00:00 2001 From: SteBaum Date: Mon, 19 Aug 2024 17:21:12 +0200 Subject: [PATCH] feat: node decommissioning --- playbooks/utils/decommissioning/README.md | 27 ++++++++++++ .../utils/decommissioning/excluded_nodes.yml | 7 ++++ ...hadoop-component-decommissioning-check.yml | 14 +++++++ .../hdfs_namenode_decomm_datanode.yml | 15 +++++++ ...arn_resourcemanager_decomm_nodemanager.yml | 15 +++++++ .../hadoop-decommissioning.yml | 9 ++++ roles/hdfs/namenode/tasks/config.yml | 1 + .../tasks/main.yml | 36 ++++++++++++++++ .../defaults/main.yml | 5 +++ .../tasks/main.yml | 41 +++++++++++++++++++ .../templates/dfs.exclude.j2 | 3 ++ .../defaults/main.yml | 6 +++ .../tasks/main.yml | 41 +++++++++++++++++++ .../templates/yarn.exclude.j2 | 3 ++ roles/yarn/resourcemanager/tasks/config.yml | 1 + tdp_vars_defaults/hdfs/hdfs.yml | 4 -- tdp_vars_defaults/yarn/yarn.yml | 4 -- 17 files changed, 224 insertions(+), 8 deletions(-) create mode 100644 playbooks/utils/decommissioning/README.md create mode 100644 playbooks/utils/decommissioning/excluded_nodes.yml create mode 100644 playbooks/utils/decommissioning/hadoop-component-decommissioning-check.yml create mode 100644 playbooks/utils/decommissioning/hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml create mode 100644 playbooks/utils/decommissioning/hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml create mode 100644 playbooks/utils/decommissioning/hadoop-decommissioning.yml create mode 100644 roles/utils/hadoop_decommissioning_check/tasks/main.yml create mode 100644 roles/utils/hdfs_namenode_decommissioning/defaults/main.yml create mode 100644 roles/utils/hdfs_namenode_decommissioning/tasks/main.yml create mode 100644 roles/utils/hdfs_namenode_decommissioning/templates/dfs.exclude.j2 create mode 100644 roles/utils/yarn_resourcemanager_decommissioning/defaults/main.yml create mode 100644 roles/utils/yarn_resourcemanager_decommissioning/tasks/main.yml create mode 100644 roles/utils/yarn_resourcemanager_decommissioning/templates/yarn.exclude.j2 diff --git a/playbooks/utils/decommissioning/README.md b/playbooks/utils/decommissioning/README.md new file mode 100644 index 00000000..c4ad4040 --- /dev/null +++ b/playbooks/utils/decommissioning/README.md @@ -0,0 +1,27 @@ +# Node decommissioning + +The procedure follows is the same as described as follows on [Cloudera](https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.1.4/administration/content/decommissioning-slave-nodes.html). + +Check Application, Nodemanager and Datanode statuses before starting a decomissioning process by executing the playbook `hadoop-component-decommissioning-check.yml`. This same playbook can be run several times after the decommissioning process has begun to see its status. + +To see which application is running on which node execute the command inside a node with yarn client `yarn app -status `. + +## Yarn Nodemanager decommissioning + +Set the hostnames of the Nodemanagers to start to decommission in `yarn_nodemanagers_decommission` of the `excuded_nodes.yml` file seperated by comma in the Yarn tdp_variables, then set the timeout for the graceful decommissioning. The node is decommissioned once all applications running on it have terminated or after timeout and in this case it is restarted on another node. The value `-1` handles infinite timeout. Then execute the playbook `hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml`. + +## HDFS Datanode decommissioning + +Set the hostnames of the Datanodes to start to decommission in `hdfs_datanodes_decommission` of the `excuded_nodes.yml` file seperated by comma in the HDFS tdp_variables, then execute the playbook `hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml`. + +*NB*: the decommissioning of the HDFS datanode can take several hours depending on the size of the file system. + +## Hadoop decommissioning + +The playbook `hadoop-decommissioning.yml` executes both playbooks above and starts decommissioning the Yarn Nodemanager and the HDFS Datanode. It also before executes the `yarn_capacity_scheduler.yml` playbook to reconfigure the Yarn capacity scheduler. + +## Recommissioning a node + +For HDFS, just delete the node from `hdfs_datanodes_decommission` and execute the playbook `hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml`. + +Concerning Yarn, delete the node from `yarn_nodemanagers_decommission`, execute the playbook `hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml`, then restart the decommissioned Nodemanger with the playbook `yarn_nodemanager_restart.yml` and finally execute the same playbook `hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml` again. diff --git a/playbooks/utils/decommissioning/excluded_nodes.yml b/playbooks/utils/decommissioning/excluded_nodes.yml new file mode 100644 index 00000000..5313dda4 --- /dev/null +++ b/playbooks/utils/decommissioning/excluded_nodes.yml @@ -0,0 +1,7 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +hdfs_datanodes_decommission: [] +yarn_nodemanagers_decommission: [] +graceful_decommission_timeout_seconds: -1 diff --git a/playbooks/utils/decommissioning/hadoop-component-decommissioning-check.yml b/playbooks/utils/decommissioning/hadoop-component-decommissioning-check.yml new file mode 100644 index 00000000..3715247a --- /dev/null +++ b/playbooks/utils/decommissioning/hadoop-component-decommissioning-check.yml @@ -0,0 +1,14 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Hadoop Yarn Nodemanager and HDFS Datanode check + hosts: hdfs_nn, yarn_rm + tasks: + - tosit.tdp.resolve: # noqa unnamed-task + node_name: hdfs_namenode, yarn_resourcemanager + - name: Print application, node and datastorage information + ansible.builtin.import_role: + name: tosit.tdp.utils.hadoop_decommissioning_check + tasks_from: main + - ansible.builtin.meta: clear_facts # noqa unnamed-task diff --git a/playbooks/utils/decommissioning/hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml b/playbooks/utils/decommissioning/hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml new file mode 100644 index 00000000..9687f217 --- /dev/null +++ b/playbooks/utils/decommissioning/hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml @@ -0,0 +1,15 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Hadoop HDFS datanode Decommissioning + hosts: hdfs_nn + vars_files: ../excluded_nodes.yml + tasks: + - tosit.tdp.resolve: # noqa unnamed-task + node_name: hdfs_namenode + - name: Decommission HDFS datanode + ansible.builtin.import_role: + name: tosit.tdp.utils.hdfs_namenode_decommissioning + tasks_from: main + - ansible.builtin.meta: clear_facts # noqa unnamed-task diff --git a/playbooks/utils/decommissioning/hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml b/playbooks/utils/decommissioning/hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml new file mode 100644 index 00000000..11666a17 --- /dev/null +++ b/playbooks/utils/decommissioning/hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml @@ -0,0 +1,15 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Hadoop Yarn resourcemanager decommissioning + hosts: yarn_rm + vars_files: ../excluded_nodes.yml + tasks: + - tosit.tdp.resolve: # noqa unnamed-task + node_name: yarn_resourcemanager + - name: Decommision YARN NM + ansible.builtin.import_role: + name: tosit.tdp.utils.yarn_resourcemanager_decommissioning + tasks_from: main + - ansible.builtin.meta: clear_facts # noqa unnamed-task diff --git a/playbooks/utils/decommissioning/hadoop-decommissioning.yml b/playbooks/utils/decommissioning/hadoop-decommissioning.yml new file mode 100644 index 00000000..e6245c80 --- /dev/null +++ b/playbooks/utils/decommissioning/hadoop-decommissioning.yml @@ -0,0 +1,9 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- ansible.builtin.import_playbook: ../yarn_capacity_scheduler.yml +- ansible.builtin.import_playbook: hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml +# Decommission Yarn nodemanager +- ansible.builtin.import_playbook: hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml +# Decommission HDFS namenode diff --git a/roles/hdfs/namenode/tasks/config.yml b/roles/hdfs/namenode/tasks/config.yml index ef20a73c..e85ea574 100644 --- a/roles/hdfs/namenode/tasks/config.yml +++ b/roles/hdfs/namenode/tasks/config.yml @@ -64,3 +64,4 @@ owner: root group: root mode: "644" + force: false # the file will only be rendered if the destination does not exist diff --git a/roles/utils/hadoop_decommissioning_check/tasks/main.yml b/roles/utils/hadoop_decommissioning_check/tasks/main.yml new file mode 100644 index 00000000..20cc08c9 --- /dev/null +++ b/roles/utils/hadoop_decommissioning_check/tasks/main.yml @@ -0,0 +1,36 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Check yarn node status + ansible.builtin.command: yarn node -list -all + register: node_output + become: true + become_user: yarn + changed_when: false + +- name: Print output of node status + ansible.builtin.debug: + msg: "{{ node_output.stdout }}" + +- name: Check running applications on Yarn + ansible.builtin.command: yarn app -list + register: app_output + become: true + become_user: yarn + changed_when: false + +- name: Print output of node status + ansible.builtin.debug: + msg: "{{ app_output.stdout }}" + +- name: Check HDFS datanodes usage + ansible.builtin.command: hdfs dfsadmin -report + register: storage_output + become: true + become_user: hdfs + changed_when: false + +- name: Print output of node status + ansible.builtin.debug: + msg: "{{ storage_output.stdout }}" diff --git a/roles/utils/hdfs_namenode_decommissioning/defaults/main.yml b/roles/utils/hdfs_namenode_decommissioning/defaults/main.yml new file mode 100644 index 00000000..41467de4 --- /dev/null +++ b/roles/utils/hdfs_namenode_decommissioning/defaults/main.yml @@ -0,0 +1,5 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +excluded_nodes: "{{ hdfs_datanodes_decommission }}" diff --git a/roles/utils/hdfs_namenode_decommissioning/tasks/main.yml b/roles/utils/hdfs_namenode_decommissioning/tasks/main.yml new file mode 100644 index 00000000..816d1694 --- /dev/null +++ b/roles/utils/hdfs_namenode_decommissioning/tasks/main.yml @@ -0,0 +1,41 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Render dfs.exclude file + ansible.builtin.template: + src: dfs.exclude.j2 + dest: "{{ hdfs_site['dfs.hosts.exclude'] }}" + owner: root + group: root + mode: "644" + +- name: Update exlude nodes file + ansible.builtin.lineinfile: + path: "{{ hdfs_site['dfs.hosts.exclude'] }}" + line: "{{ item | tosit.tdp.access_fqdn(hostvars) }}" + state: present + loop: "{{ excluded_nodes }}" + +- name: kinit hdfs NN + ansible.builtin.command: kinit -kt /etc/security/keytabs/nn.service.keytab nn/{{ ansible_hostname | tosit.tdp.access_fqdn(hostvars) }}@{{ realm }} + become: true + become_user: hdfs + changed_when: false + +- name: RefreshNodes + ansible.builtin.command: hdfs dfsadmin -refreshNodes + become: true + become_user: hdfs + changed_when: false + +- name: Check node status + ansible.builtin.command: hdfs dfsadmin -report -decommissioning + register: hdfs_output + become: true + become_user: hdfs + changed_when: false + +- name: Print output of node status + ansible.builtin.debug: + msg: "{{ hdfs_output.stdout }}" diff --git a/roles/utils/hdfs_namenode_decommissioning/templates/dfs.exclude.j2 b/roles/utils/hdfs_namenode_decommissioning/templates/dfs.exclude.j2 new file mode 100644 index 00000000..74c31d83 --- /dev/null +++ b/roles/utils/hdfs_namenode_decommissioning/templates/dfs.exclude.j2 @@ -0,0 +1,3 @@ +{% for dn in hdfs_datanodes_decommission %} +{{ dn }} +{% endfor %} diff --git a/roles/utils/yarn_resourcemanager_decommissioning/defaults/main.yml b/roles/utils/yarn_resourcemanager_decommissioning/defaults/main.yml new file mode 100644 index 00000000..e7ff8016 --- /dev/null +++ b/roles/utils/yarn_resourcemanager_decommissioning/defaults/main.yml @@ -0,0 +1,6 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +excluded_nodes: "{{ yarn_nodemanagers_decommission }}" +timeout_seconds: "{{ graceful_decommission_timeout_seconds }}" diff --git a/roles/utils/yarn_resourcemanager_decommissioning/tasks/main.yml b/roles/utils/yarn_resourcemanager_decommissioning/tasks/main.yml new file mode 100644 index 00000000..6ee11a1d --- /dev/null +++ b/roles/utils/yarn_resourcemanager_decommissioning/tasks/main.yml @@ -0,0 +1,41 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Render yarn.exclude file + ansible.builtin.template: + src: yarn.exclude.j2 + dest: "{{ yarn_site['yarn.resourcemanager.nodes.exclude-path'] }}" + owner: root + group: root + mode: "644" + +- name: Update exlude nodes file + ansible.builtin.lineinfile: + path: "{{ yarn_site['yarn.resourcemanager.nodes.exclude-path'] }}" + line: "{{ item | tosit.tdp.access_fqdn(hostvars) }}" + state: present + loop: "{{ excluded_nodes }}" + +- name: kinit yarn RM + ansible.builtin.command: kinit -kt /etc/security/keytabs/rm.service.keytab rm/{{ ansible_hostname | tosit.tdp.access_fqdn(hostvars) }}@{{ realm }} + become: true + become_user: yarn + changed_when: false + +- name: RefreshNodes + ansible.builtin.command: yarn rmadmin -refreshNodes -g "{{ timeout_seconds }}" -server + become: true + become_user: yarn + changed_when: false + +- name: Check node status + ansible.builtin.command: yarn node -list -all + register: yarn_output + become: true + become_user: yarn + changed_when: false + +- name: Print output of node status + ansible.builtin.debug: + msg: "{{ yarn_output.stdout }}" diff --git a/roles/utils/yarn_resourcemanager_decommissioning/templates/yarn.exclude.j2 b/roles/utils/yarn_resourcemanager_decommissioning/templates/yarn.exclude.j2 new file mode 100644 index 00000000..656a54ce --- /dev/null +++ b/roles/utils/yarn_resourcemanager_decommissioning/templates/yarn.exclude.j2 @@ -0,0 +1,3 @@ +{% for nm in yarn_nodemanagers_decommission %} +{{ nm }} +{% endfor %} diff --git a/roles/yarn/resourcemanager/tasks/config.yml b/roles/yarn/resourcemanager/tasks/config.yml index 46c4f7e6..66a2ef2a 100644 --- a/roles/yarn/resourcemanager/tasks/config.yml +++ b/roles/yarn/resourcemanager/tasks/config.yml @@ -91,3 +91,4 @@ owner: root group: root mode: "644" + force: false # the file will only be rendered if the destination does not exist diff --git a/tdp_vars_defaults/hdfs/hdfs.yml b/tdp_vars_defaults/hdfs/hdfs.yml index cc1b3cd4..ce0b9173 100644 --- a/tdp_vars_defaults/hdfs/hdfs.yml +++ b/tdp_vars_defaults/hdfs/hdfs.yml @@ -95,7 +95,3 @@ hdfs_zkfc_heapsize: 1024m hdfs_namenode_data_dirs: "{{ hadoop_hdfs_dir }}/nn" hdfs_datanode_data_dirs: "/data/hdfs/dn" hdfs_journalnode_data_dirs: "{{ hadoop_hdfs_dir }}/jn" - -# HDFS DataNodes decommission -# List of DataNodes to decommission. See https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.1.4/administration/content/decommissioning-slave-nodes.html -hdfs_datanodes_decommission: [] diff --git a/tdp_vars_defaults/yarn/yarn.yml b/tdp_vars_defaults/yarn/yarn.yml index 980628c3..0f1fe713 100644 --- a/tdp_vars_defaults/yarn/yarn.yml +++ b/tdp_vars_defaults/yarn/yarn.yml @@ -197,7 +197,3 @@ yarn_nm_restart: "no" yarn_rm_restart: "no" yarn_ts_restart: "no" mapred_jhs_restart: "no" - -# YARN NodeManagers decommission -# List of NodeManagers to decommission. See https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.1.4/administration/content/decommissioning-slave-nodes.html -yarn_nodemanagers_decommission: []