Skip to content

Commit

Permalink
feat: node decommissioning
Browse files Browse the repository at this point in the history
  • Loading branch information
SteBaum committed Aug 19, 2024
1 parent 99b0d0d commit c16adc5
Show file tree
Hide file tree
Showing 14 changed files with 206 additions and 0 deletions.
19 changes: 19 additions & 0 deletions playbooks/utils/decommissioning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Node decommissioning

Check Application, Nodemanager and Datanode statuses before starting a decomissioning process by executing the playbook `hadoop-component-check.yml`.

To see which application is running on which node execute the command inside a node with yarn client `yarn app -status <application-id>`.

## Nodemanager decommissioning

Set the hostnames of the Nodemanagers to start to decommission in `yarn_nodemanagers_decommission` seperated by comma in the Yarn tdp_variables, then execute the playbook `hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml`.

## Datanode decommissioning

Set the hostnames of the Datanodes to start to decommission in `hdfs_datanodes_decommission` seperated by comma in the HDFS tdp_variables, then execute the playbook `hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml`.

*NB*: the decommissioning of the HDFS datanode can take several hours depending on the size of the file system.

### Hadoop decommissioning

The playbook `hadoop-decommissioning.yml` executes both playbooks above and starts decommissioning the Yarn Nodemanager and the HDFS Datanode.
14 changes: 14 additions & 0 deletions playbooks/utils/decommissioning/hadoop-component-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
- name: Hadoop Yarn Nodemanager and HDFS Datanode check
hosts: hdfs_nn, yarn_rm
tasks:
- tosit.tdp.resolve: # noqa unnamed-task
node_name: hdfs_namenode, yarn_resourcemanager
- name: Print application, node and datastorage information
ansible.builtin.import_role:
name: tosit.tdp.utils.hadoop_check
tasks_from: main
- ansible.builtin.meta: clear_facts # noqa unnamed-task
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
- name: Hadoop HDFS datanode Decommissioning
hosts: hdfs_nn
tasks:
- tosit.tdp.resolve: # noqa unnamed-task
node_name: hdfs_namenode
- name: Decommission HDFS datanode
ansible.builtin.import_role:
name: tosit.tdp.utils.hdfs_namenode_decommissioning
tasks_from: main
- ansible.builtin.meta: clear_facts # noqa unnamed-task
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
- name: Hadoop Yarn resourcemanager decommissioning
hosts: yarn_rm
tasks:
- tosit.tdp.resolve: # noqa unnamed-task
node_name: yarn_resourcemanager
- name: Decommision YARN NM
ansible.builtin.import_role:
name: tosit.tdp.utils.yarn_resourcemanager_decommissioning
tasks_from: main
- ansible.builtin.meta: clear_facts # noqa unnamed-task
9 changes: 9 additions & 0 deletions playbooks/utils/decommissioning/hadoop-decommissioning.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
- ansible.builtin.import_playbook: ../yarn_capacity_scheduler.yml
- ansible.builtin.import_playbook: hadoop-components-decommissioning/yarn_resourcemanager_decomm_nodemanager.yml
# Decommission Yarn nodemanager
- ansible.builtin.import_playbook: hadoop-components-decommissioning/hdfs_namenode_decomm_datanode.yml
# Decommission HDFS namenode
1 change: 1 addition & 0 deletions roles/hdfs/namenode/tasks/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,4 @@
owner: root
group: root
mode: "644"
force: false # the file will only be rendered if the destination does not exist
36 changes: 36 additions & 0 deletions roles/utils/hadoop_check/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
- name: Check yarn node status
ansible.builtin.command: yarn node -list -all
register: node_output
become: true
become_user: yarn
changed_when: false

- name: Print output of node status
ansible.builtin.debug:
var: node_output.stdout

- name: Check running applications on Yarn
ansible.builtin.command: yarn app -list
register: app_output
become: true
become_user: yarn
changed_when: false

- name: Print output of node status
ansible.builtin.debug:
var: app_output.stdout

- name: Check HDFS datanodes usage
ansible.builtin.command: hdfs dfsadmin -report
register: storage_output
become: true
become_user: hdfs
changed_when: false

- name: Print output of node status
ansible.builtin.debug:
var: storage_output.stdout
5 changes: 5 additions & 0 deletions roles/utils/hdfs_namenode_decommissioning/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
excluded_nodes: "{{ hdfs_datanodes_decommission }}"
41 changes: 41 additions & 0 deletions roles/utils/hdfs_namenode_decommissioning/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
- name: Render dfs.exclude file
ansible.builtin.template:
src: dfs.exclude.j2
dest: "{{ hdfs_site['dfs.hosts.exclude'] }}"
owner: root
group: root
mode: "644"

- name: Update exlude nodes file
ansible.builtin.lineinfile:
path: "{{ hdfs_site['dfs.hosts.exclude'] }}"
line: "{{ item | tosit.tdp.access_fqdn(hostvars) }}"
state: present
loop: "{{ excluded_nodes }}"

- name: kinit hdfs NN
ansible.builtin.command: kinit -kt /etc/security/keytabs/nn.service.keytab nn/{{ ansible_hostname | tosit.tdp.access_fqdn(hostvars) }}@{{ realm }}
become: true
become_user: hdfs
changed_when: false

- name: RefreshNodes
ansible.builtin.command: hdfs dfsadmin -refreshNodes
become: true
become_user: hdfs
changed_when: false

- name: Check node status
ansible.builtin.command: hdfs dfsadmin -report -decommissioning
register: hdfs_output
become: true
become_user: hdfs
changed_when: false

- name: Print output of node status
ansible.builtin.debug:
var: hdfs_output.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{% for dn in hdfs_datanodes_decommission %}
{{ dn }}
{% endfor %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
excluded_nodes: "{{ yarn_nodemanagers_decommission }}"
41 changes: 41 additions & 0 deletions roles/utils/yarn_resourcemanager_decommissioning/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2022 TOSIT.IO
# SPDX-License-Identifier: Apache-2.0

---
- name: Render yarn.exclude file
ansible.builtin.template:
src: yarn.exclude.j2
dest: "{{ yarn_site['yarn.resourcemanager.nodes.exclude-path'] }}"
owner: root
group: root
mode: "644"

- name: Update exlude nodes file
ansible.builtin.lineinfile:
path: "{{ yarn_site['yarn.resourcemanager.nodes.exclude-path'] }}"
line: "{{ item | tosit.tdp.access_fqdn(hostvars) }}"
state: present
loop: "{{ excluded_nodes }}"

- name: kinit yarn RM
ansible.builtin.command: kinit -kt /etc/security/keytabs/rm.service.keytab rm/{{ ansible_hostname | tosit.tdp.access_fqdn(hostvars) }}@{{ realm }}
become: true
become_user: yarn
changed_when: false

- name: RefreshNodes
ansible.builtin.command: yarn rmadmin -refreshNodes -g -server
become: true
become_user: yarn
changed_when: false

- name: Check node status
ansible.builtin.command: yarn node -list -all
register: yarn_output
become: true
become_user: yarn
changed_when: false

- name: Print output of node status
ansible.builtin.debug:
var: yarn_output.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{% for nm in yarn_nodemanagers_decommission %}
{{ nm }}
{% endfor %}
1 change: 1 addition & 0 deletions roles/yarn/resourcemanager/tasks/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@
owner: root
group: root
mode: "644"
force: false # the file will only be rendered if the destination does not exist

0 comments on commit c16adc5

Please sign in to comment.