Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -281,18 +281,16 @@
{% endif %}

# UCX and OpenMPI auto-compilation disabled
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default
{% if hostvars['localhost']['ucx_support'] %}
- echo "===== UCX Configuration ====="
- echo "UCX version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA UCX 1.20.0 (system default)"
{% endif %}
{% if hostvars['localhost']['openmpi_support'] %}
- echo "===== OpenMPI Configuration ====="
- echo "OpenMPI version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)"
- bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)"
{% endif %}

{% if ldms_support %}
- echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,19 +282,16 @@
{% endif %}

# UCX and OpenMPI auto-compilation disabled
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default
{% if hostvars['localhost']['ucx_support'] %}
- echo "===== UCX Configuration ====="
- echo "UCX version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA UCX 1.20.0 (system default)"
{% endif %}
{% if hostvars['localhost']['openmpi_support'] %}
- echo "===== OpenMPI Configuration ====="
- echo "OpenMPI version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)"
- bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)"

{% endif %}

{% if ldms_support %}
- echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -422,9 +422,7 @@
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{% if hostvars['localhost']['openmpi_support'] %}
- bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)"
{% endif %}

{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined and cloud_init_groups_dict[functional_group_name].runcmd is not none %}
Expand Down Expand Up @@ -513,17 +511,15 @@
- mount -av
{% endif %}
# UCX and OpenMPI auto-compilation disabled
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default
{% if hostvars['localhost']['ucx_support'] %}
- echo "===== UCX Configuration ====="
- echo "UCX version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA UCX 1.20.0 (system default)"
{% endif %}
{% if hostvars['localhost']['openmpi_support'] %}
- echo "===== OpenMPI Configuration ====="
- echo "OpenMPI version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)"
{% endif %}

{% if ldms_support %}
- echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,9 +428,7 @@
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || echo "DOCA install failed (non-critical)"
- bash /usr/local/bin/configure-ib-network.sh || echo "IB network configuration failed (non-critical)"
{% if hostvars['localhost']['openmpi_support'] %}
- bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)"
{% endif %}

{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined and cloud_init_groups_dict[functional_group_name].runcmd is not none %}
Expand Down Expand Up @@ -515,17 +513,15 @@
- mount -av
{% endif %}
# UCX and OpenMPI auto-compilation disabled
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default
# DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default
{% if hostvars['localhost']['ucx_support'] %}
- echo "===== UCX Configuration ====="
- echo "UCX version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA UCX 1.20.0 (system default)"
{% endif %}
{% if hostvars['localhost']['openmpi_support'] %}
- echo "===== OpenMPI Configuration ====="
- echo "OpenMPI version specified in software_config.json (available for manual compilation)"
- echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)"
{% endif %}

{% if ldms_support %}
- echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,10 @@
- name: Display k8s cleanup information
ansible.builtin.debug:
msg: |
WARNING: This will delete K8s-related directories from NFS shares:
WARNING: This will delete K8s-related directories and files from NFS shares:
{% for mount in k8s_storage_mounts %}
Storage: {{ mount.name }} ({{ mount.mount_point }})
Directories:
{% for item in k8s_static_dirs_stat.results %}
{% if item.stat.exists and item.item.startswith(mount.mount_point) %}
- {{ item.item }} ({{ item.item | basename }})
Expand All @@ -141,6 +142,10 @@
{% else %}
Node IP directories: Skipped (k8s_cleanup_node_ips: false)
{% endif %}
Root-level files:
{% for file in k8s_cleanup_files %}
- {{ mount.mount_point }}/{{ file }}
{% endfor %}
{% endfor %}

CRITICAL WARNING: Deleting NFS shared data will affect ALL nodes!
Expand Down Expand Up @@ -173,19 +178,44 @@
when: k8s_cleanup_needed | default(false)
loop: "{{ k8s_all_cleanup_paths }}"

- name: Delete K8s root-level files
ansible.builtin.file:
path: "{{ item.0 }}/{{ item.1 }}"
state: absent
register: k8s_files_cleanup_result
when: k8s_cleanup_needed | default(false)
loop: "{{ all_k8s_base_paths | product(k8s_cleanup_files) | list }}"
loop_control:
label: "{{ item.0 }}/{{ item.1 }}"
failed_when: false

- name: Display k8s cleanup completion message
ansible.builtin.debug:
msg: |
K8s-related cleanup completed.
{% for mount in k8s_storage_mounts %}
Storage: {{ mount.name }} ({{ mount.mount_point }})
{% set mount_deleted = k8s_cleanup_result.results | selectattr('item', 'search', '^' + mount.mount_point) | selectattr('changed') | list %}
{% set mount_deleted = k8s_cleanup_result.results |
selectattr('item', 'search', '^' + mount.mount_point) |
selectattr('changed') |
list %}
{% if mount_deleted %}
{% for item in mount_deleted %}
-> Deleted: {{ item.item }}
-> Deleted directory: {{ item.item }}
{% endfor %}
{% else %}
-> No directories deleted from this storage
{% endif %}
{% set mount_files_deleted = k8s_files_cleanup_result.results |
selectattr('item', 'search', '^' + mount.mount_point) |
selectattr('changed') |
list %}
{% if mount_files_deleted %}
{% for item in mount_files_deleted %}
-> Deleted file: {{ item.item }}
{% endfor %}
{% else %}
-> No files deleted from this storage
{% endif %}
{% endfor %}
when: k8s_cleanup_needed | default(false)
19 changes: 14 additions & 5 deletions utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,14 @@ oim_cleanup_note: |
- For Slurm configuration backup, use the separate utility: ansible-playbook utils/slurm_config_util.yml --tags config_backup
- To skip slurm cleanup, run: ansible-playbook utils/oim_cleanup.yml --skip-tags slurm

3. The playbook removes K8s-related directories from NFS shares:
- ssh, calico, metallb, helm, packages, telemetry, karavi-observability, csi-driver-powerscale, nfs-client-provisioner
3. The playbook removes K8s-related directories and files from NFS shares:
- Directories: ssh, calico, metallb, helm, packages, telemetry, karavi-observability, csi-driver-powerscale, nfs-client-provisioner
- Files: control-plane-join-command.sh, generate-control-plane-join.sh, worker-join-command.sh, pulp_webserver.crt
- Node IP directories (when k8s_cleanup_node_ips: true)
- Directory list is configurable via k8s_cleanup_directories variable in vars/main.yml
- Supports multi-storage: Cleans directories from all K8s storage mounts configured in omnia_config.yml
- Directory and file lists are configurable via k8s_cleanup_directories and k8s_cleanup_files variables in vars/main.yml
- Supports multi-storage: Cleans directories and files from all K8s storage mounts configured in omnia_config.yml
- To skip k8s cleanup, run: ansible-playbook utils/oim_cleanup.yml --skip-tags k8s
- No backup is created for k8s directories (directory deletion only)
- No backup is created for k8s directories and files (deletion only)

4. The omnia_core container is NOT removed by oim_cleanup.yml.
- To delete it, log in to the OIM node and run:
Expand All @@ -248,6 +249,14 @@ k8s_cleanup_directories:
- csi-driver-powerscale
- nfs-client-provisioner

# List of k8s root-level files to delete from NFS share
# Edit this list to add/remove files as needed
k8s_cleanup_files:
- control-plane-join-command.sh
- generate-control-plane-join.sh
- worker-join-command.sh
- pulp_webserver.crt

# Delete node IP directories (pattern: x.x.x.x)
# Set to false to skip node directories
k8s_cleanup_node_ips: true
Expand Down
Loading