diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index dfdaa37111..5879e0ba3d 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -281,18 +281,16 @@ {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 0c727b0c01..77109a4828 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -282,19 +282,16 @@ {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" - -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index cdce20193e..449cedfa8f 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -422,9 +422,7 @@ # DOCA and IB configuration - now ready before vendor_data mounts - bash /usr/local/bin/doca-install.sh || true - bash /usr/local/bin/configure-ib-network.sh -{% if hostvars['localhost']['openmpi_support'] %} - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" -{% endif %} {# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined and cloud_init_groups_dict[functional_group_name].runcmd is not none %} @@ -513,17 +511,15 @@ - mount -av {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index ee33e0ff6f..ecf388cdeb 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -428,9 +428,7 @@ # DOCA and IB configuration - now ready before vendor_data mounts - bash /usr/local/bin/doca-install.sh || echo "DOCA install failed (non-critical)" - bash /usr/local/bin/configure-ib-network.sh || echo "IB network configuration failed (non-critical)" -{% if hostvars['localhost']['openmpi_support'] %} - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" -{% endif %} {# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined and cloud_init_groups_dict[functional_group_name].runcmd is not none %} @@ -515,17 +513,15 @@ - mount -av {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml b/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml index 6f0da6b43e..52e324cd76 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml @@ -121,9 +121,10 @@ - name: Display k8s cleanup information ansible.builtin.debug: msg: | - WARNING: This will delete K8s-related directories from NFS shares: + WARNING: This will delete K8s-related directories and files from NFS shares: {% for mount in k8s_storage_mounts %} Storage: {{ mount.name }} ({{ mount.mount_point }}) + Directories: {% for item in k8s_static_dirs_stat.results %} {% if item.stat.exists and item.item.startswith(mount.mount_point) %} - {{ item.item }} ({{ item.item | basename }}) @@ -141,6 +142,10 @@ {% else %} Node IP directories: Skipped (k8s_cleanup_node_ips: false) {% endif %} + Root-level files: + {% for file in k8s_cleanup_files %} + - {{ mount.mount_point }}/{{ file }} + {% endfor %} {% endfor %} CRITICAL WARNING: Deleting NFS shared data will affect ALL nodes! @@ -173,19 +178,44 @@ when: k8s_cleanup_needed | default(false) loop: "{{ k8s_all_cleanup_paths }}" + - name: Delete K8s root-level files + ansible.builtin.file: + path: "{{ item.0 }}/{{ item.1 }}" + state: absent + register: k8s_files_cleanup_result + when: k8s_cleanup_needed | default(false) + loop: "{{ all_k8s_base_paths | product(k8s_cleanup_files) | list }}" + loop_control: + label: "{{ item.0 }}/{{ item.1 }}" + failed_when: false + - name: Display k8s cleanup completion message ansible.builtin.debug: msg: | K8s-related cleanup completed. {% for mount in k8s_storage_mounts %} Storage: {{ mount.name }} ({{ mount.mount_point }}) - {% set mount_deleted = k8s_cleanup_result.results | selectattr('item', 'search', '^' + mount.mount_point) | selectattr('changed') | list %} + {% set mount_deleted = k8s_cleanup_result.results | + selectattr('item', 'search', '^' + mount.mount_point) | + selectattr('changed') | + list %} {% if mount_deleted %} {% for item in mount_deleted %} - -> Deleted: {{ item.item }} + -> Deleted directory: {{ item.item }} {% endfor %} {% else %} -> No directories deleted from this storage {% endif %} + {% set mount_files_deleted = k8s_files_cleanup_result.results | + selectattr('item', 'search', '^' + mount.mount_point) | + selectattr('changed') | + list %} + {% if mount_files_deleted %} + {% for item in mount_files_deleted %} + -> Deleted file: {{ item.item }} + {% endfor %} + {% else %} + -> No files deleted from this storage + {% endif %} {% endfor %} when: k8s_cleanup_needed | default(false) diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml index 28dd327351..5f3d60ea72 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml @@ -218,13 +218,14 @@ oim_cleanup_note: | - For Slurm configuration backup, use the separate utility: ansible-playbook utils/slurm_config_util.yml --tags config_backup - To skip slurm cleanup, run: ansible-playbook utils/oim_cleanup.yml --skip-tags slurm - 3. The playbook removes K8s-related directories from NFS shares: - - ssh, calico, metallb, helm, packages, telemetry, karavi-observability, csi-driver-powerscale, nfs-client-provisioner + 3. The playbook removes K8s-related directories and files from NFS shares: + - Directories: ssh, calico, metallb, helm, packages, telemetry, karavi-observability, csi-driver-powerscale, nfs-client-provisioner + - Files: control-plane-join-command.sh, generate-control-plane-join.sh, worker-join-command.sh, pulp_webserver.crt - Node IP directories (when k8s_cleanup_node_ips: true) - - Directory list is configurable via k8s_cleanup_directories variable in vars/main.yml - - Supports multi-storage: Cleans directories from all K8s storage mounts configured in omnia_config.yml + - Directory and file lists are configurable via k8s_cleanup_directories and k8s_cleanup_files variables in vars/main.yml + - Supports multi-storage: Cleans directories and files from all K8s storage mounts configured in omnia_config.yml - To skip k8s cleanup, run: ansible-playbook utils/oim_cleanup.yml --skip-tags k8s - - No backup is created for k8s directories (directory deletion only) + - No backup is created for k8s directories and files (deletion only) 4. The omnia_core container is NOT removed by oim_cleanup.yml. - To delete it, log in to the OIM node and run: @@ -248,6 +249,14 @@ k8s_cleanup_directories: - csi-driver-powerscale - nfs-client-provisioner +# List of k8s root-level files to delete from NFS share +# Edit this list to add/remove files as needed +k8s_cleanup_files: + - control-plane-join-command.sh + - generate-control-plane-join.sh + - worker-join-command.sh + - pulp_webserver.crt + # Delete node IP directories (pattern: x.x.x.x) # Set to false to skip node directories k8s_cleanup_node_ips: true