From 2abf1ee2ee649cb564b41238aab78e6ae9f42934 Mon Sep 17 00:00:00 2001 From: grohli <22306963+grohli@users.noreply.github.com> Date: Thu, 21 May 2026 10:23:55 -0400 Subject: [PATCH 1/7] [batch] Upgrade pooled VM machine family from N1 to N2 Switch GCP_MACHINE_FAMILY from 'n1' to 'n2' so all pool workers create N2 VMs. Add N2 memory ratios, machine type entries (standard/highmem/highcpu), and valid core counts. Update pricing pipeline to recognize N2 SKUs and handle N2 memory SKU descriptions. Existing N1 entries retained for GPU JPIM billing. Co-Authored-By: Claude Opus 4.6 --- batch/batch/cloud/gcp/driver/pricing.py | 6 ++- batch/batch/cloud/gcp/resource_utils.py | 60 +++++++++++++++++++++++-- batch/test/test_utils.py | 14 +++++- batch/test/utils.py | 2 +- 4 files changed, 73 insertions(+), 9 deletions(-) diff --git a/batch/batch/cloud/gcp/driver/pricing.py b/batch/batch/cloud/gcp/driver/pricing.py index 60d3495efc6..9429ed784ec 100644 --- a/batch/batch/cloud/gcp/driver/pricing.py +++ b/batch/batch/cloud/gcp/driver/pricing.py @@ -179,6 +179,8 @@ def instance_family_from_sku(sku: dict) -> Optional[str]: category = sku['category'] if category['resourceGroup'] == 'N1Standard': return 'n1' + if sku['description'].startswith("N2 Instance") or sku['description'].startswith("Spot Preemptible N2 Instance"): + return 'n2' if sku['description'].startswith("G2 Instance") or sku['description'].startswith("Spot Preemptible G2 Instance"): return 'g2' if sku['description'].startswith("A2 Instance") or sku['description'].startswith("Spot Preemptible A2 Instance"): @@ -283,7 +285,7 @@ def process_accelerator_sku(sku: dict, regions: List[str]) -> List[GCPAccelerato def process_memory_sku(sku: dict, regions: List[str]) -> List[GCPMemoryPrice]: category = sku['category'] assert category['resourceFamily'] == 'Compute', sku - assert 'Ram' in sku['description'] + assert 'Ram' in sku['description'] or 'Memory' in sku['description'] instance_family = instance_family_from_sku(sku) preemptible = preemptible_from_sku(sku) @@ -392,7 +394,7 @@ async def fetch_prices( if 'Core' in sku['description']: for compute_price in process_compute_sku(sku, regions): yield compute_price - elif 'Ram' in sku['description']: + elif 'Ram' in sku['description'] or 'Memory' in sku['description']: for memory_price in process_memory_sku(sku, regions): yield memory_price elif category['resourceFamily'] == 'Storage': diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py index 636d791f76e..4d1b88b4908 100644 --- a/batch/batch/cloud/gcp/resource_utils.py +++ b/batch/batch/cloud/gcp/resource_utils.py @@ -6,12 +6,15 @@ GCP_MAX_PERSISTENT_SSD_SIZE_GIB = 64 * 1024 -GCP_MACHINE_FAMILY = 'n1' +GCP_MACHINE_FAMILY = 'n2' MEMORY_PER_CORE_MIB = { ('n1', 'standard'): 3840, ('n1', 'highmem'): 6656, ('n1', 'highcpu'): 924, + ('n2', 'standard'): 4096, + ('n2', 'highmem'): 8192, + ('n2', 'highcpu'): 1024, } @@ -116,6 +119,45 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in for cores in [2, 4, 8, 16, 32, 64, 96] } +# N2 Standard cores: 2 4 8 16 32 48 64 80 96 128 +# N2 Standard mem: 4 * cores GiB +n2_standard_machines = { + f'n2-standard-{cores}': MachineTypeParts( + cores=cores, + memory=gib_to_bytes(4 * cores), + gpu_config=None, + machine_family='n2', + worker_type='standard', + ) + for cores in [2, 4, 8, 16, 32, 48, 64, 80, 96, 128] +} + +# N2 Highmem cores: 2 4 8 16 32 48 64 80 96 +# N2 Highmem mem: 8 * cores GiB +n2_highmem_machines = { + f'n2-highmem-{cores}': MachineTypeParts( + cores=cores, + memory=gib_to_bytes(8 * cores), + gpu_config=None, + machine_family='n2', + worker_type='highmem', + ) + for cores in [2, 4, 8, 16, 32, 48, 64, 80, 96] +} + +# N2 Highcpu cores: 2 4 8 16 32 48 64 80 96 +# N2 Highcpu mem: 1024 * cores MiB +n2_highcpu_machines = { + f'n2-highcpu-{cores}': MachineTypeParts( + cores=cores, + memory=mib_to_bytes(1024 * cores), + gpu_config=None, + machine_family='n2', + worker_type='highcpu', + ) + for cores in [2, 4, 8, 16, 32, 48, 64, 80, 96] +} + MACHINE_TYPE_TO_PARTS = { **n1_standard_t4_machines, **n1_highmem_t4_machines, @@ -123,6 +165,16 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in **n1_standard_machines, **n1_highmem_machines, **n1_highcpu_machines, + **n2_standard_machines, + **n2_highmem_machines, + **n2_highcpu_machines, + 'n2-highmem-128': MachineTypeParts( + cores=128, + memory=gib_to_bytes(864), + gpu_config=None, + machine_family='n2', + worker_type='highmem', + ), 'g2-standard-4': MachineTypeParts( cores=4, memory=gib_to_bytes(16), @@ -245,9 +297,9 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in } gcp_valid_cores_for_pool_worker_type = { - 'highcpu': [2, 4, 8, 16, 32, 64, 96], - 'standard': [1, 2, 4, 8, 16, 32, 64, 96], - 'highmem': [2, 4, 8, 16, 32, 64, 96], + 'standard': [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], + 'highmem': [2, 4, 8, 16, 32, 48, 64, 80, 96], + 'highcpu': [2, 4, 8, 16, 32, 48, 64, 80, 96], } gcp_valid_machine_types = list(MACHINE_TYPE_TO_PARTS.keys()) diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py index 92947cc5462..ce28fec8101 100644 --- a/batch/test/test_utils.py +++ b/batch/test/test_utils.py @@ -33,11 +33,12 @@ def test_memory_str_to_bytes(): def test_gcp_worker_memory_per_core_mib(): - with pytest.raises(AssertionError): - assert gcp_worker_memory_per_core_mib('n2', 'standard') assert gcp_worker_memory_per_core_mib('n1', 'standard') == 3840 assert gcp_worker_memory_per_core_mib('n1', 'highmem') == 6656 assert gcp_worker_memory_per_core_mib('n1', 'highcpu') == 924 + assert gcp_worker_memory_per_core_mib('n2', 'standard') == 4096 + assert gcp_worker_memory_per_core_mib('n2', 'highmem') == 8192 + assert gcp_worker_memory_per_core_mib('n2', 'highcpu') == 1024 def test_gcp_machine_memory_per_core_mib(): @@ -48,6 +49,15 @@ def test_gcp_machine_memory_per_core_mib(): assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 6656 elif machine_parts.machine_family == 'n1' and machine_parts.worker_type == 'highcpu': assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 924 + elif machine_parts.machine_family == 'n2' and machine_parts.worker_type == 'standard': + assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 4096 + elif machine_parts.machine_family == 'n2' and machine_parts.worker_type == 'highmem': + if machine_parts.cores == 128: + assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 6912 + else: + assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 8192 + elif machine_parts.machine_family == 'n2' and machine_parts.worker_type == 'highcpu': + assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 1024 elif machine_parts.machine_family == 'g2' and machine_parts.worker_type == 'standard': assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 4096 elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'highgpu': diff --git a/batch/test/utils.py b/batch/test/utils.py index a9e00e023a7..e352c569e5b 100644 --- a/batch/test/utils.py +++ b/batch/test/utils.py @@ -43,6 +43,6 @@ def smallest_machine_type(): cloud = os.environ['HAIL_CLOUD'] if cloud == 'gcp': - return 'n1-standard-1' + return 'n2-standard-2' assert cloud == 'azure' return 'Standard_D2ds_v4' From e0e96a0dc56ef37f5417d74fc9180dade4bb8cfd Mon Sep 17 00:00:00 2001 From: grohli <22306963+grohli@users.noreply.github.com> Date: Thu, 21 May 2026 14:58:32 -0400 Subject: [PATCH 2/7] [batch] Fix local SSD count for N2 machines: attach minimum required partitions and RAID0 when >1 N2 machines require local SSDs in specific quantities (e.g., n2-standard-16 requires minimum 2). The previous code always attached exactly 1, which GCP rejects. This adds a per-core-count lookup for the minimum SSD count, attaches that many SCRATCH disks in the VM config, and combines them via mdadm RAID0 in the startup script when count > 1. Co-Authored-By: Claude Opus 4.6 --- .../batch/cloud/gcp/driver/create_instance.py | 49 +++++++++++++------ .../cloud/gcp/driver/resource_manager.py | 3 +- batch/batch/cloud/gcp/resource_utils.py | 35 ++++++++++++- batch/batch/cloud/resource_utils.py | 3 +- batch/test/test_utils.py | 25 +++++++++- 5 files changed, 94 insertions(+), 21 deletions(-) diff --git a/batch/batch/cloud/gcp/driver/create_instance.py b/batch/batch/cloud/gcp/driver/create_instance.py index e3c6e29ccdf..63a68b844a6 100644 --- a/batch/batch/cloud/gcp/driver/create_instance.py +++ b/batch/batch/cloud/gcp/driver/create_instance.py @@ -13,7 +13,7 @@ from ....instance_config import InstanceConfig from ...resource_utils import unreserved_worker_data_disk_size_gib from ...utils import ACCEPTABLE_QUERY_JAR_URL_PREFIX -from ..resource_utils import GPUConfig, gcp_machine_type_to_parts, machine_type_to_gpu +from ..resource_utils import GPUConfig, gcp_local_ssd_count, gcp_machine_type_to_parts, machine_type_to_gpu log = logging.getLogger('create_instance') @@ -62,21 +62,28 @@ def create_vm_config( region = instance_config.region_for(zone) docker_run_gpu_args = '--runtime=nvidia --gpus all' if machine_type_to_gpu(machine_type_full) else '' if local_ssd_data_disk: - worker_data_disk = { - 'type': 'SCRATCH', - 'autoDelete': True, - 'interface': 'NVME', - 'initializeParams': {'diskType': f'zones/{zone}/diskTypes/local-ssd'}, - } - worker_data_disk_name = 'nvme0n1' + num_local_ssds = gcp_local_ssd_count(parts.machine_family, cores) + worker_data_disks = [ + { + 'type': 'SCRATCH', + 'autoDelete': True, + 'interface': 'NVME', + 'initializeParams': {'diskType': f'zones/{zone}/diskTypes/local-ssd'}, + } + for _ in range(num_local_ssds) + ] + worker_data_disk_name = 'md0' if num_local_ssds > 1 else 'nvme0n1' else: - worker_data_disk = { - 'autoDelete': True, - 'initializeParams': { - 'diskType': f'projects/{project}/zones/{zone}/diskTypes/pd-ssd', - 'diskSizeGb': str(data_disk_size_gb), - }, - } + num_local_ssds = 0 + worker_data_disks = [ + { + 'autoDelete': True, + 'initializeParams': { + 'diskType': f'projects/{project}/zones/{zone}/diskTypes/pd-ssd', + 'diskSizeGb': str(data_disk_size_gb), + }, + } + ] worker_data_disk_name = 'nvme0n2' if 'g2' in machine_type else 'sdb' if job_private: @@ -123,7 +130,7 @@ def scheduling() -> dict: 'diskSizeGb': str(boot_disk_size_gb), }, }, - worker_data_disk, + *worker_data_disks, ], 'networkInterfaces': [ { @@ -175,6 +182,7 @@ def scheduling() -> dict: set -x WORKER_DATA_DISK_NAME="{worker_data_disk_name}" +NUM_LOCAL_SSDS="{num_local_ssds}" UNRESERVED_WORKER_DATA_DISK_SIZE_GB="{unreserved_disk_storage_gb}" ACCEPTABLE_QUERY_JAR_URL_PREFIX="{ACCEPTABLE_QUERY_JAR_URL_PREFIX}" @@ -252,6 +260,15 @@ def scheduling() -> dict: sudo systemctl restart google-cloud-ops-agent +# combine multiple local SSDs into a single RAID0 array +if [ "$NUM_LOCAL_SSDS" -gt 1 ]; then + DEVICES="" + for i in $(seq 1 $NUM_LOCAL_SSDS); do + DEVICES="$DEVICES /dev/nvme0n$i" + done + mdadm --create /dev/md0 --level=0 --raid-devices=$NUM_LOCAL_SSDS $DEVICES --force --run +fi + # format worker data disk sudo mkfs.xfs -m reflink=1 -n ftype=1 /dev/$WORKER_DATA_DISK_NAME sudo mkdir -p /mnt/disks/$WORKER_DATA_DISK_NAME diff --git a/batch/batch/cloud/gcp/driver/resource_manager.py b/batch/batch/cloud/gcp/driver/resource_manager.py index a6aa4c96c7c..4e2e051f768 100644 --- a/batch/batch/cloud/gcp/driver/resource_manager.py +++ b/batch/batch/cloud/gcp/driver/resource_manager.py @@ -21,6 +21,7 @@ from ....instance_config import InstanceConfig, QuantifiedResource from ..instance_config import GCPSlimInstanceConfig from ..resource_utils import ( + GCP_LOCAL_SSD_PARTITION_SIZE_GIB, GCP_MACHINE_FAMILY, family_worker_type_cores_to_gcp_machine_type, gcp_machine_type_to_cores_and_memory_bytes, @@ -111,7 +112,7 @@ async def create_vm( instance_config: InstanceConfig, ) -> List[QuantifiedResource]: if local_ssd_data_disk: - assert data_disk_size_gb == 375 + assert data_disk_size_gb % GCP_LOCAL_SSD_PARTITION_SIZE_GIB == 0 and data_disk_size_gb >= GCP_LOCAL_SSD_PARTITION_SIZE_GIB resource_rates = self.billing_manager.resource_rates diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py index 4d1b88b4908..cbe17d5ca69 100644 --- a/batch/batch/cloud/gcp/resource_utils.py +++ b/batch/batch/cloud/gcp/resource_utils.py @@ -343,8 +343,39 @@ def gcp_is_valid_storage_request(storage_in_gib: int) -> bool: return 10 <= storage_in_gib <= GCP_MAX_PERSISTENT_SSD_SIZE_GIB -def gcp_local_ssd_size() -> int: - return 375 +GCP_LOCAL_SSD_PARTITION_SIZE_GIB = 375 + +# N2 machines require local SSDs in specific quantities that vary by vCPU count. +# Verified: n2-standard-16 requires minimum 2 (valid: [0, 2, 4, 8, 16, 24]). +# Other thresholds are estimated and may need adjustment based on GCP API errors. +N2_MIN_LOCAL_SSD_COUNT_BY_CORES = { + 2: 1, + 4: 1, + 8: 1, + 16: 2, + 32: 2, + 48: 4, + 64: 4, + 80: 8, + 96: 8, + 128: 16, +} + + +def gcp_local_ssd_count(machine_family: str, cores: int) -> int: + if machine_family != 'n2': + return 1 + count = N2_MIN_LOCAL_SSD_COUNT_BY_CORES.get(cores) + if count is not None: + return count + for threshold_cores in sorted(N2_MIN_LOCAL_SSD_COUNT_BY_CORES.keys(), reverse=True): + if cores >= threshold_cores: + return N2_MIN_LOCAL_SSD_COUNT_BY_CORES[threshold_cores] + return 1 + + +def gcp_local_ssd_size(machine_family: str, cores: int) -> int: + return GCP_LOCAL_SSD_PARTITION_SIZE_GIB * gcp_local_ssd_count(machine_family, cores) def machine_type_to_gpu(machine_type: str) -> Optional[str]: diff --git a/batch/batch/cloud/resource_utils.py b/batch/batch/cloud/resource_utils.py index e44f2b2ee4a..d55c16c818d 100644 --- a/batch/batch/cloud/resource_utils.py +++ b/batch/batch/cloud/resource_utils.py @@ -13,6 +13,7 @@ azure_valid_machine_types, ) from .gcp.resource_utils import ( + GCP_MACHINE_FAMILY, gcp_is_valid_storage_request, gcp_local_ssd_size, gcp_machine_type_to_cores_and_memory_bytes, @@ -115,4 +116,4 @@ def local_ssd_size(cloud: str, worker_type: str, cores: int) -> int: if cloud == 'azure': return azure_local_ssd_size(worker_type, cores) assert cloud == 'gcp', cloud - return gcp_local_ssd_size() + return gcp_local_ssd_size(GCP_MACHINE_FAMILY, cores) diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py index ce28fec8101..213ad3030f9 100644 --- a/batch/test/test_utils.py +++ b/batch/test/test_utils.py @@ -2,7 +2,7 @@ from batch.cloud.azure.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_AZURE from batch.cloud.gcp.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP -from batch.cloud.gcp.resource_utils import gcp_worker_memory_per_core_mib, machine_type_to_gpu_num +from batch.cloud.gcp.resource_utils import gcp_local_ssd_count, gcp_local_ssd_size, gcp_worker_memory_per_core_mib, machine_type_to_gpu_num from batch.cloud.gcp.resources import GCPAcceleratorResource, gcp_resource_from_dict from batch.cloud.resource_utils import adjust_cores_for_packability from batch.utils import rewrite_dockerhub_image @@ -81,6 +81,29 @@ def test_azure_machine_memory_per_core_mib(): assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 8192 +def test_gcp_local_ssd_count(): + assert gcp_local_ssd_count('n1', 16) == 1 + assert gcp_local_ssd_count('n1', 96) == 1 + assert gcp_local_ssd_count('n2', 2) == 1 + assert gcp_local_ssd_count('n2', 4) == 1 + assert gcp_local_ssd_count('n2', 8) == 1 + assert gcp_local_ssd_count('n2', 16) == 2 + assert gcp_local_ssd_count('n2', 32) == 2 + assert gcp_local_ssd_count('n2', 48) == 4 + assert gcp_local_ssd_count('n2', 64) == 4 + assert gcp_local_ssd_count('n2', 80) == 8 + assert gcp_local_ssd_count('n2', 96) == 8 + assert gcp_local_ssd_count('n2', 128) == 16 + + +def test_gcp_local_ssd_size(): + assert gcp_local_ssd_size('n1', 16) == 375 + assert gcp_local_ssd_size('n2', 2) == 375 + assert gcp_local_ssd_size('n2', 16) == 750 + assert gcp_local_ssd_size('n2', 48) == 1500 + assert gcp_local_ssd_size('n2', 128) == 6000 + + def test_gcp_resource_from_dict(): name = 'accelerator/l4-nonpreemptible/us-central1/1712657549063' gpu_data_dic_single = {'name': name, 'number': 1, 'type': 'gcp_accelerator', 'format_version': 2} From 8c2a5f0bc524ade2cc1f14b2cddcf4f31a79270a Mon Sep 17 00:00:00 2001 From: grohli <22306963+grohli@users.noreply.github.com> Date: Thu, 21 May 2026 15:08:09 -0400 Subject: [PATCH 3/7] [batch] Fix N2 local SSD minimums from GCP docs Corrected the N2_MIN_LOCAL_SSD_COUNT_BY_CORES lookup table using the actual values from GCP documentation. Four entries were underestimated: 32-core (2->4), 48-core (4->8), 64-core (4->8), 96-core (8->16). Co-Authored-By: Claude Opus 4.6 --- batch/batch/cloud/gcp/resource_utils.py | 11 +++++------ batch/test/test_utils.py | 10 +++++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py index cbe17d5ca69..0d74483b59f 100644 --- a/batch/batch/cloud/gcp/resource_utils.py +++ b/batch/batch/cloud/gcp/resource_utils.py @@ -346,18 +346,17 @@ def gcp_is_valid_storage_request(storage_in_gib: int) -> bool: GCP_LOCAL_SSD_PARTITION_SIZE_GIB = 375 # N2 machines require local SSDs in specific quantities that vary by vCPU count. -# Verified: n2-standard-16 requires minimum 2 (valid: [0, 2, 4, 8, 16, 24]). -# Other thresholds are estimated and may need adjustment based on GCP API errors. +# Source: https://docs.cloud.google.com/compute/docs/general-purpose-machines#n2-standard N2_MIN_LOCAL_SSD_COUNT_BY_CORES = { 2: 1, 4: 1, 8: 1, 16: 2, - 32: 2, - 48: 4, - 64: 4, + 32: 4, + 48: 8, + 64: 8, 80: 8, - 96: 8, + 96: 16, 128: 16, } diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py index 213ad3030f9..e81c0ab9689 100644 --- a/batch/test/test_utils.py +++ b/batch/test/test_utils.py @@ -88,11 +88,11 @@ def test_gcp_local_ssd_count(): assert gcp_local_ssd_count('n2', 4) == 1 assert gcp_local_ssd_count('n2', 8) == 1 assert gcp_local_ssd_count('n2', 16) == 2 - assert gcp_local_ssd_count('n2', 32) == 2 - assert gcp_local_ssd_count('n2', 48) == 4 - assert gcp_local_ssd_count('n2', 64) == 4 + assert gcp_local_ssd_count('n2', 32) == 4 + assert gcp_local_ssd_count('n2', 48) == 8 + assert gcp_local_ssd_count('n2', 64) == 8 assert gcp_local_ssd_count('n2', 80) == 8 - assert gcp_local_ssd_count('n2', 96) == 8 + assert gcp_local_ssd_count('n2', 96) == 16 assert gcp_local_ssd_count('n2', 128) == 16 @@ -100,7 +100,7 @@ def test_gcp_local_ssd_size(): assert gcp_local_ssd_size('n1', 16) == 375 assert gcp_local_ssd_size('n2', 2) == 375 assert gcp_local_ssd_size('n2', 16) == 750 - assert gcp_local_ssd_size('n2', 48) == 1500 + assert gcp_local_ssd_size('n2', 48) == 3000 assert gcp_local_ssd_size('n2', 128) == 6000 From fefae776c8e1270b5a55197bc277c18a67c1397c Mon Sep 17 00:00:00 2001 From: grohli <22306963+grohli@users.noreply.github.com> Date: Wed, 27 May 2026 15:01:24 -0400 Subject: [PATCH 4/7] [batch] Fix ruff I001 import sorting in test_utils.py Merge duplicate imports from batch.cloud.gcp.resource_utils into a single statement to satisfy ruff's isort rules after merge with upstream main. Co-Authored-By: Claude Opus 4.6 --- batch/test/test_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py index e81c0ab9689..77f0dc4a916 100644 --- a/batch/test/test_utils.py +++ b/batch/test/test_utils.py @@ -1,8 +1,13 @@ import pytest from batch.cloud.azure.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_AZURE -from batch.cloud.gcp.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP -from batch.cloud.gcp.resource_utils import gcp_local_ssd_count, gcp_local_ssd_size, gcp_worker_memory_per_core_mib, machine_type_to_gpu_num +from batch.cloud.gcp.resource_utils import ( + MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP, + gcp_local_ssd_count, + gcp_local_ssd_size, + gcp_worker_memory_per_core_mib, + machine_type_to_gpu_num, +) from batch.cloud.gcp.resources import GCPAcceleratorResource, gcp_resource_from_dict from batch.cloud.resource_utils import adjust_cores_for_packability from batch.utils import rewrite_dockerhub_image From 64867f760b10b9704eff5e2d12a1188f81c19665 Mon Sep 17 00:00:00 2001 From: grohli <22306963+grohli@users.noreply.github.com> Date: Wed, 27 May 2026 15:10:19 -0400 Subject: [PATCH 5/7] [batch] Fix ruff I001 import sorting for ruff 0.11.13 Split aliased import from regular imports per ruff 0.11.13's isort rules. Co-Authored-By: Claude Opus 4.6 --- batch/test/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py index 77f0dc4a916..fa1c3f22257 100644 --- a/batch/test/test_utils.py +++ b/batch/test/test_utils.py @@ -3,6 +3,8 @@ from batch.cloud.azure.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_AZURE from batch.cloud.gcp.resource_utils import ( MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP, +) +from batch.cloud.gcp.resource_utils import ( gcp_local_ssd_count, gcp_local_ssd_size, gcp_worker_memory_per_core_mib, From 54f2dcbafabf07609189ae8e238fe6714b2587a3 Mon Sep 17 00:00:00 2001 From: grohli <22306963+grohli@users.noreply.github.com> Date: Wed, 27 May 2026 15:16:51 -0400 Subject: [PATCH 6/7] [batch] Fix ruff format long assert line in resource_manager.py Co-Authored-By: Claude Opus 4.6 --- batch/batch/cloud/gcp/driver/resource_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/batch/batch/cloud/gcp/driver/resource_manager.py b/batch/batch/cloud/gcp/driver/resource_manager.py index 4e2e051f768..1b7c069a471 100644 --- a/batch/batch/cloud/gcp/driver/resource_manager.py +++ b/batch/batch/cloud/gcp/driver/resource_manager.py @@ -112,7 +112,10 @@ async def create_vm( instance_config: InstanceConfig, ) -> List[QuantifiedResource]: if local_ssd_data_disk: - assert data_disk_size_gb % GCP_LOCAL_SSD_PARTITION_SIZE_GIB == 0 and data_disk_size_gb >= GCP_LOCAL_SSD_PARTITION_SIZE_GIB + assert ( + data_disk_size_gb % GCP_LOCAL_SSD_PARTITION_SIZE_GIB == 0 + and data_disk_size_gb >= GCP_LOCAL_SSD_PARTITION_SIZE_GIB + ) resource_rates = self.billing_manager.resource_rates From 8a8771fa6a66204d127bc408905b7d96465b7a03 Mon Sep 17 00:00:00 2001 From: grohli <22306963+grohli@users.noreply.github.com> Date: Thu, 28 May 2026 15:09:25 -0400 Subject: [PATCH 7/7] [batch] Address PR review comments for N2 machine type upgrade - Fix create_instance.py merge conflict: remove duplicated disk/docker block, keep containerd version for ubuntu 24, move ops agent setup before SSD wrangling, position RAID0 assembly correctly - Revert smallest_machine_type to n1-standard-1 for custom machine tests - Parameterize test_gcp_local_ssd_count and test_gcp_local_ssd_size - Add 128 to highmem valid cores to match n2-highmem-128 in MACHINE_TYPE_TO_PARTS - Combine duplicate imports in test_utils.py Co-Authored-By: Claude Opus 4.6 --- batch/batch/cloud/gcp/resource_utils.py | 2 +- batch/test/test_utils.py | 54 +++++++++++++++---------- batch/test/utils.py | 2 +- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py index 0d74483b59f..caf2930f5f5 100644 --- a/batch/batch/cloud/gcp/resource_utils.py +++ b/batch/batch/cloud/gcp/resource_utils.py @@ -298,7 +298,7 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in gcp_valid_cores_for_pool_worker_type = { 'standard': [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'highmem': [2, 4, 8, 16, 32, 48, 64, 80, 96], + 'highmem': [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], 'highcpu': [2, 4, 8, 16, 32, 48, 64, 80, 96], } diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py index fa1c3f22257..42f4b172a40 100644 --- a/batch/test/test_utils.py +++ b/batch/test/test_utils.py @@ -88,27 +88,39 @@ def test_azure_machine_memory_per_core_mib(): assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 8192 -def test_gcp_local_ssd_count(): - assert gcp_local_ssd_count('n1', 16) == 1 - assert gcp_local_ssd_count('n1', 96) == 1 - assert gcp_local_ssd_count('n2', 2) == 1 - assert gcp_local_ssd_count('n2', 4) == 1 - assert gcp_local_ssd_count('n2', 8) == 1 - assert gcp_local_ssd_count('n2', 16) == 2 - assert gcp_local_ssd_count('n2', 32) == 4 - assert gcp_local_ssd_count('n2', 48) == 8 - assert gcp_local_ssd_count('n2', 64) == 8 - assert gcp_local_ssd_count('n2', 80) == 8 - assert gcp_local_ssd_count('n2', 96) == 16 - assert gcp_local_ssd_count('n2', 128) == 16 - - -def test_gcp_local_ssd_size(): - assert gcp_local_ssd_size('n1', 16) == 375 - assert gcp_local_ssd_size('n2', 2) == 375 - assert gcp_local_ssd_size('n2', 16) == 750 - assert gcp_local_ssd_size('n2', 48) == 3000 - assert gcp_local_ssd_size('n2', 128) == 6000 +@pytest.mark.parametrize( + "family,cores,expected", + [ + ('n1', 16, 1), + ('n1', 96, 1), + ('n2', 2, 1), + ('n2', 4, 1), + ('n2', 8, 1), + ('n2', 16, 2), + ('n2', 32, 4), + ('n2', 48, 8), + ('n2', 64, 8), + ('n2', 80, 8), + ('n2', 96, 16), + ('n2', 128, 16), + ], +) +def test_gcp_local_ssd_count(family, cores, expected): + assert gcp_local_ssd_count(family, cores) == expected + + +@pytest.mark.parametrize( + "family,cores,expected", + [ + ('n1', 16, 375), + ('n2', 2, 375), + ('n2', 16, 750), + ('n2', 48, 3000), + ('n2', 128, 6000), + ], +) +def test_gcp_local_ssd_size(family, cores, expected): + assert gcp_local_ssd_size(family, cores) == expected def test_gcp_resource_from_dict(): diff --git a/batch/test/utils.py b/batch/test/utils.py index e352c569e5b..a9e00e023a7 100644 --- a/batch/test/utils.py +++ b/batch/test/utils.py @@ -43,6 +43,6 @@ def smallest_machine_type(): cloud = os.environ['HAIL_CLOUD'] if cloud == 'gcp': - return 'n2-standard-2' + return 'n1-standard-1' assert cloud == 'azure' return 'Standard_D2ds_v4'