From 2abf1ee2ee649cb564b41238aab78e6ae9f42934 Mon Sep 17 00:00:00 2001
From: grohli <22306963+grohli@users.noreply.github.com>
Date: Thu, 21 May 2026 10:23:55 -0400
Subject: [PATCH 1/7] [batch] Upgrade pooled VM machine family from N1 to N2

Switch GCP_MACHINE_FAMILY from 'n1' to 'n2' so all pool workers
create N2 VMs. Add N2 memory ratios, machine type entries
(standard/highmem/highcpu), and valid core counts. Update pricing
pipeline to recognize N2 SKUs and handle N2 memory SKU descriptions.
Existing N1 entries retained for GPU JPIM billing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 batch/batch/cloud/gcp/driver/pricing.py |  6 ++-
 batch/batch/cloud/gcp/resource_utils.py | 60 +++++++++++++++++++++++--
 batch/test/test_utils.py                | 14 +++++-
 batch/test/utils.py                     |  2 +-
 4 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/batch/batch/cloud/gcp/driver/pricing.py b/batch/batch/cloud/gcp/driver/pricing.py
index 60d3495efc6..9429ed784ec 100644
--- a/batch/batch/cloud/gcp/driver/pricing.py
+++ b/batch/batch/cloud/gcp/driver/pricing.py
@@ -179,6 +179,8 @@ def instance_family_from_sku(sku: dict) -> Optional[str]:
     category = sku['category']
     if category['resourceGroup'] == 'N1Standard':
         return 'n1'
+    if sku['description'].startswith("N2 Instance") or sku['description'].startswith("Spot Preemptible N2 Instance"):
+        return 'n2'
     if sku['description'].startswith("G2 Instance") or sku['description'].startswith("Spot Preemptible G2 Instance"):
         return 'g2'
     if sku['description'].startswith("A2 Instance") or sku['description'].startswith("Spot Preemptible A2 Instance"):
@@ -283,7 +285,7 @@ def process_accelerator_sku(sku: dict, regions: List[str]) -> List[GCPAccelerato
 def process_memory_sku(sku: dict, regions: List[str]) -> List[GCPMemoryPrice]:
     category = sku['category']
     assert category['resourceFamily'] == 'Compute', sku
-    assert 'Ram' in sku['description']
+    assert 'Ram' in sku['description'] or 'Memory' in sku['description']
 
     instance_family = instance_family_from_sku(sku)
     preemptible = preemptible_from_sku(sku)
@@ -392,7 +394,7 @@ async def fetch_prices(
             if 'Core' in sku['description']:
                 for compute_price in process_compute_sku(sku, regions):
                     yield compute_price
-            elif 'Ram' in sku['description']:
+            elif 'Ram' in sku['description'] or 'Memory' in sku['description']:
                 for memory_price in process_memory_sku(sku, regions):
                     yield memory_price
         elif category['resourceFamily'] == 'Storage':
diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py
index 636d791f76e..4d1b88b4908 100644
--- a/batch/batch/cloud/gcp/resource_utils.py
+++ b/batch/batch/cloud/gcp/resource_utils.py
@@ -6,12 +6,15 @@
 
 GCP_MAX_PERSISTENT_SSD_SIZE_GIB = 64 * 1024
 
-GCP_MACHINE_FAMILY = 'n1'
+GCP_MACHINE_FAMILY = 'n2'
 
 MEMORY_PER_CORE_MIB = {
     ('n1', 'standard'): 3840,
     ('n1', 'highmem'): 6656,
     ('n1', 'highcpu'): 924,
+    ('n2', 'standard'): 4096,
+    ('n2', 'highmem'): 8192,
+    ('n2', 'highcpu'): 1024,
 }
 
 
@@ -116,6 +119,45 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in
     for cores in [2, 4, 8, 16, 32, 64, 96]
 }
 
+# N2 Standard cores: 2 4 8 16 32 48 64 80 96 128
+# N2 Standard mem: 4 * cores GiB
+n2_standard_machines = {
+    f'n2-standard-{cores}': MachineTypeParts(
+        cores=cores,
+        memory=gib_to_bytes(4 * cores),
+        gpu_config=None,
+        machine_family='n2',
+        worker_type='standard',
+    )
+    for cores in [2, 4, 8, 16, 32, 48, 64, 80, 96, 128]
+}
+
+# N2 Highmem cores: 2 4 8 16 32 48 64 80 96
+# N2 Highmem mem: 8 * cores GiB
+n2_highmem_machines = {
+    f'n2-highmem-{cores}': MachineTypeParts(
+        cores=cores,
+        memory=gib_to_bytes(8 * cores),
+        gpu_config=None,
+        machine_family='n2',
+        worker_type='highmem',
+    )
+    for cores in [2, 4, 8, 16, 32, 48, 64, 80, 96]
+}
+
+# N2 Highcpu cores: 2 4 8 16 32 48 64 80 96
+# N2 Highcpu mem: 1024 * cores MiB
+n2_highcpu_machines = {
+    f'n2-highcpu-{cores}': MachineTypeParts(
+        cores=cores,
+        memory=mib_to_bytes(1024 * cores),
+        gpu_config=None,
+        machine_family='n2',
+        worker_type='highcpu',
+    )
+    for cores in [2, 4, 8, 16, 32, 48, 64, 80, 96]
+}
+
 MACHINE_TYPE_TO_PARTS = {
     **n1_standard_t4_machines,
     **n1_highmem_t4_machines,
@@ -123,6 +165,16 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in
     **n1_standard_machines,
     **n1_highmem_machines,
     **n1_highcpu_machines,
+    **n2_standard_machines,
+    **n2_highmem_machines,
+    **n2_highcpu_machines,
+    'n2-highmem-128': MachineTypeParts(
+        cores=128,
+        memory=gib_to_bytes(864),
+        gpu_config=None,
+        machine_family='n2',
+        worker_type='highmem',
+    ),
     'g2-standard-4': MachineTypeParts(
         cores=4,
         memory=gib_to_bytes(16),
@@ -245,9 +297,9 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in
 }
 
 gcp_valid_cores_for_pool_worker_type = {
-    'highcpu': [2, 4, 8, 16, 32, 64, 96],
-    'standard': [1, 2, 4, 8, 16, 32, 64, 96],
-    'highmem': [2, 4, 8, 16, 32, 64, 96],
+    'standard': [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+    'highmem': [2, 4, 8, 16, 32, 48, 64, 80, 96],
+    'highcpu': [2, 4, 8, 16, 32, 48, 64, 80, 96],
 }
 
 gcp_valid_machine_types = list(MACHINE_TYPE_TO_PARTS.keys())
diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py
index 92947cc5462..ce28fec8101 100644
--- a/batch/test/test_utils.py
+++ b/batch/test/test_utils.py
@@ -33,11 +33,12 @@ def test_memory_str_to_bytes():
 
 
 def test_gcp_worker_memory_per_core_mib():
-    with pytest.raises(AssertionError):
-        assert gcp_worker_memory_per_core_mib('n2', 'standard')
     assert gcp_worker_memory_per_core_mib('n1', 'standard') == 3840
     assert gcp_worker_memory_per_core_mib('n1', 'highmem') == 6656
     assert gcp_worker_memory_per_core_mib('n1', 'highcpu') == 924
+    assert gcp_worker_memory_per_core_mib('n2', 'standard') == 4096
+    assert gcp_worker_memory_per_core_mib('n2', 'highmem') == 8192
+    assert gcp_worker_memory_per_core_mib('n2', 'highcpu') == 1024
 
 
 def test_gcp_machine_memory_per_core_mib():
@@ -48,6 +49,15 @@ def test_gcp_machine_memory_per_core_mib():
             assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 6656
         elif machine_parts.machine_family == 'n1' and machine_parts.worker_type == 'highcpu':
             assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 924
+        elif machine_parts.machine_family == 'n2' and machine_parts.worker_type == 'standard':
+            assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 4096
+        elif machine_parts.machine_family == 'n2' and machine_parts.worker_type == 'highmem':
+            if machine_parts.cores == 128:
+                assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 6912
+            else:
+                assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 8192
+        elif machine_parts.machine_family == 'n2' and machine_parts.worker_type == 'highcpu':
+            assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 1024
         elif machine_parts.machine_family == 'g2' and machine_parts.worker_type == 'standard':
             assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 4096
         elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'highgpu':
diff --git a/batch/test/utils.py b/batch/test/utils.py
index a9e00e023a7..e352c569e5b 100644
--- a/batch/test/utils.py
+++ b/batch/test/utils.py
@@ -43,6 +43,6 @@ def smallest_machine_type():
     cloud = os.environ['HAIL_CLOUD']
 
     if cloud == 'gcp':
-        return 'n1-standard-1'
+        return 'n2-standard-2'
     assert cloud == 'azure'
     return 'Standard_D2ds_v4'

From e0e96a0dc56ef37f5417d74fc9180dade4bb8cfd Mon Sep 17 00:00:00 2001
From: grohli <22306963+grohli@users.noreply.github.com>
Date: Thu, 21 May 2026 14:58:32 -0400
Subject: [PATCH 2/7] [batch] Fix local SSD count for N2 machines: attach
 minimum required partitions and RAID0 when >1

N2 machines require local SSDs in specific quantities (e.g., n2-standard-16
requires minimum 2). The previous code always attached exactly 1, which GCP
rejects. This adds a per-core-count lookup for the minimum SSD count, attaches
that many SCRATCH disks in the VM config, and combines them via mdadm RAID0 in
the startup script when count > 1.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../batch/cloud/gcp/driver/create_instance.py | 49 +++++++++++++------
 .../cloud/gcp/driver/resource_manager.py      |  3 +-
 batch/batch/cloud/gcp/resource_utils.py       | 35 ++++++++++++-
 batch/batch/cloud/resource_utils.py           |  3 +-
 batch/test/test_utils.py                      | 25 +++++++++-
 5 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/batch/batch/cloud/gcp/driver/create_instance.py b/batch/batch/cloud/gcp/driver/create_instance.py
index e3c6e29ccdf..63a68b844a6 100644
--- a/batch/batch/cloud/gcp/driver/create_instance.py
+++ b/batch/batch/cloud/gcp/driver/create_instance.py
@@ -13,7 +13,7 @@
 from ....instance_config import InstanceConfig
 from ...resource_utils import unreserved_worker_data_disk_size_gib
 from ...utils import ACCEPTABLE_QUERY_JAR_URL_PREFIX
-from ..resource_utils import GPUConfig, gcp_machine_type_to_parts, machine_type_to_gpu
+from ..resource_utils import GPUConfig, gcp_local_ssd_count, gcp_machine_type_to_parts, machine_type_to_gpu
 
 log = logging.getLogger('create_instance')
 
@@ -62,21 +62,28 @@ def create_vm_config(
     region = instance_config.region_for(zone)
     docker_run_gpu_args = '--runtime=nvidia --gpus all' if machine_type_to_gpu(machine_type_full) else ''
     if local_ssd_data_disk:
-        worker_data_disk = {
-            'type': 'SCRATCH',
-            'autoDelete': True,
-            'interface': 'NVME',
-            'initializeParams': {'diskType': f'zones/{zone}/diskTypes/local-ssd'},
-        }
-        worker_data_disk_name = 'nvme0n1'
+        num_local_ssds = gcp_local_ssd_count(parts.machine_family, cores)
+        worker_data_disks = [
+            {
+                'type': 'SCRATCH',
+                'autoDelete': True,
+                'interface': 'NVME',
+                'initializeParams': {'diskType': f'zones/{zone}/diskTypes/local-ssd'},
+            }
+            for _ in range(num_local_ssds)
+        ]
+        worker_data_disk_name = 'md0' if num_local_ssds > 1 else 'nvme0n1'
     else:
-        worker_data_disk = {
-            'autoDelete': True,
-            'initializeParams': {
-                'diskType': f'projects/{project}/zones/{zone}/diskTypes/pd-ssd',
-                'diskSizeGb': str(data_disk_size_gb),
-            },
-        }
+        num_local_ssds = 0
+        worker_data_disks = [
+            {
+                'autoDelete': True,
+                'initializeParams': {
+                    'diskType': f'projects/{project}/zones/{zone}/diskTypes/pd-ssd',
+                    'diskSizeGb': str(data_disk_size_gb),
+                },
+            }
+        ]
         worker_data_disk_name = 'nvme0n2' if 'g2' in machine_type else 'sdb'
 
     if job_private:
@@ -123,7 +130,7 @@ def scheduling() -> dict:
                     'diskSizeGb': str(boot_disk_size_gb),
                 },
             },
-            worker_data_disk,
+            *worker_data_disks,
         ],
         'networkInterfaces': [
             {
@@ -175,6 +182,7 @@ def scheduling() -> dict:
 set -x
 
 WORKER_DATA_DISK_NAME="{worker_data_disk_name}"
+NUM_LOCAL_SSDS="{num_local_ssds}"
 UNRESERVED_WORKER_DATA_DISK_SIZE_GB="{unreserved_disk_storage_gb}"
 ACCEPTABLE_QUERY_JAR_URL_PREFIX="{ACCEPTABLE_QUERY_JAR_URL_PREFIX}"
 
@@ -252,6 +260,15 @@ def scheduling() -> dict:
 
 sudo systemctl restart google-cloud-ops-agent
 
+# combine multiple local SSDs into a single RAID0 array
+if [ "$NUM_LOCAL_SSDS" -gt 1 ]; then
+    DEVICES=""
+    for i in $(seq 1 $NUM_LOCAL_SSDS); do
+        DEVICES="$DEVICES /dev/nvme0n$i"
+    done
+    mdadm --create /dev/md0 --level=0 --raid-devices=$NUM_LOCAL_SSDS $DEVICES --force --run
+fi
+
 # format worker data disk
 sudo mkfs.xfs -m reflink=1 -n ftype=1 /dev/$WORKER_DATA_DISK_NAME
 sudo mkdir -p /mnt/disks/$WORKER_DATA_DISK_NAME
diff --git a/batch/batch/cloud/gcp/driver/resource_manager.py b/batch/batch/cloud/gcp/driver/resource_manager.py
index a6aa4c96c7c..4e2e051f768 100644
--- a/batch/batch/cloud/gcp/driver/resource_manager.py
+++ b/batch/batch/cloud/gcp/driver/resource_manager.py
@@ -21,6 +21,7 @@
 from ....instance_config import InstanceConfig, QuantifiedResource
 from ..instance_config import GCPSlimInstanceConfig
 from ..resource_utils import (
+    GCP_LOCAL_SSD_PARTITION_SIZE_GIB,
     GCP_MACHINE_FAMILY,
     family_worker_type_cores_to_gcp_machine_type,
     gcp_machine_type_to_cores_and_memory_bytes,
@@ -111,7 +112,7 @@ async def create_vm(
         instance_config: InstanceConfig,
     ) -> List[QuantifiedResource]:
         if local_ssd_data_disk:
-            assert data_disk_size_gb == 375
+            assert data_disk_size_gb % GCP_LOCAL_SSD_PARTITION_SIZE_GIB == 0 and data_disk_size_gb >= GCP_LOCAL_SSD_PARTITION_SIZE_GIB
 
         resource_rates = self.billing_manager.resource_rates
 
diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py
index 4d1b88b4908..cbe17d5ca69 100644
--- a/batch/batch/cloud/gcp/resource_utils.py
+++ b/batch/batch/cloud/gcp/resource_utils.py
@@ -343,8 +343,39 @@ def gcp_is_valid_storage_request(storage_in_gib: int) -> bool:
     return 10 <= storage_in_gib <= GCP_MAX_PERSISTENT_SSD_SIZE_GIB
 
 
-def gcp_local_ssd_size() -> int:
-    return 375
+GCP_LOCAL_SSD_PARTITION_SIZE_GIB = 375
+
+# N2 machines require local SSDs in specific quantities that vary by vCPU count.
+# Verified: n2-standard-16 requires minimum 2 (valid: [0, 2, 4, 8, 16, 24]).
+# Other thresholds are estimated and may need adjustment based on GCP API errors.
+N2_MIN_LOCAL_SSD_COUNT_BY_CORES = {
+    2: 1,
+    4: 1,
+    8: 1,
+    16: 2,
+    32: 2,
+    48: 4,
+    64: 4,
+    80: 8,
+    96: 8,
+    128: 16,
+}
+
+
+def gcp_local_ssd_count(machine_family: str, cores: int) -> int:
+    if machine_family != 'n2':
+        return 1
+    count = N2_MIN_LOCAL_SSD_COUNT_BY_CORES.get(cores)
+    if count is not None:
+        return count
+    for threshold_cores in sorted(N2_MIN_LOCAL_SSD_COUNT_BY_CORES.keys(), reverse=True):
+        if cores >= threshold_cores:
+            return N2_MIN_LOCAL_SSD_COUNT_BY_CORES[threshold_cores]
+    return 1
+
+
+def gcp_local_ssd_size(machine_family: str, cores: int) -> int:
+    return GCP_LOCAL_SSD_PARTITION_SIZE_GIB * gcp_local_ssd_count(machine_family, cores)
 
 
 def machine_type_to_gpu(machine_type: str) -> Optional[str]:
diff --git a/batch/batch/cloud/resource_utils.py b/batch/batch/cloud/resource_utils.py
index e44f2b2ee4a..d55c16c818d 100644
--- a/batch/batch/cloud/resource_utils.py
+++ b/batch/batch/cloud/resource_utils.py
@@ -13,6 +13,7 @@
     azure_valid_machine_types,
 )
 from .gcp.resource_utils import (
+    GCP_MACHINE_FAMILY,
     gcp_is_valid_storage_request,
     gcp_local_ssd_size,
     gcp_machine_type_to_cores_and_memory_bytes,
@@ -115,4 +116,4 @@ def local_ssd_size(cloud: str, worker_type: str, cores: int) -> int:
     if cloud == 'azure':
         return azure_local_ssd_size(worker_type, cores)
     assert cloud == 'gcp', cloud
-    return gcp_local_ssd_size()
+    return gcp_local_ssd_size(GCP_MACHINE_FAMILY, cores)
diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py
index ce28fec8101..213ad3030f9 100644
--- a/batch/test/test_utils.py
+++ b/batch/test/test_utils.py
@@ -2,7 +2,7 @@
 
 from batch.cloud.azure.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_AZURE
 from batch.cloud.gcp.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP
-from batch.cloud.gcp.resource_utils import gcp_worker_memory_per_core_mib, machine_type_to_gpu_num
+from batch.cloud.gcp.resource_utils import gcp_local_ssd_count, gcp_local_ssd_size, gcp_worker_memory_per_core_mib, machine_type_to_gpu_num
 from batch.cloud.gcp.resources import GCPAcceleratorResource, gcp_resource_from_dict
 from batch.cloud.resource_utils import adjust_cores_for_packability
 from batch.utils import rewrite_dockerhub_image
@@ -81,6 +81,29 @@ def test_azure_machine_memory_per_core_mib():
             assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 8192
 
 
+def test_gcp_local_ssd_count():
+    assert gcp_local_ssd_count('n1', 16) == 1
+    assert gcp_local_ssd_count('n1', 96) == 1
+    assert gcp_local_ssd_count('n2', 2) == 1
+    assert gcp_local_ssd_count('n2', 4) == 1
+    assert gcp_local_ssd_count('n2', 8) == 1
+    assert gcp_local_ssd_count('n2', 16) == 2
+    assert gcp_local_ssd_count('n2', 32) == 2
+    assert gcp_local_ssd_count('n2', 48) == 4
+    assert gcp_local_ssd_count('n2', 64) == 4
+    assert gcp_local_ssd_count('n2', 80) == 8
+    assert gcp_local_ssd_count('n2', 96) == 8
+    assert gcp_local_ssd_count('n2', 128) == 16
+
+
+def test_gcp_local_ssd_size():
+    assert gcp_local_ssd_size('n1', 16) == 375
+    assert gcp_local_ssd_size('n2', 2) == 375
+    assert gcp_local_ssd_size('n2', 16) == 750
+    assert gcp_local_ssd_size('n2', 48) == 1500
+    assert gcp_local_ssd_size('n2', 128) == 6000
+
+
 def test_gcp_resource_from_dict():
     name = 'accelerator/l4-nonpreemptible/us-central1/1712657549063'
     gpu_data_dic_single = {'name': name, 'number': 1, 'type': 'gcp_accelerator', 'format_version': 2}

From 8c2a5f0bc524ade2cc1f14b2cddcf4f31a79270a Mon Sep 17 00:00:00 2001
From: grohli <22306963+grohli@users.noreply.github.com>
Date: Thu, 21 May 2026 15:08:09 -0400
Subject: [PATCH 3/7] [batch] Fix N2 local SSD minimums from GCP docs

Corrected the N2_MIN_LOCAL_SSD_COUNT_BY_CORES lookup table using the
actual values from GCP documentation. Four entries were underestimated:
32-core (2->4), 48-core (4->8), 64-core (4->8), 96-core (8->16).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 batch/batch/cloud/gcp/resource_utils.py | 11 +++++------
 batch/test/test_utils.py                | 10 +++++-----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py
index cbe17d5ca69..0d74483b59f 100644
--- a/batch/batch/cloud/gcp/resource_utils.py
+++ b/batch/batch/cloud/gcp/resource_utils.py
@@ -346,18 +346,17 @@ def gcp_is_valid_storage_request(storage_in_gib: int) -> bool:
 GCP_LOCAL_SSD_PARTITION_SIZE_GIB = 375
 
 # N2 machines require local SSDs in specific quantities that vary by vCPU count.
-# Verified: n2-standard-16 requires minimum 2 (valid: [0, 2, 4, 8, 16, 24]).
-# Other thresholds are estimated and may need adjustment based on GCP API errors.
+# Source: https://docs.cloud.google.com/compute/docs/general-purpose-machines#n2-standard
 N2_MIN_LOCAL_SSD_COUNT_BY_CORES = {
     2: 1,
     4: 1,
     8: 1,
     16: 2,
-    32: 2,
-    48: 4,
-    64: 4,
+    32: 4,
+    48: 8,
+    64: 8,
     80: 8,
-    96: 8,
+    96: 16,
     128: 16,
 }
 
diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py
index 213ad3030f9..e81c0ab9689 100644
--- a/batch/test/test_utils.py
+++ b/batch/test/test_utils.py
@@ -88,11 +88,11 @@ def test_gcp_local_ssd_count():
     assert gcp_local_ssd_count('n2', 4) == 1
     assert gcp_local_ssd_count('n2', 8) == 1
     assert gcp_local_ssd_count('n2', 16) == 2
-    assert gcp_local_ssd_count('n2', 32) == 2
-    assert gcp_local_ssd_count('n2', 48) == 4
-    assert gcp_local_ssd_count('n2', 64) == 4
+    assert gcp_local_ssd_count('n2', 32) == 4
+    assert gcp_local_ssd_count('n2', 48) == 8
+    assert gcp_local_ssd_count('n2', 64) == 8
     assert gcp_local_ssd_count('n2', 80) == 8
-    assert gcp_local_ssd_count('n2', 96) == 8
+    assert gcp_local_ssd_count('n2', 96) == 16
     assert gcp_local_ssd_count('n2', 128) == 16
 
 
@@ -100,7 +100,7 @@ def test_gcp_local_ssd_size():
     assert gcp_local_ssd_size('n1', 16) == 375
     assert gcp_local_ssd_size('n2', 2) == 375
     assert gcp_local_ssd_size('n2', 16) == 750
-    assert gcp_local_ssd_size('n2', 48) == 1500
+    assert gcp_local_ssd_size('n2', 48) == 3000
     assert gcp_local_ssd_size('n2', 128) == 6000
 
 

From fefae776c8e1270b5a55197bc277c18a67c1397c Mon Sep 17 00:00:00 2001
From: grohli <22306963+grohli@users.noreply.github.com>
Date: Wed, 27 May 2026 15:01:24 -0400
Subject: [PATCH 4/7] [batch] Fix ruff I001 import sorting in test_utils.py

Merge duplicate imports from batch.cloud.gcp.resource_utils into a single
statement to satisfy ruff's isort rules after merge with upstream main.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 batch/test/test_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py
index e81c0ab9689..77f0dc4a916 100644
--- a/batch/test/test_utils.py
+++ b/batch/test/test_utils.py
@@ -1,8 +1,13 @@
 import pytest
 
 from batch.cloud.azure.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_AZURE
-from batch.cloud.gcp.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP
-from batch.cloud.gcp.resource_utils import gcp_local_ssd_count, gcp_local_ssd_size, gcp_worker_memory_per_core_mib, machine_type_to_gpu_num
+from batch.cloud.gcp.resource_utils import (
+    MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP,
+    gcp_local_ssd_count,
+    gcp_local_ssd_size,
+    gcp_worker_memory_per_core_mib,
+    machine_type_to_gpu_num,
+)
 from batch.cloud.gcp.resources import GCPAcceleratorResource, gcp_resource_from_dict
 from batch.cloud.resource_utils import adjust_cores_for_packability
 from batch.utils import rewrite_dockerhub_image

From 64867f760b10b9704eff5e2d12a1188f81c19665 Mon Sep 17 00:00:00 2001
From: grohli <22306963+grohli@users.noreply.github.com>
Date: Wed, 27 May 2026 15:10:19 -0400
Subject: [PATCH 5/7] [batch] Fix ruff I001 import sorting for ruff 0.11.13

Split aliased import from regular imports per ruff 0.11.13's isort rules.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 batch/test/test_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py
index 77f0dc4a916..fa1c3f22257 100644
--- a/batch/test/test_utils.py
+++ b/batch/test/test_utils.py
@@ -3,6 +3,8 @@
 from batch.cloud.azure.resource_utils import MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_AZURE
 from batch.cloud.gcp.resource_utils import (
     MACHINE_TYPE_TO_PARTS as MACHINE_TYPE_TO_PARTS_GCP,
+)
+from batch.cloud.gcp.resource_utils import (
     gcp_local_ssd_count,
     gcp_local_ssd_size,
     gcp_worker_memory_per_core_mib,

From 54f2dcbafabf07609189ae8e238fe6714b2587a3 Mon Sep 17 00:00:00 2001
From: grohli <22306963+grohli@users.noreply.github.com>
Date: Wed, 27 May 2026 15:16:51 -0400
Subject: [PATCH 6/7] [batch] Fix ruff format long assert line in
 resource_manager.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 batch/batch/cloud/gcp/driver/resource_manager.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/batch/batch/cloud/gcp/driver/resource_manager.py b/batch/batch/cloud/gcp/driver/resource_manager.py
index 4e2e051f768..1b7c069a471 100644
--- a/batch/batch/cloud/gcp/driver/resource_manager.py
+++ b/batch/batch/cloud/gcp/driver/resource_manager.py
@@ -112,7 +112,10 @@ async def create_vm(
         instance_config: InstanceConfig,
     ) -> List[QuantifiedResource]:
         if local_ssd_data_disk:
-            assert data_disk_size_gb % GCP_LOCAL_SSD_PARTITION_SIZE_GIB == 0 and data_disk_size_gb >= GCP_LOCAL_SSD_PARTITION_SIZE_GIB
+            assert (
+                data_disk_size_gb % GCP_LOCAL_SSD_PARTITION_SIZE_GIB == 0
+                and data_disk_size_gb >= GCP_LOCAL_SSD_PARTITION_SIZE_GIB
+            )
 
         resource_rates = self.billing_manager.resource_rates
 

From 8a8771fa6a66204d127bc408905b7d96465b7a03 Mon Sep 17 00:00:00 2001
From: grohli <22306963+grohli@users.noreply.github.com>
Date: Thu, 28 May 2026 15:09:25 -0400
Subject: [PATCH 7/7] [batch] Address PR review comments for N2 machine type
 upgrade

- Fix create_instance.py merge conflict: remove duplicated disk/docker
  block, keep containerd version for ubuntu 24, move ops agent setup
  before SSD wrangling, position RAID0 assembly correctly
- Revert smallest_machine_type to n1-standard-1 for custom machine tests
- Parameterize test_gcp_local_ssd_count and test_gcp_local_ssd_size
- Add 128 to highmem valid cores to match n2-highmem-128 in MACHINE_TYPE_TO_PARTS
- Combine duplicate imports in test_utils.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 batch/batch/cloud/gcp/resource_utils.py |  2 +-
 batch/test/test_utils.py                | 54 +++++++++++++++----------
 batch/test/utils.py                     |  2 +-
 3 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py
index 0d74483b59f..caf2930f5f5 100644
--- a/batch/batch/cloud/gcp/resource_utils.py
+++ b/batch/batch/cloud/gcp/resource_utils.py
@@ -298,7 +298,7 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in
 
 gcp_valid_cores_for_pool_worker_type = {
     'standard': [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
-    'highmem': [2, 4, 8, 16, 32, 48, 64, 80, 96],
+    'highmem': [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
     'highcpu': [2, 4, 8, 16, 32, 48, 64, 80, 96],
 }
 
diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py
index fa1c3f22257..42f4b172a40 100644
--- a/batch/test/test_utils.py
+++ b/batch/test/test_utils.py
@@ -88,27 +88,39 @@ def test_azure_machine_memory_per_core_mib():
             assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 8192
 
 
-def test_gcp_local_ssd_count():
-    assert gcp_local_ssd_count('n1', 16) == 1
-    assert gcp_local_ssd_count('n1', 96) == 1
-    assert gcp_local_ssd_count('n2', 2) == 1
-    assert gcp_local_ssd_count('n2', 4) == 1
-    assert gcp_local_ssd_count('n2', 8) == 1
-    assert gcp_local_ssd_count('n2', 16) == 2
-    assert gcp_local_ssd_count('n2', 32) == 4
-    assert gcp_local_ssd_count('n2', 48) == 8
-    assert gcp_local_ssd_count('n2', 64) == 8
-    assert gcp_local_ssd_count('n2', 80) == 8
-    assert gcp_local_ssd_count('n2', 96) == 16
-    assert gcp_local_ssd_count('n2', 128) == 16
-
-
-def test_gcp_local_ssd_size():
-    assert gcp_local_ssd_size('n1', 16) == 375
-    assert gcp_local_ssd_size('n2', 2) == 375
-    assert gcp_local_ssd_size('n2', 16) == 750
-    assert gcp_local_ssd_size('n2', 48) == 3000
-    assert gcp_local_ssd_size('n2', 128) == 6000
+@pytest.mark.parametrize(
+    "family,cores,expected",
+    [
+        ('n1', 16, 1),
+        ('n1', 96, 1),
+        ('n2', 2, 1),
+        ('n2', 4, 1),
+        ('n2', 8, 1),
+        ('n2', 16, 2),
+        ('n2', 32, 4),
+        ('n2', 48, 8),
+        ('n2', 64, 8),
+        ('n2', 80, 8),
+        ('n2', 96, 16),
+        ('n2', 128, 16),
+    ],
+)
+def test_gcp_local_ssd_count(family, cores, expected):
+    assert gcp_local_ssd_count(family, cores) == expected
+
+
+@pytest.mark.parametrize(
+    "family,cores,expected",
+    [
+        ('n1', 16, 375),
+        ('n2', 2, 375),
+        ('n2', 16, 750),
+        ('n2', 48, 3000),
+        ('n2', 128, 6000),
+    ],
+)
+def test_gcp_local_ssd_size(family, cores, expected):
+    assert gcp_local_ssd_size(family, cores) == expected
 
 
 def test_gcp_resource_from_dict():
diff --git a/batch/test/utils.py b/batch/test/utils.py
index e352c569e5b..a9e00e023a7 100644
--- a/batch/test/utils.py
+++ b/batch/test/utils.py
@@ -43,6 +43,6 @@ def smallest_machine_type():
     cloud = os.environ['HAIL_CLOUD']
 
     if cloud == 'gcp':
-        return 'n2-standard-2'
+        return 'n1-standard-1'
     assert cloud == 'azure'
     return 'Standard_D2ds_v4'