Andyyyy64 · SuperMarioYL · Jun 20, 2026
diff --git a/src/whichllm/models/fetcher.py b/src/whichllm/models/fetcher.py
@@ -212,22 +212,41 @@ def _normalize_param_count(
     return extracted
 
 
+# Filename quant tokens that name the same format under a different spelling
+# than the canonical key used by QUANT_BYTES_PER_WEIGHT / QUANT_QUALITY_PENALTY.
+# llama.cpp GGUFs are frequently published as ``*-FP16.gguf`` / ``*-FP32.gguf``,
+# but the byte/penalty tables key full precision as ``F16`` / ``F32``. Without
+# this mapping the extracted token misses the table and _estimate_gguf_size
+# silently falls back to the Q4_K_M default, under-counting an FP16 GGUF ~3.5x.
+_QUANT_ALIASES = {
+    "FP16": "F16",
+    "FP32": "F32",
+}
+
+
 def _extract_quant_type(filename: str) -> str:
-    """Extract quantization type from GGUF filename."""
+    """Extract quantization type from a GGUF filename.
+
+    The returned key is canonicalized to match QUANT_BYTES_PER_WEIGHT (see the
+    ``test_extract_quant_type_keys_resolve_in_byte_table`` drift guard), so
+    callers can look it up directly. Returns ``"unknown"`` when nothing matches.
+    """
     # Common patterns: model-Q4_K_M.gguf, model.Q4_K_M.gguf
     patterns = [
         r"[.-](Q\d+_K_[SMLA])",
         r"[.-](Q\d+_\d+)",
         r"[.-](Q\d+_K)",
+        r"[.-](TQ\d+_\d+)",
         r"[.-](IQ\d+_\w+)",
         r"[.-](MXFP4|NVFP4)",
-        r"[.-](F16|FP16|BF16|F32)",
+        r"[.-](F16|FP16|BF16|F32|FP32)",
     ]
     upper = filename.upper()
     for pattern in patterns:
         m = re.search(pattern, upper)
         if m:
-            return m.group(1)
+            quant = m.group(1)
+            return _QUANT_ALIASES.get(quant, quant)
     return "unknown"
 
 

diff --git a/tests/test_quantization.py b/tests/test_quantization.py
@@ -75,3 +75,67 @@ def test_extract_quant_type_parses_fp4_gguf_filenames():
 
     assert _extract_quant_type("gpt-oss-20b-MXFP4.gguf") == "MXFP4"
     assert _extract_quant_type("model.NVFP4.gguf") == "NVFP4"
+
+
+def test_extract_quant_type_canonicalizes_full_precision_aliases():
+    # llama.cpp publishes full-precision GGUFs as *-FP16/*-FP32; the byte and
+    # penalty tables key these as F16/F32, so the extractor must canonicalize.
+    from whichllm.models.fetcher import _extract_quant_type
+
+    assert _extract_quant_type("Meta-Llama-3-8B-FP16.gguf") == "F16"
+    assert _extract_quant_type("model.FP32.gguf") == "F32"
+    # Canonical spellings still pass through unchanged.
+    assert _extract_quant_type("model-F16.gguf") == "F16"
+    assert _extract_quant_type("model.BF16.gguf") == "BF16"
+
+
+def test_extract_quant_type_recognizes_ternary_gguf():
+    # BitNet-class ternary GGUFs (TQ1_0/TQ2_0) are fully priced in the tables
+    # but were previously extracted as "unknown" and dropped at fetch.
+    from whichllm.models.fetcher import _extract_quant_type
+
+    assert _extract_quant_type("BitNet-b1.58-2B-4T-TQ1_0.gguf") == "TQ1_0"
+    assert _extract_quant_type("model.TQ2_0.gguf") == "TQ2_0"
+
+
+def test_estimate_gguf_size_does_not_undersize_fp16():
+    # An FP16 GGUF must size at full precision (2.0 bytes/weight), not collapse
+    # to the Q4_K_M 0.5625 default that an unrecognized token falls back to.
+    from whichllm.models.fetcher import _estimate_gguf_size, _extract_quant_type
+
+    params = 7_000_000_000
+    quant = _extract_quant_type("model-FP16.gguf")
+    size = _estimate_gguf_size(params, quant)
+    assert size == params * 2  # 14 GB, not the ~3.94 GB default
+    assert size == _estimate_gguf_size(params, "F16")
+
+
+def test_extract_quant_type_keys_resolve_in_byte_table():
+    # Drift guard: every quant the extractor surfaces from a real GGUF filename
+    # must resolve in QUANT_BYTES_PER_WEIGHT, otherwise it is silently mis-sized
+    # by the default or dropped at fetch. Keeps the extractor and tables aligned.
+    from whichllm.data.quantization import QUANT_BYTES_PER_WEIGHT
+    from whichllm.models.fetcher import _extract_quant_type
+
+    filenames = [
+        "model-Q4_K_M.gguf",
+        "model-Q8_0.gguf",
+        "model-Q6_K.gguf",
+        "model-IQ4_NL.gguf",
+        "model-IQ3_XXS.gguf",
+        "model-TQ1_0.gguf",
+        "model-TQ2_0.gguf",
+        "model-F16.gguf",
+        "model-FP16.gguf",
+        "model-BF16.gguf",
+        "model-F32.gguf",
+        "model-FP32.gguf",
+        "model-MXFP4.gguf",
+        "model-NVFP4.gguf",
+    ]
+    for fname in filenames:
+        quant = _extract_quant_type(fname)
+        assert quant != "unknown", f"{fname} not recognized by extractor"
+        assert quant in QUANT_BYTES_PER_WEIGHT, (
+            f"{fname} -> {quant!r} missing from QUANT_BYTES_PER_WEIGHT"
+        )