Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ markers = [
packages = [
"src/evalml",
"src/verification",
"src/data_input"
"src/data_input",
"src/spectra"
]

[tool.uv.sources]
Expand Down
59 changes: 57 additions & 2 deletions src/evalml/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path
from typing import Dict, List, Any, ClassVar, FrozenSet, Optional
from typing import Dict, List, Any, ClassVar, FrozenSet, Literal, Optional

from pydantic import BaseModel, Field, RootModel, field_validator
from pydantic import BaseModel, Field, RootModel, field_validator, model_validator

PROJECT_ROOT = Path(__file__).parents[2]

Expand Down Expand Up @@ -302,6 +302,57 @@ class ExperimentScorecardConfig(BaseModel):
model_config = {"extra": "forbid"}


class SpectraConfig(BaseModel):
"""Configuration for power-spectra QC plots in the experiment pipeline."""

enabled: bool = Field(
default=False,
description="Whether to compute and plot power spectra.",
)
method: Literal["dct", "fft"] = Field(
default="dct",
description="Spectral method: 'dct' (default, recommended for LAM) or 'fft'.",
)
lead_times: List[int] = Field(
default_factory=list,
description="Representative lead times (hours) at which to compute spectra.",
)
variables: List[str] = Field(
default=["T_2M", "WIND_KE", "TOT_PREC"],
description="Spectra variables. Supported: T_2M, WIND_KE (from U/V_10M), TOT_PREC.",
)
init_hours: Optional[List[int]] = Field(
default=None,
description="Optional subset of init hours to average over. None = all.",
)

@field_validator("variables")
@classmethod
def validate_variables(cls, v: List[str]) -> List[str]:
if not v:
raise ValueError(
"`variables` must list at least one variable when spectra is configured."
)
allowed = {"T_2M", "WIND_KE", "TOT_PREC"}
invalid = set(v) - allowed
if invalid:
raise ValueError(
f"Unsupported spectra variable(s) {invalid!r}. Must be subset of {allowed}."
)
return v

@model_validator(mode="after")
def check_enabled_requirements(self):
if self.enabled and not self.lead_times:
raise ValueError(
"experiment.spectra.enabled is true but `lead_times` is empty; "
"specify at least one lead time (hours)."
)
return self

model_config = {"extra": "forbid"}


class ShowcaseConfig(BaseModel):
"""Configuration for the showcase workflow."""

Expand Down Expand Up @@ -373,6 +424,10 @@ class ExperimentConfig(BaseModel):
default=None,
description="Scorecard generation configuration. Omit or set enabled: false to disable.",
)
spectra: SpectraConfig = Field(
default_factory=SpectraConfig,
description="Power-spectra QC configuration. Disabled by default.",
)

@field_validator("thresholds")
@classmethod
Expand Down
7 changes: 7 additions & 0 deletions src/spectra/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Variance power-spectra diagnostics for evalml.

Public surface re-exported for convenience; submodules can also be imported
directly. The spectral `core` is numpy-only and reusable for any model.
"""

from spectra import compute, core, io, regrid # noqa: F401
119 changes: 119 additions & 0 deletions src/spectra/compute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""Orchestration: per-source spectra computation, over-init aggregation, and the
experiment overlay plot. Defines the spectra.nc schema."""

from __future__ import annotations

from itertools import cycle
from pathlib import Path

import numpy as np
import xarray as xr

from spectra import core, io, regrid

_PALETTE = ["#1f3b73", "#c1272d", "#1b7837", "#6a3d9a", "#e08214", "#4d4d4d"]


def compute_source_spectra(ds: xr.Dataset, variables, lead_times, method, label):
"""Compute spectra for one source (already-loaded native dataset).

Returns an x.Dataset with dims (variable, leadtime, wavenumber) and a shared
`wavelength` coordinate. The native grid is detected from the field length.
"""
if not variables or not lead_times:
raise ValueError(
"compute_source_spectra requires non-empty `variables` and `lead_times`."
)
spectrum_fn = core.SPECTRUM_FUNCS.get(method)
if spectrum_fn is None:
raise ValueError(
f"Unknown spectrum method {method!r}. Valid: {list(core.SPECTRUM_FUNCS)}."
)
npoints = io.native_field(
ds, io.required_params(variables)[0], lead_times[0]
).shape[0]
matrix, ny, nx, dx_km = regrid.load_regridder(npoints)

wavelength = None
power = np.full((len(variables), len(lead_times), min(ny, nx) // 2), np.nan)
for vi, var in enumerate(variables):
for li, step in enumerate(lead_times):
comps, factor = io.native_components(ds, var, step)
grids = [regrid.regrid(c, matrix, ny, nx) for c in comps]
wl, p = core.combined_spectrum(spectrum_fn, grids, dx_km, factor=factor)
wavelength = wl
power[vi, li, :] = p

return xr.Dataset(
{"power": (("variable", "leadtime", "wavenumber"), power)},
coords={
"variable": list(variables),
"leadtime": list(lead_times),
"wavenumber": np.arange(power.shape[-1]),
"wavelength": ("wavenumber", wavelength),
},
attrs={
"dx_km": float(dx_km),
"npoints": int(npoints),
"label": label,
"method": method,
},
)


def aggregate_spectra(spectra_files) -> xr.Dataset:
"""Average power over init times (nanmean). All inputs share one grid."""
datasets = [xr.open_dataset(f) for f in spectra_files]
try:
stacked = xr.concat(datasets, dim="init")
agg = stacked.mean(dim="init", skipna=True)
agg.attrs = datasets[0].attrs
agg["wavelength"] = datasets[0]["wavelength"].copy()
agg.load()
finally:
for ds in datasets:
ds.close()
return agg


def plot_experiment_spectra(
truth_file, participant_files, out_dir, variables, lead_times
):
"""One overlay figure per (variable, lead time): truth + all participants."""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
participants = [xr.open_dataset(f) for f in participant_files]
try:
with xr.open_dataset(truth_file) as truth:
for var in variables:
for step in lead_times:
spectra = {}
t_wl = truth["wavelength"].values
t_p = truth["power"].sel(variable=var, leadtime=step).values
spectra[f"{truth.attrs['label']} (truth)"] = (
t_wl,
t_p,
"k",
"-",
1.0,
2.0,
)
color = cycle(_PALETTE)
for ds in participants:
wl = ds["wavelength"].values
p = ds["power"].sel(variable=var, leadtime=step).values
spectra[ds.attrs["label"]] = (wl, p, next(color), "-", 0.9, 1.6)
out = out_dir / f"spectrum_{var}_{step:03d}.png"
# Reference grid/eff-res/Nyquist lines use the TRUTH grid's dx (labelled
# "truth"); each participant curve keeps its own wavelength axis.
core.plot_power_spectra(
spectra,
out,
f"Power spectrum — {var} @ +{step}h",
grid_dx_km=float(truth.attrs["dx_km"]),
model_short="truth",
model_color="k",
)
finally:
for ds in participants:
ds.close()
Loading
Loading