diff --git a/gwlearn/ensemble.py b/gwlearn/ensemble.py index 814934ac..e5972a31 100644 --- a/gwlearn/ensemble.py +++ b/gwlearn/ensemble.py @@ -14,6 +14,7 @@ RandomForestClassifier, RandomForestRegressor, ) +from sklearn.metrics import accuracy_score, r2_score from .base import BaseClassifier, BaseRegressor @@ -148,6 +149,10 @@ class GWRandomForestClassifier(BaseClassifier): Pooled out-of-bag (OOB) true labels across all fitted local models. oob_pred_pooled_ : numpy.ndarray Pooled out-of-bag (OOB) predictions/scores across all fitted local models. + oob_pooled_score_ : float + Accuracy computed from all out-of-bag predictions pooled together. + score_ : float + Alias for ``oob_pooled_score_``. Examples -------- @@ -228,11 +233,12 @@ def __init__( self._model_type = "random_forest" self._model_kwargs["oob_score"] = self._get_oob_score_data - self._empty_score_data = (np.array([]).reshape(-1, 1), np.array([])) + self._empty_score_data = (np.array([]), np.array([])) def _get_oob_score_data(self, true, pred): """Callback used by scikit-learn to collect OOB targets/predictions.""" - return true, pred + # sklearn passes true as 2D for classifiers, flatten it + return true.ravel(), pred def fit( self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = None @@ -297,6 +303,32 @@ def fit( return self + @property + def oob_pooled_score_(self) -> float: + """Accuracy on pooled out-of-bag predictions vs pooled OOB true labels. + + Returns + ------- + float + Accuracy computed from all out-of-bag predictions pooled together. + """ + if self.oob_y_pooled_.size == 0 or self.oob_pred_pooled_.size == 0: + return np.nan + y_true = self.oob_y_pooled_ + y_pred = self.oob_pred_pooled_ + return accuracy_score(y_true, y_pred) + + @property + def score_(self) -> float: + """Alias for oob_pooled_score_. + + Returns + ------- + float + Accuracy computed from all out-of-bag predictions pooled together. + """ + return self.oob_pooled_score_ + def _get_score_data( self, local_model: BaseEstimator, @@ -646,6 +678,10 @@ class GWRandomForestRegressor(BaseRegressor): Pooled out-of-bag (OOB) true values across all fitted local models. oob_pred_pooled_ : numpy.ndarray Pooled out-of-bag (OOB) predictions across all fitted local models. + oob_pooled_score_ : float + R² computed from all out-of-bag predictions pooled together. + score_ : float + Alias for ``oob_pooled_score_``. Examples -------- @@ -720,11 +756,12 @@ def __init__( self._model_type = "random_forest" self._model_kwargs["oob_score"] = self._get_oob_score_data - self._empty_score_data = (np.array([]).reshape(-1, 1), np.array([])) + self._empty_score_data = (np.array([]), np.array([])) def _get_oob_score_data(self, true, pred): """Callback used by scikit-learn to collect OOB targets/predictions.""" - return true, pred + # sklearn passes true as 2D array (-1, 1) for regressors, flatten it + return true.ravel(), pred def fit( self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = None @@ -789,6 +826,32 @@ def fit( return self + @property + def oob_pooled_score_(self) -> float: + """R² on pooled out-of-bag predictions vs pooled OOB true values. + + Returns + ------- + float + R² computed from all out-of-bag predictions pooled together. + """ + if len(self.oob_y_pooled_) == 0: + return np.nan + y_true = self.oob_y_pooled_ + y_pred = self.oob_pred_pooled_ + return r2_score(y_true, y_pred) + + @property + def score_(self) -> float: + """Alias for oob_pooled_score_. + + Returns + ------- + float + R² computed from all out-of-bag predictions pooled together. + """ + return self.oob_pooled_score_ + def _get_score_data( self, local_model: BaseEstimator, diff --git a/gwlearn/linear_model.py b/gwlearn/linear_model.py index 338564b6..341aa4f4 100644 --- a/gwlearn/linear_model.py +++ b/gwlearn/linear_model.py @@ -8,6 +8,7 @@ from libpysal import graph from sklearn.base import BaseEstimator from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.metrics import accuracy_score, r2_score from .base import BaseClassifier, BaseRegressor @@ -138,6 +139,10 @@ class GWLogisticRegression(BaseClassifier): left_out_w_ : np.ndarray Array of weights on left out observations in local models when ``leave_out`` is set. + pooled_score_ : float + Accuracy computed from all local model predictions pooled together. + score_ : float + Alias for ``pooled_score_``. Examples -------- @@ -164,7 +169,6 @@ class GWLogisticRegression(BaseClassifier): dtype: boolean """ - # TODO: score_ should be an alias of pooled_score_ - this is different from MGWR def __init__( self, bandwidth: float | None = None, @@ -261,6 +265,30 @@ def fit(self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = No return self + @property + def pooled_score_(self) -> float: + """Accuracy on pooled predictions vs pooled true labels. + + Returns + ------- + float + Accuracy computed from all local model predictions pooled together. + """ + if self.y_pooled_.size == 0 or self.pred_pooled_.size == 0: + return np.nan + return accuracy_score(self.y_pooled_, self.pred_pooled_) + + @property + def score_(self) -> float: + """Alias for pooled_score_. + + Returns + ------- + float + Accuracy computed from all local model predictions pooled together. + """ + return self.pooled_score_ + def _get_score_data( self, local_model: BaseEstimator, @@ -382,6 +410,10 @@ class GWLinearRegression(BaseRegressor): each location local_intercept_ : pd.Series Local intercept values at each location + pooled_score_ : float + R² computed from all local model predictions pooled together. + score_ : float + Alias for ``pooled_score_``. Examples -------- @@ -458,6 +490,8 @@ def _get_score_data( y: pd.Series, # noqa: ARG002 ) -> tuple: return ( + y, + local_model.predict(X), pd.Series( local_model.coef_.flatten(), index=local_model.feature_names_in_, @@ -471,6 +505,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = No else: self.feature_names_in_ = np.arange(X.shape[1]) self._empty_score_data = ( + np.array([]), + np.array([]), pd.Series(np.nan, index=self.feature_names_in_), # local coefficients np.array([np.nan]), ) # intercept @@ -478,10 +514,52 @@ def fit(self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = No super().fit(X=X, y=y, geometry=geometry) self.local_coef_ = pd.concat( - [x[0] for x in self._score_data], axis=1, keys=self._names + [x[2] for x in self._score_data], axis=1, keys=self._names ).T self.local_intercept_ = pd.Series( - [x[1] for x in self._score_data], index=self._names + [x[3] for x in self._score_data], index=self._names ) + self._y_local = [x[0] for x in self._score_data] + self._pred_local = [x[1] for x in self._score_data] + + del self._score_data + + if self._y_local and any(arr.size > 0 for arr in self._y_local): + self.y_pooled_ = np.concatenate( + [arr for arr in self._y_local if arr.size > 0] + ) + else: + self.y_pooled_ = np.array([]) + if self._pred_local and any(arr.size > 0 for arr in self._pred_local): + self.pred_pooled_ = np.concatenate( + [arr for arr in self._pred_local if arr.size > 0] + ) + else: + self.pred_pooled_ = np.array([]) + return self + + @property + def pooled_score_(self) -> float: + """R² on pooled predictions vs pooled true values. + + Returns + ------- + float + R² computed from all local model predictions pooled together. + """ + if len(self.y_pooled_) == 0: + return np.nan + return r2_score(self.y_pooled_, self.pred_pooled_) + + @property + def score_(self) -> float: + """Alias for pooled_score_. + + Returns + ------- + float + R² computed from all local model predictions pooled together. + """ + return self.pooled_score_ diff --git a/gwlearn/tests/test_ensemble.py b/gwlearn/tests/test_ensemble.py index ecf23eb8..a6041918 100644 --- a/gwlearn/tests/test_ensemble.py +++ b/gwlearn/tests/test_ensemble.py @@ -255,3 +255,121 @@ def test_gwgb_regressor_fit_basic(sample_regression_data): # noqa: F811 assert hasattr(model, "aic_") assert hasattr(model, "aicc_") assert hasattr(model, "bic_") + + +def test_gwrf_classifier_pooled_score(sample_data): + """Test oob_pooled_score_ and score_ on GWRandomForestClassifier.""" + X, y, geometry = sample_data + + model = GWRandomForestClassifier( + bandwidth=150000, + fixed=True, + random_state=42, + strict=False, + n_estimators=50, + n_jobs=1, + ) + model.fit(X, y, geometry) + + # oob_pooled_score_ and score_ should exist and be a float in [0, 1] + assert hasattr(model, "oob_pooled_score_") + assert hasattr(model, "score_") + assert isinstance(model.oob_pooled_score_, float) + assert 0.0 <= model.oob_pooled_score_ <= 1.0 + + # score_ is an alias for oob_pooled_score_ + assert model.score_ == model.oob_pooled_score_ + + # The underlying pooled OOB arrays should be non-empty + assert model.oob_y_pooled_.size > 0 + assert model.oob_pred_pooled_.size > 0 + + +def test_gwrf_regressor_pooled_score(sample_regression_data): + """Test oob_pooled_score_ and score_ on GWRandomForestRegressor.""" + X, y, geometry = sample_regression_data + + model = GWRandomForestRegressor( + bandwidth=150000, + fixed=True, + random_state=42, + strict=False, + n_estimators=50, + n_jobs=1, + ) + model.fit(X, y, geometry) + + # oob_pooled_score_ should be a valid float (R²) + assert hasattr(model, "oob_pooled_score_") + assert hasattr(model, "score_") + assert isinstance(model.oob_pooled_score_, float) + + # score_ is an alias for oob_pooled_score_ + assert model.score_ == model.oob_pooled_score_ + + # The underlying pooled OOB arrays should be non-empty + assert model.oob_y_pooled_.size > 0 + assert model.oob_pred_pooled_.size > 0 + + +def test_gwgb_no_pooled_score(sample_data): + """GWGradientBoostingClassifier should NOT expose pooled_score_ or score_.""" + X, y, geometry = sample_data + + model = GWGradientBoostingClassifier( + bandwidth=150000, + fixed=True, + random_state=42, + strict=False, + n_estimators=10, + n_jobs=1, + ) + model.fit(X, y, geometry) + + assert not hasattr(model, "pooled_score_") + assert not hasattr(model, "oob_pooled_score_") + assert not hasattr(model, "score_") + + +def test_gwgb_regressor_no_pooled_score(sample_regression_data): + """GWGradientBoostingRegressor should NOT expose pooled_score_ or score_.""" + X, y, geometry = sample_regression_data + + model = GWGradientBoostingRegressor( + bandwidth=150000, + fixed=True, + random_state=42, + strict=False, + n_estimators=10, + n_jobs=1, + ) + model.fit(X, y, geometry) + + assert not hasattr(model, "pooled_score_") + assert not hasattr(model, "oob_pooled_score_") + assert not hasattr(model, "score_") + + +def test_gwrf_classifier_oob_pooled_score_empty(): + """oob_pooled_score_ returns nan when pooled OOB arrays are empty.""" + import numpy as np + + model = GWRandomForestClassifier(bandwidth=10, fixed=True, n_estimators=10) + # Manually simulate the state after fit where all local models were skipped + model.oob_y_pooled_ = np.array([], dtype=bool) + model.oob_pred_pooled_ = np.array([], dtype=float) + + assert np.isnan(model.oob_pooled_score_) + assert np.isnan(model.score_) + + +def test_gwrf_regressor_oob_pooled_score_empty(): + """oob_pooled_score_ returns nan when pooled OOB arrays are empty.""" + import numpy as np + + model = GWRandomForestRegressor(bandwidth=10, fixed=True, n_estimators=10) + model.oob_y_pooled_ = np.array([], dtype=float) + model.oob_pred_pooled_ = np.array([], dtype=float) + + assert np.isnan(model.oob_pooled_score_) + assert np.isnan(model.score_) diff --git a/gwlearn/tests/test_linear_model.py b/gwlearn/tests/test_linear_model.py index eb390f8f..5ad2f80b 100644 --- a/gwlearn/tests/test_linear_model.py +++ b/gwlearn/tests/test_linear_model.py @@ -279,3 +279,133 @@ def test_against_mgwr(): assert_almost_equal(gwlr.aicc_, res.aicc, decimal=0) assert_almost_equal(gwlr.effective_df_, res.ENP) assert_almost_equal(gwlr.log_likelihood_, res.llf) + + +def test_gwlogistic_pooled_score(sample_data): + """Test pooled_score_ and score_ on GWLogisticRegression. + + Verifies that y_pooled_ contains true pooled local-neighborhood labels + (not just focal y), meaning its length exceeds len(y) when include_focal=True. + """ + X, y, geometry = sample_data + + model = GWLogisticRegression( + bandwidth=150000, + fixed=True, + random_state=42, + strict=False, + max_iter=500, + n_jobs=1, + include_focal=True, + ) + model.fit(X, y, geometry) + + # pooled_score_ and score_ should exist and be a float in [0, 1] + assert hasattr(model, "pooled_score_") + assert hasattr(model, "score_") + assert isinstance(model.pooled_score_, float) + assert 0.0 <= model.pooled_score_ <= 1.0 + + # score_ is an alias for pooled_score_ + assert model.score_ == model.pooled_score_ + + # pooled arrays are truly pooled: each obs appears in multiple neighborhoods + assert model.y_pooled_.size > 0 + assert model.pred_pooled_.size > 0 + assert model.y_pooled_.size == model.pred_pooled_.size + assert model.y_pooled_.size > len(y) + + +def test_gwlogistic_pooled_score_include_focal_false(sample_data): + """Test pooled_score_ when include_focal=False (leave-out predictions).""" + X, y, geometry = sample_data + + model = GWLogisticRegression( + bandwidth=150000, + fixed=True, + random_state=42, + strict=False, + max_iter=500, + n_jobs=1, + include_focal=False, + ) + model.fit(X, y, geometry) + + # pooled_score_ should still be a valid float + assert hasattr(model, "pooled_score_") + assert isinstance(model.pooled_score_, float) + assert model.score_ == model.pooled_score_ + assert model.y_pooled_.size > 0 + + +def test_gwlogistic_pooled_score_empty(): + """pooled_score_ returns nan when no local models produced data.""" + import numpy as np + + model = GWLogisticRegression(bandwidth=10, fixed=True, max_iter=100) + model.y_pooled_ = np.array([]) + model.pred_pooled_ = np.array([]) + + assert np.isnan(model.pooled_score_) + assert np.isnan(model.score_) + + +def test_gwlinear_pooled_score(sample_regression_data): + """Test pooled_score_ and score_ on GWLinearRegression. + + Verifies that y_pooled_ contains true pooled local-neighborhood values + (not just focal y), meaning its length exceeds len(y) when include_focal=True. + """ + X, y, geometry = sample_regression_data + + model = GWLinearRegression( + bandwidth=150000, + fixed=True, + n_jobs=1, + include_focal=True, + ) + model.fit(X, y, geometry) + + # pooled_score_ and score_ should exist and be a float + assert hasattr(model, "pooled_score_") + assert hasattr(model, "score_") + assert isinstance(model.pooled_score_, float) + + # score_ is an alias for pooled_score_ + assert model.score_ == model.pooled_score_ + + # pooled arrays are truly pooled: size exceeds len(y) + assert model.y_pooled_.size > 0 + assert model.pred_pooled_.size > 0 + assert model.y_pooled_.size == model.pred_pooled_.size + assert model.y_pooled_.size > len(y) + + +def test_gwlinear_pooled_score_include_focal_false(sample_regression_data): + """Test pooled_score_ when include_focal=False (leave-out predictions).""" + X, y, geometry = sample_regression_data + + model = GWLinearRegression( + bandwidth=150000, + fixed=True, + n_jobs=1, + include_focal=False, + ) + model.fit(X, y, geometry) + + assert hasattr(model, "pooled_score_") + assert isinstance(model.pooled_score_, float) + assert model.score_ == model.pooled_score_ + assert model.y_pooled_.size > 0 + + +def test_gwlinear_pooled_score_empty(): + """pooled_score_ returns nan when no local models produced data.""" + import numpy as np + + model = GWLinearRegression(bandwidth=10, fixed=True) + model.y_pooled_ = np.array([]) + model.pred_pooled_ = np.array([]) + + assert np.isnan(model.pooled_score_) + assert np.isnan(model.score_)