Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion api/openapi/catalog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,9 @@ paths:
parameters:
- name: recommendations
in: query
description: Sort models by lowest recommended latency using Pareto filtering
deprecated: true
description: >-
Deprecated: use `orderBy=RECOMMENDED` instead. Sort models by lowest recommended latency using Pareto filtering.
required: false
schema:
type: boolean
Expand Down Expand Up @@ -288,11 +290,19 @@ paths:
- ID
- NAME
- ACCURACY
- RECOMMENDED

Defaults to `NAME`.

The `ACCURACY` sort will sort by the `overall_average` property in any linked metrics artifact.

The `RECOMMENDED` sort applies Pareto filtering and ranks models by recommended latency.
The Pareto-related parameters (`targetRPS`, `latencyProperty`, `rpsProperty`,
`hardwareCountProperty`, `hardwareTypeProperty`) are applied the same way as with the
deprecated `recommendations` parameter. With the default `sortOrder=ASC`, the most
recommended models (lowest latency) appear first. Use `sortOrder=DESC` to show the least
recommended configurations first.

In addition, models can be sorted by properties. For example:
- `provider.string_value` sorts by provider name
- `artifacts.ifeval.double_value` sorts by the min/max value a property called ifeval across all associated artifacts
Expand Down
13 changes: 12 additions & 1 deletion api/openapi/src/plugins/model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ paths:
parameters:
- name: recommendations
in: query
description: Sort models by lowest recommended latency using Pareto filtering
deprecated: true
description: >-
Deprecated: use `orderBy=RECOMMENDED` instead.
Sort models by lowest recommended latency using Pareto filtering.
required: false
schema:
type: boolean
Expand Down Expand Up @@ -98,11 +101,19 @@ paths:
- ID
- NAME
- ACCURACY
- RECOMMENDED

Defaults to `NAME`.

The `ACCURACY` sort will sort by the `overall_average` property in any linked metrics artifact.

The `RECOMMENDED` sort applies Pareto filtering and ranks models by recommended latency.
The Pareto-related parameters (`targetRPS`, `latencyProperty`, `rpsProperty`,
`hardwareCountProperty`, `hardwareTypeProperty`) are applied the same way as with the
deprecated `recommendations` parameter. With the default `sortOrder=ASC`, the most
recommended models (lowest latency) appear first. Use `sortOrder=DESC` to show the least
recommended configurations first.

In addition, models can be sorted by properties. For example:
- `provider.string_value` sorts by provider name
- `artifacts.ifeval.double_value` sorts by the min/max value a property called ifeval across all associated artifacts
Expand Down
3 changes: 2 additions & 1 deletion catalog/internal/catalog/modelcatalog/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ type APIProvider interface {
// Models without computable latency appear at the end of results.
// If sourceIDs is provided, filter models by source IDs.
// If query is provided, filter models by text search.
FindModelsWithRecommendedLatency(ctx context.Context, pagination mrmodels.Pagination, paretoParams ParetoFilteringParams, sourceIDs []string, query string) (*model.CatalogModelList, error)
// sortOrder controls direction: "ASC" (default) puts lowest latency first; "DESC" puts highest first.
FindModelsWithRecommendedLatency(ctx context.Context, pagination mrmodels.Pagination, paretoParams ParetoFilteringParams, sourceIDs []string, query string, sortOrder string) (*model.CatalogModelList, error)

// GetArtifacts returns all artifacts for a particular model. If no
// model is found with that name, it returns nil. If the model is
Expand Down
16 changes: 11 additions & 5 deletions catalog/internal/catalog/modelcatalog/db_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ func (d *dbCatalogImpl) FindModelsWithRecommendedLatency(
paretoParams ParetoFilteringParams,
sourceIDs []string,
query string,
sortOrder string,
) (*apimodels.CatalogModelList, error) {
// Get all models first (without pagination)
var sourceIDsPtr *[]string
Expand Down Expand Up @@ -684,20 +685,25 @@ func (d *dbCatalogImpl) FindModelsWithRecommendedLatency(
})
}

// Sort: models with latency first (ascending), then models without latency
// Sort: models with latency first, then models without latency.
// ASC (default) = lowest latency first (most recommended); DESC = highest latency first.
descending := strings.EqualFold(sortOrder, "DESC")
sort.Slice(modelsWithLatency, func(i, j int) bool {
latencyI, latencyJ := modelsWithLatency[i].Latency, modelsWithLatency[j].Latency

if latencyI == nil && latencyJ == nil {
return false // Maintain original order for models without latency
return false
}
if latencyI == nil {
return false // Models without latency go last
return false // Models without latency always last
}
if latencyJ == nil {
return true // Models with latency go first
return true // Models with latency always first
}
return *latencyI < *latencyJ // Sort by latency ascending
if descending {
return *latencyI > *latencyJ
}
return *latencyI < *latencyJ
})

// Apply pagination to sorted results
Expand Down
140 changes: 140 additions & 0 deletions catalog/internal/catalog/modelcatalog/db_catalog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2123,6 +2123,7 @@ func TestFindModelsWithRecommendedLatency(t *testing.T) {
paretoParams,
[]string{"latency-test-source"}, // Filter by this test's source ID
"", // No query filter
"", // Default sort order (ASC)
)

require.NoError(t, err)
Expand All @@ -2142,6 +2143,145 @@ func TestFindModelsWithRecommendedLatency(t *testing.T) {
}
}

func TestFindModelsWithRecommendedLatencyDescending(t *testing.T) {
sharedDB, cleanup := testutils.SetupPostgresWithMigrations(t, testhelpers.MustDatastoreSpec(t))
defer cleanup()

catalogModelTypeID := testhelpers.GetCatalogModelTypeIDForDBTest(t, sharedDB)
modelArtifactTypeID := testhelpers.GetCatalogModelArtifactTypeIDForDBTest(t, sharedDB)
metricsArtifactTypeID := testhelpers.GetCatalogMetricsArtifactTypeIDForDBTest(t, sharedDB)
catalogSourceTypeID := testhelpers.GetCatalogSourceTypeIDForDBTest(t, sharedDB)

catalogModelRepo := modelservice.NewCatalogModelRepository(sharedDB, catalogModelTypeID)
catalogArtifactRepo := service.NewCatalogArtifactRepository(sharedDB, map[string]int32{
service.CatalogModelArtifactTypeName: modelArtifactTypeID,
service.CatalogMetricsArtifactTypeName: metricsArtifactTypeID,
})
modelArtifactRepo := modelservice.NewCatalogModelArtifactRepository(sharedDB, modelArtifactTypeID)
metricsArtifactRepo := modelservice.NewCatalogMetricsArtifactRepository(sharedDB, metricsArtifactTypeID)
catalogSourceRepo := service.NewCatalogSourceRepository(sharedDB, catalogSourceTypeID)

svcs := Services{
CatalogModelRepository: catalogModelRepo,
CatalogArtifactRepository: catalogArtifactRepo,
CatalogModelArtifactRepository: modelArtifactRepo,
CatalogMetricsArtifactRepository: metricsArtifactRepo,
CatalogSourceRepository: catalogSourceRepo,
PropertyOptionsRepository: service.NewPropertyOptionsRepository(sharedDB),
}

dbCatalog := NewDBCatalog(svcs, nil)
ctx := context.Background()

// modelLow has latency=50 (fastest / most recommended)
modelLow := &models.CatalogModelImpl{
TypeID: new(int32(catalogModelTypeID)),
Attributes: &models.CatalogModelAttributes{
Name: new("desc-test-source:desc-model-low"),
ExternalID: new("desc-model-low-ext"),
},
Properties: &[]mr_models.Properties{
{Name: "source_id", StringValue: new("desc-test-source")},
},
}
// modelHigh has latency=200 (slowest / least recommended)
modelHigh := &models.CatalogModelImpl{
TypeID: new(int32(catalogModelTypeID)),
Attributes: &models.CatalogModelAttributes{
Name: new("desc-test-source:desc-model-high"),
ExternalID: new("desc-model-high-ext"),
},
Properties: &[]mr_models.Properties{
{Name: "source_id", StringValue: new("desc-test-source")},
},
}
// modelNone has no performance artifacts
modelNone := &models.CatalogModelImpl{
TypeID: new(int32(catalogModelTypeID)),
Attributes: &models.CatalogModelAttributes{
Name: new("desc-test-source:desc-model-none"),
ExternalID: new("desc-model-none-ext"),
},
Properties: &[]mr_models.Properties{
{Name: "source_id", StringValue: new("desc-test-source")},
},
}

savedModelLow, err := catalogModelRepo.Save(modelLow)
require.NoError(t, err)
savedModelHigh, err := catalogModelRepo.Save(modelHigh)
require.NoError(t, err)
_, err = catalogModelRepo.Save(modelNone)
require.NoError(t, err)

perfLow := &models.CatalogMetricsArtifactImpl{
TypeID: new(int32(metricsArtifactTypeID)),
Attributes: &models.CatalogMetricsArtifactAttributes{
Name: new("desc-perf-low"),
ExternalID: new("desc-perf-low-ext"),
MetricsType: models.MetricsTypePerformance,
},
Properties: &[]mr_models.Properties{},
CustomProperties: &[]mr_models.Properties{
{Name: "ttft_p90", DoubleValue: new(float64(50.0))},
{Name: "requests_per_second", DoubleValue: new(float64(100.0))},
{Name: "hardware_count", IntValue: new(int32(1))},
{Name: "hardware_type", StringValue: new("gpu")},
},
}
perfHigh := &models.CatalogMetricsArtifactImpl{
TypeID: new(int32(metricsArtifactTypeID)),
Attributes: &models.CatalogMetricsArtifactAttributes{
Name: new("desc-perf-high"),
ExternalID: new("desc-perf-high-ext"),
MetricsType: models.MetricsTypePerformance,
},
Properties: &[]mr_models.Properties{},
CustomProperties: &[]mr_models.Properties{
{Name: "ttft_p90", DoubleValue: new(float64(200.0))},
{Name: "requests_per_second", DoubleValue: new(float64(30.0))},
{Name: "hardware_count", IntValue: new(int32(1))},
{Name: "hardware_type", StringValue: new("gpu")},
},
}

_, err = metricsArtifactRepo.Save(perfLow, savedModelLow.GetID())
require.NoError(t, err)
_, err = metricsArtifactRepo.Save(perfHigh, savedModelHigh.GetID())
require.NoError(t, err)

pagination := mr_models.Pagination{PageSize: new(int32(10))}
paretoParams := ParetoFilteringParams{
LatencyProperty: "ttft_p90",
RpsProperty: "requests_per_second",
HardwareCountProperty: "hardware_count",
HardwareTypeProperty: "hardware_type",
}

// ASC should put lowest latency first
ascResult, err := dbCatalog.(*dbCatalogImpl).FindModelsWithRecommendedLatency(
ctx, pagination, paretoParams, []string{"desc-test-source"}, "", "ASC",
)
require.NoError(t, err)
require.NotNil(t, ascResult)
require.Len(t, ascResult.Items, 3)
// First two have latency data; last has none. With ASC, low-latency model comes before high-latency.
assert.Equal(t, "desc-model-low", ascResult.Items[0].Name)
assert.Equal(t, "desc-model-high", ascResult.Items[1].Name)
assert.Equal(t, "desc-model-none", ascResult.Items[2].Name)

// DESC should put highest latency first; models without latency still last
descResult, err := dbCatalog.(*dbCatalogImpl).FindModelsWithRecommendedLatency(
ctx, pagination, paretoParams, []string{"desc-test-source"}, "", "DESC",
)
require.NoError(t, err)
require.NotNil(t, descResult)
require.Len(t, descResult.Items, 3)
assert.Equal(t, "desc-model-high", descResult.Items[0].Name)
assert.Equal(t, "desc-model-low", descResult.Items[1].Name)
assert.Equal(t, "desc-model-none", descResult.Items[2].Name)
}

// TestGetFilterOptions_NoMCPServerContamination verifies that the model catalog's
// GetFilterOptions only returns properties from kf.CatalogModel contexts, not
// properties from kf.MCPServer contexts. This is a regression test for the bug
Expand Down
2 changes: 1 addition & 1 deletion catalog/internal/catalog/modelcatalog/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ func BenchmarkRecommendedLatencySorting(b *testing.B) {
for i := 0; i < b.N; i++ {
_, err := provider.FindModelsWithRecommendedLatency(ctx, mr_models.Pagination{
PageSize: new(int32(20)),
}, paretoParams, []string{"benchmark-source"}, "")
}, paretoParams, []string{"benchmark-source"}, "", "")

require.NoError(b, err)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ func (m *ModelCatalogServiceAPIService) FindModels(ctx context.Context, recommen
}
}

// Handle recommended latency sorting
if recommended {
// Handle recommended latency sorting (triggered by recommendations=true or orderBy=RECOMMENDED)
if recommended || string(orderBy) == "RECOMMENDED" {
// Build Pareto filtering parameters with defaults
var targetRPSPtr *int32
if targetRPS != 0 {
Expand Down Expand Up @@ -292,8 +292,8 @@ func (m *ModelCatalogServiceAPIService) FindModels(ctx context.Context, recommen
FilterQuery: &filterQuery,
}

// Use recommended latency sorting (ignores orderBy)
models, err := m.provider.FindModelsWithRecommendedLatency(ctx, pagination, paretoParams, sourceIDs, q)
// Use recommended latency sorting
models, err := m.provider.FindModelsWithRecommendedLatency(ctx, pagination, paretoParams, sourceIDs, q, string(sortOrder))
if err != nil {
return ErrorResponse(api.ErrToStatus(err), fmt.Errorf("failed to find models with recommended latency: %w", err)), err
}
Expand Down
Loading
Loading