diff --git a/soda-duckdb/src/soda_duckdb/common/data_sources/duckdb_data_source.py b/soda-duckdb/src/soda_duckdb/common/data_sources/duckdb_data_source.py index a70c7eaac..d15ac467f 100644 --- a/soda-duckdb/src/soda_duckdb/common/data_sources/duckdb_data_source.py +++ b/soda-duckdb/src/soda_duckdb/common/data_sources/duckdb_data_source.py @@ -128,14 +128,19 @@ def format_metadata_data_type(self, data_type: str) -> str: def _get_data_type_name_synonyms(self) -> list[list[str]]: # Implements data type synonyms - # Each list should represent a list of synonyms + # Each list should represent a list of synonyms — every member of a + # group must reverse-map to the same SodaDataTypeName (or to a Soda + # type pair that is_same_soda_data_type_with_synonyms treats as equal). return [ ["varchar", "text", "string"], - ["number", "decimal", "numeric", "int", "integer", "bigint", "smallint", "tinyint", "byteint"], - ["float", "float4", "float8", "double", "double precision", "real"], - ["timestamp", "datetime", "timestamp_ntz", "timestamp without time zone"], - ["timestamp_ltz", "timestamp with local time zone"], - ["timestamp_tz", "timestamp with time zone"], + ["decimal", "numeric"], + ["smallint", "int2"], + ["integer", "int", "int4"], + ["bigint", "int8"], + ["real", "float4", "float"], + ["float8", "double", "double precision"], + ["timestamp", "datetime", "timestamp without time zone"], + ["timestamptz", "timestamp with time zone"], ] def get_data_source_data_type_name_by_soda_data_type_names(self) -> dict: diff --git a/soda-redshift/src/soda_redshift/common/data_sources/redshift_data_source.py b/soda-redshift/src/soda_redshift/common/data_sources/redshift_data_source.py index e9236bb3b..c34ca9aec 100644 --- a/soda-redshift/src/soda_redshift/common/data_sources/redshift_data_source.py +++ b/soda-redshift/src/soda_redshift/common/data_sources/redshift_data_source.py @@ -135,8 +135,8 @@ def _get_data_type_name_synonyms(self) -> list[list[str]]: ["smallint", "int2"], ["integer", "int", "int4"], ["bigint", "int8"], - ["real", "float4", "float"], - [REDSHIFT_DOUBLE_PRECISION, "float8"], + ["real", "float4"], + [REDSHIFT_DOUBLE_PRECISION, "float8", "float"], ["timestamp", "timestamp without time zone"], ["time", "time without time zone"], ] diff --git a/soda-tests/tests/integration/test_conformance_discovery.py b/soda-tests/tests/integration/test_conformance_discovery.py new file mode 100644 index 000000000..8607a85c1 --- /dev/null +++ b/soda-tests/tests/integration/test_conformance_discovery.py @@ -0,0 +1,401 @@ +""" +Adapter Conformance Tests: Metadata Discovery + +Validates that every adapter's metadata discovery correctly: +- Filters out internal/temporary objects +- Returns accurate column type information through a full round-trip +- Maps type synonyms bidirectionally +- Reports column type parameters (precision, scale, length) + +This is the #2 source of field bugs (~20% of historical fixes). + +See: projects/enhancements/common_bugs_tests/historical-bug-analysis.md +""" + +import pytest +from helpers.data_source_test_helper import DataSourceTestHelper +from helpers.test_table import TestTableSpecification +from soda_core.common.metadata_types import ColumnMetadata, SodaDataTypeName +from soda_core.common.sql_dialect import SqlDialect +from soda_core.common.statements.table_types import ( + FullyQualifiedTableName, + FullyQualifiedViewName, +) + +# --------------------------------------------------------------------------- +# Test tables +# --------------------------------------------------------------------------- + +# A table with all Soda data types to exercise the full type mapping round-trip. +all_types_table = ( + TestTableSpecification.builder() + .table_purpose("conf_discovery_types") + .column_varchar("col_varchar") + .column_text("col_text") + .column_integer("col_integer") + .column_bigint("col_bigint") + .column_smallint("col_smallint") + .column_float("col_float") + .column_double("col_double") + .column_boolean("col_boolean") + .column_date("col_date") + .column_timestamp("col_timestamp") + .column_timestamp_tz("col_timestamp_tz") + .column_numeric("col_numeric") + .column_decimal("col_decimal") + .column_char("col_char") + .column_time("col_time") + .build() +) + +# A table with specific type parameters to test precision/scale/length discovery. +typed_params_table = ( + TestTableSpecification.builder() + .table_purpose("conf_discovery_params") + .column_varchar("varchar_100", character_maximum_length=100) + .column_char("char_10", character_maximum_length=10) + .column_numeric("numeric_18_4", numeric_precision=18, numeric_scale=4) + .column_decimal("decimal_10_2", numeric_precision=10, numeric_scale=2) + .column_timestamp("ts_precision_3", datetime_precision=3) + .column_timestamp_tz("ts_tz_precision_6", datetime_precision=6) + .build() +) + +# Simple table for discovery filtering tests. +simple_table = ( + TestTableSpecification.builder() + .table_purpose("conf_discovery_filter") + .column_varchar("name") + .column_integer("value") + .rows( + [ + ("alpha", 1), + ("bravo", 2), + ] + ) + .build() +) + + +# --------------------------------------------------------------------------- +# Internal object filtering +# --------------------------------------------------------------------------- + + +def test_discovery_excludes_soda_internal_tables(data_source_test_helper: DataSourceTestHelper): + """Metadata discovery must not return __soda_temp* or other internal tables. + + Historical bug: commit a16b99c8 — __soda_temp tables were appearing in discovery results. + """ + # Ensure the schema has at least one user table so discovery has something to return. + data_source_test_helper.ensure_test_table(simple_table) + + metadata_query = data_source_test_helper.data_source_impl.create_metadata_tables_query() + results = metadata_query.execute( + database_name=data_source_test_helper.extract_database_from_prefix(), + schema_name=data_source_test_helper.extract_schema_from_prefix(), + ) + + internal_tables = [] + for entry in results: + name = None + if isinstance(entry, FullyQualifiedTableName): + name = entry.table_name + elif isinstance(entry, FullyQualifiedViewName): + name = entry.view_name + if name and name.lower().startswith("__soda"): + internal_tables.append(name) + + assert internal_tables == [], f"Internal Soda tables leaked into discovery results: {internal_tables}" + + +def test_discovery_finds_test_table(data_source_test_helper: DataSourceTestHelper): + """Verify that a newly created table IS discoverable via metadata query.""" + test_table = data_source_test_helper.ensure_test_table(simple_table) + + metadata_query = data_source_test_helper.data_source_impl.create_metadata_tables_query() + results = metadata_query.execute( + database_name=data_source_test_helper.extract_database_from_prefix(), + schema_name=data_source_test_helper.extract_schema_from_prefix(), + include_table_name_like_filters=[f"{test_table.unique_name}"], + ) + + table_names = [entry.table_name.lower() for entry in results if isinstance(entry, FullyQualifiedTableName)] + assert ( + test_table.unique_name.lower() in table_names + ), f"Test table {test_table.unique_name} not found in discovery. Found: {table_names}" + + +# --------------------------------------------------------------------------- +# View discovery via contract +# --------------------------------------------------------------------------- + + +def test_view_contract_check_end_to_end(data_source_test_helper: DataSourceTestHelper): + """Run a full contract check (row_count + missing) against a view, not just metadata.""" + if not data_source_test_helper.data_source_impl.sql_dialect.supports_views(): + pytest.skip("Views not supported") + + test_table = data_source_test_helper.ensure_test_table(simple_table) + view_table = data_source_test_helper.create_view_from_test_table(test_table) + + data_source_test_helper.assert_contract_pass( + test_table=view_table, + contract_yaml_str=""" + columns: + - name: name + checks: + - missing: + checks: + - row_count: + threshold: + must_be: 2 + """, + ) + + +def test_materialized_view_contract_check_end_to_end(data_source_test_helper: DataSourceTestHelper): + """Run a full contract check against a materialized view.""" + if not data_source_test_helper.data_source_impl.sql_dialect.supports_materialized_views(): + pytest.skip("Materialized views not supported") + + test_table = data_source_test_helper.ensure_test_table(simple_table) + mv_table = data_source_test_helper.create_materialized_view_from_test_table(test_table) + + data_source_test_helper.assert_contract_pass( + test_table=mv_table, + contract_yaml_str=""" + columns: + - name: name + checks: + - missing: + checks: + - row_count: + threshold: + must_be: 2 + """, + ) + + +# --------------------------------------------------------------------------- +# Type mapping round-trip: create → discover → map back to SodaDataTypeName +# --------------------------------------------------------------------------- + +# Expected SodaDataTypeName for each column in all_types_table. +EXPECTED_TYPE_MAP = { + "col_varchar": SodaDataTypeName.VARCHAR, + "col_text": SodaDataTypeName.TEXT, + "col_integer": SodaDataTypeName.INTEGER, + "col_bigint": SodaDataTypeName.BIGINT, + "col_smallint": SodaDataTypeName.SMALLINT, + "col_float": SodaDataTypeName.FLOAT, + "col_double": SodaDataTypeName.DOUBLE, + "col_boolean": SodaDataTypeName.BOOLEAN, + "col_date": SodaDataTypeName.DATE, + "col_timestamp": SodaDataTypeName.TIMESTAMP, + "col_timestamp_tz": SodaDataTypeName.TIMESTAMP_TZ, + "col_numeric": SodaDataTypeName.NUMERIC, + "col_decimal": SodaDataTypeName.DECIMAL, + "col_char": SodaDataTypeName.CHAR, + "col_time": SodaDataTypeName.TIME, +} + + +def test_all_types_round_trip(data_source_test_helper: DataSourceTestHelper): + """Every SodaDataTypeName must survive a create→discover→map-back round-trip. + + Tighter than test_soda_data_types.py: this test asserts the exact expected + SodaDataTypeName (with synonym awareness) for each column, not just that a + mapping exists. + """ + test_table = data_source_test_helper.ensure_test_table(all_types_table) + sql_dialect: SqlDialect = data_source_test_helper.data_source_impl.sql_dialect + + actual_columns: list[ColumnMetadata] = data_source_test_helper.data_source_impl.get_columns_metadata( + dataset_prefixes=test_table.dataset_prefix, + dataset_name=test_table.unique_name, + ) + + assert len(actual_columns) == len( + EXPECTED_TYPE_MAP + ), f"Column count mismatch: expected {len(EXPECTED_TYPE_MAP)}, got {len(actual_columns)}" + + reverse_map = sql_dialect.get_soda_data_type_name_by_data_source_data_type_names() + + for col in actual_columns: + col_name = col.column_name.lower() + expected_soda_type = EXPECTED_TYPE_MAP.get(col_name) + assert expected_soda_type is not None, f"Unexpected column in metadata: {col_name}" + + ds_type_name = col.sql_data_type.name + actual_soda_type = reverse_map.get(ds_type_name) + assert ( + actual_soda_type is not None + ), f"Column '{col_name}': data source type '{ds_type_name}' has no reverse mapping" + assert sql_dialect.is_same_soda_data_type_with_synonyms(expected_soda_type, actual_soda_type), ( + f"Column '{col_name}': expected SodaDataType {expected_soda_type}, " + f"got {actual_soda_type} (from DS type '{ds_type_name}')" + ) + + +# --------------------------------------------------------------------------- +# Type synonym bidirectionality +# --------------------------------------------------------------------------- + + +def test_type_synonyms_are_bidirectional(data_source_test_helper: DataSourceTestHelper): + """For each data source type synonym, both the canonical and synonym names + must map to the same SodaDataTypeName through the reverse mapping. + + This catches silent bugs where a type synonym is defined but the reverse + mapping only recognizes the canonical form. + """ + sql_dialect: SqlDialect = data_source_test_helper.data_source_impl.sql_dialect + # Deliberately reaches into _get_data_type_name_synonyms (dialect-internal) + # because the synonym list is the unique input the test needs and is not + # exposed via any public method. + synonym_lists = sql_dialect._get_data_type_name_synonyms() + reverse_map = sql_dialect.get_soda_data_type_name_by_data_source_data_type_names() + + mismatches = [] + for synonym_group in synonym_lists: + # All names in a synonym group should resolve to the same SodaDataTypeName + resolved = {} + for type_name in synonym_group: + soda_type = reverse_map.get(type_name.lower()) or reverse_map.get(type_name) + if soda_type is not None: + resolved[type_name] = soda_type + + if len(resolved) < 2: + # Only one or zero names in this group have a reverse mapping — skip + continue + + soda_types = set(resolved.values()) + # Allow synonym-aware comparison: all resolved types should be considered equivalent + canonical = next(iter(soda_types)) + for type_name, soda_type in resolved.items(): + if not sql_dialect.is_same_soda_data_type_with_synonyms(canonical, soda_type): + mismatches.append( + f"Synonym group {synonym_group}: '{type_name}' maps to {soda_type}, " + f"but others map to {canonical}" + ) + + assert mismatches == [], "Type synonym bidirectionality broken:\n" + "\n".join(mismatches) + + +# --------------------------------------------------------------------------- +# Column type parameters: precision, scale, length +# --------------------------------------------------------------------------- + + +def test_column_type_parameters_preserved(data_source_test_helper: DataSourceTestHelper): + """Column type parameters (length, precision, scale, datetime precision) must + survive the create→discover round-trip for adapters that support them.""" + test_table = data_source_test_helper.ensure_test_table(typed_params_table) + sql_dialect: SqlDialect = data_source_test_helper.data_source_impl.sql_dialect + + actual_columns: list[ColumnMetadata] = data_source_test_helper.data_source_impl.get_columns_metadata( + dataset_prefixes=test_table.dataset_prefix, + dataset_name=test_table.unique_name, + ) + + cols_by_name = {c.column_name.lower(): c for c in actual_columns} + + # character_maximum_length + if sql_dialect.supports_data_type_character_maximum_length(): + varchar_col = cols_by_name.get("varchar_100") + assert varchar_col is not None, "Column varchar_100 not found" + if varchar_col.sql_data_type.character_maximum_length is not None: + assert ( + varchar_col.sql_data_type.character_maximum_length == 100 + ), f"varchar_100: expected length 100, got {varchar_col.sql_data_type.character_maximum_length}" + + char_col = cols_by_name.get("char_10") + assert char_col is not None, "Column char_10 not found" + if char_col.sql_data_type.character_maximum_length is not None: + assert ( + char_col.sql_data_type.character_maximum_length == 10 + ), f"char_10: expected length 10, got {char_col.sql_data_type.character_maximum_length}" + + # numeric_precision and numeric_scale + if sql_dialect.supports_data_type_numeric_precision(): + numeric_col = cols_by_name.get("numeric_18_4") + assert numeric_col is not None, "Column numeric_18_4 not found" + if numeric_col.sql_data_type.numeric_precision is not None: + assert ( + numeric_col.sql_data_type.numeric_precision == 18 + ), f"numeric_18_4: expected precision 18, got {numeric_col.sql_data_type.numeric_precision}" + + decimal_col = cols_by_name.get("decimal_10_2") + assert decimal_col is not None, "Column decimal_10_2 not found" + if decimal_col.sql_data_type.numeric_precision is not None: + assert ( + decimal_col.sql_data_type.numeric_precision == 10 + ), f"decimal_10_2: expected precision 10, got {decimal_col.sql_data_type.numeric_precision}" + + if sql_dialect.supports_data_type_numeric_scale(): + numeric_col = cols_by_name.get("numeric_18_4") + assert numeric_col is not None, "Column numeric_18_4 not found" + if numeric_col.sql_data_type.numeric_scale is not None: + assert ( + numeric_col.sql_data_type.numeric_scale == 4 + ), f"numeric_18_4: expected scale 4, got {numeric_col.sql_data_type.numeric_scale}" + + decimal_col = cols_by_name.get("decimal_10_2") + assert decimal_col is not None, "Column decimal_10_2 not found" + if decimal_col.sql_data_type.numeric_scale is not None: + assert ( + decimal_col.sql_data_type.numeric_scale == 2 + ), f"decimal_10_2: expected scale 2, got {decimal_col.sql_data_type.numeric_scale}" + + # datetime_precision — assert the discovered precision is *at least* the + # requested value. Some adapters (e.g. Trino-iceberg) normalize datetime + # precision to a connector-specific default (often 6) regardless of DDL, + # which still satisfies the contract that precision is preserved or extended. + if sql_dialect.supports_data_type_datetime_precision(): + ts_col = cols_by_name.get("ts_precision_3") + assert ts_col is not None, "Column ts_precision_3 not found" + if ts_col.sql_data_type.datetime_precision is not None: + assert ts_col.sql_data_type.datetime_precision >= 3, ( + f"ts_precision_3: expected datetime_precision >= 3, " f"got {ts_col.sql_data_type.datetime_precision}" + ) + + ts_tz_col = cols_by_name.get("ts_tz_precision_6") + assert ts_tz_col is not None, "Column ts_tz_precision_6 not found" + if ts_tz_col.sql_data_type.datetime_precision is not None: + assert ts_tz_col.sql_data_type.datetime_precision >= 6, ( + f"ts_tz_precision_6: expected datetime_precision >= 6, " + f"got {ts_tz_col.sql_data_type.datetime_precision}" + ) + + +# --------------------------------------------------------------------------- +# Every SodaDataTypeName has both forward and reverse mappings +# --------------------------------------------------------------------------- + + +def test_every_soda_type_has_forward_mapping(data_source_test_helper: DataSourceTestHelper): + """Every SodaDataTypeName must have a forward mapping (Soda→data source).""" + forward_map = ( + data_source_test_helper.data_source_impl.sql_dialect.get_data_source_data_type_name_by_soda_data_type_names() + ) + unmapped = [str(t) for t in SodaDataTypeName if t not in forward_map] + assert unmapped == [], f"SodaDataTypeNames with no forward mapping: {unmapped}" + + +def test_every_forward_mapped_type_has_reverse(data_source_test_helper: DataSourceTestHelper): + """Every data source type produced by the forward mapping must have a reverse mapping.""" + sql_dialect = data_source_test_helper.data_source_impl.sql_dialect + forward_map = sql_dialect.get_data_source_data_type_name_by_soda_data_type_names() + reverse_map = sql_dialect.get_soda_data_type_name_by_data_source_data_type_names() + + unmapped = [] + for soda_type, ds_type in forward_map.items(): + ds_type_lower = ds_type.lower() if isinstance(ds_type, str) else ds_type + if ds_type not in reverse_map and ds_type_lower not in reverse_map: + # Check synonyms (dialect-internal mapping) + canonical = sql_dialect._data_type_name_synonym_mappings.get(ds_type_lower, ds_type_lower) + if canonical not in reverse_map: + unmapped.append(f"{soda_type} → '{ds_type}' (no reverse)") + + assert unmapped == [], "Forward-mapped types with no reverse mapping:\n" + "\n".join(unmapped) diff --git a/soda-tests/tests/integration/test_conformance_identifiers.py b/soda-tests/tests/integration/test_conformance_identifiers.py new file mode 100644 index 000000000..11574b46a --- /dev/null +++ b/soda-tests/tests/integration/test_conformance_identifiers.py @@ -0,0 +1,351 @@ +""" +Adapter Conformance Tests: Identifier Quoting + +Validates that every adapter correctly quotes identifiers containing special +characters in both DDL (CREATE TABLE) and DML (SELECT, INSERT) paths. +This is the #1 source of field bugs (~30% of historical fixes). + +These tests go beyond the existing dialect-level tests in test_hyphenated_identifiers.py +by running full end-to-end contract checks — creating tables with problematic column names, +inserting data, and executing checks against them. + +See: projects/enhancements/common_bugs_tests/historical-bug-analysis.md +""" + +import pytest +from helpers.data_source_test_helper import DataSourceTestHelper +from helpers.test_table import TestTableSpecification + +# --------------------------------------------------------------------------- +# Test tables +# --------------------------------------------------------------------------- + +reserved_words_table = ( + TestTableSpecification.builder() + .table_purpose("conf_reserved_words") + .column_varchar("select") + .column_varchar("table") + .column_varchar("order") + .column_varchar("group") + .column_integer("count") + .rows( + [ + ("a", "t1", "asc", "g1", 1), + ("b", "t2", "desc", "g2", 2), + ("c", "t3", "asc", "g1", 3), + ] + ) + .build() +) + +hyphenated_columns_table = ( + TestTableSpecification.builder() + .table_purpose("conf_hyphenated_cols") + .column_varchar("first-name") + .column_varchar("last-name") + .column_integer("row-id") + .rows( + [ + ("Alice", "Smith", 1), + ("Bob", "Jones", 2), + (None, "Brown", 3), + ] + ) + .build() +) + +mixed_case_table = ( + TestTableSpecification.builder() + .table_purpose("conf_mixed_case") + .column_varchar("FirstName") + .column_varchar("LastName") + .column_integer("AccountBalance") + .rows( + [ + ("Alice", "Smith", 100), + ("Bob", "Jones", 200), + ("Charlie", "Brown", 300), + ] + ) + .build() +) + + +# --------------------------------------------------------------------------- +# Reserved SQL words as column names +# --------------------------------------------------------------------------- + + +def test_reserved_word_columns_row_count(data_source_test_helper: DataSourceTestHelper): + """Table creation and row_count check must work with reserved-word column names.""" + test_table = data_source_test_helper.ensure_test_table(reserved_words_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + checks: + - row_count: + threshold: + must_be: 3 + """, + ) + + +def test_reserved_word_columns_missing_check(data_source_test_helper: DataSourceTestHelper): + """Missing check must work on columns named with SQL reserved words.""" + test_table = data_source_test_helper.ensure_test_table(reserved_words_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + columns: + - name: select + checks: + - missing: + - name: table + checks: + - missing: + - name: order + checks: + - missing: + - name: group + checks: + - missing: + checks: + - row_count: + """, + ) + + +def test_reserved_word_columns_aggregate_check(data_source_test_helper: DataSourceTestHelper): + """Aggregate check (SUM) must work on a column named 'count' (reserved word).""" + test_table = data_source_test_helper.ensure_test_table(reserved_words_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + columns: + - name: count + checks: + - aggregate: + function: sum + threshold: + must_be: 6 + checks: + - row_count: + """, + ) + + +def test_reserved_word_columns_schema_check(data_source_test_helper: DataSourceTestHelper): + """Schema check must discover columns even when they are named with reserved words.""" + test_table = data_source_test_helper.ensure_test_table(reserved_words_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=f""" + checks: + - schema: + allow_extra_columns: true + columns: + - name: select + data_type: {test_table.data_type('select')} + - name: count + data_type: {test_table.data_type('count')} + """, + ) + + +# --------------------------------------------------------------------------- +# Hyphenated column names (end-to-end) +# --------------------------------------------------------------------------- + + +def test_hyphenated_columns_row_count(data_source_test_helper: DataSourceTestHelper): + """Table creation and row_count check with hyphenated column names.""" + test_table = data_source_test_helper.ensure_test_table(hyphenated_columns_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + checks: + - row_count: + threshold: + must_be: 3 + """, + ) + + +def test_hyphenated_columns_missing_detects_null(data_source_test_helper: DataSourceTestHelper): + """Missing check must correctly detect the NULL in 'first-name' column.""" + test_table = data_source_test_helper.ensure_test_table(hyphenated_columns_table) + data_source_test_helper.assert_contract_fail( + test_table=test_table, + contract_yaml_str=""" + columns: + - name: first-name + checks: + - missing: + checks: + - row_count: + """, + ) + + +def test_hyphenated_columns_aggregate(data_source_test_helper: DataSourceTestHelper): + """Aggregate check on a hyphenated integer column.""" + test_table = data_source_test_helper.ensure_test_table(hyphenated_columns_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + columns: + - name: row-id + checks: + - aggregate: + function: sum + threshold: + must_be: 6 + checks: + - row_count: + """, + ) + + +def test_hyphenated_columns_schema_check(data_source_test_helper: DataSourceTestHelper): + """Schema check must discover hyphenated column names correctly.""" + test_table = data_source_test_helper.ensure_test_table(hyphenated_columns_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=f""" + checks: + - schema: + columns: + - name: first-name + data_type: {test_table.data_type('first-name')} + - name: last-name + data_type: {test_table.data_type('last-name')} + - name: row-id + data_type: {test_table.data_type('row-id')} + """, + ) + + +# --------------------------------------------------------------------------- +# Mixed-case (CamelCase) column names +# --------------------------------------------------------------------------- + + +def test_mixed_case_columns_row_count(data_source_test_helper: DataSourceTestHelper): + """Row count check with CamelCase column names.""" + if not data_source_test_helper.data_source_impl.sql_dialect.supports_case_sensitive_column_names(): + pytest.skip("Case sensitive column names not supported") + test_table = data_source_test_helper.ensure_test_table(mixed_case_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + checks: + - row_count: + threshold: + must_be: 3 + """, + ) + + +def test_mixed_case_columns_missing_check(data_source_test_helper: DataSourceTestHelper): + """Missing check referencing CamelCase column names.""" + if not data_source_test_helper.data_source_impl.sql_dialect.supports_case_sensitive_column_names(): + pytest.skip("Case sensitive column names not supported") + test_table = data_source_test_helper.ensure_test_table(mixed_case_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + columns: + - name: FirstName + checks: + - missing: + - name: LastName + checks: + - missing: + checks: + - row_count: + """, + ) + + +def test_mixed_case_columns_aggregate(data_source_test_helper: DataSourceTestHelper): + """Aggregate check on a CamelCase integer column.""" + if not data_source_test_helper.data_source_impl.sql_dialect.supports_case_sensitive_column_names(): + pytest.skip("Case sensitive column names not supported") + test_table = data_source_test_helper.ensure_test_table(mixed_case_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + columns: + - name: AccountBalance + checks: + - aggregate: + function: avg + threshold: + must_be: 200 + checks: + - row_count: + """, + ) + + +def test_mixed_case_columns_schema_preserves_case(data_source_test_helper: DataSourceTestHelper): + """Schema check must preserve CamelCase column names.""" + if not data_source_test_helper.data_source_impl.sql_dialect.supports_case_sensitive_column_names(): + pytest.skip("Case sensitive column names not supported") + test_table = data_source_test_helper.ensure_test_table(mixed_case_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=f""" + checks: + - schema: + columns: + - name: FirstName + data_type: {test_table.data_type('FirstName')} + - name: LastName + data_type: {test_table.data_type('LastName')} + - name: AccountBalance + data_type: {test_table.data_type('AccountBalance')} + """, + ) + + +# --------------------------------------------------------------------------- +# Parametrized quoting consistency across special identifier patterns +# --------------------------------------------------------------------------- + +SPECIAL_IDENTIFIERS = [ + "my-table", + "col with spaces", + "123_starts_digit", + "SELECT", +] + + +@pytest.mark.parametrize("identifier", SPECIAL_IDENTIFIERS) +def test_quote_default_handles_special_identifiers(identifier: str, data_source_test_helper: DataSourceTestHelper): + """quote_default must return a quoted form (not the bare identifier) for each special pattern.""" + sql_dialect = data_source_test_helper.data_source_impl.sql_dialect + quoted = sql_dialect.quote_default(identifier) + assert quoted != identifier, f"quote_default returned bare identifier for '{identifier}'" + + +@pytest.mark.parametrize("identifier", SPECIAL_IDENTIFIERS) +def test_quote_for_ddl_handles_special_identifiers(identifier: str, data_source_test_helper: DataSourceTestHelper): + """quote_for_ddl must return a quoted form (not the bare identifier) for each special pattern.""" + sql_dialect = data_source_test_helper.data_source_impl.sql_dialect + quoted = sql_dialect.quote_for_ddl(identifier) + assert quoted != identifier, f"quote_for_ddl returned bare identifier for '{identifier}'" + + +@pytest.mark.parametrize("identifier", SPECIAL_IDENTIFIERS) +def test_ddl_and_dml_quoting_both_preserve_identifier(identifier: str, data_source_test_helper: DataSourceTestHelper): + """Both DDL and DML quoting must preserve the original identifier string. + + Comparison is case-insensitive: some dialects fold identifier case during + quoting, but the *characters* must survive the round-trip. + """ + sql_dialect = data_source_test_helper.data_source_impl.sql_dialect + dml_quoted = sql_dialect.quote_default(identifier) + ddl_quoted = sql_dialect.quote_for_ddl(identifier) + assert identifier.casefold() in dml_quoted.casefold(), f"DML quoting lost identifier: {dml_quoted}" + assert identifier.casefold() in ddl_quoted.casefold(), f"DDL quoting lost identifier: {ddl_quoted}" diff --git a/soda-tests/tests/integration/test_conformance_types_dialect.py b/soda-tests/tests/integration/test_conformance_types_dialect.py new file mode 100644 index 000000000..7faa956c3 --- /dev/null +++ b/soda-tests/tests/integration/test_conformance_types_dialect.py @@ -0,0 +1,356 @@ +""" +Adapter Conformance Tests: Type Mapping & SQL Dialect + +Validates that every adapter: +- Can create tables, insert data, and run checks for ALL Soda data types +- Generates valid sampling SQL for each supported sampler type +- Generates valid regex SQL +- Generates valid RANDOM() SQL +- Has consistent type synonym definitions + +These cover the #3-#4 sources of field bugs (type mapping ~10%, SQL dialect ~8%). + +See: projects/enhancements/common_bugs_tests/conformance-test-dev-plan.md (Phase 3) +""" + +import datetime + +import pytest +from helpers.data_source_test_helper import DataSourceTestHelper +from helpers.test_table import TestTableSpecification +from soda_core.common.data_source_impl import DataSourceImpl +from soda_core.common.data_source_results import QueryResult +from soda_core.common.metadata_types import SamplerType, SodaDataTypeName +from soda_core.common.sql_ast import FROM, RANDOM, SELECT, STAR +from soda_core.common.sql_dialect import SqlDialect + +# --------------------------------------------------------------------------- +# Test table: one column per Soda data type, with actual data +# --------------------------------------------------------------------------- + +all_types_with_data_table = ( + TestTableSpecification.builder() + .table_purpose("conf_types_e2e") + .column_char("col_char") + .column_varchar("col_varchar") + .column_text("col_text") + .column_smallint("col_smallint") + .column_integer("col_integer") + .column_bigint("col_bigint") + .column_numeric("col_numeric") + .column_decimal("col_decimal") + .column_float("col_float") + .column_double("col_double") + .column_boolean("col_boolean") + .column_date("col_date") + .column_time("col_time") + .column_timestamp("col_timestamp") + .column_timestamp_tz("col_timestamp_tz") + .rows( + [ + ( + "a", # char + "hello", # varchar + "some text", # text + 1, # smallint + 42, # integer + 1000000, # bigint + 3.14, # numeric + 2.718, # decimal + 1.5, # float + 2.71828, # double + True, # boolean + datetime.date(2025, 6, 15), # date + datetime.time(10, 30, 0), # time + datetime.datetime(2025, 6, 15, 10, 30, 0), # timestamp + datetime.datetime(2025, 6, 15, 10, 30, 0), # timestamp_tz + ), + ( + "b", + "world", + "more text", + 2, + 99, + 2000000, + 6.28, + 5.436, + 2.5, + 3.14159, + False, + datetime.date(2025, 7, 20), + datetime.time(14, 0, 0), + datetime.datetime(2025, 7, 20, 14, 0, 0), + datetime.datetime(2025, 7, 20, 14, 0, 0), + ), + ( + None, # null char + None, # null varchar + None, # null text + None, # null smallint + None, # null integer + None, # null bigint + None, # null numeric + None, # null decimal + None, # null float + None, # null double + None, # null boolean + None, # null date + None, # null time + None, # null timestamp + None, # null timestamp_tz + ), + ] + ) + .build() +) + +# Columns to test with missing check (all of them) +ALL_TYPE_COLUMNS = [ + "col_char", + "col_varchar", + "col_text", + "col_smallint", + "col_integer", + "col_bigint", + "col_numeric", + "col_decimal", + "col_float", + "col_double", + "col_boolean", + "col_date", + "col_time", + "col_timestamp", + "col_timestamp_tz", +] + +# Numeric columns to test with aggregate checks +NUMERIC_COLUMNS = [ + "col_smallint", + "col_integer", + "col_bigint", + "col_numeric", + "col_decimal", + "col_float", + "col_double", +] + + +# --------------------------------------------------------------------------- +# End-to-end type tests: full pipeline for every data type +# --------------------------------------------------------------------------- + + +def test_all_types_table_creation_and_row_count(data_source_test_helper: DataSourceTestHelper): + """Create a table with all Soda data types, insert data, verify row count. + This exercises the full DDL + INSERT pipeline for every type.""" + test_table = data_source_test_helper.ensure_test_table(all_types_with_data_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=""" + checks: + - row_count: + threshold: + must_be: 3 + """, + ) + + +@pytest.mark.parametrize("column_name", ALL_TYPE_COLUMNS) +def test_missing_check_per_type(column_name: str, data_source_test_helper: DataSourceTestHelper): + """Missing check must detect the NULL row for each data type. + This verifies the full pipeline: type mapping → SQL generation → query → result parsing.""" + test_table = data_source_test_helper.ensure_test_table(all_types_with_data_table) + data_source_test_helper.assert_contract_fail( + test_table=test_table, + contract_yaml_str=f""" + columns: + - name: {column_name} + checks: + - missing: + checks: + - row_count: + """, + ) + + +@pytest.mark.parametrize("column_name", NUMERIC_COLUMNS) +def test_aggregate_check_per_numeric_type(column_name: str, data_source_test_helper: DataSourceTestHelper): + """Aggregate (avg) must work on every numeric type. Verifies type casting and aggregation.""" + test_table = data_source_test_helper.ensure_test_table(all_types_with_data_table) + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=f""" + columns: + - name: {column_name} + checks: + - aggregate: + function: avg + threshold: + must_be_greater_than: 0 + checks: + - row_count: + """, + ) + + +def test_schema_check_all_types(data_source_test_helper: DataSourceTestHelper): + """Schema check must discover all columns in the correct order. + Note: we don't compare data_type here because forward-mapped names may differ + from discovered names (e.g., Postgres maps FLOAT→'float' but discovers 'double precision'). + Type round-trip accuracy is tested in Phase 2 (test_conformance_discovery.py).""" + test_table = data_source_test_helper.ensure_test_table(all_types_with_data_table) + + columns_yaml = "\n".join(f" - name: {col}" for col in ALL_TYPE_COLUMNS) + + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=f""" + checks: + - schema: + columns: +{columns_yaml} + """, + ) + + +# --------------------------------------------------------------------------- +# Sampling SQL conformance +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("sampler_type", list(SamplerType)) +def test_sampling_sql_executes(sampler_type: SamplerType, data_source_test_helper: DataSourceTestHelper): + """For each supported sampler type, generate a full SELECT with sampling and execute it. + Exercises the public sampling SQL path end-to-end: SAMPLE clause generation, + parameter substitution, and adapter execution.""" + sql_dialect: SqlDialect = data_source_test_helper.data_source_impl.sql_dialect + + if not sql_dialect.supports_sampler(sampler_type): + pytest.skip(f"{sql_dialect.__class__.__name__} does not support {sampler_type.name}") + + test_table = data_source_test_helper.ensure_test_table(all_types_with_data_table) + table_from_name = sql_dialect.get_from_name_from_qualified_name(test_table.qualified_name) + + sample_size = 50 if sampler_type == SamplerType.PERCENTAGE else 2 + select_sql = sql_dialect.build_select_sql( + [ + SELECT(STAR()), + FROM(table_from_name).SAMPLE(sampler_type, sample_size), + ] + ) + + # Successful execution implies non-empty parseable SQL — no extra assertion needed. + data_source_test_helper.data_source_impl.execute_query(select_sql) + + +# --------------------------------------------------------------------------- +# Regex SQL conformance +# --------------------------------------------------------------------------- + + +def test_regex_via_invalid_check(data_source_test_helper: DataSourceTestHelper): + """Invalid check with regex must work end-to-end (the row with NULL is excluded, + the two data rows match the pattern, so no invalids among non-null rows).""" + test_table = data_source_test_helper.ensure_test_table(all_types_with_data_table) + + if data_source_test_helper.data_source_impl.sql_dialect.supports_regex_advanced(): + regex = "^[a-z]+$" + else: + regex = "[a-z]" + + data_source_test_helper.assert_contract_pass( + test_table=test_table, + contract_yaml_str=f""" + columns: + - name: col_varchar + valid_format: + regex: '{regex}' + name: lowercase-letters + checks: + - invalid: + checks: + - row_count: + """, + ) + + +# --------------------------------------------------------------------------- +# RANDOM() conformance +# --------------------------------------------------------------------------- + + +def test_random_generates_valid_sql(data_source_test_helper: DataSourceTestHelper): + """RANDOM() must generate valid SQL that returns values in [0.0, 1.0).""" + test_table = data_source_test_helper.ensure_test_table(all_types_with_data_table) + data_source_impl: DataSourceImpl = data_source_test_helper.data_source_impl + sql_dialect: SqlDialect = data_source_impl.sql_dialect + + table_from_name = sql_dialect.get_from_name_from_qualified_name(test_table.qualified_name) + select_sql = sql_dialect.build_select_sql( + [ + SELECT(RANDOM()), + FROM(table_from_name), + ] + ) + + result: QueryResult = data_source_impl.execute_query(select_sql) + assert len(result.rows) == 3 + + for row in result.rows: + value = float(row[0]) + assert 0.0 <= value < 1.0, f"RANDOM() returned {value}, expected [0.0, 1.0)" + + +# --------------------------------------------------------------------------- +# Type mapping consistency +# --------------------------------------------------------------------------- + + +def test_forward_mapping_covers_all_types(data_source_test_helper: DataSourceTestHelper): + """Every SodaDataTypeName must have a data source type in the forward mapping.""" + forward_map = ( + data_source_test_helper.data_source_impl.sql_dialect.get_data_source_data_type_name_by_soda_data_type_names() + ) + unmapped = [t.name for t in SodaDataTypeName if t not in forward_map] + assert unmapped == [], f"SodaDataTypeNames missing from forward mapping: {unmapped}" + + +def test_reverse_mapping_covers_forward(data_source_test_helper: DataSourceTestHelper): + """Every type produced by forward mapping must be resolvable via reverse mapping.""" + sql_dialect = data_source_test_helper.data_source_impl.sql_dialect + forward = sql_dialect.get_data_source_data_type_name_by_soda_data_type_names() + reverse = sql_dialect.get_soda_data_type_name_by_data_source_data_type_names() + + broken = [] + for soda_type, ds_type in forward.items(): + found = ( + ds_type in reverse + or (isinstance(ds_type, str) and ds_type.lower() in reverse) + or sql_dialect._data_type_name_synonym_mappings.get( + ds_type.lower() if isinstance(ds_type, str) else ds_type, None + ) + in reverse + ) + if not found: + broken.append(f"{soda_type.name} → '{ds_type}'") + + assert broken == [], "Forward-mapped types with no reverse path:\n" + "\n".join(broken) + + +def test_data_type_synonyms_internally_consistent(data_source_test_helper: DataSourceTestHelper): + """All entries in a synonym group must resolve to the same canonical name + through the synonym mapping (the _data_type_name_synonym_mappings dict).""" + sql_dialect = data_source_test_helper.data_source_impl.sql_dialect + synonym_lists = sql_dialect._get_data_type_name_synonyms() + + inconsistencies = [] + for group in synonym_lists: + canonicals = set() + for name in group: + canonical = sql_dialect._data_type_name_synonym_mappings.get(name.lower()) + if canonical is not None: + canonicals.add(canonical) + if len(canonicals) > 1: + inconsistencies.append(f"Group {group} maps to multiple canonicals: {canonicals}") + + assert inconsistencies == [], "Synonym groups with inconsistent canonical mappings:\n" + "\n".join(inconsistencies)