Skip to content

Commit 363d8bd

Browse files
authored
Surface more detailed error info when detecting metadata from dataframes (#2358)
1 parent ca7892c commit 363d8bd

File tree

5 files changed

+97
-22
lines changed

5 files changed

+97
-22
lines changed

sdv/metadata/multi_table.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ def detect_table_from_dataframe(self, table_name, data):
544544
"""
545545
self._validate_table_not_detected(table_name)
546546
table = SingleTableMetadata()
547-
table._detect_columns(data)
547+
table._detect_columns(data, table_name)
548548
self.tables[table_name] = table
549549
self._log_detected_table(table)
550550

sdv/metadata/single_table.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -595,35 +595,53 @@ def _detect_primary_key(self, data):
595595

596596
return None
597597

598-
def _detect_columns(self, data):
598+
def _detect_columns(self, data, table_name=None):
599599
"""Detect the columns' sdtypes from the data.
600600
601601
Args:
602602
data (pandas.DataFrame):
603603
The data to be analyzed.
604+
table_name (str):
605+
The name of the table to be analyzed. Defaults to ``None``.
604606
"""
605607
old_columns = data.columns
606608
data.columns = data.columns.astype(str)
607609
for field in data:
608-
column_data = data[field]
609-
clean_data = column_data.dropna()
610-
dtype = clean_data.infer_objects().dtype.kind
611-
612-
sdtype = self._detect_pii_column(field)
613-
if sdtype is None:
614-
if dtype in self._DTYPES_TO_SDTYPES:
615-
sdtype = self._DTYPES_TO_SDTYPES[dtype]
616-
elif dtype in ['i', 'f', 'u']:
617-
sdtype = self._determine_sdtype_for_numbers(column_data)
618-
619-
elif dtype == 'O':
620-
sdtype = self._determine_sdtype_for_objects(column_data)
610+
try:
611+
column_data = data[field]
612+
clean_data = column_data.dropna()
613+
dtype = clean_data.infer_objects().dtype.kind
621614

615+
sdtype = self._detect_pii_column(field)
622616
if sdtype is None:
623-
raise InvalidMetadataError(
624-
f"Unsupported data type for column '{field}' (kind: {dtype})."
625-
"The valid data types are: 'object', 'int', 'float', 'datetime', 'bool'."
626-
)
617+
if dtype in self._DTYPES_TO_SDTYPES:
618+
sdtype = self._DTYPES_TO_SDTYPES[dtype]
619+
elif dtype in ['i', 'f', 'u']:
620+
sdtype = self._determine_sdtype_for_numbers(column_data)
621+
622+
elif dtype == 'O':
623+
sdtype = self._determine_sdtype_for_objects(column_data)
624+
625+
if sdtype is None:
626+
table_str = f"table '{table_name}' " if table_name else ''
627+
error_message = (
628+
f"Unsupported data type for {table_str}column '{field}' (kind: {dtype}"
629+
"). The valid data types are: 'object', 'int', 'float', 'datetime',"
630+
" 'bool'."
631+
)
632+
raise InvalidMetadataError(error_message)
633+
634+
except Exception as e:
635+
error_type = type(e).__name__
636+
if error_type == 'InvalidMetadataError':
637+
raise e
638+
639+
table_str = f"table '{table_name}' " if table_name else ''
640+
error_message = (
641+
f"Unable to detect metadata for {table_str}column '{field}' due to an invalid "
642+
f'data format.\n {error_type}: {e}'
643+
)
644+
raise InvalidMetadataError(error_message) from e
627645

628646
column_dict = {'sdtype': sdtype}
629647
sdtype_in_reference = sdtype in self._REFERENCE_TO_SDTYPE.values()

tests/integration/metadata/test_metadata.py

+33
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
import re
33
from copy import deepcopy
44

5+
import pandas as pd
56
import pytest
67

78
from sdv.datasets.demo import download_demo
9+
from sdv.metadata.errors import InvalidMetadataError
810
from sdv.metadata.metadata import Metadata
911
from sdv.metadata.multi_table import MultiTableMetadata
1012
from sdv.metadata.single_table import SingleTableMetadata
@@ -520,3 +522,34 @@ def test_anonymize():
520522

521523
assert anonymized.tables['table1'].to_dict() == table1_metadata.anonymize().to_dict()
522524
assert anonymized.tables['table2'].to_dict() == table2_metadata.anonymize().to_dict()
525+
526+
527+
def test_detect_from_dataframes_invalid_format():
528+
"""Test the ``detect_from_dataframes`` method with an invalid data format."""
529+
# Setup
530+
dict_data = [
531+
{
532+
'key1': i,
533+
'key2': f'string_{i}',
534+
'key3': 1.5,
535+
}
536+
for i in range(100)
537+
]
538+
data = {
539+
'table_1': pd.DataFrame({
540+
'dict_column': dict_data,
541+
'numerical': [1.2] * 100,
542+
}),
543+
'table_2': pd.DataFrame({
544+
'numerical': [1.5] * 10,
545+
'categorical': ['A'] * 10,
546+
}),
547+
}
548+
expected_error = re.escape(
549+
"Unable to detect metadata for table 'table_1' column 'dict_column' due to an "
550+
"invalid data format.\n TypeError: unhashable type: 'dict'"
551+
)
552+
553+
# Run and Assert
554+
with pytest.raises(InvalidMetadataError, match=expected_error):
555+
Metadata.detect_from_dataframes(data)

tests/unit/metadata/test_multi_table.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2499,7 +2499,7 @@ def test_detect_table_from_dataframe(self, single_table_mock, log_mock):
24992499
metadata.detect_table_from_dataframe('table', data)
25002500

25012501
# Assert
2502-
single_table_mock.return_value._detect_columns.assert_called_once_with(data)
2502+
single_table_mock.return_value._detect_columns.assert_called_once_with(data, 'table')
25032503
assert metadata.tables == {'table': single_table_mock.return_value}
25042504

25052505
expected_log_calls = call(

tests/unit/metadata/test_single_table.py

+26-2
Original file line numberDiff line numberDiff line change
@@ -1277,9 +1277,8 @@ def test__detect_columns_with_error(self, mock__get_datetime_format):
12771277

12781278
expected_error_message = re.escape(
12791279
"Unsupported data type for column 'complex_dtype' (kind: c)."
1280-
"The valid data types are: 'object', 'int', 'float', 'datetime', 'bool'."
1280+
" The valid data types are: 'object', 'int', 'float', 'datetime', 'bool'."
12811281
)
1282-
12831282
with pytest.raises(InvalidMetadataError, match=expected_error_message):
12841283
instance._detect_columns(non_supported_data)
12851284

@@ -1296,6 +1295,31 @@ def test__detect_columns_with_error(self, mock__get_datetime_format):
12961295
instance._determine_sdtype_for_objects.assert_called_once()
12971296
mock__get_datetime_format.assert_called_once()
12981297

1298+
def test__detect_columns_invalid_data_format(self):
1299+
"""Test the ``_detect_columns`` method with an invalid data format."""
1300+
# Setup
1301+
instance = SingleTableMetadata()
1302+
dict_data = [
1303+
{
1304+
'key1': i,
1305+
'key2': f'string_{i}',
1306+
'key3': np.random.random(), # random float
1307+
}
1308+
for i in range(100)
1309+
]
1310+
data = pd.DataFrame({
1311+
'dict_column': dict_data,
1312+
'numerical': [1.2] * 100,
1313+
})
1314+
expected_error_message = re.escape(
1315+
"Unable to detect metadata for column 'dict_column' due to an invalid data format."
1316+
"\n TypeError: unhashable type: 'dict'"
1317+
)
1318+
1319+
# Run and Assert
1320+
with pytest.raises(InvalidMetadataError, match=expected_error_message):
1321+
instance._detect_columns(data)
1322+
12991323
def test__detect_primary_key_missing_sdtypes(self):
13001324
"""The method should raise an error if not all sdtypes were detected."""
13011325
# Setup

0 commit comments

Comments
 (0)