Skip to content

Commit 27e9acd

Browse files
committed
make release-tag: Merge branch 'main' into stable
2 parents 12e9082 + 66026b5 commit 27e9acd

File tree

26 files changed

+807
-142
lines changed

26 files changed

+807
-142
lines changed

HISTORY.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,29 @@
11
# Release Notes
22

3+
## 1.8.0 - 2023-12-05
4+
5+
This release adds support for the new Diagnostic Report from SDMetrics. This report calculates scores for three basic but important properties of your data: data validity, data structure and in the multi table case, relationship validity. Data validity checks that the columns of your data are valid (eg. correct range or values). Data structure makes sure the synthetic data has the correct columns. Relationship validity checks to make sure key references are correct and the cardinality is within ranges seen in the real data.
6+
7+
Additionally, a few bugs were fixed and functionality was improved around synthesizers. It is now possible to access the loss values for the `TVAESynthesizer` and `CTGANSynthesizer` by using the `get_loss_values` method. The `get_parameters` method is now more detailed and returns all the parameters used to make a synthesizer. The metadata is now capable of detecting some common pii sdtypes. Finally, a bug that made every parent row generated by the `HMASynthesizer` have at least one child row was patched. This should improve cardinality.
8+
9+
### Maintenance
10+
11+
* Address `SettingWithCopyWarning` (HMASynthesizer) - Issue [#1557](https://github.com/sdv-dev/SDV/issues/1557) by @pvk-developer
12+
* Bump SDMetrics version - Issue [#1702](https://github.com/sdv-dev/SDV/issues/1702) by @amontanez24
13+
14+
### New Features
15+
16+
* Allow me to access loss values for GAN-based synthesizers - Issue [#1671](https://github.com/sdv-dev/SDV/issues/1671) by @frances-h
17+
* Create a unified `get_parameters` method for all multi-table synthesizers - Issue [#1674](https://github.com/sdv-dev/SDV/issues/1674) by @frances-h
18+
* Set credentials key as variables - Issue [#1680](https://github.com/sdv-dev/SDV/issues/1680) by @R-Palazzo
19+
* Identifying PII Sdtypes in Metadata - Issue [#1683](https://github.com/sdv-dev/SDV/issues/1683) by @R-Palazzo
20+
* Make SDV compatible with the latest SDMetrics - Issue [#1687](https://github.com/sdv-dev/SDV/issues/1687) by @fealho
21+
* SingleTablePreset uses FrequencyEncoder - Issue [#1695](https://github.com/sdv-dev/SDV/issues/1695) by @fealho
22+
23+
### Bugs Fixed
24+
25+
* HMASynthesizer creates too much synthetic data (always creates a child for every parent row) - Issue [#1673](https://github.com/sdv-dev/SDV/issues/1673) by @frances-h
26+
327
## 1.7.0 - 2023-11-16
428

529
This release adds an alert to the `CTGANSynthesizer` during preprocessing. The alert informs the user if the fitting of the synthesizer is likely to be slow on their schema. Additionally, it is now possible to enforce that sampled datetime values stay within the range of the fitted data!

sdv/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
__author__ = 'DataCebo, Inc.'
88
__email__ = 'info@sdv.dev'
9-
__version__ = '1.7.0'
9+
__version__ = '1.8.0.dev1'
1010

1111

1212
import sys

sdv/datasets/demo.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import json
55
import logging
66
import os
7-
import urllib.request
87
from collections import defaultdict
98
from pathlib import Path
109
from zipfile import ZipFile
@@ -14,13 +13,15 @@
1413
import pandas as pd
1514
from botocore import UNSIGNED
1615
from botocore.client import Config
16+
from botocore.exceptions import ClientError
1717

1818
from sdv.metadata.multi_table import MultiTableMetadata
1919
from sdv.metadata.single_table import SingleTableMetadata
2020

2121
LOGGER = logging.getLogger(__name__)
2222
BUCKET = 'sdv-demo-datasets'
2323
BUCKET_URL = 'https://sdv-demo-datasets.s3.amazonaws.com'
24+
SIGNATURE_VERSION = UNSIGNED
2425
METADATA_FILENAME = 'metadata.json'
2526

2627

@@ -38,19 +39,27 @@ def _validate_output_folder(output_folder_name):
3839
)
3940

4041

42+
def _get_data_from_bucket(object_key):
43+
session = boto3.Session()
44+
s3 = session.client('s3', config=Config(signature_version=SIGNATURE_VERSION))
45+
response = s3.get_object(Bucket=BUCKET, Key=object_key)
46+
return response['Body'].read()
47+
48+
4149
def _download(modality, dataset_name):
4250
dataset_url = f'{BUCKET_URL}/{modality.upper()}/{dataset_name}.zip'
51+
object_key = f'{modality.upper()}/{dataset_name}.zip'
4352
LOGGER.info(f'Downloading dataset {dataset_name} from {dataset_url}')
4453
try:
45-
response = urllib.request.urlopen(dataset_url)
46-
except urllib.error.HTTPError:
54+
file_content = _get_data_from_bucket(object_key)
55+
except ClientError:
4756
raise ValueError(
4857
f"Invalid dataset name '{dataset_name}'. "
4958
'Make sure you have the correct modality for the dataset name or '
5059
"use 'get_available_demos' to get a list of demo datasets."
5160
)
5261

53-
return io.BytesIO(response.read())
62+
return io.BytesIO(file_content)
5463

5564

5665
def _extract_data(bytes_io, output_folder_name):
@@ -162,7 +171,7 @@ def get_available_demos(modality):
162171
* If ``modality`` is not ``'single_table'``, ``'multi_table'`` or ``'sequential'``.
163172
"""
164173
_validate_modalities(modality)
165-
client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
174+
client = boto3.client('s3', config=Config(signature_version=SIGNATURE_VERSION))
166175
tables_info = defaultdict(list)
167176
for item in client.list_objects(Bucket=BUCKET)['Contents']:
168177
dataset_modality, dataset = item['Key'].split('/', 1)

sdv/lite/single_table.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import sys
55

66
import cloudpickle
7-
import rdt.transformers
87

98
from sdv.single_table import GaussianCopulaSynthesizer
109

@@ -38,10 +37,6 @@ def _setup_fast_preset(self, metadata, locales):
3837
enforce_rounding=False,
3938
locales=locales
4039
)
41-
self._synthesizer._data_processor._update_transformers_by_sdtypes(
42-
'categorical',
43-
rdt.transformers.FrequencyEncoder(add_noise=True)
44-
)
4540

4641
def __init__(self, metadata, name, locales=None):
4742
if name not in PRESETS:

sdv/metadata/single_table.py

Lines changed: 75 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,46 @@ class SingleTableMetadata:
5252
'sequence_index',
5353
'METADATA_SPEC_VERSION'
5454
])
55+
56+
_REFERENCE_TO_SDTYPE = {
57+
'phonenumber': 'phone_number',
58+
'email': 'email',
59+
'ssn': 'ssn',
60+
'firstname': 'first_name',
61+
'lastname': 'last_name',
62+
'countrycode': 'country_code',
63+
'administativeunit': 'administrative_unit',
64+
'state': 'administrative_unit',
65+
'province': 'administrative_unit',
66+
'stateabbr': 'state_abbr',
67+
'city': 'city',
68+
'postalcode': 'postcode',
69+
'zipcode': 'postcode',
70+
'postcode': 'postcode',
71+
'streetaddress': 'street_address',
72+
'line1': 'street_address',
73+
'secondaryaddress': 'secondary_address',
74+
'line2': 'secondary_address',
75+
'latitude': 'latitude',
76+
'longitude': 'longitude',
77+
'ipv4': 'ipv4_address',
78+
'ipv4address': 'ipv4_address',
79+
'ipv6': 'ipv6_address',
80+
'ipv6address': 'ipv6_address',
81+
'ipaddress': 'ipv6_address',
82+
'macaddress': 'mac_address',
83+
'useragent': 'user_agent_string',
84+
'useragentstring': 'user_agent_string',
85+
'iban': 'iban',
86+
'swift': 'swift11',
87+
'swift11': 'swift11',
88+
'swift8': 'swift8',
89+
'creditcardnumber': 'credit_card_number',
90+
'vin': 'vin',
91+
'licenseplate': 'license_plate',
92+
'license': 'license_plate',
93+
}
94+
5595
METADATA_SPEC_VERSION = 'SINGLE_TABLE_V1'
5696
_DEFAULT_SDTYPES = list(_SDTYPE_KWARGS) + list(SDTYPE_ANONYMIZERS)
5797

@@ -250,6 +290,19 @@ def to_dict(self):
250290

251291
return deepcopy(metadata)
252292

293+
def _detect_pii_column(self, column_name):
294+
"""Detect PII columns.
295+
296+
Args:
297+
column_name (str):
298+
The column name to be analyzed.
299+
"""
300+
cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', column_name).lower()
301+
return next((
302+
sdtype for reference, sdtype in self._REFERENCE_TO_SDTYPE.items()
303+
if reference in cleaned_name
304+
), None)
305+
253306
def _determine_sdtype_for_numbers(self, data):
254307
"""Determine the sdtype for a numerical column.
255308
@@ -322,31 +375,32 @@ def _detect_columns(self, data):
322375
clean_data = column_data.dropna()
323376
dtype = clean_data.infer_objects().dtype.kind
324377

325-
sdtype = None
326-
if dtype in self._DTYPES_TO_SDTYPES:
327-
sdtype = self._DTYPES_TO_SDTYPES[dtype]
328-
elif dtype in ['i', 'f']:
329-
sdtype = self._determine_sdtype_for_numbers(column_data)
330-
331-
elif dtype == 'O':
332-
sdtype = self._determine_sdtype_for_objects(column_data)
333-
378+
sdtype = self._detect_pii_column(field)
334379
if sdtype is None:
335-
raise InvalidMetadataError(
336-
f"Unsupported data type for column '{field}' (kind: {dtype})."
337-
"The valid data types are: 'object', 'int', 'float', 'datetime', 'bool'."
338-
)
339-
340-
# Set the first ID column we detect to be the primary key
341-
if sdtype == 'id':
342-
if self.primary_key is None:
343-
self.primary_key = field
344-
else:
345-
sdtype = 'unknown'
380+
if dtype in self._DTYPES_TO_SDTYPES:
381+
sdtype = self._DTYPES_TO_SDTYPES[dtype]
382+
elif dtype in ['i', 'f']:
383+
sdtype = self._determine_sdtype_for_numbers(column_data)
384+
385+
elif dtype == 'O':
386+
sdtype = self._determine_sdtype_for_objects(column_data)
387+
388+
if sdtype is None:
389+
raise InvalidMetadataError(
390+
f"Unsupported data type for column '{field}' (kind: {dtype})."
391+
"The valid data types are: 'object', 'int', 'float', 'datetime', 'bool'."
392+
)
393+
394+
# Set the first ID column we detect to be the primary key
395+
if sdtype == 'id':
396+
if self.primary_key is None:
397+
self.primary_key = field
398+
else:
399+
sdtype = 'unknown'
346400

347401
column_dict = {'sdtype': sdtype}
348402

349-
if sdtype == 'unknown':
403+
if sdtype in self._REFERENCE_TO_SDTYPE.values() or sdtype == 'unknown':
350404
column_dict['pii'] = True
351405
elif sdtype == 'datetime' and dtype == 'O':
352406
datetime_format = get_datetime_format(column_data.iloc[:100])

sdv/multi_table/base.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def set_address_columns(self, table_name, column_names, anonymization_level='ful
115115
self._table_synthesizers[table_name].set_address_columns(column_names, anonymization_level)
116116

117117
def get_table_parameters(self, table_name):
118-
"""Return the parameters that will be used to instantiate the table's synthesizer.
118+
"""Return the parameters for the given table's synthesizer.
119119
120120
Args:
121121
table_name (str):
@@ -126,21 +126,33 @@ def get_table_parameters(self, table_name):
126126
A dictionary representing the parameters that will be used to instantiate the
127127
table's synthesizer.
128128
"""
129-
return self._table_parameters.get(table_name, {})
129+
table_synthesizer = self._table_synthesizers.get(table_name)
130+
if not table_synthesizer:
131+
table_params = {'table_synthesizer': None, 'table_parameters': {}}
132+
else:
133+
table_params = {
134+
'table_synthesizer': type(table_synthesizer).__name__,
135+
'table_parameters': table_synthesizer.get_parameters()
136+
}
130137

131-
def get_parameters(self, table_name):
132-
"""Return the parameters used to instantiate the table's synthesizer.
138+
return table_params
133139

134-
Args:
135-
table_name (str):
136-
Table name for which the parameters should be retrieved.
140+
def get_parameters(self):
141+
"""Return the parameters used to instantiate the synthesizer and all table synthesizers.
137142
138143
Returns:
139144
parameters (dict):
140-
A dictionary representing the parameters used to instantiate the table's
141-
synthesizer.
145+
A dictionary representing the parameters used to instantiate the synthesizer.
142146
"""
143-
return self._table_synthesizers.get(table_name).get_parameters()
147+
parameters_dict = {
148+
'locales': self.locales,
149+
'verbose': self.verbose,
150+
'tables': {
151+
table: self.get_table_parameters(table) for table in self.metadata.tables
152+
}
153+
}
154+
155+
return parameters_dict
144156

145157
def set_table_parameters(self, table_name, table_parameters):
146158
"""Update the table's synthesizer instantiation parameters.
@@ -406,6 +418,34 @@ def get_learned_distributions(self, table_name):
406418
f"table because it uses the '{synthesizer.__class__.__name__}'."
407419
)
408420

421+
def get_loss_values(self, table_name):
422+
"""Get the loss values from a model for a table.
423+
424+
Return a pandas dataframe mapping of the loss values per epoch of GAN
425+
based synthesizers
426+
427+
Args:
428+
table_name (str):
429+
Table name for which the parameters should be retrieved.
430+
431+
Returns:
432+
pd.DataFrame:
433+
Dataframe of loss values per epoch
434+
"""
435+
if table_name not in self._table_synthesizers:
436+
raise ValueError(
437+
f"Table '{table_name}' is not present in the metadata."
438+
)
439+
440+
synthesizer = self._table_synthesizers[table_name]
441+
if hasattr(synthesizer, 'get_loss_values'):
442+
return synthesizer.get_loss_values()
443+
444+
raise SynthesizerInputError(
445+
f"Loss values are not available for table '{table_name}' "
446+
'because the table does not use a GAN-based model.'
447+
)
448+
409449
def _validate_constraints_to_be_added(self, constraints):
410450
for constraint_dict in constraints:
411451
if 'table_name' not in constraint_dict.keys():

sdv/multi_table/hma.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Hierarchical Modeling Algorithms."""
22

33
import logging
4-
import math
54
from copy import deepcopy
65

76
import numpy as np
@@ -110,7 +109,8 @@ def _get_num_extended_columns(self, table_name, parent_table, columns_per_table)
110109
if num_data_columns == 0:
111110
return num_rows_columns
112111

113-
distribution = self.get_table_parameters(table_name)['default_distribution']
112+
table_parameters = self.get_table_parameters(table_name)['table_parameters']
113+
distribution = table_parameters['default_distribution']
114114
num_parameters_columns = num_rows_columns * num_data_columns
115115
if distribution in {'beta', 'truncnorm'}:
116116
num_parameters_columns *= 4
@@ -431,7 +431,7 @@ def _extract_parameters(self, parent_row, table_name, foreign_key):
431431
num_rows = flat_parameters[num_rows_key]
432432
flat_parameters[num_rows_key] = min(
433433
self._max_child_rows[num_rows_key],
434-
math.ceil(num_rows)
434+
round(num_rows)
435435
)
436436

437437
return flat_parameters.rename(new_keys).to_dict()

0 commit comments

Comments
 (0)