From c641f9fb405298ad06bcb71f3d7b0e9cc82b0600 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Tue, 30 Dec 2025 19:53:11 +0530 Subject: [PATCH 01/12] Support 0 eval results --- tools/statvar_importer/property_value_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/statvar_importer/property_value_mapper.py b/tools/statvar_importer/property_value_mapper.py index 76d5209216..8c60f481a3 100644 --- a/tools/statvar_importer/property_value_mapper.py +++ b/tools/statvar_importer/property_value_mapper.py @@ -347,7 +347,7 @@ def _process_eval(self, pvs: dict, data_key: str) -> bool: self._log_every_n) if not eval_prop: eval_prop = data_key - if eval_data and eval_data != eval_str: + if eval_data is not None and eval_data != eval_str: pvs[eval_prop] = eval_data self._counters.add_counter('processed-eval', 1, eval_str) pvs.pop(eval_key) From f30e89550743d537d0aed8f59e3031cd2452cf19 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Wed, 25 Mar 2026 16:32:16 +0530 Subject: [PATCH 02/12] Use environment variable for DC API root --- util/dc_api_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/dc_api_wrapper.py b/util/dc_api_wrapper.py index f1a1fee247..98f7209d0e 100644 --- a/util/dc_api_wrapper.py +++ b/util/dc_api_wrapper.py @@ -15,7 +15,7 @@ It uses the DataCommonsClient library module for DC APIs and adds support for batched requests, retries and HTTP caching. -DC API requires an environment variable set for DC_API_KEY. +DC API requires an environment variable set for DC_API_KEY and DC_API_ROOT. Please refer to https://docs.datacommons.org/api/python/v2 for more details. """ @@ -265,7 +265,7 @@ def get_datacommons_client(config: dict = None) -> DataCommonsClient: """Returns a DataCommonsClient object initialized using config.""" config = _validate_v2_config(config) api_key = get_dc_api_key(config) - dc_instance = config.get('dc_api_root') + dc_instance = config.get('dc_api_root', os.environ.get('DC_API_ROOT')) url = None # Check if API root is a host or url endpoint. if dc_instance: From 2a0e93d8f73aeef6ff2b9b48bf9b5b819ed99182 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Wed, 25 Mar 2026 22:17:08 +0530 Subject: [PATCH 03/12] cleanup dc_api_root configs --- scripts/earthengine/utils.py | 2 -- scripts/earthengine/utils_test.py | 1 - .../common/datacommons_api_wrappers/datacommons_wrappers.py | 3 --- .../datacommons_api_wrappers/datacommons_wrappers_test.py | 2 -- .../india_rbi_state_statistics/environment_sdg_metadata.csv | 1 - .../india_rbi_state_statistics/infrastructure_metadata.csv | 1 - .../india_rbi_state_statistics/rbi_metadata.csv | 5 ----- .../denmark_demographics/denmark_demographics_metadata.csv | 1 - .../fema/flood_insurance_claims/us_flood_nfip_config.py | 3 --- .../state_domestic_product_metadata.csv | 1 - .../statistics_poland/StatisticsPoland_metadata.csv | 3 --- .../ap_ib_gt_enrollment/config/common_metadata.csv | 1 - .../state/config/SATorACT_Participation_metadata.csv | 1 - tools/statvar_importer/config_flags.py | 2 +- util/dc_api_wrapper.py | 3 ++- 15 files changed, 3 insertions(+), 27 deletions(-) diff --git a/scripts/earthengine/utils.py b/scripts/earthengine/utils.py index 44dbd71535..7e59202eaa 100644 --- a/scripts/earthengine/utils.py +++ b/scripts/earthengine/utils.py @@ -46,7 +46,6 @@ # Constants _MAX_LATITUDE = 90.0 _MAX_LONGITUDE = 180.0 -_DC_API_ROOT = 'https://api.datacommons.org' # Utilities for dicts. @@ -372,7 +371,6 @@ def place_id_to_lat_lng(placeid: str, { 'dc_api_version': 'V2', 'dc_api_use_cache': True, - 'dc_api_root': _DC_API_ROOT, }, ) node_props = resp.get(placeid) if resp else None diff --git a/scripts/earthengine/utils_test.py b/scripts/earthengine/utils_test.py index bfdd347661..f0f8e567cd 100644 --- a/scripts/earthengine/utils_test.py +++ b/scripts/earthengine/utils_test.py @@ -394,5 +394,4 @@ def test_place_id_to_lat_lng_dc_api(self): [placeid], ['latitude', 'longitude'], { 'dc_api_version': 'V2', 'dc_api_use_cache': True, - 'dc_api_root': utils._DC_API_ROOT, }) diff --git a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py index 39c93bc599..70e0936ee5 100644 --- a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py +++ b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py @@ -64,9 +64,6 @@ def dc_check_existence(dcid_list: list, wrapper_config = { 'dc_api_batch_size': max_items, - 'dc_api_root': - 'https://autopush.api.datacommons.org' - if use_autopush else 'https://api.datacommons.org' } return dc_api_is_defined_dcid(dcid_list, wrapper_config) diff --git a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py index 5eb9d2a497..e0374c7010 100644 --- a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py +++ b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py @@ -37,14 +37,12 @@ def test_dc_check_existence_mock(self, mock_is_defined): mock_is_defined.assert_called_with( ['node1'], { 'dc_api_batch_size': 450, - 'dc_api_root': 'https://autopush.api.datacommons.org' }) # Test 2: use_autopush=False dc_check_existence(['node2'], use_autopush=False, max_items=10) mock_is_defined.assert_called_with(['node2'], { 'dc_api_batch_size': 10, - 'dc_api_root': 'https://api.datacommons.org' }) @mock.patch('datacommons_wrappers.request_post_json') diff --git a/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/environment_sdg_metadata.csv b/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/environment_sdg_metadata.csv index 782aa4c330..f11ac0b3c4 100644 --- a/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/environment_sdg_metadata.csv +++ b/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/environment_sdg_metadata.csv @@ -2,4 +2,3 @@ parameter,value header_rows,3 output_columns,"observationAbout,observationDate,variableMeasured,value,unit,observationPeriod" mapped_rows,3 -dc_api_root,https://api.datacommons.org diff --git a/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/infrastructure_metadata.csv b/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/infrastructure_metadata.csv index 475c900919..f5c45d8c4e 100644 --- a/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/infrastructure_metadata.csv +++ b/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/infrastructure_metadata.csv @@ -2,4 +2,3 @@ parameter,value header_rows,5 output_columns,"observationAbout,observationDate,variableMeasured,value,unit,observationPeriod" mapped_rows,5 -dc_api_root,https://api.datacommons.org diff --git a/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/rbi_metadata.csv b/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/rbi_metadata.csv index ad0d50f768..c2042f4fd4 100644 --- a/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/rbi_metadata.csv +++ b/statvar_imports/database_on_indian_economy/india_rbi_state_statistics/rbi_metadata.csv @@ -2,8 +2,3 @@ parameter,value output_columns,"observationAbout,observationDate,variableMeasured,value,unit,observationPeriod" header_rows,4 mapped_rows,4 -dc_api_root,https://api.datacommons.org - - - - diff --git a/statvar_imports/denmark_demographics/denmark_demographics_metadata.csv b/statvar_imports/denmark_demographics/denmark_demographics_metadata.csv index 41f8f31e37..95d252a541 100644 --- a/statvar_imports/denmark_demographics/denmark_demographics_metadata.csv +++ b/statvar_imports/denmark_demographics/denmark_demographics_metadata.csv @@ -1,3 +1,2 @@ parameter,value output_columns,"observationDate,value,observationAbout,variableMeasured" -dc_api_root,https://api.datacommons.org diff --git a/statvar_imports/fema/flood_insurance_claims/us_flood_nfip_config.py b/statvar_imports/fema/flood_insurance_claims/us_flood_nfip_config.py index 90e53db883..082ce8b7e3 100644 --- a/statvar_imports/fema/flood_insurance_claims/us_flood_nfip_config.py +++ b/statvar_imports/fema/flood_insurance_claims/us_flood_nfip_config.py @@ -68,7 +68,4 @@ 5, 'dc_api_use_cache': True, - #'dc_api_root': 'http://autopush.api.datacommons.org', - 'dc_api_root': - 'http://api.datacommons.org', } diff --git a/statvar_imports/india_rbistatedomesticproduct/state_domestic_product_metadata.csv b/statvar_imports/india_rbistatedomesticproduct/state_domestic_product_metadata.csv index ee630bdff4..0c90bd2702 100644 --- a/statvar_imports/india_rbistatedomesticproduct/state_domestic_product_metadata.csv +++ b/statvar_imports/india_rbistatedomesticproduct/state_domestic_product_metadata.csv @@ -11,4 +11,3 @@ comments, output_columns,"observationAbout,observationDate,variableMeasured,value,unit,measurementMethod,observationPeriod" #header_rows,6 #mapped_rows,5 -dc_api_root,https://api.datacommons.org diff --git a/statvar_imports/statistics_poland/StatisticsPoland_metadata.csv b/statvar_imports/statistics_poland/StatisticsPoland_metadata.csv index b909a13a08..a3a30ec1c2 100644 --- a/statvar_imports/statistics_poland/StatisticsPoland_metadata.csv +++ b/statvar_imports/statistics_poland/StatisticsPoland_metadata.csv @@ -9,6 +9,3 @@ places_within,country/POL #skip_rows,1 header_rows,5 mapped_columns,2 -dc_api_root,https://api.datacommons.org - - diff --git a/statvar_imports/us_urban_school/ap_ib_gt_enrollment/config/common_metadata.csv b/statvar_imports/us_urban_school/ap_ib_gt_enrollment/config/common_metadata.csv index 41a321a836..2c8f80a15c 100644 --- a/statvar_imports/us_urban_school/ap_ib_gt_enrollment/config/common_metadata.csv +++ b/statvar_imports/us_urban_school/ap_ib_gt_enrollment/config/common_metadata.csv @@ -3,4 +3,3 @@ mapped_rows,1 output_columns,"observationDate,observationAbout,variableMeasured,value" #input_rows,10 mapped_columns,2 -dc_api_root,https://api.datacommons.org diff --git a/statvar_imports/us_urban_school/sat_act_participation/state/config/SATorACT_Participation_metadata.csv b/statvar_imports/us_urban_school/sat_act_participation/state/config/SATorACT_Participation_metadata.csv index 4909fa4a53..75997951a4 100644 --- a/statvar_imports/us_urban_school/sat_act_participation/state/config/SATorACT_Participation_metadata.csv +++ b/statvar_imports/us_urban_school/sat_act_participation/state/config/SATorACT_Participation_metadata.csv @@ -1,3 +1,2 @@ parameter,value output_columns,"observationAbout,observationDate,value,variableMeasured,unit,scalingFactor" -dc_api_root,https://api.datacommons.org diff --git a/tools/statvar_importer/config_flags.py b/tools/statvar_importer/config_flags.py index d5214a3510..94a162f33c 100644 --- a/tools/statvar_importer/config_flags.py +++ b/tools/statvar_importer/config_flags.py @@ -370,7 +370,7 @@ def get_default_config() -> dict: True, # Settings for DC API. 'dc_api_root': - 'http://api.datacommons.org', + os.environ.get('DC_API_ROOT', 'http://api.datacommons.org'), 'dc_api_use_cache': False, 'dc_api_batch_size': diff --git a/util/dc_api_wrapper.py b/util/dc_api_wrapper.py index 98f7209d0e..682d4aeeaf 100644 --- a/util/dc_api_wrapper.py +++ b/util/dc_api_wrapper.py @@ -520,7 +520,8 @@ def dc_api_resolve_latlng(lat_lngs: list, dictionary containing the resolved place information. """ config = _validate_v2_config(config) - api_root = config.get('dc_api_root', _DEFAULT_API_ROOT) + api_root = config.get('dc_api_root', + os.environ.get('DC_API_ROOT', _DEFAULT_API_ROOT)) v1_data = {} v1_data['coordinates'] = lat_lngs num_ids = len(lat_lngs) From dfad7506200cbf30ee076a9e258e975460c965a2 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Wed, 25 Mar 2026 23:12:48 +0530 Subject: [PATCH 04/12] lint fix --- scripts/earthengine/utils.py | 12 +-- scripts/earthengine/utils_test.py | 10 +- .../datacommons_wrappers.py | 3 +- .../datacommons_wrappers_test.py | 7 +- tools/statvar_importer/config_flags.py | 2 +- util/dc_api_wrapper.py | 100 +++++++++++------- 6 files changed, 77 insertions(+), 57 deletions(-) diff --git a/scripts/earthengine/utils.py b/scripts/earthengine/utils.py index 7e59202eaa..aeb94045ac 100644 --- a/scripts/earthengine/utils.py +++ b/scripts/earthengine/utils.py @@ -19,11 +19,11 @@ from datetime import datetime import glob import os +from pathlib import Path import pickle import re import sys import tempfile -from pathlib import Path from typing import Union from absl import logging @@ -305,8 +305,8 @@ def grid_get_neighbor_ids(grid_id: str) -> list: if lat_offset != 0 or lng_offset != 0: neighbour_lat = lat + lat_offset * deg neighbour_lng = lng + lng_offset * deg - if abs(neighbour_lat) < _MAX_LATITUDE and abs( - neighbour_lng) < _MAX_LONGITUDE: + if (abs(neighbour_lat) < _MAX_LATITUDE and + abs(neighbour_lng) < _MAX_LONGITUDE): neighbours.append( grid_id_from_lat_lng( deg, @@ -433,7 +433,7 @@ def add_namespace(dcid: str, prefix: str = 'dcid:') -> str: def str_get_numeric_value( - value: Union[str, list, int, float]) -> Union[int, float, None]: + value: Union[str, list, int, float],) -> Union[int, float, None]: """Returns the numeric value from input string or None.""" if isinstance(value, list): value = value[0] @@ -528,7 +528,7 @@ def date_advance_by_period(date_str: str, if not date_str: return '' dt = datetime.strptime(date_str, date_format) - (delta, unit) = date_parse_time_period(time_period) + delta, unit = date_parse_time_period(time_period) if not delta or not unit: logging.error( f'Unable to parse time period: {time_period} for date: {date_str}') @@ -545,7 +545,7 @@ def date_format_by_time_period(date_str: str, time_period: str) -> str: """ if not time_period: return date_str - (delta, unit) = date_parse_time_period(time_period) + delta, unit = date_parse_time_period(time_period) date_parts = date_str.split('-') if unit == 'years': return date_parts[0] diff --git a/scripts/earthengine/utils_test.py b/scripts/earthengine/utils_test.py index f0f8e567cd..e93be53fea 100644 --- a/scripts/earthengine/utils_test.py +++ b/scripts/earthengine/utils_test.py @@ -390,8 +390,8 @@ def test_place_id_to_lat_lng_dc_api(self): lat, lng = utils.place_id_to_lat_lng(placeid, dc_api_lookup=True) self.assertAlmostEqual(37.221614, lat) self.assertAlmostEqual(-121.68954, lng) - mock_get.assert_called_once_with( - [placeid], ['latitude', 'longitude'], { - 'dc_api_version': 'V2', - 'dc_api_use_cache': True, - }) + mock_get.assert_called_once_with([placeid], + ['latitude', 'longitude'], { + 'dc_api_version': 'V2', + 'dc_api_use_cache': True, + }) diff --git a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py index 70e0936ee5..eb0e487f2a 100644 --- a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py +++ b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers.py @@ -62,8 +62,7 @@ def dc_check_existence(dcid_list: list, Dict object with dcids as key values and boolean values signifying existence as values. """ wrapper_config = { - 'dc_api_batch_size': - max_items, + 'dc_api_batch_size': max_items, } return dc_api_is_defined_dcid(dcid_list, wrapper_config) diff --git a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py index e0374c7010..e605afb5a9 100644 --- a/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py +++ b/scripts/us_census/acs5yr/subject_tables/common/datacommons_api_wrappers/datacommons_wrappers_test.py @@ -34,10 +34,9 @@ def test_dc_check_existence_mock(self, mock_is_defined): # Test 1: Default (use_autopush=True by default in function signature) mock_is_defined.return_value = {'node1': True} dc_check_existence(['node1']) - mock_is_defined.assert_called_with( - ['node1'], { - 'dc_api_batch_size': 450, - }) + mock_is_defined.assert_called_with(['node1'], { + 'dc_api_batch_size': 450, + }) # Test 2: use_autopush=False dc_check_existence(['node2'], use_autopush=False, max_items=10) diff --git a/tools/statvar_importer/config_flags.py b/tools/statvar_importer/config_flags.py index 94a162f33c..a7449ca032 100644 --- a/tools/statvar_importer/config_flags.py +++ b/tools/statvar_importer/config_flags.py @@ -370,7 +370,7 @@ def get_default_config() -> dict: True, # Settings for DC API. 'dc_api_root': - os.environ.get('DC_API_ROOT', 'http://api.datacommons.org'), + os.environ.get('DC_API_ROOT', 'https://api.datacommons.org'), 'dc_api_use_cache': False, 'dc_api_batch_size': diff --git a/util/dc_api_wrapper.py b/util/dc_api_wrapper.py index 682d4aeeaf..1265452f8c 100644 --- a/util/dc_api_wrapper.py +++ b/util/dc_api_wrapper.py @@ -21,16 +21,21 @@ import os import sys -import urllib -import requests from typing import Union +import urllib from absl import logging from datacommons_client.client import DataCommonsClient from datacommons_client.utils.error_handling import APIError, DCConnectionError, DCStatusError +import requests import requests_cache -from tenacity import (RetryCallState, Retrying, retry_if_exception, - stop_after_attempt, wait_fixed) +from tenacity import ( + RetryCallState, + Retrying, + retry_if_exception, + stop_after_attempt, + wait_fixed, +) _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(_SCRIPT_DIR) @@ -64,8 +69,14 @@ def _get_exception_status_code(exception): def _should_retry_exception(exception: Exception) -> bool: - if isinstance(exception, (DCConnectionError, requests.exceptions.Timeout, - requests.exceptions.ChunkedEncodingError)): + if isinstance( + exception, + ( + DCConnectionError, + requests.exceptions.Timeout, + requests.exceptions.ChunkedEncodingError, + ), + ): return True if isinstance(exception, (urllib.error.HTTPError, DCStatusError, APIError)): status_code = _get_exception_status_code(exception) @@ -105,6 +116,7 @@ def dc_api_wrapper( retries: Maximum number of attempts (including the first attempt). retry_sec: Interval in seconds between retries for which caller is blocked. use_cache: If True, uses request cache for faster response. + Returns: The response from the DataCommons API call. """ @@ -147,8 +159,9 @@ def dc_api_wrapper( logging.error(f'Got exception for api: {function}, {e}') return None except Exception as e: - e.add_note(f'DC API call failed for {function} with max attempts ' - f'{max_attempts}.') + e.add_note( + f'DC API call failed for {function} with max attempts {max_attempts}.' + ) raise @@ -252,11 +265,13 @@ def get_dc_api_key(config: dict = None) -> str: api_key = config.get('dc_api_key', os.environ.get('DC_API_KEY')) if not api_key: logging.log_first_n( - logging.WARNING, f'Using default DC API key with limited quota. ' - 'Please set an API key in the environment variable: DC_API_KEY.' - 'Refer https://docs.datacommons.org/api/python/v2/#authentication ' - 'for more details.', - n=1) + logging.WARNING, + f'Using default DC API key with limited quota. ' + f'Please set an API key in the environment variable: DC_API_KEY.' + f'Refer https://docs.datacommons.org/api/python/v2/#authentication ' + f'for more details.', + n=1, + ) api_key = _DEFAULT_DC_API_KEY return api_key @@ -286,6 +301,7 @@ def get_datacommons_client(config: dict = None) -> DataCommonsClient: def dc_api_is_defined_dcid(dcids: list, config: dict = {}) -> dict: """Returns a dictionary with dcids mapped to True/False based on whether + the dcid is defined in the API and has a 'typeOf' property. Uses the property_value() DC API to lookup 'typeOf' for each dcid. dcids not defined in KG get a value of False. @@ -301,11 +317,13 @@ def dc_api_is_defined_dcid(dcids: list, config: dict = {}) -> dict: # Set parameters for node API. client = get_datacommons_client(config) api_function = client.node.fetch_property_values - api_result = dc_api_batched_wrapper(function=api_function, - dcids=dcids, - args={'properties': 'typeOf'}, - dcid_arg_kw='node_dcids', - config=config) + api_result = dc_api_batched_wrapper( + function=api_function, + dcids=dcids, + args={'properties': 'typeOf'}, + dcid_arg_kw='node_dcids', + config=config, + ) response = {} for dcid in dcids: dcid_stripped = _strip_namespace(dcid) @@ -348,11 +366,13 @@ def _dc_api_get_node_property_v2(dcids: list, api_function = client.node.fetch_property_values args = {'properties': prop} dcid_arg_kw = 'node_dcids' - api_result = dc_api_batched_wrapper(function=api_function, - dcids=dcids, - args=args, - dcid_arg_kw=dcid_arg_kw, - config=config) + api_result = dc_api_batched_wrapper( + function=api_function, + dcids=dcids, + args=args, + dcid_arg_kw=dcid_arg_kw, + config=config, + ) response = {} for dcid in dcids: dcid_stripped = _strip_namespace(dcid) @@ -398,11 +418,13 @@ def dc_api_get_node_property_values(dcids: list, config: dict = {}) -> dict: api_function = client.node.fetch args = {'expression': '->*'} dcid_arg_kw = 'node_dcids' - api_result = dc_api_batched_wrapper(function=api_function, - dcids=dcids, - args=args, - dcid_arg_kw=dcid_arg_kw, - config=config) + api_result = dc_api_batched_wrapper( + function=api_function, + dcids=dcids, + args=args, + dcid_arg_kw=dcid_arg_kw, + config=config, + ) response = {} for dcid, arcs in api_result.items(): pvs = {} @@ -446,11 +468,13 @@ def dc_api_resolve_placeid(dcids: list, api_function = client.resolve.fetch args = {'expression': f'<-{in_prop}->dcid'} dcid_arg_kw = 'node_ids' - api_result = dc_api_batched_wrapper(function=api_function, - dcids=dcids, - args=args, - dcid_arg_kw=dcid_arg_kw, - config=config) + api_result = dc_api_batched_wrapper( + function=api_function, + dcids=dcids, + args=args, + dcid_arg_kw=dcid_arg_kw, + config=config, + ) results = {} if api_result: for node in api_result.get('entities', []): @@ -478,7 +502,7 @@ def dc_api_resolve_latlng(lat_lngs: list, } if return_v1_response is True, a v1 response of this form is returned: - + { "placeCoordinates": [ { @@ -552,8 +576,7 @@ def dc_api_resolve_latlng(lat_lngs: list, def _convert_v2_to_v1_coordinate_response(v2_response: dict) -> dict: - """Converts a v2 coordinate resolution response to a v1 response. - """ + """Converts a v2 coordinate resolution response to a v1 response.""" v1_response = {'placeCoordinates': []} for entity in v2_response.get('entities', []): node = entity.get('node', '') @@ -573,15 +596,14 @@ def _convert_v2_to_v1_coordinate_response(v2_response: dict) -> dict: candidate.get('dcid') for candidate in entity.get('candidates', []) ], - 'places': entity.get('candidates', []) + 'places': entity.get('candidates', []), } v1_response['placeCoordinates'].append(place_coordinate) return v1_response def _convert_v1_to_v2_coordinate_request(v1_request: dict) -> dict: - """Converts a v1 coordinate resolution request to a v2 request. - """ + """Converts a v1 coordinate resolution request to a v2 request.""" v2_request = {'nodes': [], 'property': '<-geoCoordinate->dcid'} for coordinate in v1_request.get('coordinates', []): lat = coordinate.get('latitude') From 9951b781977f14554b7a3f144974ed038e74d858 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Fri, 12 Jun 2026 19:21:12 +0530 Subject: [PATCH 05/12] Add script for UN20 imports --- .../schema/statvar_dcid_gen.py | 197 ++++++++++++++++++ .../schema/statvar_dcid_gen_test.py | 71 +++++++ tools/statvar_importer/stat_var_processor.py | 10 +- 3 files changed, 276 insertions(+), 2 deletions(-) create mode 100644 tools/statvar_importer/schema/statvar_dcid_gen.py create mode 100644 tools/statvar_importer/schema/statvar_dcid_gen_test.py diff --git a/tools/statvar_importer/schema/statvar_dcid_gen.py b/tools/statvar_importer/schema/statvar_dcid_gen.py new file mode 100644 index 0000000000..52c345aac2 --- /dev/null +++ b/tools/statvar_importer/schema/statvar_dcid_gen.py @@ -0,0 +1,197 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities to generate statvar dcid.""" + +import os +import re +import sys + +from absl import app +from absl import flags +from absl import logging + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +_DATA_DIR = os.path.dirname(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append(os.path.join(_DATA_DIR, 'util')) + +from counters import Counters +from dc_api_wrapper import dc_api_get_node_property_values +from mcf_file_util import strip_namespace, add_namespace, add_mcf_node, is_leaf_object + + +def camel_to_snake(text: str, delim: str = '_') -> str: + """Convert a string from camelCase to snake_case. + + Args: + text: The camelCase string to convert. + delim: Delimiter to use between words (default is '_'). + + Returns: + The converted snake_case string in lowercase. + """ + s1 = re.sub(r'([a-z0-9])([A-Z])', r'\1' + delim + r'\2', text) + s2 = re.sub(r'([a-zA-Z])([0-9])', r'\1' + delim + r'\2', s1) + s3 = re.sub(r'([A-Z])([A-Z][a-z])', r'\1' + delim + r'\2', s2) + return s3.lower() + + +def get_dcid_name(dcid: str, schema_nodes: dict) -> str: + """Returns the name for a DCID if it exists in the schema. + + Args: + dcid: The DCID string to look up. + schema_nodes: Dictionary of schema nodes containing properties. + + Returns: + The name of the DCID if found, or stripped DCID if no name property is + defined. Returns None if the DCID is not in the schema. + """ + node = schema_nodes.get(strip_namespace(dcid)) + if not node: + node = schema_nodes.get(add_namespace(dcid)) + if not node: + return None + name = node.get('name') + if not name: + name = strip_namespace(dcid) + return name.strip('"').strip() + + +def get_dcid_token(word: str, + upper_case: bool = False, + remove_prefix: str = '') -> str: + """Returns the word normalized into a token suitable for a DCID. + + Args: + word: The raw string to normalize. + upper_case: If True, converts camelCase to uppercase snake_case. + remove_prefix: Optional prefix string to remove from the token. + + Returns: + A normalized DCID token string. + """ + # Convert any non alphanumeric characters to '_' + token = re.sub(r'[^A-Za-z0-9_.-]+', '_', word.strip()) + token = re.sub(r'_+', '_', token).strip('_') + + if upper_case: + # Convert camelCase to snake case + token = camel_to_snake(token).upper() + if remove_prefix: + token = token.removeprefix(remove_prefix) + return token[0].upper() + token[1:] + + +def generate_dcid_for_statvar(pvs: dict, + config: dict, + schema_nodes: dict = None, + counters: Counters = None) -> str: + """Returns the generated statistical variable DCID using the configuration. + + Args: + pvs: Dictionary of property-value mappings representing the StatVar. + config: Configuration dictionary defining DCID generation parameters. + schema_nodes: Optional dictionary of loaded schema nodes. + counters: Optional Counters object to track statistics. + + Returns: + A generated DCID string for the StatVar. + """ + + if schema_nodes is None: + schema_nodes = dict() + + # Get the order of properties for dcid with ignored values + dcid_props = config.get('statvar_dcid_fixed_properties', [ + 'statType<>measuredValue', + 'measurementQualifier', + 'measuredProperty', + 'populationType', + ]) + fixed_props = dict() + for prop in dcid_props: + val = '' + if '<>' in prop: + prop, val = prop.split('<>', 1) + fixed_props.setdefault(prop, set()).add(val) + + use_value_names = config.get('statvar_dcid_value_name', False) + dcid_prefix = config.get('statvar_dcid_prefix', '') + ignore_props = config.get('statvar_dcid_ignore_properties', [ + 'description', 'name', 'nameWithLanguage', 'descriptionUrl', + 'alternateName', 'footnote', 'unCode', 'Node', 'typeOf' + ]) + prop_delim = config.get('statvar_dcid_delimiter', '_') + val_delim = config.get('statvar_dcid_value_delimiter', '') + upper_case = config.get('statvar_dcid_upper_case', False) + remove_prefix = config.get('statvar_dcid_remove_prefix', '') + + add_prop = False + if val_delim: + add_prop = True + + # Lookup names for values. + lookup_dcids = set() + dcid_pvs = dict() + for prop, value in pvs.items(): + if prop not in ignore_props: + dcid_pvs[prop] = value + if use_value_names and not get_dcid_name(prop, schema_nodes): + lookup_dcids.add(prop) + if use_value_names and not is_leaf_object( + value) and not get_dcid_name(value, schema_nodes): + lookup_dcids.add(value) + + if lookup_dcids: + if counters: + counters.add_counter('dc_api_lookup_name', len(lookup_dcids)) + node_names = dc_api_get_node_property_values(list(lookup_dcids)) + for pvs in node_names.values(): + add_mcf_node(pvs, schema_nodes) + + ordered_props = [] + # Add properties from template followed by constraint props + for prop, val in fixed_props.items(): + prop_val = dcid_pvs.get(prop) + if prop_val: + if val and prop_val in val: + dcid_pvs.pop(prop) + else: + ordered_props.append(prop) + for prop in sorted(dcid_pvs.keys()): + if prop not in ordered_props: + ordered_props.append(prop) + + # Get ordered list of dcid tokens + dcid_tokens = [] + for prop in ordered_props: + prop_value = dcid_pvs.pop(prop, None) + if prop_value: + value_name = prop_value + if use_value_names: + value_name = get_dcid_name(prop_value, schema_nodes) + value_name = get_dcid_token(value_name, upper_case, remove_prefix) + if val_delim and prop not in fixed_props: + prop_name = get_dcid_token(prop, upper_case, remove_prefix) + value_name = prop_name + val_delim + value_name + if upper_case: + value_name = value_name.upper() + dcid_tokens.append(value_name) + dcid = prop_delim.join(dcid_tokens) + if dcid_prefix: + dcid = dcid_prefix + dcid + return dcid diff --git a/tools/statvar_importer/schema/statvar_dcid_gen_test.py b/tools/statvar_importer/schema/statvar_dcid_gen_test.py new file mode 100644 index 0000000000..5a1737d6e6 --- /dev/null +++ b/tools/statvar_importer/schema/statvar_dcid_gen_test.py @@ -0,0 +1,71 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from statvar_dcid_gen import camel_to_snake +from statvar_dcid_gen import generate_dcid_for_statvar +from statvar_dcid_gen import get_dcid_name +from statvar_dcid_gen import get_dcid_token + + +class TestStatvarDcidGen(unittest.TestCase): + + def test_camel_to_snake(self): + self.assertEqual(camel_to_snake('camelCase'), 'camel_case') + self.assertEqual(camel_to_snake('CamelCase'), 'camel_case') + self.assertEqual(camel_to_snake('CaseACRONYM'), 'case_acronym') + self.assertEqual(camel_to_snake('CaseAbc123'), 'case_abc_123') + self.assertEqual(camel_to_snake('simple'), 'simple') + + def test_get_dcid_token(self): + self.assertEqual(get_dcid_token('Hello World!'), 'Hello_World') + self.assertEqual( + get_dcid_token('helloWorld', upper_case=True), 'HELLO_WORLD') + self.assertEqual( + get_dcid_token('prefixWorld', remove_prefix='prefix'), 'World') + + def test_get_dcid_name(self): + schema_nodes = { + 'Person': { + 'name': '"Human"' + }, + 'dcid:Count': { + 'name': 'TotalCount' + }, + } + self.assertEqual(get_dcid_name('Person', schema_nodes), 'Human') + self.assertEqual(get_dcid_name('dcid:Person', schema_nodes), 'Human') + self.assertEqual(get_dcid_name('Count', schema_nodes), 'TotalCount') + self.assertEqual(get_dcid_name('Unknown', schema_nodes), None) + + def test_generate_dcid(self): + pvs = { + 'statType': 'measuredValue', + 'measuredProperty': 'count', + 'populationType': 'Person', + } + dcid = generate_dcid_for_statvar(pvs, {}) + self.assertEqual(dcid, 'Count_Person') + + pvs2 = { + 'statType': 'index', + 'measuredProperty': 'count', + 'populationType': 'Person', + } + dcid2 = generate_dcid_for_statvar(pvs2, {}) + self.assertEqual(dcid2, 'Index_Count_Person') + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/statvar_importer/stat_var_processor.py b/tools/statvar_importer/stat_var_processor.py index 487cce556f..cb4553cdcb 100644 --- a/tools/statvar_importer/stat_var_processor.py +++ b/tools/statvar_importer/stat_var_processor.py @@ -89,6 +89,7 @@ from schema_generator import generate_schema_nodes, generate_statvar_name from schema_checker import sanity_check_nodes from schema_reconciler import SchemaReconciler +from statvar_dcid_gen import generate_dcid_for_statvar # imports from ../../util from config_map import ConfigMap, read_py_dict_from_file @@ -375,14 +376,19 @@ def generate_statvar_dcid(self, pvs: dict) -> str: 'statvar_dcid_ignore_properties', [ 'description', 'name', 'nameWithLanguage', 'descriptionUrl', - 'alternateName' + 'alternateName', 'footnote', 'typeOf', 'Node' ], ) if not self._config.get( 'schemaless', False) or not self._get_schemaless_statvar_props(pvs): try: - dcid = get_statvar_dcid(pvs, ignore_props=dcid_ignore_props) + if self._config.get('statvar_dcid_fixed_properties'): + # Use the custom statvar dcid generator + dcid = generate_dcid_for_statvar(pvs, self._config, + self._counters) + else: + dcid = get_statvar_dcid(pvs, ignore_props=dcid_ignore_props) dcid = re.sub(r'[^A-Za-z_0-9/_\.-]+', '_', dcid) except TypeError as e: logging.log_every_n( From 4e305159f6a4a3bfc9e8f02cd53ff885dfee620f Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Fri, 12 Jun 2026 19:21:55 +0530 Subject: [PATCH 06/12] Add script for generating codelist mapping --- scripts/un/codes/generate_codelist_map.py | 174 ++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 scripts/un/codes/generate_codelist_map.py diff --git a/scripts/un/codes/generate_codelist_map.py b/scripts/un/codes/generate_codelist_map.py new file mode 100644 index 0000000000..f7d13597e0 --- /dev/null +++ b/scripts/un/codes/generate_codelist_map.py @@ -0,0 +1,174 @@ +"""Script to geenrate codelist mapings for a specific codelist file.""" + +import os +import re +import sys + +from absl import app +from absl import flags +from absl import logging + +import file_util +import mcf_file_util +import eval_functions + +from counters import Counters + +flags.DEFINE_string('input_codelist', '', 'CSV file with codelist.') +flags.DEFINE_string('output_pvmap', '', 'Output pvmap csv.') +flags.DEFINE_string('namespace', 'un', 'Namespace prefix for agency') +flags.DEFINE_integer('logging_level', logging.INFO, 'Logging level.') +flags.DEFINE_string('pvmap_template', '', 'Python file with pvmap template.') + +_FLAGS = flags.FLAGS + +_DEFAULT_CODE_PROPS = [ + 'CONCEPT', + 'CODE', + 'NAME_EN', + 'PARENT', + 'SORT_ORDER', + 'NAME_FR', + 'NAME_ES', + 'DESCRIPTION', +] + +# Map from a code to a pvmap +_DEFAULT_CODE_PVMAP = { + 'key': '{CONCEPT}:{CODE}', + 'UnConceptProp': 'Property', + 'UnConcept': '"{CONCEPT}"', + 'UnCodeProp': 'UnCode', + 'UnCode': '"{CODE}"', + 'ConstraintProp': 'to_property(CONCEPT)', + 'ConstraintPropValue': 'to_dcid(NAMESPACE+"_"+CODE)', + 'ConstraintPropType': 'TypeOf', + 'ConstraintPropEnum': 'str(ConstraintProp[0].upper() + ConstraintProp[1:]+"Enum")', + 'NameProp': '{CONCEPT}_name', + 'ConstraintValueName': '"{NAME_EN}"', + 'DescriptionProp': '{CONCEPT}_description', + 'ConstraintValueDescription': '{DESCRIPTION}', + 'End': 'End', + 'Dummy': '.', +} + + +def to_property(concept: str) -> str: + """Returns a property for the concept.""" + c = eval_functions.str_to_camel_case(concept.lower().replace('_', ' ')) + return c[0].lower() + c[1:] + + +def to_dcid(code: str) -> str: + """Replace any non alphanumeric characters with '_'""" + value = re.sub(r'[^A-Za-z0-9\.]+', '_', code) + return value[0].upper() + value[1:] + + +def clean_value_str(val: str, + regex: str = 'r[^A-Za-z0-9()[]".-]+', + replace: str = '_') -> str: + """Cleanup value string to remove redundant characters.""" + val = val.strip() + if val[0] == '"' and val[-1] == '"': + val = '"' + val[1:-1].strip() + '"' + val = re.sub(regex, replace, val) + return val + + +_EVAL_FUNCTIONS = dict(eval_functions.EVAL_GLOBALS) +_EVAL_FUNCTIONS.update({ + 'to_property': to_property, + 'to_dcid': to_dcid, + 'clean_value_str': clean_value_str, +}) + +def get_value(tpl_val: str, input_pvs: dict) -> str: + """Retuns a value with the pvs applied.""" + value = tpl_val + if '{' in tpl_val: + # Format string + try: + value = tpl_val.format(**input_pvs) + except Exception as e: + logging.error( + f'Failed to format "{tpl_val}" using dict: {input_pvs}, error:{e}' + ) + value = '' + elif '(' in tpl_val: + # Evaluate a function + try: + prop, value = eval_functions.evaluate_statement( + tpl_val, input_pvs, _EVAL_FUNCTIONS) + except Exception as e: + lgging.error( + f'Failed to evaluate "{tpl_val}" using dict: {pvs}, error:{e}' + ) + value = '' + if value: + # Cleanup value + value = clean_value_str(value) + return value + + +def generate_code_map(code_pvs: dict, + namespace: str = 'un', + template: dict = _DEFAULT_CODE_PVMAP) -> dict: + """Returns a pvmap pvs for a single code. + A code has keys listed in _DEFAULT_CODE_PROPS + It returns a dictionary with the keys in template. + """ + output_pvs = dict() + input_pvs = dict(code_pvs) + input_pvs.setdefault('namespace', namespace.lower()) + input_pvs.setdefault('NAMESPACE', namespace.upper()) + for tpl_prop, tpl_val in template.items(): + tpl_prop = get_value(tpl_prop, input_pvs) + value = get_value(tpl_val, input_pvs) + output_pvs[tpl_prop] = value + input_pvs[tpl_prop] = value + logging.log(2, f'Mapped {tpl_prop} using {tpl_val} to {value}') + return output_pvs + + +def generate_codelist_pvmap(cl_file: str, + output: str, + namespace: str = 'un') -> dict: + """Generate a pvmap file for a codelist.""" + counters = Counters() + + input_codes = file_util.file_load_csv_dict(cl_file, key_index=True) + logging.info(f'Loaded {len(input_codes)} from codelist: {cl_file}') + counters.add_counter('input-codes', len(input_codes)) + + output_pvs = {} + for index, code_pvs in input_codes.items(): + pvs = generate_code_map(code_pvs, namespace) + output_pvs[index] = pvs + logging.debug(f'Mapped {code_pvs} to {pvs}') + + # Write to output file + if output: + file_util.file_write_csv_dict(output_pvs, output) + + # Get unique counts across output columns + unique_counts = dict() + for index, pvs in output_pvs.items(): + for prop, val in pvs.items(): + if val: + unique_counts.setdefault(prop, set()).add(val) + for prop, vals in unique_counts.items(): + counters.add_counter(f'output-unique-{prop}', len(vals)) + + counters.add_counter('output-rows', len(output_pvs)) + counters.print_counters() + + +def main(_): + logging.set_verbosity(_FLAGS.logging_level) + generate_codelist_pvmap(_FLAGS.input_codelist, _FLAGS.output_pvmap, + _FLAGS.namespace) + + +if __name__ == '__main__': + app.run(main) From 3cae0f3e93ce8b3efe150029f6ac16e3ab2c939e Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Sat, 13 Jun 2026 20:04:40 +0530 Subject: [PATCH 07/12] add dcid generator to flags --- tools/statvar_importer/config_flags.py | 15 +++++++ .../schema/statvar_dcid_gen_test.py | 39 +++++++++++++++++-- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/tools/statvar_importer/config_flags.py b/tools/statvar_importer/config_flags.py index a7449ca032..5b8f05e3cb 100644 --- a/tools/statvar_importer/config_flags.py +++ b/tools/statvar_importer/config_flags.py @@ -433,6 +433,21 @@ def get_default_config() -> dict: _FLAGS.generate_statvar_name, # Generate names for StatVars 'llm_generate_statvar_name': _FLAGS.llm_generate_statvar_name, + + # Settings for statvar dcid generator + 'statvar_dcid_fixed_properties': [], + 'statvar_dcid_prefix': + '', + 'statvar_dcid_remove_prefix': + '', + 'statvar_dcid_delimiter': + '', + 'statvar_dcid_value_delimiter': + '', + 'statvar_dcid_upper_case': + False, + 'statvar_dcid_remove_prefix': + '', } diff --git a/tools/statvar_importer/schema/statvar_dcid_gen_test.py b/tools/statvar_importer/schema/statvar_dcid_gen_test.py index 5a1737d6e6..0538e1456e 100644 --- a/tools/statvar_importer/schema/statvar_dcid_gen_test.py +++ b/tools/statvar_importer/schema/statvar_dcid_gen_test.py @@ -30,10 +30,10 @@ def test_camel_to_snake(self): def test_get_dcid_token(self): self.assertEqual(get_dcid_token('Hello World!'), 'Hello_World') - self.assertEqual( - get_dcid_token('helloWorld', upper_case=True), 'HELLO_WORLD') - self.assertEqual( - get_dcid_token('prefixWorld', remove_prefix='prefix'), 'World') + self.assertEqual(get_dcid_token('helloWorld', upper_case=True), + 'HELLO_WORLD') + self.assertEqual(get_dcid_token('prefixWorld', remove_prefix='prefix'), + 'World') def test_get_dcid_name(self): schema_nodes = { @@ -66,6 +66,37 @@ def test_generate_dcid(self): dcid2 = generate_dcid_for_statvar(pvs2, {}) self.assertEqual(dcid2, 'Index_Count_Person') + def test_generate_dcid_with_property(self): + config = { + 'statvar_dcid_fixed_properties': [ + 'statType<>measuredValue', 'measuredProperty<>value', + 'populationType' + ], + 'statvar_dcid_delimiter': '__', + 'statvar_dcid_value_delimiter': '--', + 'statvar_dcid_remove_prefix': 'TEST_', + 'statvar_dcid_upper_case': True, + 'statvar_dcid_prefix': 'test/', + } + pvs = { + 'statType': 'measuredValue', + 'measuredProperty': 'count', + 'populationType': 'Person', + } + dcid = generate_dcid_for_statvar(pvs, config) + self.assertEqual(dcid, 'test/COUNT__PERSON') + + pvs2 = { + 'statType': 'medianValue', + 'measuredProperty': 'age', + 'populationType': 'Person', + 'gender': 'Male', + 'place': 'TEST_Urban', + } + dcid2 = generate_dcid_for_statvar(pvs2, config) + self.assertEqual( + dcid2, 'test/MEDIAN_VALUE__AGE__PERSON__GENDER--MALE__PLACE--URBAN') + if __name__ == '__main__': unittest.main() From 258bbdedd2c0086c9fe0b748a44ea9403a557755 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Tue, 16 Jun 2026 19:15:53 +0530 Subject: [PATCH 08/12] script to generate statvar names --- scripts/un/codes/codelist_pvmap_template.py | 23 +++ scripts/un/codes/codelist_schema.tmcf | 17 ++ scripts/un/codes/dsd_property_pvmap.py | 22 +++ .../un/codes/dsd_property_pvmap_template.py | 39 ++++ scripts/un/codes/dsd_property_schema.tmcf | 15 ++ scripts/un/codes/generate_codelist_map.py | 138 +++++++++----- scripts/un/codes/generate_statvar_name.py | 169 ++++++++++++++++++ tools/statvar_importer/config_flags.py | 5 +- .../schema/statvar_dcid_gen.py | 23 ++- .../schema/statvar_dcid_gen_test.py | 16 +- 10 files changed, 409 insertions(+), 58 deletions(-) create mode 100644 scripts/un/codes/codelist_pvmap_template.py create mode 100644 scripts/un/codes/codelist_schema.tmcf create mode 100644 scripts/un/codes/dsd_property_pvmap.py create mode 100644 scripts/un/codes/dsd_property_pvmap_template.py create mode 100644 scripts/un/codes/dsd_property_schema.tmcf create mode 100644 scripts/un/codes/generate_statvar_name.py diff --git a/scripts/un/codes/codelist_pvmap_template.py b/scripts/un/codes/codelist_pvmap_template.py new file mode 100644 index 0000000000..3a9b0f4196 --- /dev/null +++ b/scripts/un/codes/codelist_pvmap_template.py @@ -0,0 +1,23 @@ +# Template for converting UN codelist files to PVMap for statvar processor. +# The pvmap will have columns to generate statvar constraint propoerty:values +# and names. +# The pvmap will also have columns to generate schema MCF with a tMCF. +{ + 'key': '{CONCEPT}:{CODE}', + 'UnConceptProp': 'Property', + 'UnConcept': '"{CONCEPT}"', + 'UnCodeProp': 'UnCode', + 'UnCode': '"{CODE}"', + 'ConstraintProp': '{PROPERTY}', + 'ConstraintPropValue': 'to_dcid(NAMESPACE+"_"+CONCEPT+"-"+CODE)', + 'ConstraintPropType': 'TypeOf', + 'ConstraintPropEnum': 'str(ConstraintProp[0].upper() + ConstraintProp[1:]+"Enum")', + 'NameProp': 'ValueName_{CONCEPT}', + 'ConstraintValueName': '"{NAME_EN}"', + 'DescriptionProp': 'Desc_{CONCEPT}', + 'ConstraintValueDescription': 'quote(anyascii(DESCRIPTION))', + # Eond of line prop:value when description is empty. + '#End': 'End', + 'Dummy': '.', +} + diff --git a/scripts/un/codes/codelist_schema.tmcf b/scripts/un/codes/codelist_schema.tmcf new file mode 100644 index 0000000000..038dd3a5d9 --- /dev/null +++ b/scripts/un/codes/codelist_schema.tmcf @@ -0,0 +1,17 @@ +Node: E:UN->E0 +typeOf: C:UN->ConstraintPropEnum +dcid: C:UN->ConstraintPropValue +unCode: C:UN->UnCode +name: C:UN->ConstraintValueName +description: C:UN->ConstraintValueDescription + +Node: E:UN->E1 +typeOf: C:UN->UnConceptProp +dcid: C:UN->ConstraintProp +unConcept: C:UN->UnConcept +rangeIncludes: C:UN->ConstraintPropEnum + +Node: E:UN->E2 +typeOf: schema:Class +dcid: C:UN->ConstraintPropEnum +subClassOf: schema:Enumeration diff --git a/scripts/un/codes/dsd_property_pvmap.py b/scripts/un/codes/dsd_property_pvmap.py new file mode 100644 index 0000000000..aa1b9ef914 --- /dev/null +++ b/scripts/un/codes/dsd_property_pvmap.py @@ -0,0 +1,22 @@ +# Template for converting UN DSD file with column metadata +# to PVMap for statvar processor. +# The pvmap will have columns to generate statvar constraint +# propoerties with names. +# The pvmap will also have columns to generate schema MCF with a tMCF. +{ + 'key': '{CONCEPT}', + 'UnCodeProp': 'UnConceptCode', + 'UnCode': '"{CONCEPT}"', + 'ConceptProp': 'UnConceptProperty', + 'ConstraintProp': '{PROPERTY}', + 'ConstraintPropType': 'Property', + 'ConstraintPropEnum': 'str(ConstraintProp[0].upper() + ConstraintProp[1:]+"Enum")', + 'ConceptNameProp': 'PropertyName_{CONCEPT}', + 'ConceptName': '"{NAME_EN}"', + 'DescriptionProp': 'PropertyDesc_{CONCEPT}', + 'ConceptDescription': '{DESCRIPTION}', + # Eond of line prop:value when description is empty. + '#End': 'End', + 'Dummy': '.', +} + diff --git a/scripts/un/codes/dsd_property_pvmap_template.py b/scripts/un/codes/dsd_property_pvmap_template.py new file mode 100644 index 0000000000..708beac19f --- /dev/null +++ b/scripts/un/codes/dsd_property_pvmap_template.py @@ -0,0 +1,39 @@ +# Template for converting UN DSD file with column metadata +# to PVMap for statvar processor. +# The pvmap will have columns to generate statvar constraint +# propoerties with names. +# The pvmap will also have columns to generate schema MCF with a tMCF. +{ + 'key': + '{CONCEPT}', + 'UnCodeProp': + 'UnConceptCode', + 'UnCode': + '"{CONCEPT}"', + 'ConceptProp': + 'UnConceptProperty', + 'ConstraintProp': + '{PROPERTY}', + 'ConstraintPropType': + 'Property', + 'ConstraintPropEnum': + 'str(ConstraintProp[0].upper() + ConstraintProp[1:]+"Enum")', + 'ConceptNameProp': + 'PropertyName_{CONCEPT}', + 'ConceptName': + '"{NAME_EN}"', + 'DescriptionProp': + 'ValueDesc_{CONCEPT}', + 'ConceptDescription': + 'quote(anyascii(DESCRIPTION))', + # Initialize ValueName for specific codes for a concept to empty string + 'CodeNameProp': + 'ValueName_{CONCEPT}', + 'DefaultName': + '""', + # End of line prop:value when description is empty. + '#End': + 'End', + 'Dummy': + '.', +} diff --git a/scripts/un/codes/dsd_property_schema.tmcf b/scripts/un/codes/dsd_property_schema.tmcf new file mode 100644 index 0000000000..8491b4b64f --- /dev/null +++ b/scripts/un/codes/dsd_property_schema.tmcf @@ -0,0 +1,15 @@ +Node: E:DSD->E0 +dcid: C:DSD->ConstraintProp +typeOf: C:DSD->ConstraintPropType +name: C:DSD->ConstraintProp +domainIncludes: dcid:UNSeries +alternateName: C:DSD->ConceptName +description: C:DSD->ConceptDescription +rangeIncludes: C:DSD->ConstraintPropEnum + +Node: E:DSD->E1 +dcid: C:DSD->ConstraintPropEnum +typeOf: schema:Class +subClassOf: schema:Enumeration +description: C:DSD->ConceptDescription + diff --git a/scripts/un/codes/generate_codelist_map.py b/scripts/un/codes/generate_codelist_map.py index f7d13597e0..c511807379 100644 --- a/scripts/un/codes/generate_codelist_map.py +++ b/scripts/un/codes/generate_codelist_map.py @@ -1,12 +1,15 @@ -"""Script to geenrate codelist mapings for a specific codelist file.""" +"""Script to generate codelist mapings for a specific codelist file.""" import os import re import sys +import unicodedata from absl import app from absl import flags from absl import logging +from anyascii import anyascii +from pprint import pprint import file_util import mcf_file_util @@ -17,8 +20,8 @@ flags.DEFINE_string('input_codelist', '', 'CSV file with codelist.') flags.DEFINE_string('output_pvmap', '', 'Output pvmap csv.') flags.DEFINE_string('namespace', 'un', 'Namespace prefix for agency') -flags.DEFINE_integer('logging_level', logging.INFO, 'Logging level.') flags.DEFINE_string('pvmap_template', '', 'Python file with pvmap template.') +flags.DEFINE_integer('logging_level', logging.INFO, 'Logging level.') _FLAGS = flags.FLAGS @@ -35,23 +38,49 @@ # Map from a code to a pvmap _DEFAULT_CODE_PVMAP = { - 'key': '{CONCEPT}:{CODE}', - 'UnConceptProp': 'Property', - 'UnConcept': '"{CONCEPT}"', - 'UnCodeProp': 'UnCode', - 'UnCode': '"{CODE}"', - 'ConstraintProp': 'to_property(CONCEPT)', - 'ConstraintPropValue': 'to_dcid(NAMESPACE+"_"+CODE)', - 'ConstraintPropType': 'TypeOf', - 'ConstraintPropEnum': 'str(ConstraintProp[0].upper() + ConstraintProp[1:]+"Enum")', - 'NameProp': '{CONCEPT}_name', - 'ConstraintValueName': '"{NAME_EN}"', - 'DescriptionProp': '{CONCEPT}_description', - 'ConstraintValueDescription': '{DESCRIPTION}', - 'End': 'End', - 'Dummy': '.', + 'key': + '{CONCEPT}:{CODE}', + 'UnConceptProp': + 'Property', + 'UnConcept': + '"{CONCEPT}"', + 'UnCodeProp': + 'UnCode', + 'UnCode': + '"{CODE}"', + 'ConstraintProp': + '{PROPERTY}', + 'ConstraintPropValue': + 'to_dcid(NAMESPACE+"_"+CONCEPT+"-"+CODE)', + 'ConstraintPropType': + 'TypeOf', + 'ConstraintPropEnum': + 'str(ConstraintProp[0].upper() + ConstraintProp[1:]+"Enum")', + 'NameProp': + 'ValueName_{CONCEPT}', + 'ConstraintValueName': + '"{NAME_EN}"', + 'DescriptionProp': + 'ValueDesc_{CONCEPT}', + 'ConstraintValueDescription': + '{DESCRIPTION}', + 'End': + 'End', + 'Dummy': + '.', +} + +# Mapping from concept to properties. +# If not set it map, the concept is used as the property. +_DEFAULT_CONCEPT_PROP_MAP = { + 'SERIES': 'populationType', + 'UNIT_MEASURE': 'unit', } +def quote(value: str) -> str: + """Returns a string in double quotes.""" + value = value.strip().strip('"').strip() + return f'"{value}"' def to_property(concept: str) -> str: """Returns a property for the concept.""" @@ -61,7 +90,7 @@ def to_property(concept: str) -> str: def to_dcid(code: str) -> str: """Replace any non alphanumeric characters with '_'""" - value = re.sub(r'[^A-Za-z0-9\.]+', '_', code) + value = re.sub(r'[^A-Za-z0-9\._:-]+', '_', code) return value[0].upper() + value[1:] @@ -81,34 +110,39 @@ def clean_value_str(val: str, 'to_property': to_property, 'to_dcid': to_dcid, 'clean_value_str': clean_value_str, + 'quote': quote, + + # Additional modules for text manipulations + 'unicodedata': unicodedata, + 'anyascii': anyascii, }) + def get_value(tpl_val: str, input_pvs: dict) -> str: - """Retuns a value with the pvs applied.""" - value = tpl_val - if '{' in tpl_val: - # Format string - try: - value = tpl_val.format(**input_pvs) - except Exception as e: - logging.error( - f'Failed to format "{tpl_val}" using dict: {input_pvs}, error:{e}' - ) - value = '' - elif '(' in tpl_val: - # Evaluate a function - try: - prop, value = eval_functions.evaluate_statement( - tpl_val, input_pvs, _EVAL_FUNCTIONS) - except Exception as e: - lgging.error( - f'Failed to evaluate "{tpl_val}" using dict: {pvs}, error:{e}' - ) - value = '' - if value: - # Cleanup value - value = clean_value_str(value) - return value + """Retuns a value with the pvs applied.""" + value = tpl_val + if '{' in tpl_val: + # Format string + try: + value = tpl_val.format(**input_pvs) + except Exception as e: + logging.error( + f'Failed to format "{tpl_val}" using dict: {input_pvs}, error:{e}' + ) + value = '' + elif '(' in tpl_val: + # Evaluate a function + try: + prop, value = eval_functions.evaluate_statement( + tpl_val, input_pvs, _EVAL_FUNCTIONS) + except Exception as e: + lgging.error( + f'Failed to evaluate "{tpl_val}" using dict: {pvs}, error:{e}') + value = '' + if value: + # Cleanup value + value = clean_value_str(value) + return value def generate_code_map(code_pvs: dict, @@ -122,6 +156,11 @@ def generate_code_map(code_pvs: dict, input_pvs = dict(code_pvs) input_pvs.setdefault('namespace', namespace.lower()) input_pvs.setdefault('NAMESPACE', namespace.upper()) + concept = code_pvs.get('CONCEPT') + concept_prop = _DEFAULT_CONCEPT_PROP_MAP.get(concept) + if not concept_prop: + concept_prop = to_property(concept) + input_pvs['PROPERTY'] = concept_prop for tpl_prop, tpl_val in template.items(): tpl_prop = get_value(tpl_prop, input_pvs) value = get_value(tpl_val, input_pvs) @@ -133,7 +172,8 @@ def generate_code_map(code_pvs: dict, def generate_codelist_pvmap(cl_file: str, output: str, - namespace: str = 'un') -> dict: + namespace: str = 'un', + template_file: str = None) -> dict: """Generate a pvmap file for a codelist.""" counters = Counters() @@ -141,9 +181,15 @@ def generate_codelist_pvmap(cl_file: str, logging.info(f'Loaded {len(input_codes)} from codelist: {cl_file}') counters.add_counter('input-codes', len(input_codes)) + pvmap_template = _DEFAULT_CODE_PVMAP + if template_file: + pvmap_template = file_util.file_load_py_dict(template_file) + + logging.info(f'Using template: {pprint(pvmap_template)}') + output_pvs = {} for index, code_pvs in input_codes.items(): - pvs = generate_code_map(code_pvs, namespace) + pvs = generate_code_map(code_pvs, namespace, pvmap_template) output_pvs[index] = pvs logging.debug(f'Mapped {code_pvs} to {pvs}') @@ -167,7 +213,7 @@ def generate_codelist_pvmap(cl_file: str, def main(_): logging.set_verbosity(_FLAGS.logging_level) generate_codelist_pvmap(_FLAGS.input_codelist, _FLAGS.output_pvmap, - _FLAGS.namespace) + _FLAGS.namespace, _FLAGS.pvmap_template) if __name__ == '__main__': diff --git a/scripts/un/codes/generate_statvar_name.py b/scripts/un/codes/generate_statvar_name.py new file mode 100644 index 0000000000..079e5f096a --- /dev/null +++ b/scripts/un/codes/generate_statvar_name.py @@ -0,0 +1,169 @@ +"""Script to generate statvar names for UN statvars.""" + +import os +import re +import sys + +from absl import app +from absl import flags +from absl import logging + +import file_util +from mcf_file_util import add_namespace, strip_namespace, get_node_dcid +from mcf_file_util import load_mcf_nodes, write_mcf_nodes + +from config_map import ConfigMap +from counters import Counters + +flags.DEFINE_string('input_statvar_mcf', '', 'MCF files with statvar nodes.') +flags.DEFINE_string('output_statvar_mcf', '', + 'Output MCF files for statvar with names.') +flags.DEFINE_string('input_schema_mcf', '', + 'Schema file with names for propeorties.') +flags.DEFINE_integer('logging_level', logging.INFO, 'Logging level.') + +_FLAGS = flags.FLAGS +"""Wrapper to generate statvar names for UN StatVars.""" + +_DEFAULT_IGNORE_PROP = { + 'Node': '', + 'dcid': '', + 'typeOf': '', + 'memberOf': '', + 'footnote': '', + 'description': '', + 'name': '', + 'populationType': '', + 'measuredProperty': 'value', + 'statType': 'measuredValue', +} + + +def to_sentence_case(text: str) -> str: + """Returns a string in sentence case.""" + # convert camelCase + sentence = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', ' ', text) + + # convert '_' to spaces + sentence = re.sub(r'[_ ]+', ' ', sentence) + sentence = sentence.strip() + return sentence.capitalize() + + +class UNStatVarNameGenerator: + + def __init__( + self, + config_dict: dict = {}, + counters: Counters = None, + ): + self._config = ConfigMap() + self._config.update_config(config_dict) + self._counters = counters + if counters is None: + self._counters = Counters() + self._schema_nodes = {} + + def load_schema_mcf(self, mcf: str) -> dict: + """Loads schema nodes from MCF files.""" + load_mcf_nodes(mcf, nodes=self._schema_nodes) + self._counters.add_counter('input-schema-nodes', + len(self._schema_nodes)) + return self._schema_nodes + + def get_schema_node(self, dcid: str) -> dict: + """Returns a schema node for the dcid.""" + if not dcid: + return None + node = self._schema_nodes.get(strip_namespace(dcid)) + if not node: + node = self._schema_nodes.get(add_namespace(dcid)) + return node + + def get_schema_name(self, dcid: str) -> str: + """Returns the name for the dcid fomr the schema.""" + node = self.get_schema_node(dcid) + if not node: + return '' + name = node.get('alternateName') + if not name: + name = node.get('name', '') + return name.strip('"').strip() + + def generate_statvar_name(self, pvs: dict) -> dict: + """Adds a name to a statvar if it doesn't exist already.""" + name = pvs.get('name') + if name: + logging.debug(f'Using existing name for statvar:{name}') + self._counters.add_counter(f'input-existing-name', 1) + return pvs + + # Use the name from the schema if it already exists. + dcid = get_node_dcid(pvs) + name = self.get_schema_name(dcid) + if name: + pvs['name'] = '"' + name + '"' + self._counters.add_counter(f'input-schema-name', 1) + return pvs + + # Get the name from the populationType + name_prefix = self.get_schema_name(pvs.get('populationType')) + name_tokens = [] + # Collect names for constraint property:values + for prop, value in pvs.items(): + pv_tokens = [] + prop = strip_namespace(prop) + value = strip_namespace(value) + ignore_val = strip_namespace(_DEFAULT_IGNORE_PROP.get(prop)) + if ignore_val is not None: + if not ignore_val or ignore_val == value: + continue + prop_name = self.get_schema_name(prop) + if not prop_name: + prop_name = to_sentence_case(prop) + self._counters.add_counter('property-missing-name', 1) + if prop_name: + pv_tokens.append(prop_name) + val_name = self.get_schema_name(value) + if not val_name: + val_name = to_sentence_case(value) + self._counters.add_counter('value-missing-name', 1) + if val_name: + pv_tokens.append(val_name) + if pv_tokens: + name_tokens.append('='.join(pv_tokens)) + name_suffix = ', '.join(name_tokens) + name = name_prefix + if name_suffix: + self._counters.add_counter(f'generated-statvar-name-contraints', 1) + name = f'{name} [{name_suffix}]' + pvs['name'] = f'"{name}"' + self._counters.add_counter(f'generated-statvar-names', 1) + + +def generate_statvar_names(input_mcf: str, schema_mcf: str, output_mcf: str): + """Generate names for statvars in input_mcf.""" + counters = Counters() + config = {} + sv_name_generator = UNStatVarNameGenerator(config, counters) + sv_name_generator.load_schema_mcf(schema_mcf) + + statvar_nodes = load_mcf_nodes(input_mcf) + logging.info(f'Generating statvar names for {len(statvar_nodes)}') + for dcid, pvs in statvar_nodes.items(): + sv_name_generator.generate_statvar_name(pvs) + + if output_mcf: + write_mcf_nodes(statvar_nodes, output_mcf) + + counters.print_counters() + + +def main(_): + logging.set_verbosity(_FLAGS.logging_level) + generate_statvar_names(_FLAGS.input_statvar_mcf, _FLAGS.input_schema_mcf, + _FLAGS.output_statvar_mcf) + + +if __name__ == '__main__': + app.run(main) diff --git a/tools/statvar_importer/config_flags.py b/tools/statvar_importer/config_flags.py index 5b8f05e3cb..be5088526a 100644 --- a/tools/statvar_importer/config_flags.py +++ b/tools/statvar_importer/config_flags.py @@ -169,7 +169,8 @@ 'Generate names for Statvars.') flags.DEFINE_bool('enable_cloud_logging', False, 'Enable cloud logging when running on cloud.') - +flags.DEFINE_string('statvar_dcid_prefix', '', + 'Prefix for statvar dcid.') def get_default_config() -> dict: """Returns the default config as dictionary of config parameters and values.""" @@ -437,7 +438,7 @@ def get_default_config() -> dict: # Settings for statvar dcid generator 'statvar_dcid_fixed_properties': [], 'statvar_dcid_prefix': - '', + _FLAGS.statvar_dcid_prefix, 'statvar_dcid_remove_prefix': '', 'statvar_dcid_delimiter': diff --git a/tools/statvar_importer/schema/statvar_dcid_gen.py b/tools/statvar_importer/schema/statvar_dcid_gen.py index 52c345aac2..32d509950f 100644 --- a/tools/statvar_importer/schema/statvar_dcid_gen.py +++ b/tools/statvar_importer/schema/statvar_dcid_gen.py @@ -43,10 +43,9 @@ def camel_to_snake(text: str, delim: str = '_') -> str: Returns: The converted snake_case string in lowercase. """ - s1 = re.sub(r'([a-z0-9])([A-Z])', r'\1' + delim + r'\2', text) - s2 = re.sub(r'([a-zA-Z])([0-9])', r'\1' + delim + r'\2', s1) - s3 = re.sub(r'([A-Z])([A-Z][a-z])', r'\1' + delim + r'\2', s2) - return s3.lower() + s1 = re.sub(r'([a-z])([A-Z0-9])', r'\1' + delim + r'\2', text) + s2 = re.sub(r'([A-Z])([A-Z][a-z])', r'\1' + delim + r'\2', s1) + return s2.lower() def get_dcid_name(dcid: str, schema_nodes: dict) -> str: @@ -92,7 +91,7 @@ def get_dcid_token(word: str, # Convert camelCase to snake case token = camel_to_snake(token).upper() if remove_prefix: - token = token.removeprefix(remove_prefix) + token = re.sub(remove_prefix, '', token) return token[0].upper() + token[1:] @@ -136,6 +135,7 @@ def generate_dcid_for_statvar(pvs: dict, 'alternateName', 'footnote', 'unCode', 'Node', 'typeOf' ]) prop_delim = config.get('statvar_dcid_delimiter', '_') + fixed_prop_delim = config.get('statvar_dcid_fixed_delimiter', '_') val_delim = config.get('statvar_dcid_value_delimiter', '') upper_case = config.get('statvar_dcid_upper_case', False) remove_prefix = config.get('statvar_dcid_remove_prefix', '') @@ -177,7 +177,8 @@ def generate_dcid_for_statvar(pvs: dict, ordered_props.append(prop) # Get ordered list of dcid tokens - dcid_tokens = [] + dcid_fixed_tokens = [] + dcid_prop_tokens = [] for prop in ordered_props: prop_value = dcid_pvs.pop(prop, None) if prop_value: @@ -190,8 +191,14 @@ def generate_dcid_for_statvar(pvs: dict, value_name = prop_name + val_delim + value_name if upper_case: value_name = value_name.upper() - dcid_tokens.append(value_name) - dcid = prop_delim.join(dcid_tokens) + if prop in fixed_props: + dcid_fixed_tokens.append(value_name) + else: + dcid_prop_tokens.append(value_name) + prop_token = prop_delim.join(dcid_prop_tokens) + if prop_token: + dcid_fixed_tokens.append(prop_token) + dcid = fixed_prop_delim.join(dcid_fixed_tokens) if dcid_prefix: dcid = dcid_prefix + dcid return dcid diff --git a/tools/statvar_importer/schema/statvar_dcid_gen_test.py b/tools/statvar_importer/schema/statvar_dcid_gen_test.py index 0538e1456e..a74a52f9f6 100644 --- a/tools/statvar_importer/schema/statvar_dcid_gen_test.py +++ b/tools/statvar_importer/schema/statvar_dcid_gen_test.py @@ -73,6 +73,7 @@ def test_generate_dcid_with_property(self): 'populationType' ], 'statvar_dcid_delimiter': '__', + 'statvar_dcid_fixed_delimiter': '.', 'statvar_dcid_value_delimiter': '--', 'statvar_dcid_remove_prefix': 'TEST_', 'statvar_dcid_upper_case': True, @@ -84,7 +85,7 @@ def test_generate_dcid_with_property(self): 'populationType': 'Person', } dcid = generate_dcid_for_statvar(pvs, config) - self.assertEqual(dcid, 'test/COUNT__PERSON') + self.assertEqual(dcid, 'test/COUNT.PERSON') pvs2 = { 'statType': 'medianValue', @@ -95,7 +96,18 @@ def test_generate_dcid_with_property(self): } dcid2 = generate_dcid_for_statvar(pvs2, config) self.assertEqual( - dcid2, 'test/MEDIAN_VALUE__AGE__PERSON__GENDER--MALE__PLACE--URBAN') + dcid2, 'test/MEDIAN_VALUE.AGE.PERSON.GENDER--MALE__PLACE--URBAN') + pvs3 = { + 'statType': 'measuredValue', + 'measuredProperty': 'value', + 'populationType': 'AdultPerson', + 'gender': 'Male', + 'place': 'TEST_Urban', + } + dcid3 = generate_dcid_for_statvar(pvs3, config) + self.assertEqual( + dcid3, 'test/ADULT_PERSON.GENDER--MALE__PLACE--URBAN') + if __name__ == '__main__': From c1c2a676e4034db98fd09138a0292777f1db4d81 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Tue, 16 Jun 2026 21:23:31 +0530 Subject: [PATCH 09/12] convert large ints to float --- import-automation/executor/requirements.txt | 1 + tools/statvar_importer/utils.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/import-automation/executor/requirements.txt b/import-automation/executor/requirements.txt index 7ee91417a2..9868d3656c 100644 --- a/import-automation/executor/requirements.txt +++ b/import-automation/executor/requirements.txt @@ -1,6 +1,7 @@ # Requirements for Python scripts in this repo that have automation enabled! absl-py +anyascii arcgis2geojson beautifulsoup4 chardet diff --git a/tools/statvar_importer/utils.py b/tools/statvar_importer/utils.py index 9c060de0b0..b40982bc9a 100644 --- a/tools/statvar_importer/utils.py +++ b/tools/statvar_importer/utils.py @@ -94,7 +94,10 @@ def str_from_number(number: Union[int, float], '123.45' """ # Check if number is an integer or float without any decimals. - if int(number) == number: + if abs(number) > sys.maxsize: + # Convert very large ints to float. + number = float(number) + elif int(number) == number: number_int = int(number) return f'{number_int}' # Return float rounded to precision digits. From 0d87b47573c5e1c5bf54ac1e1e98177bb8c98f18 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Tue, 16 Jun 2026 22:31:27 +0530 Subject: [PATCH 10/12] make max_int configurable --- tools/statvar_importer/stat_var_processor.py | 1 + tools/statvar_importer/utils.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/statvar_importer/stat_var_processor.py b/tools/statvar_importer/stat_var_processor.py index cb4553cdcb..2767a18d02 100644 --- a/tools/statvar_importer/stat_var_processor.py +++ b/tools/statvar_importer/stat_var_processor.py @@ -1222,6 +1222,7 @@ def format_svobs(self, svobs: dict) -> dict: numeric_value, precision_digits=self._config.get('output_precision_digits', 5), + max_int=self._config.get('max_integer', sys.maxsize), ) elif isinstance(value, str) and value: value = value.strip() diff --git a/tools/statvar_importer/utils.py b/tools/statvar_importer/utils.py index b40982bc9a..17c98832a6 100644 --- a/tools/statvar_importer/utils.py +++ b/tools/statvar_importer/utils.py @@ -67,7 +67,8 @@ def capitalize_first_char(string: str) -> str: def str_from_number(number: Union[int, float], - precision_digits: Optional[int] = None) -> str: + precision_digits: Optional[int] = None, + max_int: int = sys.maxsize) -> str: """Converts a number (int or float) to its string representation. Integers and floats that are whole numbers (e.g., 10.0) are returned as @@ -77,6 +78,7 @@ def str_from_number(number: Union[int, float], Args: number: The number to convert. precision_digits: Optional number of decimal places to round a float to. + max_int: Numbers larger than this are converted to float Returns: The string representation of the number. @@ -94,8 +96,8 @@ def str_from_number(number: Union[int, float], '123.45' """ # Check if number is an integer or float without any decimals. - if abs(number) > sys.maxsize: - # Convert very large ints to float. + if abs(number) > max_int: + # Convert very large ints to float with potential loss of precision number = float(number) elif int(number) == number: number_int = int(number) From eaecf244d07cf2151b7528198e81e9d27d3765ea Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Tue, 16 Jun 2026 22:31:58 +0530 Subject: [PATCH 11/12] support imports form other folders --- scripts/un/codes/generate_codelist_map.py | 9 +++++++++ scripts/un/codes/generate_statvar_name.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/scripts/un/codes/generate_codelist_map.py b/scripts/un/codes/generate_codelist_map.py index c511807379..88cb3b6bfc 100644 --- a/scripts/un/codes/generate_codelist_map.py +++ b/scripts/un/codes/generate_codelist_map.py @@ -11,6 +11,15 @@ from anyascii import anyascii from pprint import pprint +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +_DATA_DIR = os.path.dirname(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append(_DATA_DIR) +sys.path.append(os.path.join(_DATA_DIR, 'util')) +sys.path.append(os.path.join(_DATA_DIR, 'tools', 'statvar_importer')) + import file_util import mcf_file_util import eval_functions diff --git a/scripts/un/codes/generate_statvar_name.py b/scripts/un/codes/generate_statvar_name.py index 079e5f096a..4f6a075b10 100644 --- a/scripts/un/codes/generate_statvar_name.py +++ b/scripts/un/codes/generate_statvar_name.py @@ -8,6 +8,15 @@ from absl import flags from absl import logging +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +_DATA_DIR = os.path.dirname(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append(_DATA_DIR) +sys.path.append(os.path.join(_DATA_DIR, 'util')) +sys.path.append(os.path.join(_DATA_DIR, 'tools', 'statvar_importer')) + import file_util from mcf_file_util import add_namespace, strip_namespace, get_node_dcid from mcf_file_util import load_mcf_nodes, write_mcf_nodes From 87b5b93fe5d26f2758ae1e5a61a991be6f6e09bf Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Thu, 18 Jun 2026 01:24:07 +0530 Subject: [PATCH 12/12] add script to generate statvar groups --- scripts/un/codes/generate_statvar_groups.py | 294 ++++++++++++++++++ scripts/un/codes/generate_statvar_name.py | 15 +- .../schema/statvar_dcid_gen.py | 6 +- 3 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 scripts/un/codes/generate_statvar_groups.py diff --git a/scripts/un/codes/generate_statvar_groups.py b/scripts/un/codes/generate_statvar_groups.py new file mode 100644 index 0000000000..4fa599f804 --- /dev/null +++ b/scripts/un/codes/generate_statvar_groups.py @@ -0,0 +1,294 @@ +"""Script to generate statvar groups for UN statvars.""" + +import itertools +import os +import re +import sys + +from absl import app +from absl import flags +from absl import logging + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +_DATA_DIR = os.path.dirname(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append(_DATA_DIR) +sys.path.append(os.path.join(_DATA_DIR, 'util')) +sys.path.append(os.path.join(_DATA_DIR, 'tools', 'statvar_importer')) + +import file_util +from mcf_file_util import add_namespace, strip_namespace, get_node_dcid +from mcf_file_util import load_mcf_nodes, write_mcf_nodes, add_mcf_node + +from config_map import ConfigMap +from counters import Counters + +flags.DEFINE_string('input_statvar_mcf', '', 'MCF files with statvar nodes.') +flags.DEFINE_string('output_statvar_group_mcf', '', + 'Output MCF files for statvar groups.') +flags.DEFINE_string('input_schema_mcf', '', + 'Schema file with names for properties.') +flags.DEFINE_string('statvar_root', 'dc/g/Root', 'Root for the statvar group.') +flags.DEFINE_string('statvar_group_prefix', 'custom/g/undata', + 'Prefix for the statvar group.') +flags.DEFINE_string('statvar_dcid_remove_prefix', '', + 'Prefix for the statvar group.') +flags.DEFINE_list('statvar_property_order', ['populationType'], + 'Statvar properties ordered by group heirarchy.') +flags.DEFINE_bool( + 'statvar_group_permutations', True, + 'Geneerate statvar groups for all permutations of properties.') +flags.DEFINE_integer('logging_level', logging.INFO, 'Logging level.') + +_FLAGS = flags.FLAGS +"""Wrapper to generate statvar groups for UN StatVars.""" + +_DEFAULT_IGNORE_PROP = { + 'Node': '', + 'dcid': '', + 'typeOf': '', + 'memberOf': '', + 'footnote': '', + 'description': '', + 'name': '', + 'measuredProperty': 'value', + 'statType': 'measuredValue', +} + + +def get_default_statvar_group_config() -> dict: + """Returns the default statvar group config.""" + return { + 'svg_root': _FLAGS.statvar_root, + 'svg_prefix': _FLAGS.statvar_group_prefix, + 'svg_properties': _FLAGS.statvar_property_order, + 'statvar_dcid_remove_prefix': _FLAGS.statvar_dcid_remove_prefix, + 'statvar_group_permutations': _FLAGS.statvar_group_permutations, + } + + +def to_snake_case(text: str, delim: str = '_', upper: bool = True) -> str: + """Returns a string in sentence case.""" + # convert camelCase + sentence = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', delim, text) + + # convert '_' to spaces + sentence = re.sub(r'[_ ]+', delim, sentence) + sentence = sentence.strip() + if upper: + return sentence.upper() + return sentence + + +def to_quoted(text: str) -> str: + """Returns quoted string.""" + if not text: + return text + text = text.strip().strip('"').strip().replace('"', "'") + if text: + return '"' + text + '"' + return '' + + +class UNStatVarGroupGenerator: + + def __init__( + self, + config_dict: dict = {}, + counters: Counters = None, + ): + self._config = ConfigMap() + self._config.update_config(config_dict) + self._counters = counters + if counters is None: + self._counters = Counters() + # dictionary of schema nodes keyed by dcid. + self._schema_nodes = {} + # dictionary of statvar groups created. + self._statvar_groups = {} + + def load_schema_mcf(self, mcf: str) -> dict: + """Loads schema nodes from MCF files.""" + load_mcf_nodes(mcf, nodes=self._schema_nodes) + self._counters.add_counter('input-schema-nodes', + len(self._schema_nodes)) + return self._schema_nodes + + def get_schema_node(self, dcid: str) -> dict: + """Returns a schema node for the dcid.""" + if not dcid: + return None + node = self._schema_nodes.get(strip_namespace(dcid)) + if not node: + node = self._schema_nodes.get(add_namespace(dcid)) + return node + + def get_schema_name(self, dcid: str) -> str: + """Returns the name for the dcid fomr the schema.""" + node = self.get_schema_node(dcid) + if not node: + return '' + name = node.get('alternateName') + if not name: + name = node.get('name', '') + if not name: + # convert the dcid to a name string + remove_prefix = self._config.get('statvar_dcid_remove_prefix', '') + name = re.sub(remove_prefix, '', dcid[dcid.find('/') + 1:]) + name = to_snake_case(name).capitalize() + return name.strip('"').strip() + + def add_statvar_group(self, pvs: dict): + """Add a statvar group to schema.""" + add_mcf_node(pvs, self._schema_nodes) + add_mcf_node(pvs, self._statvar_groups) + + def get_statvar_groups(self) -> dict: + """Returns the new statvar groups created.""" + return self._statvar_groups + + def get_statvar_group_node(self, dcid, name, parent) -> dict: + return { + 'Node': add_namespace(dcid), + 'typeOf': 'dcid:StatVarGroup', + 'name': to_quoted(name), + 'specializationOf': add_namespace(parent), + } + + def generate_prop_value_svg(self, pvs: dict, grp_props: list, + svg_parent: str, svg_prefix: str): + """Generate statvar groups for the property values in the list.""" + strip_prefix = self._config.get('svg_dcid_remove_prefix', '') + depth = 0 + for prop in grp_props: + val = strip_namespace(pvs.get(prop, '')) + if not val: + continue + # Create svg for the property + prop_id = re.sub(strip_prefix, '', to_snake_case(prop)) + svg_dcid = svg_prefix + prop_id + svg_name = self.get_schema_name(prop) + self.add_statvar_group( + self.get_statvar_group_node(svg_dcid, svg_name, svg_parent)) + depth += 1 + self._counters.add_counter( + f'generated-statvar-groups-depth-{depth}', 1) + svg_parent = svg_dcid + svg_prefix = svg_dcid + self._config.get( + 'statvar_dcid_value_delimiter', '--') + + # Generate statvar group for value + val_id = re.sub(strip_prefix, '', val) + svg_dcid = svg_prefix + val_id + svg_name = self.get_schema_name(val) + self.add_statvar_group( + self.get_statvar_group_node(svg_dcid, svg_name, svg_parent)) + depth += 1 + self._counters.add_counter( + f'generated-statvar-groups-depth-{depth}', 1) + svg_parent = svg_dcid + svg_prefix = svg_dcid + self._config.get('statvar_dcid_delimiter', + '__') + # Add the statvar to the leaf group. + sv = { + 'Node': add_namespace(get_node_dcid(pvs)), + 'typeOf': 'StatisticalVariable', + 'memberOf': svg_parent, + } + self.add_statvar_group(sv) + self._counters.add_counter(f'statvar-for-depth-{depth}', 1) + + def generate_groups_for_statvar(self, pvs: dict, svg_parent: str, + svg_prefix: str): + """Generates statvar groups for the hierarchy property:values in the statvar.""" + self._counters.add_counter('input-statvars', 1) + # Get the properties for the group + grp_props = dict() + for prop, value in pvs.items(): + prop = strip_namespace(prop) + value = strip_namespace(value) + ignore_val = strip_namespace(_DEFAULT_IGNORE_PROP.get(prop)) + if ignore_val is not None: + if not ignore_val or ignore_val == value: + continue + grp_props.setdefault(prop, value) + + # Get an ordered list of properties to create statvar groups. + # Also generate statvar for each set of properties. + strip_prefix = self._config.get('svg_dcid_remove_prefix', '') + for prop in self._config.get('svg_properties', ['populationType']): + val = grp_props.pop(prop, None) + if not val: + continue + val = re.sub(strip_prefix, '', to_snake_case(val)) + svg_dcid = svg_prefix + val + svg_name = self.get_schema_name(val) + self.add_statvar_group( + self.get_statvar_group_node(svg_dcid, svg_name, svg_parent)) + self._counters.add_counter(f'generated-statvar-groups-{prop}', 1) + svg_parent = svg_dcid + svg_prefix = svg_dcid + self._config.get('statvar_dcid_delimiter', + '__') + + # Generate statvar group for all permutations of properties. + props_perm = sorted(grp_props.keys()) + if self._config.get('statvar_group_permutations', False): + props_perm = list(itertools.permutations(grp_props.keys())) + for props_list in props_perm: + self.generate_prop_value_svg(pvs, props_list, svg_parent, + svg_prefix) + + def generate_statvar_groups(self, sv_nodes: dict): + """Generate statvar groups for given statvar nodes.""" + svg_prefix = self._config.get('svg_prefix', 'dc/g/') + svg_root = self._config.get('svg_root', 'dc/g/Root') + self._counters.add_counter('total', len(sv_nodes)) + for dcid, pvs in sv_nodes.items(): + self._counters.add_counter('processed', 1) + typ = strip_namespace(pvs.get('typeOf', '')) + if typ and typ != 'StatisticalVariable': + self._counters.add_counter('input-non-statvar-ignored', 1) + continue + self.generate_groups_for_statvar(pvs, svg_root, svg_prefix) + + # Make the top SVG a child of root + if 'Root' not in svg_root and self._config.get( + 'generate_statvar_group_root', True): + name = to_snake_case(svg_root[svg_root.find('/') + 1:], ' ', False) + self.add_statvar_group( + self.get_statvar_group_node(svg_root, name, 'dc/g/Root')) + self._counters.add_counter(f'generated-statvar-groups-root', 1) + + +def generate_statvar_groups(input_mcf: str, + schema_mcf: str, + output_mcf: str, + config: dict = None): + """Generate groups for statvars in input_mcf.""" + counters = Counters() + sv_grp_generator = UNStatVarGroupGenerator(config, counters) + sv_grp_generator.load_schema_mcf(schema_mcf) + + statvar_nodes = load_mcf_nodes(input_mcf) + logging.info(f'Generating statvar groups for {len(statvar_nodes)} nodes') + sv_grp_generator.generate_statvar_groups(statvar_nodes) + + sv_grps = sv_grp_generator.get_statvar_groups() + if output_mcf and sv_grps: + write_mcf_nodes(sv_grps, output_mcf) + counters.add_counter('output-nodes', len(sv_grps)) + + counters.print_counters() + + +def main(_): + logging.set_verbosity(_FLAGS.logging_level) + generate_statvar_groups(_FLAGS.input_statvar_mcf, _FLAGS.input_schema_mcf, + _FLAGS.output_statvar_group_mcf, + get_default_statvar_group_config()) + + +if __name__ == '__main__': + app.run(main) diff --git a/scripts/un/codes/generate_statvar_name.py b/scripts/un/codes/generate_statvar_name.py index 4f6a075b10..60fbdec682 100644 --- a/scripts/un/codes/generate_statvar_name.py +++ b/scripts/un/codes/generate_statvar_name.py @@ -28,7 +28,7 @@ flags.DEFINE_string('output_statvar_mcf', '', 'Output MCF files for statvar with names.') flags.DEFINE_string('input_schema_mcf', '', - 'Schema file with names for propeorties.') + 'Schema file with names for properties.') flags.DEFINE_integer('logging_level', logging.INFO, 'Logging level.') _FLAGS = flags.FLAGS @@ -47,6 +47,14 @@ 'statType': 'measuredValue', } +def to_quoted(text: str) -> str: + """Returns quoted string.""" + if not text: + return text + text = text.strip().strip('"').strip().replace('"', "'") + if text: + return '"' + text + '"' + return '' def to_sentence_case(text: str) -> str: """Returns a string in sentence case.""" @@ -105,13 +113,14 @@ def generate_statvar_name(self, pvs: dict) -> dict: if name: logging.debug(f'Using existing name for statvar:{name}') self._counters.add_counter(f'input-existing-name', 1) + pvs['name'] = to_quoted(name) return pvs # Use the name from the schema if it already exists. dcid = get_node_dcid(pvs) name = self.get_schema_name(dcid) if name: - pvs['name'] = '"' + name + '"' + pvs['name'] = to_quoted(name) self._counters.add_counter(f'input-schema-name', 1) return pvs @@ -146,7 +155,7 @@ def generate_statvar_name(self, pvs: dict) -> dict: if name_suffix: self._counters.add_counter(f'generated-statvar-name-contraints', 1) name = f'{name} [{name_suffix}]' - pvs['name'] = f'"{name}"' + pvs['name'] = to_quoted(name) self._counters.add_counter(f'generated-statvar-names', 1) diff --git a/tools/statvar_importer/schema/statvar_dcid_gen.py b/tools/statvar_importer/schema/statvar_dcid_gen.py index 32d509950f..ad09b8a1cc 100644 --- a/tools/statvar_importer/schema/statvar_dcid_gen.py +++ b/tools/statvar_importer/schema/statvar_dcid_gen.py @@ -92,7 +92,9 @@ def get_dcid_token(word: str, token = camel_to_snake(token).upper() if remove_prefix: token = re.sub(remove_prefix, '', token) - return token[0].upper() + token[1:] + if token: + return token[0].upper() + token[1:] + return '' def generate_dcid_for_statvar(pvs: dict, @@ -187,7 +189,7 @@ def generate_dcid_for_statvar(pvs: dict, value_name = get_dcid_name(prop_value, schema_nodes) value_name = get_dcid_token(value_name, upper_case, remove_prefix) if val_delim and prop not in fixed_props: - prop_name = get_dcid_token(prop, upper_case, remove_prefix) + prop_name = get_dcid_token(prop, upper_case) value_name = prop_name + val_delim + value_name if upper_case: value_name = value_name.upper()