From c8446adbdf9cb749a8672813cc04761b17aff0a7 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 20:18:46 +0000 Subject: [PATCH 01/36] Start snapshot directory. Start snapshot directory as part of the v2 within #111. --- README.md | 2 +- snapshot/README.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 snapshot/README.md diff --git a/README.md b/README.md index 4d59e08f..5c2f1dde 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,7 @@ In addition to Github-provided [Github Actions](https://docs.github.com/en/actio - [sftp-action](https://github.com/Creepios/sftp-action) under the [MIT License](https://github.com/Creepios/sftp-action/blob/master/LICENSE) from Niklas Creepios. - [ssh-action](https://github.com/appleboy/ssh-action) under the [MIT License](https://github.com/appleboy/ssh-action/blob/master/LICENSE) from Bo-Yi Wu. -Next, the visualization tool has additional dependencies as documented in the [visualization readme](https://github.com/SchmidtDSE/afscgap/blob/main/afscgapviz/README.md). +Next, the visualization tool has additional dependencies as documented in the [visualization readme](https://github.com/SchmidtDSE/afscgap/blob/main/afscgapviz/README.md). Similarly, the community flat files snapshot updater has additional dependencies as documented in the [snapshot readme](https://github.com/SchmidtDSE/afscgap/blob/main/snapshot/README.md). Finally, note that the website uses assets from [The Noun Project](thenounproject.com/) under the NounPro plan. If used outside of https://pyafscgap.org, they may be subject to a [different license](https://thenounproject.com/pricing/#icons). diff --git a/snapshot/README.md b/snapshot/README.md new file mode 100644 index 00000000..e380e84a --- /dev/null +++ b/snapshot/README.md @@ -0,0 +1,2 @@ +# Snapshot Updater +Scripts to update the community Avro flat files as described at [data.pyafscgap.org](https://data.pyafscgap.org/). From 3e8a8701c57c81131c0f951e2f225981e50592c7 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 21:44:26 +0000 Subject: [PATCH 02/36] Additional README edits. --- README.md | 5 ++-- snapshot/README.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5c2f1dde..d90a5ebe 100644 --- a/README.md +++ b/README.md @@ -133,9 +133,10 @@ at UC Berkeley](https://dse.berkeley.edu) where [Kevin Koy](https://github.com/k
## Open Source -We are happy to be part of the open source community. +We are happy to be part of the open source community. We use the following: -At this time, the only open source dependency used by this microlibrary is [Requests](https://docs.python-requests.org/en/latest/index.html) which is available under the [Apache v2 License](https://github.com/psf/requests/blob/main/LICENSE) from [Kenneth Reitz and other contributors](https://github.com/psf/requests/graphs/contributors). + - [Requests](https://docs.python-requests.org/en/latest/index.html) which is available under the [Apache v2 License](https://github.com/psf/requests/blob/main/LICENSE) from [Kenneth Reitz and other contributors](https://github.com/psf/requests/graphs/contributors). + - [fastavro](https://fastavro.readthedocs.io/en/latest/) by Miki Tebeka and Contributors under the [MIT License](https://github.com/fastavro/fastavro/blob/master/LICENSE). In addition to Github-provided [Github Actions](https://docs.github.com/en/actions), our build and documentation systems also use the following but are not distributed with or linked to the project itself: diff --git a/snapshot/README.md b/snapshot/README.md index e380e84a..d2543c41 100644 --- a/snapshot/README.md +++ b/snapshot/README.md @@ -1,2 +1,65 @@ # Snapshot Updater Scripts to update the community Avro flat files as described at [data.pyafscgap.org](https://data.pyafscgap.org/). + +## Purpose +Due to API limitations that prevent filtering joined data prior to downloading locally, community flat files in [Avro format](https://avro.apache.org/) offer pre-joined data with indicies which can be used by `pyafscgap` to avoid downloading all catch data or specifying individual hauls. This directory contains scripts used to update those resources which are availble at [data.pyafscgap.org](https://data.pyafscgap.org/). + +## Usage +The updater can be executed with individual scripts or in its entirety through bash. Note that some of these steps use environment variables specified in local setup. + +### Python library +These community files are used by default when interacting with the `pyafscgap` library. See [pyafscgap.org](https://pyafscgap.org/) for instructions. These Avro files will be requested and iterated by the client without the user needing to understand the underlying file format. Only the `pyafscgap` interface is intended to be maintained across major versions for backwards compatibility. + +### Prebulit payloads +Prebuilt Avro files are avialable via HTTPS through [data.pyafscgap.org](https://data.pyafscgap.org/). There are two subdirectories of files. + +First, [index](https://data.pyafscgap.org/index) contains "index data files" which indicate where catch data can be found. These indicies include filename that can be found in `joined`. Each file maps from a value for the filename's variable to a set of joined flat files with those data can be found. Each value refers to a specific haul where floating point values are rounded to two decimal places. Note that, due to this rounding, more precise filters will have to further sub-filter after collecting relevant data from the `joined` subdirectory. + +Second, [joined](https://data.pyafscgap.org/joined) includes all catch data joined against the species list and hauls table to create a single "flat" file which fully describes all information available for each catch. Each record is a single catch and each file is a single haul where a haul takes place within a specific year and survey. + +Note that, while provided as a service to the community, these Avro files and directory structure may be changed in the future. These files exist to serve the `pyafscgap` functionality as the NOAA APIs change over time. Therefore, for a long term stable interface with documentation and further type annotation, please consider using the `pyafscgap` library isntead. + +### Manual execution +In order to build the Avro files yourself by requesting, joining, and indexing original upstream API data, you can simply execute `bash execute_all.sh` after local setup. These will build these files on S3 but they may be deployed to an SFTP server trivially. + +## Local setup +Local environment setup varies depending on how these files are used. + +### Python library setup +Simply install `pyafscgap` normally to have the library automatically use the flat files for queries. + +### Prebuilt payloads environment +These files may be used by any programming language or environment supporting Avro. For more information, see the official [Avro documentation](https://avro.apache.org/docs/) though [fastavro](https://fastavro.readthedocs.io/en/latest/) is recommended for use in Python. + +### Environment for manual execution +To perform manual execution, these scripts expect to use [AWS S3](https://aws.amazon.com/s3/) prior to deployment to a simple SFTP server. In order to use these scripts, the following envrionment variables need to be set after installing dependencies (optionally within a virtual environment) via `pip install -r requirements.txt`: + + - `AWS_ACCESS_KEY`: This is the access key used to upload completed payloads to AWS S3 or to request those data as part of distributed indexing and processing. + - `AWS_ACCESS_SECRET`: This is the secret associated with the access key used to upload completed payloads to AWS S3 or to request those data as part of distributed indexing and processing. + - `BUCKET_NAME`: This is the name of the bucket where completed uploads should be uploaded or requested within S3. + +These may be set within `.bashrc` files or similar through `EXPORT` commands. Finally, these scripts expect [Coiled](https://www.coiled.io/) to perform distributed tasks. + +## Testing +Unit tests can be executed by running `nose2` within the `snapshot` directory. + +## Deployment +Files generated in S3 can be trivially deployed to an SFTP server or accessed directly from AWS. + +## Development +These scripts follow the same development guidelines as the overall `pyafscgap` project. Note that style and type checks are enforced though CI / CD systems. See [contributors documentation](https://github.com/SchmidtDSE/afscgap/blob/main/CONTRIBUTING.md). + +## Open source +The snapshots updater uses the following open source packages: + + - [bokeh](https://docs.bokeh.org/en/latest/) from Bokah Contributors and NumFocus under the [BSD License](https://github.com/bokeh/demo.bokeh.org/blob/main/LICENSE.txt). + - [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) under the [Apache v2 License](https://github.com/boto/boto3/blob/develop/LICENSE). + - [dask](https://www.dask.org/) from Anaconda and Contributors under the [BSD License](https://github.com/dask/dask/blob/main/LICENSE.txt). + - [fastavro](https://fastavro.readthedocs.io/en/latest/) by Miki Tebeka and Contributors under the [MIT License](https://github.com/fastavro/fastavro/blob/master/LICENSE). + - [requests](https://docs.python-requests.org/en/latest/index.html) which is available under the [Apache v2 License](https://github.com/psf/requests/blob/main/LICENSE) from [Kenneth Reitz and other contributors](https://github.com/psf/requests/graphs/contributors). + - [toolz](https://toolz.readthedocs.io/en/latest/) under a [BSD License](https://github.com/pytoolz/toolz/blob/master/LICENSE.txt). + +We thank these projects for their contribution. Note that we also use [coiled](https://www.coiled.io/). + +## License +Code to generate these flat files is released alongside the rest of the pyafscgap project under the [BSD License](https://github.com/SchmidtDSE/afscgap/blob/main/LICENSE.md). See [data.pyafscgap.org](https://data.pyafscgap.org/) for further license details regarding prebuilt files. From 6aec7cb0de47b4e0cdf617fff45f56c2c01644af Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:23:45 +0000 Subject: [PATCH 03/36] Add combine shards. --- .github/workflows/build.yml | 4 +- afscgap/test/test_convert.py | 9 +++ snapshot/combine_shards.py | 127 ++++++++++++++++++++++++++++++++ snapshot/const.py | 28 +++++++ snapshot/norm_util.py | 39 ++++++++++ snapshot/test_combine_shards.py | 60 +++++++++++++++ snapshot/test_norm_util.py | 67 +++++++++++++++++ 7 files changed, 333 insertions(+), 1 deletion(-) create mode 100644 snapshot/combine_shards.py create mode 100644 snapshot/const.py create mode 100644 snapshot/norm_util.py create mode 100644 snapshot/test_combine_shards.py create mode 100644 snapshot/test_norm_util.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8e18e3f9..daa6318e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,8 +23,10 @@ jobs: run: nose2 - name: Unit test app run: nose2 --start-dir=afscgapviz + - name: Unit test snapshot + run: nose2 --start-dir=snapshot - name: Check types - run: mypy **/*.py + run: mypy **/*.py --check-untyped-defs - name: Check errors run: pyflakes **/*.py - name: Check style diff --git a/afscgap/test/test_convert.py b/afscgap/test/test_convert.py index 075f1961..08c2ac66 100644 --- a/afscgap/test/test_convert.py +++ b/afscgap/test/test_convert.py @@ -1,3 +1,12 @@ +""" +Tests for unit conversion. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" import unittest import unittest.mock diff --git a/snapshot/combine_shards.py b/snapshot/combine_shards.py new file mode 100644 index 00000000..19fd6319 --- /dev/null +++ b/snapshot/combine_shards.py @@ -0,0 +1,127 @@ +""" +Script to combine sharded indicies into a single index usable by the pyafscgap library. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import io +import itertools +import os +import sys +import typing + +import boto3 +import fastavro + +import norm_util + +INDEX_SCHEMA = { + 'doc': 'Index from a value to an observations flat file.', + 'name': 'Index', + 'namespace': 'edu.dse.afscgap', + 'type': 'record', + 'fields': [ + {'name': 'value', 'type': ['string', 'long', 'double', 'null']}, + {'name': 'keys', 'type': { + 'type': 'array', + 'items': { + 'name': 'Key', + 'type': 'record', + 'fields': [ + {'name': 'year', 'type': 'int'}, + {'name': 'survey', 'type': 'string'}, + {'name': 'haul', 'type': 'long'} + ] + } + }} + ] +} + +NUM_ARGS = 2 +USAGE_STR = 'python combine_shards.py [bucket] [key]' + + +def normalize_record(key: str, target: dict) -> dict: + """Normalize a record value. + + Normalize a record value so that it can be used to generate bins of haul keys, rounding or + truncating in an expected way. + + Args: + key: The property key for which a value should be normalized. + target: The record whose value should be updated. + + Returns: + The record after its value attribute has been normalized if required or target unmodified + if no changes made. + """ + value = target['value'] + normalized = norm_util.normalize_value(key, value) + target['value'] = normalized + return target + + +def main(): + """Entry point for the shard combination script.""" + if len(sys.argv) != NUM_ARGS + 1: + print(USAGE_STR) + sys.exit(1) + + bucket = sys.argv[1] + key = sys.argv[2] + + filename = key + '.txt' + loc = os.path.join('index_shards', filename) + with open(loc) as f: + batches = [int(x.strip()) for x in f] + + access_key = os.environ['AWS_ACCESS_KEY'] + access_secret = os.environ['AWS_ACCESS_SECRET'] + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + + def get_avro(full_loc: str) -> typing.Iterable[dict]: + """Get the contents of an Avro file as parsed dictionaries. + + Args: + full_loc: The location where the avro file can be found within the S3 bucket. + + Returns: + List of parsed Avro records with each element being one parsed Avro record. + """ + target_buffer = io.BytesIO() + s3_client.download_fileobj(bucket, full_loc, target_buffer) + target_buffer.seek(0) + return list(fastavro.reader(target_buffer)) + + batch_locs = map(lambda x: 'index_sharded/%s_%d.avro' % (key, x), batches) + shards = map(get_avro, batch_locs) + combined = itertools.chain(*shards) + normalized = map(lambda x: normalize_record(key, x), combined) + + write_buffer = io.BytesIO() + fastavro.writer( + write_buffer, + INDEX_SCHEMA, + normalized + ) + write_buffer.seek(0) + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + output_loc = 'index/%s.avro' % key + s3_client.upload_fileobj(write_buffer, bucket, output_loc) + + +if __name__ == '__main__': + main() diff --git a/snapshot/const.py b/snapshot/const.py new file mode 100644 index 00000000..8078c15b --- /dev/null +++ b/snapshot/const.py @@ -0,0 +1,28 @@ +""" +Shared constants for flat file snapshot scripts. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +REQUIRES_ROUNDING = { + 'latitude_dd_start', + 'longitude_dd_start', + 'latitude_dd_end', + 'longitude_dd_end', + 'bottom_temperature_c', + 'surface_temperature_c', + 'depth_m', + 'distance_fished_km', + 'duration_hr', + 'net_width_m', + 'net_height_m', + 'area_swept_km2', + 'cpue_kgkm2', + 'cpue_nokm2', + 'weight_kg', +} + +REQUIRES_DATE_ROUND = {'date_time'} diff --git a/snapshot/norm_util.py b/snapshot/norm_util.py new file mode 100644 index 00000000..6757007f --- /dev/null +++ b/snapshot/norm_util.py @@ -0,0 +1,39 @@ +""" +Logic to consistently normalize values for indicies. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import typing + +import const + +T = typing.TypeVar('T') + + +def normalize_value(key: str, value: T) -> T: + """Normalize a record value. + + Normalize a record value so that it can be used to generate bins of haul keys, rounding or + truncating in an expected way. + + Args: + key: The property key for which a value should be normalized. + target: The record whose value should be updated. + + Returns: + The record after its value attribute has been normalized if required or target unmodified + if no changes made. + """ + if value is None: + return None + else: + if key in const.REQUIRES_ROUNDING: + return '%.2f' % value # type: ignore + elif key in const.REQUIRES_DATE_ROUND: + return value.split('T')[0] # type: ignore + else: + return value diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py new file mode 100644 index 00000000..375bf0c2 --- /dev/null +++ b/snapshot/test_combine_shards.py @@ -0,0 +1,60 @@ +""" +Tests for scripts to combine index shards. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import unittest +import unittest.mock + +import combine_shards + + +class NormUtilTests(unittest.TestCase): + + def test_unchanged(self): + normalized = combine_shards.normalize_record('test attr', {'value': 'test val'}) + self.assertEqual(normalized['value'], 'test val') + + def test_none(self): + normalized = combine_shards.normalize_record('depth_m', {'value': None}) + self.assertEqual(normalized['value'], None) + + def test_changed(self): + normalized = combine_shards.normalize_record('depth_m', {'value': 1.236}) + self.assertAlmostEqual(normalized, 1.24) + + def test_rounded_float_same(self): + normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236}) + normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.237}) + self.assertAlmostEqual(normalized_1['value'], normalized_2['value']) + + def test_rounded_float_different(self): + normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.234}) + normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.236}) + self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value']) + + def test_rounded_datetime_same(self): + normalized_1 = combine_shards.normalize_record( + 'date_time', + {'value': '2025-12-31T13:25:50Z'} + ) + normalized_2 = combine_shards.normalize_record( + 'date_time', + {'value': '2025-12-31T14:25:50Z'} + ) + self.assertAlmostEqual(normalized_1['value'], normalized_2['value']) + + def test_rounded_datetime_different(self): + normalized_1 = combine_shards.normalize_record( + 'date_time', + {'value': '2025-12-31T13:25:50Z'} + ) + normalized_2 = combine_shards.normalize_record( + 'date_time', + {'value': '2025-12-30T14:25:50Z'} + ) + self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value']) diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py new file mode 100644 index 00000000..e63414a5 --- /dev/null +++ b/snapshot/test_norm_util.py @@ -0,0 +1,67 @@ +""" +Tests for normalization utilities. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import unittest +import unittest.mock + +import norm_util + + +class NormUtilTests(unittest.TestCase): + + def test_normalize_record_unknown_none(self): + normalized = norm_util.normalize_value('test attr', 'test val') + self.assertEqual(normalized, 'test val') + + def test_normalize_record_known_none(self): + normalized = norm_util.normalize_value('depth_m', None) + self.assertEqual(normalized, None) + + def test_normalize_record_pass_float(self): + normalized = norm_util.normalize_value('depth_m', 1.23) + self.assertAlmostEqual(normalized, 1.23) + + def test_normalize_record_pass_datetime(self): + normalized = norm_util.normalize_value('date_time', '2025-01-13') + self.assertEqual(normalized, '2025-01-13') + + def test_normalize_record_round_float_same_up(self): + normalized_1 = norm_util.normalize_value('depth_m', 1.237) + normalized_2 = norm_util.normalize_value('depth_m', 1.236) + self.assertAlmostEqual(normalized_1, normalized_2) + + def test_normalize_record_round_float_same_down(self): + normalized_1 = norm_util.normalize_value('depth_m', 1.231) + normalized_2 = norm_util.normalize_value('depth_m', 1.229) + self.assertAlmostEqual(normalized_1, normalized_2) + + def test_normalize_record_round_float_different(self): + normalized_1 = norm_util.normalize_value('depth_m', 1.236) + normalized_2 = norm_util.normalize_value('depth_m', 1.234) + self.assertNotAlmostEqual(normalized_1, normalized_2) + + def test_normalize_record_round_datetime_valid_same(self): + normalized_1 = norm_util.normalize_value('date_time', '2025-01-13T12:25:50Z') + normalized_2 = norm_util.normalize_value('date_time', '2025-01-13T13:25:50Z') + self.assertEqual(normalized_1, normalized_2) + + def test_normalize_record_round_datetime_valid_different(self): + normalized_1 = norm_util.normalize_value('date_time', '2025-01-13T12:25:50Z') + normalized_2 = norm_util.normalize_value('date_time', '2025-01-14T13:25:50Z') + self.assertNotEqual(normalized_1, normalized_2) + + def test_normalize_record_round_datetime_invalid_same(self): + normalized_1 = norm_util.normalize_value('date_time', 'test') + normalized_2 = norm_util.normalize_value('date_time', 'test') + self.assertEqual(normalized_1, normalized_2) + + def test_normalize_record_round_datetime_invalid_different(self): + normalized_1 = norm_util.normalize_value('date_time', 'test') + normalized_2 = norm_util.normalize_value('date_time', 'other') + self.assertNotEqual(normalized_1, normalized_2) From 01d46648aa092191856191e1616e0f58984d9448 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:27:12 +0000 Subject: [PATCH 04/36] Fixes for #114 initial code. --- .github/workflows/build.yml | 2 ++ snapshot/requirements.txt | 6 ++++++ snapshot/test_combine_shards.py | 8 ++++---- snapshot/test_norm_util.py | 14 +++++++------- 4 files changed, 19 insertions(+), 11 deletions(-) create mode 100644 snapshot/requirements.txt diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index daa6318e..d2c3ea95 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,6 +17,8 @@ jobs: run: pip install -e .[dev] - name: Install dev dependencies for app run: pip install -r afscgapviz/requirements.txt + - name: Install dev dependencies for snapshot + run: pip install -r snapshot/requirements.txt - name: Install afscgap run: pip install . - name: Unit tests main diff --git a/snapshot/requirements.txt b/snapshot/requirements.txt new file mode 100644 index 00000000..530277ea --- /dev/null +++ b/snapshot/requirements.txt @@ -0,0 +1,6 @@ +bokeh!=3.0.*,>=2.4.2 +boto3==1.35.54 +coiled==1.59.0 +fastavro==1.9.7 +requests==2.32.3 +toolz==1.0.0 diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py index 375bf0c2..e88ed30b 100644 --- a/snapshot/test_combine_shards.py +++ b/snapshot/test_combine_shards.py @@ -30,12 +30,12 @@ def test_changed(self): def test_rounded_float_same(self): normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236}) normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.237}) - self.assertAlmostEqual(normalized_1['value'], normalized_2['value']) + self.assertAlmostEqual(float(normalized_1['value']), float(normalized_2['value'])) def test_rounded_float_different(self): normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.234}) normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.236}) - self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value']) + self.assertNotAlmostEqual(float(normalized_1['value']), float(normalized_2['value'])) def test_rounded_datetime_same(self): normalized_1 = combine_shards.normalize_record( @@ -46,7 +46,7 @@ def test_rounded_datetime_same(self): 'date_time', {'value': '2025-12-31T14:25:50Z'} ) - self.assertAlmostEqual(normalized_1['value'], normalized_2['value']) + self.assertEqual(normalized_1['value'], normalized_2['value']) def test_rounded_datetime_different(self): normalized_1 = combine_shards.normalize_record( @@ -57,4 +57,4 @@ def test_rounded_datetime_different(self): 'date_time', {'value': '2025-12-30T14:25:50Z'} ) - self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value']) + self.assertNotEqual(normalized_1['value'], normalized_2['value']) diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py index e63414a5..376524ad 100644 --- a/snapshot/test_norm_util.py +++ b/snapshot/test_norm_util.py @@ -24,7 +24,7 @@ def test_normalize_record_known_none(self): self.assertEqual(normalized, None) def test_normalize_record_pass_float(self): - normalized = norm_util.normalize_value('depth_m', 1.23) + normalized = float(norm_util.normalize_value('depth_m', 1.23)) self.assertAlmostEqual(normalized, 1.23) def test_normalize_record_pass_datetime(self): @@ -32,18 +32,18 @@ def test_normalize_record_pass_datetime(self): self.assertEqual(normalized, '2025-01-13') def test_normalize_record_round_float_same_up(self): - normalized_1 = norm_util.normalize_value('depth_m', 1.237) - normalized_2 = norm_util.normalize_value('depth_m', 1.236) + normalized_1 = float(norm_util.normalize_value('depth_m', 1.237)) + normalized_2 = float(norm_util.normalize_value('depth_m', 1.236)) self.assertAlmostEqual(normalized_1, normalized_2) def test_normalize_record_round_float_same_down(self): - normalized_1 = norm_util.normalize_value('depth_m', 1.231) - normalized_2 = norm_util.normalize_value('depth_m', 1.229) + normalized_1 = float(norm_util.normalize_value('depth_m', 1.231)) + normalized_2 = float(norm_util.normalize_value('depth_m', 1.229)) self.assertAlmostEqual(normalized_1, normalized_2) def test_normalize_record_round_float_different(self): - normalized_1 = norm_util.normalize_value('depth_m', 1.236) - normalized_2 = norm_util.normalize_value('depth_m', 1.234) + normalized_1 = float(norm_util.normalize_value('depth_m', 1.236)) + normalized_2 = float(norm_util.normalize_value('depth_m', 1.234)) self.assertNotAlmostEqual(normalized_1, normalized_2) def test_normalize_record_round_datetime_valid_same(self): From 8312d7a23a4667c6f45b368fea1342803e6dcd2c Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:28:59 +0000 Subject: [PATCH 05/36] Fix test_changed in test_combine_shards. --- snapshot/test_combine_shards.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py index e88ed30b..7969c3a4 100644 --- a/snapshot/test_combine_shards.py +++ b/snapshot/test_combine_shards.py @@ -25,7 +25,7 @@ def test_none(self): def test_changed(self): normalized = combine_shards.normalize_record('depth_m', {'value': 1.236}) - self.assertAlmostEqual(normalized, 1.24) + self.assertAlmostEqual(float(normalized), 1.24) def test_rounded_float_same(self): normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236}) From d5d2373e8b76b433906f6190ac91a63f124bd22d Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:30:30 +0000 Subject: [PATCH 06/36] Additional bash scripts. --- snapshot/combine_shards.sh | 76 ++++++++++++++++++++++++++++++++++++++ snapshot/execute_all.sh | 25 +++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 snapshot/combine_shards.sh create mode 100644 snapshot/execute_all.sh diff --git a/snapshot/combine_shards.sh b/snapshot/combine_shards.sh new file mode 100644 index 00000000..1f5402b3 --- /dev/null +++ b/snapshot/combine_shards.sh @@ -0,0 +1,76 @@ +echo "area_swept_km2" +python combine_shards.py $BUCKET_NAME area_swept_km2 +echo "bottom_temperature_c" +python combine_shards.py $BUCKET_NAME bottom_temperature_c +echo "common_name" +python combine_shards.py $BUCKET_NAME common_name +echo "count" +python combine_shards.py $BUCKET_NAME count +echo "cpue_kgkm2" +python combine_shards.py $BUCKET_NAME cpue_kgkm2 +echo "cpue_nokm2" +python combine_shards.py $BUCKET_NAME cpue_nokm2 +echo "cruise" +python combine_shards.py $BUCKET_NAME cruise +echo "cruisejoin" +python combine_shards.py $BUCKET_NAME cruisejoin +echo "date_time" +python combine_shards.py $BUCKET_NAME date_time +echo "depth_m" +python combine_shards.py $BUCKET_NAME depth_m +echo "distance_fished_km" +python combine_shards.py $BUCKET_NAME distance_fished_km +echo "duration_hr" +python combine_shards.py $BUCKET_NAME duration_hr +echo "haul" +python combine_shards.py $BUCKET_NAME haul +echo "hauljoin" +python combine_shards.py $BUCKET_NAME hauljoin +echo "id_rank" +python combine_shards.py $BUCKET_NAME id_rank +echo "latitude_dd_end" +python combine_shards.py $BUCKET_NAME latitude_dd_end +echo "latitude_dd_start" +python combine_shards.py $BUCKET_NAME latitude_dd_start +echo "longitude_dd_end" +python combine_shards.py $BUCKET_NAME longitude_dd_end +echo "longitude_dd_start" +python combine_shards.py $BUCKET_NAME longitude_dd_start +echo "net_height_m" +python combine_shards.py $BUCKET_NAME net_height_m +echo "net_width_m" +python combine_shards.py $BUCKET_NAME net_width_m +echo "performance" +python combine_shards.py $BUCKET_NAME performance +echo "requirements" +python combine_shards.py $BUCKET_NAME requirements +echo "scientific_name" +python combine_shards.py $BUCKET_NAME scientific_name +echo "species_code" +python combine_shards.py $BUCKET_NAME species_code +echo "srvy" +python combine_shards.py $BUCKET_NAME srvy +echo "station" +python combine_shards.py $BUCKET_NAME station +echo "stratum" +python combine_shards.py $BUCKET_NAME stratum +echo "surface_temperature_c" +python combine_shards.py $BUCKET_NAME surface_temperature_c +echo "survey" +python combine_shards.py $BUCKET_NAME survey +echo "survey_definition_id" +python combine_shards.py $BUCKET_NAME survey_definition_id +echo "survey_name" +python combine_shards.py $BUCKET_NAME survey_name +echo "taxon_confidence" +python combine_shards.py $BUCKET_NAME taxon_confidence +echo "variables" +python combine_shards.py $BUCKET_NAME variables +echo "vessel_id" +python combine_shards.py $BUCKET_NAME vessel_id +echo "vessel_name" +python combine_shards.py $BUCKET_NAME vessel_name +echo "weight_kg" +python combine_shards.py $BUCKET_NAME weight_kg +echo "year" +python combine_shards.py $BUCKET_NAME year diff --git a/snapshot/execute_all.sh b/snapshot/execute_all.sh new file mode 100644 index 00000000..00847db4 --- /dev/null +++ b/snapshot/execute_all.sh @@ -0,0 +1,25 @@ +echo "Starting..." >> status.txt + +echo "== Get all years ==" +echo "== Get all years ==" >> status.txt +bash get_all_years.sh +echo "== Render flat ==" +echo "== Render flat ==" >> status.txt +python3 render_flat.py $BUCKET_NAME written_paths.csv +echo "== Index data ==" +echo "== Index data ==" >> status.txt +bash index_data.sh +echo "== Combine shards ==" +echo "== Combine shards ==" >> status.txt +bash combine_shards.sh +echo "== Write main ==" +echo "== Write main ==" >> status.txt +python3 write_main_index.py $BUCKET_NAME +echo "== Move index ==" +echo "== Move index ==" >> status.txt +python3 move_afscgap.py index +echo "== Move joined ==" +echo "== Move joined ==" >> status.txt +python3 move_afscgap.py joined + +echo "Done." >> status.txt From 7c02f93382fa59b7882c465f4fef14a89a010873 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:32:11 +0000 Subject: [PATCH 07/36] Fix in test for #114. --- snapshot/test_combine_shards.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py index 7969c3a4..d00cdd75 100644 --- a/snapshot/test_combine_shards.py +++ b/snapshot/test_combine_shards.py @@ -25,7 +25,7 @@ def test_none(self): def test_changed(self): normalized = combine_shards.normalize_record('depth_m', {'value': 1.236}) - self.assertAlmostEqual(float(normalized), 1.24) + self.assertAlmostEqual(float(normalized['value']), 1.24) def test_rounded_float_same(self): normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236}) From 559166c8ddc86582ac1b18425c7f5a5bb98cb5ae Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:35:08 +0000 Subject: [PATCH 08/36] Type fixes for #114. --- snapshot/combine_shards.py | 2 +- snapshot/norm_util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshot/combine_shards.py b/snapshot/combine_shards.py index 19fd6319..9d42f6f0 100644 --- a/snapshot/combine_shards.py +++ b/snapshot/combine_shards.py @@ -99,7 +99,7 @@ def get_avro(full_loc: str) -> typing.Iterable[dict]: target_buffer = io.BytesIO() s3_client.download_fileobj(bucket, full_loc, target_buffer) target_buffer.seek(0) - return list(fastavro.reader(target_buffer)) + return list(fastavro.reader(target_buffer)) # type: ignore batch_locs = map(lambda x: 'index_sharded/%s_%d.avro' % (key, x), batches) shards = map(get_avro, batch_locs) diff --git a/snapshot/norm_util.py b/snapshot/norm_util.py index 6757007f..0bc9f10f 100644 --- a/snapshot/norm_util.py +++ b/snapshot/norm_util.py @@ -14,7 +14,7 @@ T = typing.TypeVar('T') -def normalize_value(key: str, value: T) -> T: +def normalize_value(key: str, value: typing.Optional[T]) -> typing.Optional[T]: """Normalize a record value. Normalize a record value so that it can be used to generate bins of haul keys, rounding or From 400643a0be28057b3796487d08a4bf6cc889716e Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:38:59 +0000 Subject: [PATCH 09/36] Viz updates for expanded types checks. --- afscgapviz/afscgapviz.py | 12 ++++++------ afscgapviz/build_database.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/afscgapviz/afscgapviz.py b/afscgapviz/afscgapviz.py index 3a8519ba..22058117 100644 --- a/afscgapviz/afscgapviz.py +++ b/afscgapviz/afscgapviz.py @@ -265,7 +265,7 @@ def render_page(): with conn_generator() as con: return flask.render_template( 'viz.html', - displays=get_display_info(con, state)['state'], + displays=get_display_info(con, state)['state'], # type: ignore get_species_select_content=get_species_select_content ) @@ -387,7 +387,7 @@ def download_geohashes(): else: base_sql = sql_util.get_sql('query') query_sql = base_sql % (geohash_size + 1, species_filter[0]) - query_args = (year, survey, species_filter[1]) + query_args = (year, survey, species_filter[1]) # type: ignore output_io = io.StringIO() writer = csv.DictWriter( @@ -416,7 +416,7 @@ def download_geohashes(): writer.writerows(results_dict_final) full_filename_pieces = comparison_filename_pieces + filename_pieces - filename_spaces = '_'.join(full_filename_pieces) + filename_spaces = '_'.join(full_filename_pieces) # type: ignore filename = filename_spaces.replace(' ', '_') if FILENAME_REGEX.match(filename) is None: @@ -556,7 +556,7 @@ def try_float(target: str) -> float: species_filter[0], geohash_size + 1 ) - query_args = (year, survey, species_filter[1]) + query_args = (year, survey, species_filter[1]) # type: ignore with conn_generator() as connection: cursor = connection.cursor() @@ -586,7 +586,7 @@ def try_float(target: str) -> float: max_temp, first_cpue, second_cpue - ) = result_float + ) = result_float # type: ignore ret_object = { 'cpue': { @@ -602,7 +602,7 @@ def try_float(target: str) -> float: } if is_comparison: - ret_object['cpue']['second'] = { + ret_object['cpue']['second'] = { # type: ignore 'name': other_species_filter[1], 'year': other_year, 'value': second_cpue diff --git a/afscgapviz/build_database.py b/afscgapviz/build_database.py index b83ebda6..8923d4a6 100644 --- a/afscgapviz/build_database.py +++ b/afscgapviz/build_database.py @@ -327,8 +327,8 @@ def download_main(args): for year in years: for survey in SURVEYS: - with connection as cursor: - download_and_persist_year(survey, year, cursor, geohash_size) + with connection as cursor: # type: ignore + download_and_persist_year(survey, year, cursor, geohash_size) # type: ignore print('Completed %d for %s.' % (year, survey)) time.sleep(SLEEP_TIME) From 4ea9b4b092ea9463544966fe2bbc95db5ac20409 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:44:04 +0000 Subject: [PATCH 10/36] Additional type fixes #114. --- afscgapviz/afscgapviz.py | 2 +- snapshot/combine_shards.py | 2 +- snapshot/test_norm_util.py | 18 +++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/afscgapviz/afscgapviz.py b/afscgapviz/afscgapviz.py index 22058117..fb6dbb27 100644 --- a/afscgapviz/afscgapviz.py +++ b/afscgapviz/afscgapviz.py @@ -14,7 +14,7 @@ import sqlite3 import typing -import flask +import flask # type: ignore import data_util import model diff --git a/snapshot/combine_shards.py b/snapshot/combine_shards.py index 9d42f6f0..310de248 100644 --- a/snapshot/combine_shards.py +++ b/snapshot/combine_shards.py @@ -13,7 +13,7 @@ import sys import typing -import boto3 +import boto3 # type: ignore import fastavro import norm_util diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py index 376524ad..1484081d 100644 --- a/snapshot/test_norm_util.py +++ b/snapshot/test_norm_util.py @@ -24,7 +24,7 @@ def test_normalize_record_known_none(self): self.assertEqual(normalized, None) def test_normalize_record_pass_float(self): - normalized = float(norm_util.normalize_value('depth_m', 1.23)) + normalized = self._force_float(norm_util.normalize_value('depth_m', 1.23)) self.assertAlmostEqual(normalized, 1.23) def test_normalize_record_pass_datetime(self): @@ -32,18 +32,18 @@ def test_normalize_record_pass_datetime(self): self.assertEqual(normalized, '2025-01-13') def test_normalize_record_round_float_same_up(self): - normalized_1 = float(norm_util.normalize_value('depth_m', 1.237)) - normalized_2 = float(norm_util.normalize_value('depth_m', 1.236)) + normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.237)) + normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.236)) self.assertAlmostEqual(normalized_1, normalized_2) def test_normalize_record_round_float_same_down(self): - normalized_1 = float(norm_util.normalize_value('depth_m', 1.231)) - normalized_2 = float(norm_util.normalize_value('depth_m', 1.229)) + normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.231)) + normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.229)) self.assertAlmostEqual(normalized_1, normalized_2) def test_normalize_record_round_float_different(self): - normalized_1 = float(norm_util.normalize_value('depth_m', 1.236)) - normalized_2 = float(norm_util.normalize_value('depth_m', 1.234)) + normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.236)) + normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.234)) self.assertNotAlmostEqual(normalized_1, normalized_2) def test_normalize_record_round_datetime_valid_same(self): @@ -65,3 +65,7 @@ def test_normalize_record_round_datetime_invalid_different(self): normalized_1 = norm_util.normalize_value('date_time', 'test') normalized_2 = norm_util.normalize_value('date_time', 'other') self.assertNotEqual(normalized_1, normalized_2) + + def _force_float(self, value) -> float: + assert value is not None + return float(value) From 3368e5ca7fa0103ca455f55079ff241a44e50c3b Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 22:45:49 +0000 Subject: [PATCH 11/36] Style fixes for #114. --- snapshot/test_combine_shards.py | 2 +- snapshot/test_norm_util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py index d00cdd75..4aee0c86 100644 --- a/snapshot/test_combine_shards.py +++ b/snapshot/test_combine_shards.py @@ -22,7 +22,7 @@ def test_unchanged(self): def test_none(self): normalized = combine_shards.normalize_record('depth_m', {'value': None}) self.assertEqual(normalized['value'], None) - + def test_changed(self): normalized = combine_shards.normalize_record('depth_m', {'value': 1.236}) self.assertAlmostEqual(float(normalized['value']), 1.24) diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py index 1484081d..a9d18e5d 100644 --- a/snapshot/test_norm_util.py +++ b/snapshot/test_norm_util.py @@ -18,7 +18,7 @@ class NormUtilTests(unittest.TestCase): def test_normalize_record_unknown_none(self): normalized = norm_util.normalize_value('test attr', 'test val') self.assertEqual(normalized, 'test val') - + def test_normalize_record_known_none(self): normalized = norm_util.normalize_value('depth_m', None) self.assertEqual(normalized, None) @@ -40,7 +40,7 @@ def test_normalize_record_round_float_same_down(self): normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.231)) normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.229)) self.assertAlmostEqual(normalized_1, normalized_2) - + def test_normalize_record_round_float_different(self): normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.236)) normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.234)) From efefb1a5775e06aed90b2da7bbf9aa0610398796 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 23:36:02 +0000 Subject: [PATCH 12/36] Add generate_indicies. --- snapshot/const.py | 2 + snapshot/execute_all.sh | 6 - snapshot/generate_indicies.py | 457 ++++++++++++++++++++++++++++++++++ 3 files changed, 459 insertions(+), 6 deletions(-) create mode 100644 snapshot/generate_indicies.py diff --git a/snapshot/const.py b/snapshot/const.py index 8078c15b..619c8bbe 100644 --- a/snapshot/const.py +++ b/snapshot/const.py @@ -26,3 +26,5 @@ } REQUIRES_DATE_ROUND = {'date_time'} + +ZEROABLE_FIELDS = ['cpue_kgkm2', 'cpue_nokm2', 'weight_kg', 'count'] diff --git a/snapshot/execute_all.sh b/snapshot/execute_all.sh index 00847db4..b3081ff9 100644 --- a/snapshot/execute_all.sh +++ b/snapshot/execute_all.sh @@ -15,11 +15,5 @@ bash combine_shards.sh echo "== Write main ==" echo "== Write main ==" >> status.txt python3 write_main_index.py $BUCKET_NAME -echo "== Move index ==" -echo "== Move index ==" >> status.txt -python3 move_afscgap.py index -echo "== Move joined ==" -echo "== Move joined ==" >> status.txt -python3 move_afscgap.py joined echo "Done." >> status.txt diff --git a/snapshot/generate_indicies.py b/snapshot/generate_indicies.py new file mode 100644 index 00000000..230965c0 --- /dev/null +++ b/snapshot/generate_indicies.py @@ -0,0 +1,457 @@ +""" +Script to generate sharded indicies which indicate in which hauls values can be found. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import itertools +import os +import sys +import typing + +import boto3 # type: ignore +import coiled # type: ignore +import dask # type: ignore +import dask.bag # type: ignore + +import const +import norm_util + +USAGE_STR = 'python render_flat.py [bucket] [keys] [terminate]' +NUM_ARGS = 3 + +REQUIRES_FLAT = { + 'performance', + 'cruise', + 'cruisejoin', + 'hauljoin', + 'haul' +} + +IGNORE_ZEROS = { + 'species_code', + 'scientific_name', + 'common_name' +} + +T = typing.TypeVar('T') + + +def build_index_record(record: dict, key: str, year: int, survey: str, haul: int) -> dict: + """Build an index record. + + Args: + record: The record to index. + key: The key (attribute name) being indexed. + year: The year of the haul represented by the record. + survey: The survey in which the haul took place. + haul: The ID of the haul which produced the data to index. + + Returns: + Dictionary describing the index record which can be combined through a reduce operation. + """ + value = record[key] + key_pieces = [year, survey, haul] + key_pieces_str = map(lambda x: str(x), key_pieces) + key_output = '\t'.join(key_pieces_str) + return { + 'value': value, + 'keys': set([key_output]) + } + + +def is_non_zero(target: dict) -> bool: + """Determine if the record is a zeroed record. + + Determine if the record is a zeroed record, potentially indicating absence according to the + ZEROABLE_FIELDS. + + Args: + target: The record to check. + + Returns: + False if the record is zeroed and true otherwise. + """ + + def is_field_non_zero(field: str) -> bool: + value = target[field] + return (value is not None) and (value > 0) + + fields = const.ZEROABLE_FIELDS + flags = map(is_field_non_zero, fields) + flags_positive = filter(lambda x: x is True, flags) + num_flags_positive = sum(map(lambda x: 1, flags_positive)) + return num_flags_positive > 0 + + +def process_file(bucket: str, year: int, survey: str, haul: int, key: str) -> typing.List[dict]: + """Process a single flattened joined file remotely. + + Args: + bucket: The name of the bucket where the file can be found. + year: The year of the haul represented in the file. + survey: The survey in which the haul took place for the file. + haul: The ID of the haul which produced the data to process. + key: The key (attribute name) being indexed. + + Returns: + Dictionary with index records from the given file. + """ + import io + import os + + import botocore # type: ignore + import boto3 # type: ignore + import fastavro + + access_key = os.environ['AWS_ACCESS_KEY'] + access_secret = os.environ['AWS_ACCESS_SECRET'] + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + + def get_avro(full_loc: str) -> typing.Optional[typing.List[dict]]: + """Get all the records from a file within S3. + + Args: + full_loc: The location (path) within the S3 bucket. + + Returns: + List of records found at the given location or None if there is an error in reading like + the file is not found. + """ + target_buffer = io.BytesIO() + s3_client.download_fileobj(bucket, full_loc, target_buffer) + target_buffer.seek(0) + return list(fastavro.reader(target_buffer)) # type: ignore + + def check_file_exists(full_loc: str) -> bool: + """Check that a file exists in S3. + + Args: + full_loc: The location (path) within the S3 bucket. + + Returns: + True if the file is found and false otherwise. + """ + try: + s3_client.head_object(bucket, full_loc) + return True + except botocore.exceptions.ClientError as e: + error_code = e.response['Error']['Code'] + error_code_cast = int(error_code) + if error_code_cast == 404: + return False + else: + raise RuntimeError('Unexpected S3 head code: %d' % error_code) + + def infer_index_record(record: dict) -> dict: + """Build an index record. + + Args: + record: The record to index. + + Returns: + Dictionary describing the index record which can be combined through a reduce operation. + """ + return build_index_record(record, key, year, survey, haul) + + template_vals = (year, survey, haul) + flat_loc = 'joined/%d_%s_%d.avro' % template_vals + + if not check_file_exists(flat_loc): + return [] + + flat_records_all = get_avro(flat_loc) + flat_records = filter(lambda x: x is not None, flat_records_all) # type: ignore + flat_records_allowed = get_flat_records_allowed(flat_records, key) + + index_records = map(infer_index_record, flat_records_allowed) + + return list(index_records) + + +def build_output_record(target: dict) -> dict: + """Convert an index record to a JSON serializable dictionary which can be written to S3. + + Args: + target: The index record to be converted for serialization. + + Returns: + A JSON-serializable version of target. + """ + + def process_key(key_str: str) -> dict: + """Parse a key into a dictionary which will be saved as an object in JSON. + + Args: + key_str: The string description of the key to parse. + + Returns: + The key string interpreted as a dictionary with separated fields. + """ + key_pieces = key_str.split('\t') + year = int(key_pieces[0]) + survey = key_pieces[1] + haul = int(key_pieces[2]) + return {'year': year, 'survey': survey, 'haul': haul} + + return { + 'value': target['value'], + 'keys': [process_key(x) for x in target['keys']] + } + + +def get_observations_meta(bucket: str) -> typing.Iterable[dict]: + """Get keys for all available joined data inside a bucket. + + Args: + bucket: The bucket at which the data are to be found. + + Returns: + Records of all data available within the bucket inside the "joined" directory. + """ + access_key = os.environ['AWS_ACCESS_KEY'] + access_secret = os.environ['AWS_ACCESS_SECRET'] + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + + def make_haul_metadata_record(path: str) -> dict: + """Create a key object (dict) given a path to data for that haul. + + Args: + path: The path within the s3 bucket. + + Returns: + Dictionary representing a haul key for the given path. + """ + filename_with_path = path.split('/')[-1] + filename = filename_with_path.split('.')[0] + components = filename.split('_') + return { + 'path': path, + 'year': int(components[0]), + 'survey': components[1], + 'haul': int(components[2]) + } + + paginator = s3_client.get_paginator('list_objects_v2') + iterator = paginator.paginate(Bucket=bucket, Prefix='joined/') + pages = filter(lambda x: 'Contents' in x, iterator) + contents = map(lambda x: x['Contents'], pages) + contents_flat = itertools.chain(*contents) + keys = map(lambda x: x['Key'], contents_flat) + return map(make_haul_metadata_record, keys) + + +def write_sample(key: str, bucket: str, sample: typing.Iterable[dict]) -> typing.Optional[int]: + """Write an index shard. + + Args: + key: The key (attribute name) being indexed. + bucket: The bucket in which the shard should be written. + sample: The contents of the shared to be written. + + Returns: + Random index number given to the shard. Client code should ensure that there were no + collisions. + """ + import io + import os + import random + + import boto3 + import fastavro + + INDEX_SCHEMA = { + 'doc': 'Index from a value to an observations flat file.', + 'name': 'Index', + 'namespace': 'edu.dse.afscgap', + 'type': 'record', + 'fields': [ + {'name': 'value', 'type': ['string', 'long', 'double', 'null']}, + {'name': 'keys', 'type': { + 'type': 'array', + 'items': { + 'name': 'Key', + 'type': 'record', + 'fields': [ + {'name': 'year', 'type': 'int'}, + {'name': 'survey', 'type': 'string'}, + {'name': 'haul', 'type': 'long'} + ] + } + }} + ] + } + + sample_realized = list(sample) + if len(sample_realized) == 0: + return None + + batch = random.randint(0, 1000000) + + access_key = os.environ.get('AWS_ACCESS_KEY', '') + access_secret = os.environ.get('AWS_ACCESS_SECRET', '') + + target_buffer = io.BytesIO() + fastavro.writer( + target_buffer, + INDEX_SCHEMA, + sample_realized + ) + target_buffer.seek(0) + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + output_loc = 'index_sharded/%s_%d.avro' % (key, batch) + s3_client.upload_fileobj(target_buffer, bucket, output_loc) + return batch + + +def normalize_value(target: dict, key: str) -> typing.Optional[T]: + """Normalize a value for indexing. + + Args: + target: Record from which a normalized value for indexing is requested. + key: The name of the attribute being indexed. + + Returns: + Normalized value to use for indexing. + """ + value = target['value'] + return norm_util.normalize_value(key, value) + + +def combine_records(a: dict, b: dict) -> dict: + """Combine index records. + + Args: + a: The first index record to combine. + b: The second index record to combine. + + Returns: + New index record indicating the cobination of the two records. + """ + assert a['value'] == b['value'] + return {'value': a['value'], 'keys': a['keys'].union(b['keys'])} + + +def get_flat_records_allowed(records: typing.Iterable[dict], key: str) -> typing.Iterable[dict]: + """Get records which are permitted to be indexed. + + Args: + records: Candidate records. + key: The name of the attribute being indexed. + + Returns: + Records which can be indexed, excluding zero catch records in some circumstances. + """ + if key in IGNORE_ZEROS: + return filter(is_non_zero, records) + else: + return records + + +def main(): + """Entry point for the shareded index generation script.""" + if len(sys.argv) != NUM_ARGS + 1: + print(USAGE_STR) + sys.exit(1) + + bucket = sys.argv[1] + keys = sys.argv[2].split(',') + terminate = sys.argv[3].lower() in ['y', 'yes', 't', 'true', '1'] + hauls_meta = list(get_observations_meta(bucket)) + + access_key = os.environ.get('AWS_ACCESS_KEY', '') + access_secret = os.environ.get('AWS_ACCESS_SECRET', '') + cluster = coiled.Cluster( + name='DseProcessAfscgap', + n_workers=100, + worker_vm_types=['m7a.medium'], + scheduler_vm_types=['m7a.medium'], + environ={ + 'AWS_ACCESS_KEY': access_key, + 'AWS_ACCESS_SECRET': access_secret + } + ) + client = cluster.get_client() + + def execute_for_key(key: str): + """Execute sharded index generation for a single attribute. + + Args: + key: The name of the attribute to be indexed. + """ + hauls_meta_realized = dask.bag.from_sequence(hauls_meta) + index_records_nest = hauls_meta_realized.map( + lambda x: process_file( + bucket, + x['year'], + x['survey'], + x['haul'], + key + ) + ) + index_records = index_records_nest.flatten() + + def key_record(target: dict): + """Get a normalized value for indexing. + + Args: + target: Record from which a normalized value for indexing is requested. + key: The name of the attribute being indexed. + + Returns: + Normalized value to use for indexing. + """ + return normalize_value(target, key) + + if key in REQUIRES_FLAT: + index_records_output = index_records.map(build_output_record) + else: + index_records_grouped_nest = index_records.foldby( + key=key_record, + binop=combine_records + ) + index_records_grouped = index_records_grouped_nest.map(lambda x: x[1]) + index_records_output = index_records_grouped.map(build_output_record) + + repartitioned = index_records_output.repartition(npartitions=20) + incidies_future = repartitioned.map_partitions( + lambda x: write_sample(key, bucket, x) + ) + + indicies_all = incidies_future.compute(scheduler=client) + indicies = filter(lambda x: x is not None, indicies_all) + indicies_strs = list(map(lambda x: str(x), indicies)) + assert len(indicies_strs) == len(set(indicies_strs)) + + loc = os.path.join('index_shards', key + '.txt') + with open(loc, 'w') as f: + f.write('\n'.join(indicies_strs)) + + for key in keys: + print('Executing for %s...' % key) + execute_for_key(key) + + if terminate: + cluster.close(force_shutdown=True) + + +if __name__ == '__main__': + main() From 29158049dda540aafbd2d03283aa42768fcf19e9 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 23:36:42 +0000 Subject: [PATCH 13/36] Move some values to const for #114. --- snapshot/const.py | 14 ++++++++++++++ snapshot/generate_indicies.py | 18 ++---------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/snapshot/const.py b/snapshot/const.py index 619c8bbe..2e9218fa 100644 --- a/snapshot/const.py +++ b/snapshot/const.py @@ -28,3 +28,17 @@ REQUIRES_DATE_ROUND = {'date_time'} ZEROABLE_FIELDS = ['cpue_kgkm2', 'cpue_nokm2', 'weight_kg', 'count'] + +REQUIRES_FLAT = { + 'performance', + 'cruise', + 'cruisejoin', + 'hauljoin', + 'haul' +} + +IGNORE_ZEROS = { + 'species_code', + 'scientific_name', + 'common_name' +} diff --git a/snapshot/generate_indicies.py b/snapshot/generate_indicies.py index 230965c0..115ef0e5 100644 --- a/snapshot/generate_indicies.py +++ b/snapshot/generate_indicies.py @@ -23,20 +23,6 @@ USAGE_STR = 'python render_flat.py [bucket] [keys] [terminate]' NUM_ARGS = 3 -REQUIRES_FLAT = { - 'performance', - 'cruise', - 'cruisejoin', - 'hauljoin', - 'haul' -} - -IGNORE_ZEROS = { - 'species_code', - 'scientific_name', - 'common_name' -} - T = typing.TypeVar('T') @@ -360,7 +346,7 @@ def get_flat_records_allowed(records: typing.Iterable[dict], key: str) -> typing Returns: Records which can be indexed, excluding zero catch records in some circumstances. """ - if key in IGNORE_ZEROS: + if key in const.IGNORE_ZEROS: return filter(is_non_zero, records) else: return records @@ -421,7 +407,7 @@ def key_record(target: dict): """ return normalize_value(target, key) - if key in REQUIRES_FLAT: + if key in const.REQUIRES_FLAT: index_records_output = index_records.map(build_output_record) else: index_records_grouped_nest = index_records.foldby( From 6ad249b6eaa836efd1ab86753e95b3ac58e32627 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 23:40:25 +0000 Subject: [PATCH 14/36] Add check for presence only index. --- afscgap/flat_index_util.py | 3 +++ afscgap/test/test_flat_index_util.py | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py index dbaad69a..0a8cf7f7 100644 --- a/afscgap/flat_index_util.py +++ b/afscgap/flat_index_util.py @@ -497,6 +497,9 @@ def make_filters(field: str, param: afscgap.param.Param, if param.get_is_ignorable(): return [] + if presence_only and field in PRESENCE_ONLY_FIELDS: + return [] + filter_type = param.get_filter_type() if filter_type == 'empty': return [] diff --git a/afscgap/test/test_flat_index_util.py b/afscgap/test/test_flat_index_util.py index b97c7ed6..902ac655 100644 --- a/afscgap/test/test_flat_index_util.py +++ b/afscgap/test/test_flat_index_util.py @@ -365,6 +365,18 @@ def test_string_false(self): filters = afscgap.flat_index_util.make_filters('common_name', param, True) self.assertEqual(len(filters), 1) self.assertFalse(filters[0].get_matches('other')) + + def test_string_true_presence_only(self): + param = afscgap.param.StrEqualsParam('test') + filters = afscgap.flat_index_util.make_filters('common_name', param, False) + self.assertEqual(len(filters), 1) + self.assertTrue(filters[0].get_matches('test')) + + def test_string_false_presence_only(self): + param = afscgap.param.StrEqualsParam('test') + filters = afscgap.flat_index_util.make_filters('common_name', param, False) + self.assertEqual(len(filters), 1) + self.assertTrue(filters[0].get_matches('other')) def test_int_true(self): param = afscgap.param.IntEqualsParam(1) From cdd9916fb8dbb159af39e0612896f2583f9f8f72 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 23:42:48 +0000 Subject: [PATCH 15/36] Additional fixes for #115. --- afscgap/flat_index_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py index 0a8cf7f7..ba35a000 100644 --- a/afscgap/flat_index_util.py +++ b/afscgap/flat_index_util.py @@ -457,6 +457,7 @@ def get_matches(self, value: MATCH_TARGET) -> bool: FIELD_DATA_TYPE_OVERRIDES = {'date_time': 'datetime'} +# These fields, when indexed, ignore zero values. If not presence only, these need to be included. PRESENCE_ONLY_FIELDS = {'species_code', 'common_name', 'scientific_name'} @@ -497,7 +498,7 @@ def make_filters(field: str, param: afscgap.param.Param, if param.get_is_ignorable(): return [] - if presence_only and field in PRESENCE_ONLY_FIELDS: + if (not presence_only) and (field in PRESENCE_ONLY_FIELDS): return [] filter_type = param.get_filter_type() From d3043e5f3060a5b6a696559b16c37ca9f5d5529b Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 23:43:35 +0000 Subject: [PATCH 16/36] Add additional explanation for #115. --- afscgap/flat_index_util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py index ba35a000..c0d7dc80 100644 --- a/afscgap/flat_index_util.py +++ b/afscgap/flat_index_util.py @@ -498,6 +498,8 @@ def make_filters(field: str, param: afscgap.param.Param, if param.get_is_ignorable(): return [] + # If the field index is presence only and this isn't a presence only request, the index must be + # ignored (cannot be used to pre-filter results). if (not presence_only) and (field in PRESENCE_ONLY_FIELDS): return [] From 6ca70a6ec45329758bbda95e32de9beb56515db5 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Fri, 3 Jan 2025 23:45:14 +0000 Subject: [PATCH 17/36] Update tests for #115. --- afscgap/test/test_flat_index_util.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/afscgap/test/test_flat_index_util.py b/afscgap/test/test_flat_index_util.py index 902ac655..f23ed24b 100644 --- a/afscgap/test/test_flat_index_util.py +++ b/afscgap/test/test_flat_index_util.py @@ -366,17 +366,10 @@ def test_string_false(self): self.assertEqual(len(filters), 1) self.assertFalse(filters[0].get_matches('other')) - def test_string_true_presence_only(self): + def test_presence_only(self): param = afscgap.param.StrEqualsParam('test') filters = afscgap.flat_index_util.make_filters('common_name', param, False) - self.assertEqual(len(filters), 1) - self.assertTrue(filters[0].get_matches('test')) - - def test_string_false_presence_only(self): - param = afscgap.param.StrEqualsParam('test') - filters = afscgap.flat_index_util.make_filters('common_name', param, False) - self.assertEqual(len(filters), 1) - self.assertTrue(filters[0].get_matches('other')) + self.assertEqual(len(filters), 0) def test_int_true(self): param = afscgap.param.IntEqualsParam(1) From f540475726651c48b0d657d3c1b5645e0ba707fe Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 00:23:33 +0000 Subject: [PATCH 18/36] Update coverage targets. --- CONTRIBUTING.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4fa559f8..09214712 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,9 +14,9 @@ Thank you for your contribution. We appreciate the community's help in any capac In order to ensure the conceptual integrity and readability of our code, we have a few guidelines for Python code under the `afscgap` library itself: - Please try to follow the conventions laid out by the project in existing code. In cases of ambiguity, please refer to the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) where possible. - - Tests are encouraged and we aim for 80% coverage where feasible. - - Type hints are encouraged and we aim for 80% coverage where feasible. - - Docstrings are encouraged and we aim for 80% coverage. Please use the [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) to ensure that our automated documentation system can use your work. + - Tests are encouraged. + - Type hints are encouraged. + - Docstrings are encouraged. Please use the [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) to ensure that our automated documentation system can use your work. - Please check that you have no mypy errors when contributing. - Please check that you have no linting (pycodestyle, pyflakes) errors when contributing. - As contributors may be periodic, please do not re-write history / squash commits for ease of fast forward. @@ -27,6 +27,8 @@ The `afscgap` library itself requires a very high rigor. For other sections incl Of course, **do not worry if you aren't sure that you met all of our the guidelines!** We encourage pull requests and are happy to work through any necessary outstanding tasks with you. +Previous versions of this guide indicated specific coverage targets but those are removed for the `2.x` release as the codebase spans more modalities where different approaches may be more appropriate in different areas. +

From 0539c913cbd55a2fdd8d308520c291c3550716a1 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 00:41:59 +0000 Subject: [PATCH 19/36] Expand tests for ignorable re #111. --- afscgap/flat_index_util.py | 41 +++++++++++++++++++++----- afscgap/test/test_flat_index_util.py | 44 ++++++++++++++++++++++++++++ snapshot/test_generate_indicies.py | 13 ++++++++ 3 files changed, 90 insertions(+), 8 deletions(-) create mode 100644 snapshot/test_generate_indicies.py diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py index c0d7dc80..2972616e 100644 --- a/afscgap/flat_index_util.py +++ b/afscgap/flat_index_util.py @@ -480,6 +480,38 @@ def decorate_filter(field: str, original: IndexFilter) -> IndexFilter: return UnitConversionIndexFilter(original, user_units, system_units) +def determine_if_ignorable(field: str, param: afscgap.param.Param, presence_only: bool) -> bool: + """Determine if a field parameter is ignored for pre-filtering. + + Determine if a field parameter is ignored for pre-filtering, turning it into a noop because + pre-filtering isn't possible or precomputed indicies are not available. + + Args: + field: The name of the field for which filters should be made. + param: The parameter to apply for the field. + presence_only: Flag indicating if the query is for presence so zero inference records can be + excluded. + + Returns: + True if + """ + if param.get_is_ignorable(): + return True + + # If the field index is presence only and this isn't a presence only request, the index must be + # ignored (cannot be used to pre-filter results). + zero_inference_required = not presence_only + field_index_excludes_zeros = field in PRESENCE_ONLY_FIELDS + if zero_inference_required and field_index_excludes_zeros: + return True + + filter_type = param.get_filter_type() + if filter_type == 'empty': + return True + + return False + + def make_filters(field: str, param: afscgap.param.Param, presence_only: bool) -> typing.Iterable[IndexFilter]: """Make filters for a field describing a backend-agnostic parameter. @@ -495,17 +527,10 @@ def make_filters(field: str, param: afscgap.param.Param, be approximated such that all matching results are included in results but some results may included may not match, requiring re-evaluation locally. """ - if param.get_is_ignorable(): - return [] - - # If the field index is presence only and this isn't a presence only request, the index must be - # ignored (cannot be used to pre-filter results). - if (not presence_only) and (field in PRESENCE_ONLY_FIELDS): + if determine_if_ignorable(field, param, presence_only): return [] filter_type = param.get_filter_type() - if filter_type == 'empty': - return [] if field in FIELD_DATA_TYPE_OVERRIDES: data_type = FIELD_DATA_TYPE_OVERRIDES[field] diff --git a/afscgap/test/test_flat_index_util.py b/afscgap/test/test_flat_index_util.py index f23ed24b..3062e0ce 100644 --- a/afscgap/test/test_flat_index_util.py +++ b/afscgap/test/test_flat_index_util.py @@ -347,6 +347,50 @@ def test_decorate_filter_active_none(self): self.assertFalse(decorated.get_matches(None)) +class DetermineIfIgnorableTests(unittest.TestCase): + + def test_explicit_ignorable_require_zero(self): + param = self._make_test_param(True, 'int') + ignorable = afscgap.flat_index_util.determine_if_ignorable('test', param, False) + self.assertTrue(ignorable) + + def test_explicit_ignorable_presence_only(self): + param = self._make_test_param(True, 'int') + ignorable = afscgap.flat_index_util.determine_if_ignorable('test', param, True) + self.assertTrue(ignorable) + + def test_require_zero_supported(self): + param = self._make_test_param(False, 'int') + ignorable = afscgap.flat_index_util.determine_if_ignorable('count', param, False) + self.assertFalse(ignorable) + + def test_require_zero_unsupported(self): + param = self._make_test_param(False, 'str') + ignorable = afscgap.flat_index_util.determine_if_ignorable('species_code', param, False) + self.assertTrue(ignorable) + + def test_presence_only_unsupported(self): + param = self._make_test_param(False, 'str') + ignorable = afscgap.flat_index_util.determine_if_ignorable('species_code', param, True) + self.assertFalse(ignorable) + + def test_empty(self): + param = afscgap.param.EmptyParam() + ignorable = afscgap.flat_index_util.determine_if_ignorable('count', param, True) + self.assertTrue(ignorable) + + def test_plain_not_ignorable(self): + param = afscgap.param.IntRangeParam(1, None) + ignorable = afscgap.flat_index_util.determine_if_ignorable('count', param, True) + self.assertFalse(ignorable) + + def _make_test_param(self, ignorable, filter_type): + param = unittest.mock.MagicMock() + param.get_is_ignorable = unittest.mock.MagicMock(return_value=ignorable) + param.get_filter_type = unittest.mock.MagicMock(return_value=filter_type) + return param + + class MakeFilterTests(unittest.TestCase): def test_empty(self): diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py new file mode 100644 index 00000000..5271fb01 --- /dev/null +++ b/snapshot/test_generate_indicies.py @@ -0,0 +1,13 @@ +""" +Tests for generating sharded indicies. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import unittest +import unittest.mock + +import generate_indicies From 8a408200da4af1d9fff1062021c8e0104bbc42cc Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 01:13:18 +0000 Subject: [PATCH 20/36] Add tests for generate_indicies. --- snapshot/generate_indicies.py | 4 +- snapshot/test_combine_shards.py | 2 +- snapshot/test_generate_indicies.py | 187 +++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 2 deletions(-) diff --git a/snapshot/generate_indicies.py b/snapshot/generate_indicies.py index 115ef0e5..908cd630 100644 --- a/snapshot/generate_indicies.py +++ b/snapshot/generate_indicies.py @@ -332,7 +332,9 @@ def combine_records(a: dict, b: dict) -> dict: Returns: New index record indicating the cobination of the two records. """ - assert a['value'] == b['value'] + if a['value'] != b['value']: + raise RuntimeError('Tried combining keys for incompatible values.') + return {'value': a['value'], 'keys': a['keys'].union(b['keys'])} diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py index 4aee0c86..f165bed4 100644 --- a/snapshot/test_combine_shards.py +++ b/snapshot/test_combine_shards.py @@ -13,7 +13,7 @@ import combine_shards -class NormUtilTests(unittest.TestCase): +class NormTests(unittest.TestCase): def test_unchanged(self): normalized = combine_shards.normalize_record('test attr', {'value': 'test val'}) diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py index 5271fb01..53ace4ad 100644 --- a/snapshot/test_generate_indicies.py +++ b/snapshot/test_generate_indicies.py @@ -10,4 +10,191 @@ import unittest import unittest.mock +import const import generate_indicies + + +class BuildIndexRecordTests(unittest.TestCase): + + def setUp(self): + self._record = {'testkey': 'testvalue'} + + def test_build_index_record_value(self): + record = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123) + self.assertEqual(record['value'], 'testvalue') + + def test_build_index_record_same(self): + record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123) + record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123) + key_1 = list(record_1['keys'])[0] + key_2 = list(record_2['keys'])[0] + self.assertEqual(key_1, key_2) + + def test_build_index_record_different_year(self): + record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123) + record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2026, 'GOA', 123) + key_1 = list(record_1['keys'])[0] + key_2 = list(record_2['keys'])[0] + self.assertNotEqual(key_1, key_2) + + def test_build_index_record_different_survey(self): + record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123) + record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'Other', 123) + key_1 = list(record_1['keys'])[0] + key_2 = list(record_2['keys'])[0] + self.assertNotEqual(key_1, key_2) + + def test_build_index_record_different_haul(self): + record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123) + record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 124) + key_1 = list(record_1['keys'])[0] + key_2 = list(record_2['keys'])[0] + self.assertNotEqual(key_1, key_2) + + +class NormTests(unittest.TestCase): + + def test_unchanged(self): + normalized = generate_indicies.normalize_value( + {'value': 'test val'}, + 'test attr' + ) + self.assertEqual(normalized, 'test val') + + def test_none(self): + normalized = generate_indicies.normalize_value( + {'value': None}, + 'depth_m' + ) + self.assertEqual(normalized, None) + + def test_changed(self): + normalized = generate_indicies.normalize_value( + {'value': 1.236}, + 'depth_m' + ) + self.assertEqual(normalized, '1.24') + + def test_rounded_float_same(self): + normalized_1 = generate_indicies.normalize_value( + {'value': 1.236}, + 'depth_m' + ) + normalized_2 = generate_indicies.normalize_value( + {'value': 1.237}, + 'depth_m' + ) + self.assertEqual(normalized_1, normalized_2) + + def test_rounded_float_different(self): + normalized_1 = generate_indicies.normalize_value( + {'value': 1.234}, + 'depth_m' + ) + normalized_2 = generate_indicies.normalize_value( + {'value': 1.236}, + 'depth_m' + ) + self.assertNotEqual(normalized_1, normalized_2) + + def test_rounded_datetime_same(self): + normalized_1 = generate_indicies.normalize_value( + {'value': '2025-12-31T13:25:50Z'}, + 'date_time' + ) + normalized_2 = generate_indicies.normalize_value( + {'value': '2025-12-31T14:25:50Z'}, + 'date_time' + ) + self.assertEqual(normalized_1, normalized_2) + + def test_rounded_datetime_different(self): + normalized_1 = generate_indicies.normalize_value( + {'value': '2025-12-31T13:25:50Z'}, + 'date_time' + ) + normalized_2 = generate_indicies.normalize_value( + {'value': '2025-12-30T14:25:50Z'}, + 'date_time' + ) + self.assertNotEqual(normalized_1, normalized_2) + + +class IsNonZeroTests(unittest.TestCase): + + def setUp(self): + self._target = {} + for field in const.ZEROABLE_FIELDS: + self._target[field] = 123 + + def test_is_non_zero_not_zeroable_zero(self): + self._target['other'] = 0 + self.assertTrue(generate_indicies.is_non_zero(self._target)) + + def test_is_non_zero_not_zeroable_none(self): + self._target['other'] = None + self.assertTrue(generate_indicies.is_non_zero(self._target)) + + def test_is_non_zero_zeroable_zero_partial(self): + self._target['count'] = 0 + self.assertTrue(generate_indicies.is_non_zero(self._target)) + + def test_is_non_zero_zeroable_none_partial(self): + self._target['count'] = None + self.assertTrue(generate_indicies.is_non_zero(self._target)) + + def test_is_non_zero_zeroable_zero_all(self): + for field in const.ZEROABLE_FIELDS: + self._target[field] = 0 + + self.assertFalse(generate_indicies.is_non_zero(self._target)) + + def test_is_non_zero_zeroable_none_all(self): + for field in const.ZEROABLE_FIELDS: + self._target[field] = None + + self.assertFalse(generate_indicies.is_non_zero(self._target)) + + +class BuildOutputRecordTests(unittest.TestCase): + + def setUp(self): + target = {'value': 'test value', 'keys': ['2025\tGOA\t123', '2025\tGOA\t124']} + self._output_record = generate_indicies.build_output_record(target) + + def test_build_output_record(self): + self.assertEqual(self._output_record['value'], 'test value') + + def test_build_key_meta(self): + key = self._output_record['keys'][0] + self.assertEqual(key['year'], 2025) + self.assertEqual(key['survey'], 'GOA') + self.assertEqual(key['haul'], 124) + + def test_build_key_meta(self): + key_1 = self._output_record['keys'][0] + self.assertEqual(key_1['haul'], 123) + + key_2 = self._output_record['keys'][1] + self.assertEqual(key_2['haul'], 124) + + +class CombineTests(unittest.TestCase): + + def setUp(self): + self._base = {'value': 1, 'keys': {'a'}} + self._compatible = {'value': 1, 'keys': {'b'}} + self._incompatible = {'value': 2, 'keys': {'c'}} + + def test_combine_compatible(self): + combined = generate_indicies.combine_records(self._base, self._compatible) + self.assertEqual(combined['value'], 1) + + keys = combined['keys'] + self.assertEqual(len(keys), 2) + self.assertTrue('a' in keys) + self.assertTrue('b' in keys) + + def test_combine_incompatible(self): + with self.assertRaises(RuntimeError): + combined = generate_indicies.combine_records(self._base, self._incompatible) From 131c09e1f3636bee828810a499704c9691a66a7b Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 01:17:09 +0000 Subject: [PATCH 21/36] Add additional bash scripts for #114. --- snapshot/get_all_years.sh | 90 +++++++++++++++++++++++++++++++++++++++ snapshot/index_data.sh | 77 +++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 snapshot/get_all_years.sh create mode 100644 snapshot/index_data.sh diff --git a/snapshot/get_all_years.sh b/snapshot/get_all_years.sh new file mode 100644 index 00000000..d278d15d --- /dev/null +++ b/snapshot/get_all_years.sh @@ -0,0 +1,90 @@ +echo "-- Getting species --" +python request_source.py species $BUCKET_NAME species +echo "-- Getting catch --" +python request_source.py catch $BUCKET_NAME catch +echo "-- Getting 1982 --" +python request_source.py haul $BUCKET_NAME haul 1982 +echo "-- Getting 1983 --" +python request_source.py haul $BUCKET_NAME haul 1983 +echo "-- Getting 1984 --" +python request_source.py haul $BUCKET_NAME haul 1984 +echo "-- Getting 1985 --" +python request_source.py haul $BUCKET_NAME haul 1985 +echo "-- Getting 1986 --" +python request_source.py haul $BUCKET_NAME haul 1986 +echo "-- Getting 1987 --" +python request_source.py haul $BUCKET_NAME haul 1987 +echo "-- Getting 1988 --" +python request_source.py haul $BUCKET_NAME haul 1988 +echo "-- Getting 1989 --" +python request_source.py haul $BUCKET_NAME haul 1989 +echo "-- Getting 1990 --" +python request_source.py haul $BUCKET_NAME haul 1990 +echo "-- Getting 1991 --" +python request_source.py haul $BUCKET_NAME haul 1991 +echo "-- Getting 1992 --" +python request_source.py haul $BUCKET_NAME haul 1992 +echo "-- Getting 1993 --" +python request_source.py haul $BUCKET_NAME haul 1993 +echo "-- Getting 1994 --" +python request_source.py haul $BUCKET_NAME haul 1994 +echo "-- Getting 1995 --" +python request_source.py haul $BUCKET_NAME haul 1995 +echo "-- Getting 1996 --" +python request_source.py haul $BUCKET_NAME haul 1996 +echo "-- Getting 1997 --" +python request_source.py haul $BUCKET_NAME haul 1997 +echo "-- Getting 1998 --" +python request_source.py haul $BUCKET_NAME haul 1998 +echo "-- Getting 1999 --" +python request_source.py haul $BUCKET_NAME haul 1999 +echo "-- Getting 2000 --" +python request_source.py haul $BUCKET_NAME haul 2000 +echo "-- Getting 2001 --" +python request_source.py haul $BUCKET_NAME haul 2001 +echo "-- Getting 2002 --" +python request_source.py haul $BUCKET_NAME haul 2002 +echo "-- Getting 2003 --" +python request_source.py haul $BUCKET_NAME haul 2003 +echo "-- Getting 2004 --" +python request_source.py haul $BUCKET_NAME haul 2004 +echo "-- Getting 2005 --" +python request_source.py haul $BUCKET_NAME haul 2005 +echo "-- Getting 2006 --" +python request_source.py haul $BUCKET_NAME haul 2006 +echo "-- Getting 2007 --" +python request_source.py haul $BUCKET_NAME haul 2007 +echo "-- Getting 2008 --" +python request_source.py haul $BUCKET_NAME haul 2008 +echo "-- Getting 2009 --" +python request_source.py haul $BUCKET_NAME haul 2009 +echo "-- Getting 2010 --" +python request_source.py haul $BUCKET_NAME haul 2010 +echo "-- Getting 2011 --" +python request_source.py haul $BUCKET_NAME haul 2011 +echo "-- Getting 2012 --" +python request_source.py haul $BUCKET_NAME haul 2012 +echo "-- Getting 2013 --" +python request_source.py haul $BUCKET_NAME haul 2013 +echo "-- Getting 2014 --" +python request_source.py haul $BUCKET_NAME haul 2014 +echo "-- Getting 2015 --" +python request_source.py haul $BUCKET_NAME haul 2015 +echo "-- Getting 2016 --" +python request_source.py haul $BUCKET_NAME haul 2016 +echo "-- Getting 2017 --" +python request_source.py haul $BUCKET_NAME haul 2017 +echo "-- Getting 2018 --" +python request_source.py haul $BUCKET_NAME haul 2018 +echo "-- Getting 2019 --" +python request_source.py haul $BUCKET_NAME haul 2019 +echo "-- Getting 2020 --" +python request_source.py haul $BUCKET_NAME haul 2020 +echo "-- Getting 2021 --" +python request_source.py haul $BUCKET_NAME haul 2021 +echo "-- Getting 2022 --" +python request_source.py haul $BUCKET_NAME haul 2022 +echo "-- Getting 2023 --" +python request_source.py haul $BUCKET_NAME haul 2023 +echo "-- Getting 2024 --" +python request_source.py haul $BUCKET_NAME haul 2024 diff --git a/snapshot/index_data.sh b/snapshot/index_data.sh new file mode 100644 index 00000000..7bc72fa4 --- /dev/null +++ b/snapshot/index_data.sh @@ -0,0 +1,77 @@ +mkdir index_shards +echo "area_swept_km2" +python generate_indicies.py $BUCKET_NAME area_swept_km2 n +echo "bottom_temperature_c" +python generate_indicies.py $BUCKET_NAME bottom_temperature_c n +echo "common_name" +python generate_indicies.py $BUCKET_NAME common_name n +echo "count" +python generate_indicies.py $BUCKET_NAME count n +echo "cpue_kgkm2" +python generate_indicies.py $BUCKET_NAME cpue_kgkm2 n +echo "cpue_nokm2" +python generate_indicies.py $BUCKET_NAME cpue_nokm2 n +echo "cruise" +python generate_indicies.py $BUCKET_NAME cruise n +echo "cruisejoin" +python generate_indicies.py $BUCKET_NAME cruisejoin n +echo "date_time" +python generate_indicies.py $BUCKET_NAME date_time n +echo "depth_m" +python generate_indicies.py $BUCKET_NAME depth_m n +echo "distance_fished_km" +python generate_indicies.py $BUCKET_NAME distance_fished_km n +echo "duration_hr" +python generate_indicies.py $BUCKET_NAME duration_hr n +echo "haul" +python generate_indicies.py $BUCKET_NAME haul n +echo "hauljoin" +python generate_indicies.py $BUCKET_NAME hauljoin n +echo "id_rank" +python generate_indicies.py $BUCKET_NAME id_rank n +echo "latitude_dd_end" +python generate_indicies.py $BUCKET_NAME latitude_dd_end n +echo "latitude_dd_start" +python generate_indicies.py $BUCKET_NAME latitude_dd_start n +echo "longitude_dd_end" +python generate_indicies.py $BUCKET_NAME longitude_dd_end n +echo "longitude_dd_start" +python generate_indicies.py $BUCKET_NAME longitude_dd_start n +echo "net_height_m" +python generate_indicies.py $BUCKET_NAME net_height_m n +echo "net_width_m" +python generate_indicies.py $BUCKET_NAME net_width_m n +echo "performance" +python generate_indicies.py $BUCKET_NAME performance n +echo "requirements" +python generate_indicies.py $BUCKET_NAME requirements n +echo "scientific_name" +python generate_indicies.py $BUCKET_NAME scientific_name n +echo "species_code" +python generate_indicies.py $BUCKET_NAME species_code n +echo "srvy" +python generate_indicies.py $BUCKET_NAME srvy n +echo "station" +python generate_indicies.py $BUCKET_NAME station n +echo "stratum" +python generate_indicies.py $BUCKET_NAME stratum n +echo "surface_temperature_c" +python generate_indicies.py $BUCKET_NAME surface_temperature_c n +echo "survey" +python generate_indicies.py $BUCKET_NAME survey n +echo "survey_definition_id" +python generate_indicies.py $BUCKET_NAME survey_definition_id n +echo "survey_name" +python generate_indicies.py $BUCKET_NAME survey_name n +echo "taxon_confidence" +python generate_indicies.py $BUCKET_NAME taxon_confidence n +#echo "variables" +#python generate_indicies.py $BUCKET_NAME variables n +echo "vessel_id" +python generate_indicies.py $BUCKET_NAME vessel_id n +echo "vessel_name" +python generate_indicies.py $BUCKET_NAME vessel_name n +echo "weight_kg" +python generate_indicies.py $BUCKET_NAME weight_kg n +echo "year" +python generate_indicies.py $BUCKET_NAME year y From b8f8ed5dda0e60324f169cc88eab09037ad9a789 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 01:19:44 +0000 Subject: [PATCH 22/36] Type fixes for tests in #114. --- snapshot/test_generate_indicies.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py index 53ace4ad..0d1cd7af 100644 --- a/snapshot/test_generate_indicies.py +++ b/snapshot/test_generate_indicies.py @@ -132,7 +132,7 @@ def test_is_non_zero_not_zeroable_zero(self): self.assertTrue(generate_indicies.is_non_zero(self._target)) def test_is_non_zero_not_zeroable_none(self): - self._target['other'] = None + self._target['other'] = None # type: ignore self.assertTrue(generate_indicies.is_non_zero(self._target)) def test_is_non_zero_zeroable_zero_partial(self): @@ -140,7 +140,7 @@ def test_is_non_zero_zeroable_zero_partial(self): self.assertTrue(generate_indicies.is_non_zero(self._target)) def test_is_non_zero_zeroable_none_partial(self): - self._target['count'] = None + self._target['count'] = None # type: ignore self.assertTrue(generate_indicies.is_non_zero(self._target)) def test_is_non_zero_zeroable_zero_all(self): @@ -151,7 +151,7 @@ def test_is_non_zero_zeroable_zero_all(self): def test_is_non_zero_zeroable_none_all(self): for field in const.ZEROABLE_FIELDS: - self._target[field] = None + self._target[field] = None # type: ignore self.assertFalse(generate_indicies.is_non_zero(self._target)) From e51a38c4dcc6b81eb6efd772ec6ba6bd3854150b Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 02:10:18 +0000 Subject: [PATCH 23/36] Partial implementation of render_flat. --- snapshot/render_flat.py | 574 +++++++++++++++++++++++++++++ snapshot/test_generate_indicies.py | 4 +- 2 files changed, 576 insertions(+), 2 deletions(-) create mode 100644 snapshot/render_flat.py diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py new file mode 100644 index 00000000..5a82fc7c --- /dev/null +++ b/snapshot/render_flat.py @@ -0,0 +1,574 @@ +""" +Script to build joined flat Avro files. + +Script to build joined flat Avro files by joining across the speices list, the hauls dataset, and +the catch dataset. Catches and species without haul matches will be excluded. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import csv +import io +import itertools +import functools +import os +import sys + +import boto3 +import coiled +import fastavro + +USAGE_STR = 'python render_flat.py [bucket] [filenames]' +NUM_ARGS = 2 + + +OBSERVATION_SCHEMA = { + 'doc': 'Description of an observation joined across haul, catch, species.', + 'name': 'Observation', + 'namespace': 'edu.dse.afscgap', + 'type': 'record', + 'fields': [ + {'name': 'year', 'type': ['int', 'null']}, + {'name': 'srvy', 'type': ['string', 'null']}, + {'name': 'survey', 'type': ['string', 'null']}, + {'name': 'survey_name', 'type': ['string', 'null']}, + {'name': 'survey_definition_id', 'type': ['long', 'null']}, + {'name': 'cruise', 'type': ['long', 'null']}, + {'name': 'cruisejoin', 'type': ['long', 'null']}, + {'name': 'hauljoin', 'type': ['long', 'null']}, + {'name': 'haul', 'type': ['long', 'null']}, + {'name': 'stratum', 'type': ['long', 'null']}, + {'name': 'station', 'type': ['string', 'null']}, + {'name': 'vessel_id', 'type': ['long', 'null']}, + {'name': 'vessel_name', 'type': ['string', 'null']}, + {'name': 'date_time', 'type': ['string', 'null']}, + {'name': 'latitude_dd_start', 'type': ['double', 'null']}, + {'name': 'longitude_dd_start', 'type': ['double', 'null']}, + {'name': 'latitude_dd_end', 'type': ['double', 'null']}, + {'name': 'longitude_dd_end', 'type': ['double', 'null']}, + {'name': 'bottom_temperature_c', 'type': ['double', 'null']}, + {'name': 'surface_temperature_c', 'type': ['double', 'null']}, + {'name': 'depth_m', 'type': ['double', 'null']}, + {'name': 'distance_fished_km', 'type': ['double', 'null']}, + {'name': 'duration_hr', 'type': ['double', 'null']}, + {'name': 'net_width_m', 'type': ['double', 'null']}, + {'name': 'net_height_m', 'type': ['double', 'null']}, + {'name': 'area_swept_km2', 'type': ['double', 'null']}, + {'name': 'performance', 'type': ['float', 'null']}, + {'name': 'species_code', 'type': ['long', 'null']}, + {'name': 'cpue_kgkm2', 'type': ['double', 'null']}, + {'name': 'cpue_nokm2', 'type': ['double', 'null']}, + {'name': 'count', 'type': ['long', 'null']}, + {'name': 'weight_kg', 'type': ['double', 'null']}, + {'name': 'taxon_confidence', 'type': ['string', 'null']}, + {'name': 'scientific_name', 'type': ['string', 'null']}, + {'name': 'common_name', 'type': ['string', 'null']}, + {'name': 'id_rank', 'type': ['string', 'null']}, + {'name': 'worms', 'type': ['long', 'null']}, + {'name': 'itis', 'type': ['long', 'null']}, + {'name': 'complete', 'type': ['boolean', 'null']} + ] +} + +SPECIES_DICT = typing.Dict[str, dict] + + +def make_zero_record(species: dict, haul_record: dict) -> dict: + """Make a zero catch record meaning that a species was not found. + + Args: + species: Information about the species not found. + haul_record: Informatino about the haul for which no specimens were found. + + Returns: + Complete output record indicating the given species not found for the given haul. + """ + haul_copy = copy.deepcopy(haul_record) + haul_copy['species_code'] = species['species_code'] + haul_copy['cpue_kgkm2'] = 0 + haul_copy['cpue_nokm2'] = 0 + haul_copy['count'] = 0 + haul_copy['weight_kg'] = 0 + haul_copy['taxon_confidence'] = None + haul_copy['scientific_name'] = species['scientific_name'] + haul_copy['common_name'] = species['common_name'] + haul_copy['id_rank'] = species['id_rank'] + haul_copy['worms'] = species['worms'] + haul_copy['itis'] = species['itis'] + haul_copy['complete'] = True + return haul_copy + + +def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT) -> dict: + """Add information about a species found within a catch. + + Args: + target: Record describing a catch within haul. + species_by_code: Dictionary mapping from species code found in a catch to information about + that species. + + Returns: + Catch record with species information added. + """ + species_code = target['species_code'] + + if species_code not in species_by_code: + target['complete'] = False + return target + + species_record = species_by_code[species_code] + target.update(species_record) + return target + + +def make_get_avro(bucket: str, s3_client) -> typling.Callable[[str], typing.List[dict]]: + """Build a function which gets a file from a bucket using the given S3 client. + + Args: + bucket: The name of the bucket where files should be found. + s3_client: The S3 client to use in getting those files. + + Returns: + Function which takes a full path and reutrns a list of parsed dictionaries from the Avro + file at that path. + """ + + def get_avro(full_loc: str) -> typing.List[dict]: + """Get all records from an Avro file. + + Args: + full_loc: The full path to the avro file to be read. + + Returns: + All records within that Avro file parsed as dictionaries. + """ + target_buffer = io.BytesIO() + s3_client.download_fileobj(bucket, full_loc, target_buffer) + target_buffer.seek(0) + return list(fastavro.reader(target_buffer)) + + return get_avro + + +def append_catch_haul(catch_record: dict, haul_record: dict) -> dic: + """Combine information between a catch record and a haul record. + + Args: + catch_record: The catch record to combine with haul information. + haul_record: The haul information for the given catch record. + + Returns: + Combined catch and haul information. + """ + catch_record.update(haul_record) + return catch_record + + +def complete_record(target: dict) -> dict: + """Fill in any missing fields with None to fit edu.dse.afscgap.Observation Avro format. + + Args: + target: Record to finish. This record may or may not be modified in place. + + Returns: + Record with any missing fields set to None. + """ + keys = map(lambda x: x['name'], OBSERVATION_SCHEMA['fields']) + keys_realized = list(keys) + values = map(lambda x: target.get(x, None), keys_realized) + return dict(zip(keys_realized, values)) + + +def mark_incomplete(target: dict) -> dict: + """Mark a record as incomplete. + + Args: + target: Record on which the complete attribute should be changed. This may or may not be + modified in-place. + + Returns: + Record with the complete flag updated. + """ + target['complete'] = False + return target + + +def mark_complete(target: dict) -> dict: + """Mark a record as complete. + + Args: + target: Record on which the complete attribute should be changed. This may or may not be + modified in-place. + + Returns: + Record with the complete flag updated. + """ + target['complete'] = True + return target + + +def combine_catch_and_haul(haul_record: dict, + catch_records: typing.Optional[typing.List[dict]]) -> typing.Iterable[dict]: + """Combine catch information with information about the haul in which that catch happened. + + Args: + haul_record: Information about the haul in which the catch took place. + catch_records: The catch records to be joined with haul information. + + Returns: + Updated catch records or, if no catch records provided, a single record with haul + information marked incomplete. + """ + if catch_records is None: + catch_records_out = map(mark_incomplete, [haul_record]) + else: + catch_no_species = map( + lambda x: append_catch_haul(x, haul_record), + catch_records + ) + catch_with_species = map(append_species, catch_no_species) + catch_records_out = map(mark_complete, catch_with_species) + + return catch_records_out + + +def make_zero_catch_records(catch_records_out_realized: typing.List[dict], + species_by_code: SPECIES_DICT) -> typing.Iterable[dict]: + """Generate zero catch records for species not found in catches for a haul. + + Args: + catch_records_out_realized: All catch records for a haul. + species_by_code: Mapping from species code to information about the species such that all + formally tracked species are in this dictionary. + + Returns: + Inferred zero catch records. + """ + species_codes_found = set(map( + lambda x: x.get('species_code', None), + catch_records_out_realized + )) + species_codes_all = set(species_by_code.keys()) + speices_codes_missing = species_codes_all - species_codes_found + speices_missing = map(lambda x: species_by_code[x], speices_codes_missing) + catch_records_zero = map( + lambda x: make_zero_record(x, haul_record), + speices_missing + ) + return catch_records_zero + + +def get_url_for_catches_in_haul(haul: int) -> str: + """Get the URL where the catches associated with a haul may be found. + + Args: + haul: The ID of the desired haul. + + Returns: + Path to where the catches associated with the given haul may be found if there is any data + available. + """ + return 'catch/%d.avro' % haul + + +def get_meta_url_for_haul(year: int, survey: str, haul: int) -> str: + """Get the URL for a haul's metadata given the haul location. + + Args: + year: The year in which the haul took place like 2025. + survey: The survey name like "Gulf of Alaska" that the haul was part of. + haul: The haul ID for the desired haul. + + Returns: + String path where the Avro file with haul metadata is expected. + """ + template_vals = (year, survey, haul) + return = 'haul/%d_%s_%d.avro' % template_vals + + +def process_haul(bucket: str, year: int, survey: str, haul: int, + species_by_code: SPECIES_DICT) -> dict: + """Distributed task to process a single haul. + + Distributed task to process a single haul, joining across species and catch datasets for that + haul and writing out the joined file to S3. + + Args: + bucket: The name of the bucket where catch, haul, and species information can be found. + year: The year of the haul to be processed. + survey: The suvey name like "Gulf of Alaska" in which the haul to bpe processed was found. + haul: The haul ID to be processed. + species_by_code: Information about all species formally tracked indexed by species code. + + Returns: + Diagnostic information about the file written. + """ + + import copy + import io + import os + + import botocore + import boto3 + import fastavro + + access_key = os.environ['AWS_ACCESS_KEY'] + access_secret = os.environ['AWS_ACCESS_SECRET'] + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + + get_avro = make_get_avro(bucket, s3_client) + + def check_file_exists(full_loc: str) -> bool: + """Check that a file exists in S3. + + Args: + full_loc: The location (path) within the S3 bucket. + + Returns: + True if the file is found and false otherwise. + """ + try: + s3_client.head_object(bucket, full_loc) + return True + except botocore.exceptions.ClientError as e: + error_code = e.response['Error']['Code'] + error_code_cast = int(error_code) + if error_code_cast == 404: + return False + else: + raise RuntimeError('Unexpected S3 head code: %d' % error_code) + + def append_species(target: dict) -> dict: + """Add information about the speices found in a catch. + + Args: + target: Catch information to which species information should be added. This may or + may not be modified in-place. + + Returns: + Record with species information added. + """ + return append_species_from_species_list(target, species_by_code) + + def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO: + """Convert an iterable of dictionaries to Avro bytes. + + Args: + records: The dictionaries to convert. + + Returns: + Bytes with Avro payload. + """ + records_complete = map(complete_record, records) + target_buffer = io.BytesIO() + fastavro.writer(target_buffer, OBSERVATION_SCHEMA, records_complete) + target_buffer.seek(0) + return target_buffer + + def get_haul_record(year: int, survey: str, haul: int) -> typing.Optional[dict]: + """Get the record for a haul given the haul location. + + Args: + year: The year in which the haul took place like 2025. + survey: The survey name like "Gulf of Alaska" that the haul was part of. + haul: The haul ID for the desired haul. + + Returns: + Dictionary record describing the haul or None if the haul was not found. + """ + haul_loc = get_meta_url_for_haul(year, survey, haul) + + if not check_file_exists(haul_loc): + return None + + haul_records = get_avro(haul_loc) + assert len(haul_records) == 1 + haul_record = haul_records[0] + return haul_record + + def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: + """Get the catch records associated with a haul. + + Args: + haul: The ID of the haul for which catch records should be returned. + + Returns: + All catch records associated with a haul. Either None or empty list if no data could be + found. + """ + catch_loc = get_url_for_catches_in_haul(haul) + if check_file_exists(catch_loc): + return get_avro(catch_loc) + else: + return None + + haul_record = get_haul_record(haul_loc) + + if haul_record is None: + return { + 'complete': 0, + 'incomplete': 0, + 'zero': 0, + 'path': None + } + + catch_records = list(get_catch_records(haul)) + catch_records_out = combine_catch_and_haul(haul_reord, catch_records) + catch_records_out_realized = list(catch_records_out) + catch_records_zero = make_zero_catch_records(catch_records_out_realized, species_by_code) + + # Combine regular records with zero catch inferred records + catch_records_all = itertools.chain( + catch_records_out_realized, + catch_records_zero + ) + + # Upload to S3 + catch_with_species_avro = convert_to_avro(catch_records_all) + output_loc = 'joined/%d_%s_%d.avro' % template_vals + s3_client.upload_fileobj(catch_with_species_avro, bucket, output_loc) + + # Write out diagnostic information + outputs_dicts = map( + lambda x: { + 'complete': 1 if x['complete'] else 0, + 'incomplete': 0 if x['complete'] else 1, + 'zero': 1 if x.get('count', 0) == 0 else 0 + }, + catch_records_out_realized + ) + output_dict = functools.reduce( + lambda a, b: { + 'complete': a['complete'] + b['complete'], + 'incomplete': a['incomplete'] + b['incomplete'], + 'zero': a['zero'] + b['zero'] + }, + outputs_dicts + ) + output_dict['loc'] = output_loc + return output_dict + + +def get_hauls_meta(bucket: str) -> typing.Iterable[dict]: + """Get metadata for all available hauls. + + Args: + bucket: The bucket where hauls inforamtion can be found. + + Returns: + Iterable over hauls metadata required to find that haul within S3. + """ + access_key = os.environ['AWS_ACCESS_KEY'] + access_secret = os.environ['AWS_ACCESS_SECRET'] + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + + def make_haul_metadata_record(path): + filename_with_path = path.split('/')[-1] + filename = filename_with_path.split('.')[0] + components = filename.split('_') + return { + 'path': path, + 'year': int(components[0]), + 'survey': components[1], + 'haul': int(components[2]) + } + + paginator = s3_client.get_paginator('list_objects_v2') + iterator = paginator.paginate(Bucket=bucket, Prefix='haul/') + pages = filter(lambda x: 'Contents' in x, iterator) + contents = map(lambda x: x['Contents'], pages) + contents_flat = itertools.chain(*contents) + keys = map(lambda x: x['Key'], contents_flat) + return map(make_haul_metadata_record, keys) + + +def get_all_species(bucket): + access_key = os.environ['AWS_ACCESS_KEY'] + access_secret = os.environ['AWS_ACCESS_SECRET'] + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + + get_avro = make_get_avro(bucket, s3_client) + + paginator = s3_client.get_paginator('list_objects_v2') + iterator = paginator.paginate(Bucket=bucket, Prefix='species/') + pages = filter(lambda x: 'Contents' in x, iterator) + contents = map(lambda x: x['Contents'], pages) + contents_flat = itertools.chain(*contents) + keys = map(lambda x: x['Key'], contents_flat) + records_nest = map(get_avro, keys) + records_flat = itertools.chain(*records_nest) + records_tuples = map(lambda x: (x['species_code'], x), records_flat) + return dict(records_tuples) + + +def main(): + if len(sys.argv) != NUM_ARGS + 1: + print(USAGE_STR) + sys.exit(1) + + bucket = sys.argv[1] + file_paths_loc = sys.argv[2] + hauls_meta = get_hauls_meta(bucket) + + cluster = coiled.Cluster( + name='DseProcessAfscgap', + n_workers=10, + worker_vm_types=['m7a.medium'], + scheduler_vm_types=['m7a.medium'], + environ={ + 'AWS_ACCESS_KEY': os.environ.get('AWS_ACCESS_KEY', ''), + 'AWS_ACCESS_SECRET': os.environ.get('AWS_ACCESS_SECRET', ''), + 'SOURCE_DATA_LOC': os.environ.get('SOURCE_DATA_LOC', '') + } + ) + cluster.adapt(minimum=10, maximum=500) + client = cluster.get_client() + + hauls_meta_realized = list(hauls_meta) + species_by_code = get_all_species(bucket) + + written_paths_future = client.map( + lambda x: process_haul( + bucket, + x['year'], + x['survey'], + x['haul'], + species_by_code + ), + hauls_meta_realized + ) + written_paths = map(lambda x: x.result(), written_paths_future) + + with open(file_paths_loc, 'w') as f: + writer = csv.DictWriter(f, fieldnames=[ + 'loc', + 'complete', + 'incomplete', + 'zero' + ]) + writer.writeheader() + writer.writerows(written_paths) + + cluster.close(force_shutdown=True) + + +if __name__ == '__main__': + main() diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py index 0d1cd7af..6e26468e 100644 --- a/snapshot/test_generate_indicies.py +++ b/snapshot/test_generate_indicies.py @@ -165,13 +165,13 @@ def setUp(self): def test_build_output_record(self): self.assertEqual(self._output_record['value'], 'test value') - def test_build_key_meta(self): + def test_build_key_meta_check_first(self): key = self._output_record['keys'][0] self.assertEqual(key['year'], 2025) self.assertEqual(key['survey'], 'GOA') self.assertEqual(key['haul'], 124) - def test_build_key_meta(self): + def test_build_key_meta_check_hauls(self): key_1 = self._output_record['keys'][0] self.assertEqual(key_1['haul'], 123) From bf49a8beebfde9ec2bfb5d5a6e60c8432217e769 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 02:12:47 +0000 Subject: [PATCH 24/36] Docstring complete on render_flat. --- snapshot/render_flat.py | 42 +++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py index 5a82fc7c..7b472995 100644 --- a/snapshot/render_flat.py +++ b/snapshot/render_flat.py @@ -457,6 +457,26 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: return output_dict +def make_haul_metadata_record(path: str) -> dict: + """Parse a path into a metadata record. + + Args: + path: The path to be parsed as a metadata record. + + Returns: + Dictionary describing metadata for a haul. + """ + filename_with_path = path.split('/')[-1] + filename = filename_with_path.split('.')[0] + components = filename.split('_') + return { + 'path': path, + 'year': int(components[0]), + 'survey': components[1], + 'haul': int(components[2]) + } + + def get_hauls_meta(bucket: str) -> typing.Iterable[dict]: """Get metadata for all available hauls. @@ -475,17 +495,6 @@ def get_hauls_meta(bucket: str) -> typing.Iterable[dict]: aws_secret_access_key=access_secret ) - def make_haul_metadata_record(path): - filename_with_path = path.split('/')[-1] - filename = filename_with_path.split('.')[0] - components = filename.split('_') - return { - 'path': path, - 'year': int(components[0]), - 'survey': components[1], - 'haul': int(components[2]) - } - paginator = s3_client.get_paginator('list_objects_v2') iterator = paginator.paginate(Bucket=bucket, Prefix='haul/') pages = filter(lambda x: 'Contents' in x, iterator) @@ -495,7 +504,15 @@ def make_haul_metadata_record(path): return map(make_haul_metadata_record, keys) -def get_all_species(bucket): +def get_all_species(bucket: str) -> SPECIES_DICT: + """Get information about all species formally tracked. + + Args: + bucket: The S3 bucket name where species information can be found. + + Returns: + Dictionary mapping from species code to information about that species. + """ access_key = os.environ['AWS_ACCESS_KEY'] access_secret = os.environ['AWS_ACCESS_SECRET'] @@ -520,6 +537,7 @@ def get_all_species(bucket): def main(): + """Entry point for the join script.""" if len(sys.argv) != NUM_ARGS + 1: print(USAGE_STR) sys.exit(1) From 51db277210e6044fb6d4772e9ada04aa7cc53607 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 02:13:30 +0000 Subject: [PATCH 25/36] Fix unit tests for #114. --- snapshot/test_generate_indicies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py index 6e26468e..1b425b13 100644 --- a/snapshot/test_generate_indicies.py +++ b/snapshot/test_generate_indicies.py @@ -169,7 +169,7 @@ def test_build_key_meta_check_first(self): key = self._output_record['keys'][0] self.assertEqual(key['year'], 2025) self.assertEqual(key['survey'], 'GOA') - self.assertEqual(key['haul'], 124) + self.assertEqual(key['haul'], 123) def test_build_key_meta_check_hauls(self): key_1 = self._output_record['keys'][0] From 65b5ce97f36115bd630a780b329daaf1bc364fd7 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 02:23:23 +0000 Subject: [PATCH 26/36] Through pyflakes for render_flat. --- snapshot/render_flat.py | 74 ++++++++++++++++++------------ snapshot/test_generate_indicies.py | 2 +- 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py index 7b472995..ece94df1 100644 --- a/snapshot/render_flat.py +++ b/snapshot/render_flat.py @@ -16,6 +16,7 @@ import functools import os import sys +import typing import boto3 import coiled @@ -81,11 +82,13 @@ def make_zero_record(species: dict, haul_record: dict) -> dict: Args: species: Information about the species not found. - haul_record: Informatino about the haul for which no specimens were found. + haul_record: Information about the haul for which no specimens were found. Returns: Complete output record indicating the given species not found for the given haul. """ + import copy + haul_copy = copy.deepcopy(haul_record) haul_copy['species_code'] = species['species_code'] haul_copy['cpue_kgkm2'] = 0 @@ -124,7 +127,7 @@ def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT return target -def make_get_avro(bucket: str, s3_client) -> typling.Callable[[str], typing.List[dict]]: +def make_get_avro(bucket: str, s3_client) -> typing.Callable[[str], typing.List[dict]]: """Build a function which gets a file from a bucket using the given S3 client. Args: @@ -153,7 +156,7 @@ def get_avro(full_loc: str) -> typing.List[dict]: return get_avro -def append_catch_haul(catch_record: dict, haul_record: dict) -> dic: +def append_catch_haul(catch_record: dict, haul_record: dict) -> dict: """Combine information between a catch record and a haul record. Args: @@ -197,7 +200,7 @@ def mark_incomplete(target: dict) -> dict: def mark_complete(target: dict) -> dict: - """Mark a record as complete. + """Mark a record as complete. Args: target: Record on which the complete attribute should be changed. This may or may not be @@ -211,12 +214,14 @@ def mark_complete(target: dict) -> dict: def combine_catch_and_haul(haul_record: dict, - catch_records: typing.Optional[typing.List[dict]]) -> typing.Iterable[dict]: + catch_records: typing.Optional[typing.List[dict]], + species_by_code: SPECIES_DICT) -> typing.Iterable[dict]: """Combine catch information with information about the haul in which that catch happened. Args: haul_record: Information about the haul in which the catch took place. catch_records: The catch records to be joined with haul information. + species_by_code: Information about all tracked species indexed by species code. Returns: Updated catch records or, if no catch records provided, a single record with haul @@ -229,20 +234,24 @@ def combine_catch_and_haul(haul_record: dict, lambda x: append_catch_haul(x, haul_record), catch_records ) - catch_with_species = map(append_species, catch_no_species) + catch_with_species = map( + lambda x: append_species_from_species_list(x, species_by_code), + catch_no_species + ) catch_records_out = map(mark_complete, catch_with_species) return catch_records_out def make_zero_catch_records(catch_records_out_realized: typing.List[dict], - species_by_code: SPECIES_DICT) -> typing.Iterable[dict]: + species_by_code: SPECIES_DICT, haul_record: dict) -> typing.Iterable[dict]: """Generate zero catch records for species not found in catches for a haul. Args: catch_records_out_realized: All catch records for a haul. species_by_code: Mapping from species code to information about the species such that all formally tracked species are in this dictionary. + haul_record: Base record to use in generating zero catch records. Returns: Inferred zero catch records. @@ -261,7 +270,7 @@ def make_zero_catch_records(catch_records_out_realized: typing.List[dict], return catch_records_zero -def get_url_for_catches_in_haul(haul: int) -> str: +def get_path_for_catches_in_haul(haul: int) -> str: """Get the URL where the catches associated with a haul may be found. Args: @@ -274,7 +283,7 @@ def get_url_for_catches_in_haul(haul: int) -> str: return 'catch/%d.avro' % haul -def get_meta_url_for_haul(year: int, survey: str, haul: int) -> str: +def get_meta_path_for_haul(year: int, survey: str, haul: int) -> str: """Get the URL for a haul's metadata given the haul location. Args: @@ -286,7 +295,22 @@ def get_meta_url_for_haul(year: int, survey: str, haul: int) -> str: String path where the Avro file with haul metadata is expected. """ template_vals = (year, survey, haul) - return = 'haul/%d_%s_%d.avro' % template_vals + return 'haul/%d_%s_%d.avro' % template_vals + + +def get_joined_path(year: int, survey: str, haul: int) -> str: + """Get that path at which joined data is expected to be written for a haul. + + Args: + year: The year in which the haul occurred like 2025. + survey: The name of the survey like "Gulf of Alaska" in which the haul took place. + haul: The ID of the haul. + + Returns: + STring path where the Avro file with joined haul data is expected. + """ + template_vals = (year, survey, haul) + return 'joined/%d_%s_%d.avro' % template_vals def process_haul(bucket: str, year: int, survey: str, haul: int, @@ -307,7 +331,6 @@ def process_haul(bucket: str, year: int, survey: str, haul: int, Diagnostic information about the file written. """ - import copy import io import os @@ -346,18 +369,6 @@ def check_file_exists(full_loc: str) -> bool: else: raise RuntimeError('Unexpected S3 head code: %d' % error_code) - def append_species(target: dict) -> dict: - """Add information about the speices found in a catch. - - Args: - target: Catch information to which species information should be added. This may or - may not be modified in-place. - - Returns: - Record with species information added. - """ - return append_species_from_species_list(target, species_by_code) - def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO: """Convert an iterable of dictionaries to Avro bytes. @@ -384,7 +395,7 @@ def get_haul_record(year: int, survey: str, haul: int) -> typing.Optional[dict]: Returns: Dictionary record describing the haul or None if the haul was not found. """ - haul_loc = get_meta_url_for_haul(year, survey, haul) + haul_loc = get_meta_path_for_haul(year, survey, haul) if not check_file_exists(haul_loc): return None @@ -404,14 +415,13 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: All catch records associated with a haul. Either None or empty list if no data could be found. """ - catch_loc = get_url_for_catches_in_haul(haul) + catch_loc = get_path_for_catches_in_haul(haul) if check_file_exists(catch_loc): return get_avro(catch_loc) else: return None - haul_record = get_haul_record(haul_loc) - + haul_record = get_haul_record(year, survey, haul) if haul_record is None: return { 'complete': 0, @@ -421,9 +431,13 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: } catch_records = list(get_catch_records(haul)) - catch_records_out = combine_catch_and_haul(haul_reord, catch_records) + catch_records_out = combine_catch_and_haul(haul_record, catch_records, species_by_code) catch_records_out_realized = list(catch_records_out) - catch_records_zero = make_zero_catch_records(catch_records_out_realized, species_by_code) + catch_records_zero = make_zero_catch_records( + catch_records_out_realized, + species_by_code, + haul_record + ) # Combine regular records with zero catch inferred records catch_records_all = itertools.chain( @@ -433,7 +447,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: # Upload to S3 catch_with_species_avro = convert_to_avro(catch_records_all) - output_loc = 'joined/%d_%s_%d.avro' % template_vals + output_loc = get_joined_path(year, survey, haul) s3_client.upload_fileobj(catch_with_species_avro, bucket, output_loc) # Write out diagnostic information diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py index 1b425b13..01dcc06b 100644 --- a/snapshot/test_generate_indicies.py +++ b/snapshot/test_generate_indicies.py @@ -197,4 +197,4 @@ def test_combine_compatible(self): def test_combine_incompatible(self): with self.assertRaises(RuntimeError): - combined = generate_indicies.combine_records(self._base, self._incompatible) + generate_indicies.combine_records(self._base, self._incompatible) From 6f2aaa9bc3a38c31e30b57ef9ca6d176dc9ccb0d Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 02:24:10 +0000 Subject: [PATCH 27/36] Style fixes in flat_index_util. --- afscgap/flat_index_util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py index 2972616e..6278a67d 100644 --- a/afscgap/flat_index_util.py +++ b/afscgap/flat_index_util.py @@ -482,10 +482,10 @@ def decorate_filter(field: str, original: IndexFilter) -> IndexFilter: def determine_if_ignorable(field: str, param: afscgap.param.Param, presence_only: bool) -> bool: """Determine if a field parameter is ignored for pre-filtering. - + Determine if a field parameter is ignored for pre-filtering, turning it into a noop because pre-filtering isn't possible or precomputed indicies are not available. - + Args: field: The name of the field for which filters should be made. param: The parameter to apply for the field. @@ -493,7 +493,7 @@ def determine_if_ignorable(field: str, param: afscgap.param.Param, presence_only excluded. Returns: - True if + True if ignorable and false otherwise. """ if param.get_is_ignorable(): return True From 2426f187e351ab8a0dbbd2d773b60b5767b66a4b Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 02:24:52 +0000 Subject: [PATCH 28/36] Additional style fixes on #114. --- snapshot/render_flat.py | 6 +++--- snapshot/test_generate_indicies.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py index ece94df1..299f072f 100644 --- a/snapshot/render_flat.py +++ b/snapshot/render_flat.py @@ -152,7 +152,7 @@ def get_avro(full_loc: str) -> typing.List[dict]: s3_client.download_fileobj(bucket, full_loc, target_buffer) target_buffer.seek(0) return list(fastavro.reader(target_buffer)) - + return get_avro @@ -239,7 +239,7 @@ def combine_catch_and_haul(haul_record: dict, catch_no_species ) catch_records_out = map(mark_complete, catch_with_species) - + return catch_records_out @@ -404,7 +404,7 @@ def get_haul_record(year: int, survey: str, haul: int) -> typing.Optional[dict]: assert len(haul_records) == 1 haul_record = haul_records[0] return haul_record - + def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: """Get the catch records associated with a haul. diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py index 01dcc06b..72968f4e 100644 --- a/snapshot/test_generate_indicies.py +++ b/snapshot/test_generate_indicies.py @@ -142,7 +142,7 @@ def test_is_non_zero_zeroable_zero_partial(self): def test_is_non_zero_zeroable_none_partial(self): self._target['count'] = None # type: ignore self.assertTrue(generate_indicies.is_non_zero(self._target)) - + def test_is_non_zero_zeroable_zero_all(self): for field in const.ZEROABLE_FIELDS: self._target[field] = 0 From 43b19dec2a7d6cb03b30447fab448c8e3ae0f46b Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 02:32:56 +0000 Subject: [PATCH 29/36] Through mypy in render_flat. --- snapshot/render_flat.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py index 299f072f..ff7371de 100644 --- a/snapshot/render_flat.py +++ b/snapshot/render_flat.py @@ -18,8 +18,8 @@ import sys import typing -import boto3 -import coiled +import boto3 # type: ignore +import coiled # type: ignore import fastavro USAGE_STR = 'python render_flat.py [bucket] [filenames]' @@ -151,7 +151,7 @@ def get_avro(full_loc: str) -> typing.List[dict]: target_buffer = io.BytesIO() s3_client.download_fileobj(bucket, full_loc, target_buffer) target_buffer.seek(0) - return list(fastavro.reader(target_buffer)) + return list(fastavro.reader(target_buffer)) # type: ignore return get_avro @@ -179,7 +179,7 @@ def complete_record(target: dict) -> dict: Returns: Record with any missing fields set to None. """ - keys = map(lambda x: x['name'], OBSERVATION_SCHEMA['fields']) + keys = map(lambda x: x['name'], OBSERVATION_SCHEMA['fields']) # type: ignore keys_realized = list(keys) values = map(lambda x: target.get(x, None), keys_realized) return dict(zip(keys_realized, values)) @@ -334,7 +334,7 @@ def process_haul(bucket: str, year: int, survey: str, haul: int, import io import os - import botocore + import botocore # type: ignore import boto3 import fastavro @@ -422,7 +422,9 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: return None haul_record = get_haul_record(year, survey, haul) - if haul_record is None: + catch_records_maybe = get_catch_records(haul) + + if haul_record is None or catch_records_maybe is None: return { 'complete': 0, 'incomplete': 0, @@ -430,7 +432,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: 'path': None } - catch_records = list(get_catch_records(haul)) + catch_records = list(catch_records_maybe) catch_records_out = combine_catch_and_haul(haul_record, catch_records, species_by_code) catch_records_out_realized = list(catch_records_out) catch_records_zero = make_zero_catch_records( @@ -459,7 +461,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: }, catch_records_out_realized ) - output_dict = functools.reduce( + output_dict: typing.Dict[str, int] = functools.reduce( lambda a, b: { 'complete': a['complete'] + b['complete'], 'incomplete': a['incomplete'] + b['incomplete'], @@ -467,8 +469,10 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: }, outputs_dicts ) - output_dict['loc'] = output_loc - return output_dict + + output_dict_with_loc: typing.Dict[str, typing.Union[str, int]] = output_dict # type: ignore + output_dict_with_loc['loc'] = output_loc + return output_dict_with_loc def make_haul_metadata_record(path: str) -> dict: From 2d081ca2f73615d034b868b798c0e6149e9b5489 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 19:37:04 +0000 Subject: [PATCH 30/36] Add tests for render flat. --- snapshot/render_flat.py | 106 +++++++++--------- snapshot/test_render_flat.py | 207 +++++++++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+), 53 deletions(-) create mode 100644 snapshot/test_render_flat.py diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py index ff7371de..c1806d41 100644 --- a/snapshot/render_flat.py +++ b/snapshot/render_flat.py @@ -105,6 +105,33 @@ def make_zero_record(species: dict, haul_record: dict) -> dict: return haul_copy +def make_zero_catch_records(catch_records_out_realized: typing.List[dict], + species_by_code: SPECIES_DICT, haul_record: dict) -> typing.Iterable[dict]: + """Generate zero catch records for species not found in catches for a haul. + + Args: + catch_records_out_realized: All catch records for a haul. + species_by_code: Mapping from species code to information about the species such that all + formally tracked species are in this dictionary. + haul_record: Base record to use in generating zero catch records. + + Returns: + Inferred zero catch records. + """ + species_codes_found = set(map( + lambda x: x.get('species_code', None), + catch_records_out_realized + )) + species_codes_all = set(species_by_code.keys()) + speices_codes_missing = species_codes_all - species_codes_found + speices_missing = map(lambda x: species_by_code[x], speices_codes_missing) + catch_records_zero = map( + lambda x: make_zero_record(x, haul_record), + speices_missing + ) + return catch_records_zero + + def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT) -> dict: """Add information about a species found within a catch. @@ -119,8 +146,9 @@ def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT species_code = target['species_code'] if species_code not in species_by_code: - target['complete'] = False - return target + return mark_incomplete(target) + else: + target = mark_complete(target) species_record = species_by_code[species_code] target.update(species_record) @@ -213,7 +241,7 @@ def mark_complete(target: dict) -> dict: return target -def combine_catch_and_haul(haul_record: dict, +def execute_full_join(haul_record: dict, catch_records: typing.Optional[typing.List[dict]], species_by_code: SPECIES_DICT) -> typing.Iterable[dict]: """Combine catch information with information about the haul in which that catch happened. @@ -234,42 +262,14 @@ def combine_catch_and_haul(haul_record: dict, lambda x: append_catch_haul(x, haul_record), catch_records ) - catch_with_species = map( + catch_records_out = map( lambda x: append_species_from_species_list(x, species_by_code), catch_no_species ) - catch_records_out = map(mark_complete, catch_with_species) return catch_records_out -def make_zero_catch_records(catch_records_out_realized: typing.List[dict], - species_by_code: SPECIES_DICT, haul_record: dict) -> typing.Iterable[dict]: - """Generate zero catch records for species not found in catches for a haul. - - Args: - catch_records_out_realized: All catch records for a haul. - species_by_code: Mapping from species code to information about the species such that all - formally tracked species are in this dictionary. - haul_record: Base record to use in generating zero catch records. - - Returns: - Inferred zero catch records. - """ - species_codes_found = set(map( - lambda x: x.get('species_code', None), - catch_records_out_realized - )) - species_codes_all = set(species_by_code.keys()) - speices_codes_missing = species_codes_all - species_codes_found - speices_missing = map(lambda x: species_by_code[x], speices_codes_missing) - catch_records_zero = map( - lambda x: make_zero_record(x, haul_record), - speices_missing - ) - return catch_records_zero - - def get_path_for_catches_in_haul(haul: int) -> str: """Get the URL where the catches associated with a haul may be found. @@ -313,6 +313,26 @@ def get_joined_path(year: int, survey: str, haul: int) -> str: return 'joined/%d_%s_%d.avro' % template_vals +def make_haul_metadata_record(path: str) -> dict: + """Parse a path into a metadata record. + + Args: + path: The path to be parsed as a metadata record. + + Returns: + Dictionary describing metadata for a haul. + """ + filename_with_path = path.split('/')[-1] + filename = filename_with_path.split('.')[0] + components = filename.split('_') + return { + 'path': path, + 'year': int(components[0]), + 'survey': components[1], + 'haul': int(components[2]) + } + + def process_haul(bucket: str, year: int, survey: str, haul: int, species_by_code: SPECIES_DICT) -> dict: """Distributed task to process a single haul. @@ -433,7 +453,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: } catch_records = list(catch_records_maybe) - catch_records_out = combine_catch_and_haul(haul_record, catch_records, species_by_code) + catch_records_out = execute_full_join(haul_record, catch_records, species_by_code) catch_records_out_realized = list(catch_records_out) catch_records_zero = make_zero_catch_records( catch_records_out_realized, @@ -475,26 +495,6 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]: return output_dict_with_loc -def make_haul_metadata_record(path: str) -> dict: - """Parse a path into a metadata record. - - Args: - path: The path to be parsed as a metadata record. - - Returns: - Dictionary describing metadata for a haul. - """ - filename_with_path = path.split('/')[-1] - filename = filename_with_path.split('.')[0] - components = filename.split('_') - return { - 'path': path, - 'year': int(components[0]), - 'survey': components[1], - 'haul': int(components[2]) - } - - def get_hauls_meta(bucket: str) -> typing.Iterable[dict]: """Get metadata for all available hauls. diff --git a/snapshot/test_render_flat.py b/snapshot/test_render_flat.py new file mode 100644 index 00000000..d8955cfb --- /dev/null +++ b/snapshot/test_render_flat.py @@ -0,0 +1,207 @@ +""" +Tests for script to join data to build flat files. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import unittest +import unittest.mock + +import render_flat + + +class ZeroRecordTests(unittest.TestCase): + + def setUp(self): + self._species = { + 'species_code': 1, + 'scientific_name': 'test_science', + 'common_name': 'test_common', + 'id_rank': 2, + 'worms': False, + 'itis': 3 + } + self._species_by_code = { + 1: self._species + } + self._haul_record = {'haul_field': 123} + self._zero_record_sample = render_flat.make_zero_record( + self._species, + self._haul_record + ) + + def test_make_zero_record_haul(self): + self.assertEqual(self._zero_record_sample['haul_field'], 123) + + def test_make_zero_record_zero(self): + self.assertEqual(self._zero_record_sample['count'], 0) + + def test_make_zero_record_species(self): + self.assertEqual(self._zero_record_sample['scientific_name'], 'test_science') + + def test_make_zero_catch_records_no_infer(self): + output_records_iter = render_flat.make_zero_catch_records( + [{'species_code': 1, 'haul_field': 456}], + self._species_by_code, + self._haul_record + ) + output_records = list(output_records_iter) + self.assertEqual(len(output_records), 0) + + def test_make_zero_catch_records_infer(self): + output_records_iter = render_flat.make_zero_catch_records( + [{'species_code': 2, 'haul_field': 456}], + self._species_by_code, + self._haul_record + ) + output_records = list(output_records_iter) + self.assertEqual(len(output_records), 1) + self.assertEqual(output_records[0]['haul_field'], 123) + + +class JoinTests(unittest.TestCase): + + def setUp(self): + self._known = {'species_code': 1, 'catch_field': 12} + self._unknown = {'species_code': 2, 'catch_field': 23} + self._catch_records = [self._known, self._unknown] + self._species_by_code = { + 1: {'species_field': 34} + } + self._haul_record = {'haul_field': 45} + + def test_append_complete_catch_field(self): + result = render_flat.append_species_from_species_list(self._known, self._species_by_code) + self.assertEqual(result['catch_field'], 12) + + def test_append_complete_species_field(self): + result = render_flat.append_species_from_species_list(self._known, self._species_by_code) + self.assertEqual(result['species_field'], 34) + + def test_append_complete_flag(self): + result = render_flat.append_species_from_species_list(self._known, self._species_by_code) + self.assertTrue(result['complete']) + + def test_append_incomplete_catch_field(self): + result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code) + self.assertEqual(result['catch_field'], 23) + + def test_append_incomplete_species_field(self): + result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code) + self.assertFalse('species_field' in result) + + def test_append_incomplete_flag(self): + result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code) + self.assertFalse(result['complete']) + + def test_full_join_execution(self): + full_join = self._execute_full_join() + count = sum(map(lambda x: 1, full_join)) + self.assertEqual(count, 2) + + def test_full_join_no_catch(self): + result = render_flat.execute_full_join( + self._haul_record, + None, + self._species_by_code + ) + + result_realized = list(result) + self.assertEqual(len(result_realized), 1) + + result_individual = result_realized[0] + self.assertFalse(result_individual['complete']) + + def test_full_join_no_species(self): + target = self._get_species_code_from_join(2) + self.assertFalse(target['complete']) + self.assertEqual(target['catch_field'], 23) + self.assertEqual(target['haul_field'], 45) + + def test_full_join_success(self): + target = self._get_species_code_from_join(1) + self.assertTrue(target['complete']) + self.assertEqual(target['catch_field'], 12) + self.assertEqual(target['species_code'], 1) + self.assertEqual(target['haul_field'], 45) + + def _execute_full_join(self): + return render_flat.execute_full_join( + self._haul_record, + self._catch_records, + self._species_by_code + ) + + def _get_species_code_from_join(self, species_code): + full_join = self._execute_full_join() + full_join_tuple = map(lambda x: (x['species_code'], x), full_join) + full_join_dict = dict(full_join_tuple) + return full_join_dict[species_code] + + +class MakeAvroTests(unittest.TestCase): + + def test_make_get_avro(self): + client = unittest.mock.MagicMock() + result = render_flat.make_get_avro('bucket', client) + self.assertIsNotNone(result) + + +class CombineCatchHaulTests(unittest.TestCase): + + def test_append_catch_haul(self): + catch = {'catch_field': 12} + haul = {'haul_field': 34} + combined = render_flat.append_catch_haul(catch, haul) + self.assertEqual(combined['catch_field'], 12) + self.assertEqual(combined['haul_field'], 34) + + +class CompleteRecordTests(unittest.TestCase): + + def setUp(self): + self._start_record = {'unknown': 12, 'count': 34} + self._completed = render_flat.complete_record(self._start_record) + + def test_complete_record_pass_through(self): + self.assertEqual(self._completed['count'], 34) + + def test_complete_record_unknown_field(self): + self.assertFalse('unknown' in self._completed) + + def test_complete_record_missing_field(self): + self.assertIsNone(self._completed['species_code']) + + def test_mark_complete(self): + result = render_flat.mark_complete(self._start_record) + self.assertTrue(result['complete']) + + def test_mark_incomplete(self): + result = render_flat.mark_incomplete(self._start_record) + self.assertFalse(result['complete']) + + +class PathTests(unittest.TestCase): + + def test_get_path_for_catches_in_haul(self): + path = render_flat.get_path_for_catches_in_haul(123) + self.assertEqual(path, 'catch/123.avro') + + def test_get_meta_path_for_haul(self): + path = render_flat.get_meta_path_for_haul(2025, 'Gulf of Alaska', 123) + self.assertEqual(path, 'haul/2025_Gulf of Alaska_123.avro') + + def test_get_joined_path(self): + path = render_flat.get_joined_path(2025, 'Gulf of Alaska', 123) + self.assertEqual(path, 'joined/2025_Gulf of Alaska_123.avro') + + def test_make_haul_metadata_record(self): + path = render_flat.get_joined_path(2025, 'Gulf of Alaska', 123) + metadata = render_flat.make_haul_metadata_record(path) + self.assertEqual(metadata['path'], path) + self.assertEqual(metadata['year'], 2025) + self.assertEqual(metadata['survey'], 'Gulf of Alaska') + self.assertEqual(metadata['haul'], 123) From f576e0f2eb18a8a1d0db5c5417a0b044ff1950f9 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 19:41:28 +0000 Subject: [PATCH 31/36] Test fixes for mypy as part of #114. --- snapshot/test_render_flat.py | 38 ++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/snapshot/test_render_flat.py b/snapshot/test_render_flat.py index d8955cfb..21c88728 100644 --- a/snapshot/test_render_flat.py +++ b/snapshot/test_render_flat.py @@ -45,7 +45,7 @@ def test_make_zero_record_species(self): def test_make_zero_catch_records_no_infer(self): output_records_iter = render_flat.make_zero_catch_records( [{'species_code': 1, 'haul_field': 456}], - self._species_by_code, + self._species_by_code, # type: ignore self._haul_record ) output_records = list(output_records_iter) @@ -54,7 +54,7 @@ def test_make_zero_catch_records_no_infer(self): def test_make_zero_catch_records_infer(self): output_records_iter = render_flat.make_zero_catch_records( [{'species_code': 2, 'haul_field': 456}], - self._species_by_code, + self._species_by_code, # type: ignore self._haul_record ) output_records = list(output_records_iter) @@ -74,27 +74,45 @@ def setUp(self): self._haul_record = {'haul_field': 45} def test_append_complete_catch_field(self): - result = render_flat.append_species_from_species_list(self._known, self._species_by_code) + result = render_flat.append_species_from_species_list( + self._known, + self._species_by_code # type: ignore + ) self.assertEqual(result['catch_field'], 12) def test_append_complete_species_field(self): - result = render_flat.append_species_from_species_list(self._known, self._species_by_code) + result = render_flat.append_species_from_species_list( + self._known, + self._species_by_code # type: ignore + ) self.assertEqual(result['species_field'], 34) def test_append_complete_flag(self): - result = render_flat.append_species_from_species_list(self._known, self._species_by_code) + result = render_flat.append_species_from_species_list( + self._known, + self._species_by_code # type: ignore + ) self.assertTrue(result['complete']) def test_append_incomplete_catch_field(self): - result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code) + result = render_flat.append_species_from_species_list( + self._unknown, + self._species_by_code # type: ignore + ) self.assertEqual(result['catch_field'], 23) def test_append_incomplete_species_field(self): - result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code) + result = render_flat.append_species_from_species_list( + self._unknown, + self._species_by_code # type: ignore + ) self.assertFalse('species_field' in result) def test_append_incomplete_flag(self): - result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code) + result = render_flat.append_species_from_species_list( + self._unknown, + self._species_by_code # type: ignore + ) self.assertFalse(result['complete']) def test_full_join_execution(self): @@ -106,7 +124,7 @@ def test_full_join_no_catch(self): result = render_flat.execute_full_join( self._haul_record, None, - self._species_by_code + self._species_by_code # type: ignore ) result_realized = list(result) @@ -132,7 +150,7 @@ def _execute_full_join(self): return render_flat.execute_full_join( self._haul_record, self._catch_records, - self._species_by_code + self._species_by_code # type: ignore ) def _get_species_code_from_join(self, species_code): From 7adeee9586f62277722c7940a3d6e09680624382 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 19:52:39 +0000 Subject: [PATCH 32/36] Add initial request_source implementation. --- snapshot/request_source.py | 212 +++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 snapshot/request_source.py diff --git a/snapshot/request_source.py b/snapshot/request_source.py new file mode 100644 index 00000000..b034fd06 --- /dev/null +++ b/snapshot/request_source.py @@ -0,0 +1,212 @@ +""" +Scripts to request haul, catch, and species data from upstream APIs. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import io +import itertools +import os +import sys +import time +import typing + +import boto3 +import fastavro +import requests +import toolz.itertoolz + +MIN_ARGS = 3 +MAX_ARGS = 4 +USAGE_STR = 'python request_source.py [type] [bucket] [location] [year]' +DOMAIN = 'https://apps-st.fisheries.noaa.gov' +ENDPOINTS = { + 'haul': '/ods/foss/afsc_groundfish_survey_haul/', + 'catch': '/ods/foss/afsc_groundfish_survey_catch/', + 'species': '/ods/foss/afsc_groundfish_survey_species/' +} + +HAUL_SCHEMA = { + 'doc': 'Description of a haul', + 'name': 'Haul', + 'namespace': 'edu.dse.afscgap', + 'type': 'record', + 'fields': [ + {'name': 'year', 'type': 'int'}, + {'name': 'srvy', 'type': 'string'}, + {'name': 'survey', 'type': 'string'}, + {'name': 'survey_name', 'type': 'string'}, + {'name': 'survey_definition_id', 'type': ['long', 'null']}, + {'name': 'cruise', 'type': ['long', 'null']}, + {'name': 'cruisejoin', 'type': 'long'}, + {'name': 'hauljoin', 'type': 'long'}, + {'name': 'haul', 'type': ['long', 'null']}, + {'name': 'stratum', 'type': ['long', 'null']}, + {'name': 'station', 'type': ['string', 'null']}, + {'name': 'vessel_id', 'type': ['long', 'null']}, + {'name': 'vessel_name', 'type': ['string', 'null']}, + {'name': 'date_time', 'type': 'string'}, + {'name': 'latitude_dd_start', 'type': ['double', 'null']}, + {'name': 'longitude_dd_start', 'type': ['double', 'null']}, + {'name': 'latitude_dd_end', 'type': ['double', 'null']}, + {'name': 'longitude_dd_end', 'type': ['double', 'null']}, + {'name': 'bottom_temperature_c', 'type': ['double', 'null']}, + {'name': 'surface_temperature_c', 'type': ['double', 'null']}, + {'name': 'depth_m', 'type': ['double', 'null']}, + {'name': 'distance_fished_km', 'type': ['double', 'null']}, + {'name': 'duration_hr', 'type': ['double', 'null']}, + {'name': 'net_width_m', 'type': ['double', 'null']}, + {'name': 'net_height_m', 'type': ['double', 'null']}, + {'name': 'area_swept_km2', 'type': ['double', 'null']}, + {'name': 'performance', 'type': ['float', 'null']} + ] +} + +CATCH_SCHEMA = { + 'doc': 'Description of a catch', + 'name': 'Catch', + 'namespace': 'edu.dse.afscgap', + 'type': 'record', + 'fields': [ + {'name': 'hauljoin', 'type': 'long'}, + {'name': 'species_code', 'type': 'long'}, + {'name': 'cpue_kgkm2', 'type': ['double', 'null']}, + {'name': 'cpue_nokm2', 'type': ['double', 'null']}, + {'name': 'count', 'type': ['long', 'null']}, + {'name': 'weight_kg', 'type': ['double', 'null']}, + {'name': 'taxon_confidence', 'type': ['string', 'null']} + ] +} + +SPECIES_SCHEMA = { + 'doc': 'Description of a species', + 'name': 'Species', + 'namespace': 'edu.dse.afscgap', + 'type': 'record', + 'fields': [ + {'name': 'species_code', 'type': 'long'}, + {'name': 'scientific_name', 'type': ['string', 'null']}, + {'name': 'common_name', 'type': ['string', 'null']}, + {'name': 'id_rank', 'type': ['string', 'null']}, + {'name': 'worms', 'type': ['long', 'null']}, + {'name': 'itis', 'type': ['long', 'null']} + ] +} + +SCHEMAS = { + 'haul': HAUL_SCHEMA, + 'catch': CATCH_SCHEMA, + 'species': SPECIES_SCHEMA +} + + +def get_api_request_url(type_name: str, year: int, offset: int) -> str: + endpoint = ENDPOINTS[type_name] + + if year: + params = '?offset=%d&limit=10000&q={"year":%d}' % (offset, year) + else: + params = '?offset=%d&limit=10000' % offset + + full_url = DOMAIN + endpoint + params + return full_url + + +def dump_to_s3(year: int, bucket: str, loc: str, type_name: str): + offset = 0 + done = False + + s3_client = boto3.client( + 's3', + aws_access_key_id=os.environ['AWS_ACCESS_KEY'], + aws_secret_access_key=os.environ['AWS_ACCESS_SECRET'] + ) + + def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO: + target_buffer = io.BytesIO() + fastavro.writer(target_buffer, SCHEMAS[type_name], records) + target_buffer.seek(0) + return target_buffer + + def append_in_bucket(key: str, records: typing.List[dict]): + sample_record = records[0] + + if type_name == 'haul': + template_vals = ( + year, + sample_record['survey'], + sample_record['hauljoin'] + ) + full_loc = loc + '/%d_%s_%d.avro' % template_vals + elif type_name == 'catch': + full_loc = loc + '/%d.avro' % sample_record['hauljoin'] + elif type_name == 'species': + full_loc = loc + '/%d.avro' % sample_record['species_code'] + + try: + target_buffer = io.BytesIO() + s3_client.download_fileobj(bucket, full_loc, target_buffer) + target_buffer.seek(0) + prior_records = fastavro.reader(target_buffer) + except s3_client.exceptions.ClientError: + prior_records = [] + + records_avro = convert_to_avro(itertools.chain(prior_records, records)) + s3_client.upload_fileobj(records_avro, bucket, full_loc) + + def write_response(parsed: dict): + items = parsed['items'] + key_name = 'species_code' if type_name == 'species' else 'hauljoin' + by_key = toolz.itertoolz.groupby(lambda x: x[key_name], items) + for key_tuple in by_key.items(): + key = key_tuple[0] + records = key_tuple[1] + append_in_bucket(key, records) + + def execute_request(offset: int): + full_url = get_api_request_url(type_name, year, offset) + response = requests.get(full_url) + return response + + while not done: + if offset % 100000 == 0: + print('Offset: %d' % offset) + + response = execute_request(offset) + status_code = response.status_code + + if status_code == 200: + parsed = response.json() + write_response(parsed) + offset += 10000 + done = len(parsed['items']) == 0 + if done: + print('Ending gracefully...') + else: + template_vals = (offset, status_code) + print('Offset of %d with status %d. Waiting...' % template_vals) + time.sleep(1) + + +def main(): + if len(sys.argv) < MIN_ARGS + 1 or len(sys.argv) > MAX_ARGS + 1: + print(USAGE_STR) + sys.exit(1) + + type_name = sys.argv[1] + bucket = sys.argv[2] + loc = sys.argv[3] + + if len(sys.argv) > 4: + year = int(sys.argv[4]) + else: + year = None + + dump_to_s3(year, bucket, loc, type_name) + + +if __name__ == '__main__': + main() From 294325d184ef18cdafa898316ada7a6ae913b154 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 19:58:16 +0000 Subject: [PATCH 33/36] Type fixes for request source. Type fixes for newly added request source script as part of #114. --- snapshot/request_source.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/snapshot/request_source.py b/snapshot/request_source.py index b034fd06..6b81468f 100644 --- a/snapshot/request_source.py +++ b/snapshot/request_source.py @@ -14,10 +14,10 @@ import time import typing -import boto3 +import boto3 # type: ignore import fastavro import requests -import toolz.itertoolz +import toolz.itertoolz # type: ignore MIN_ARGS = 3 MAX_ARGS = 4 @@ -103,7 +103,7 @@ } -def get_api_request_url(type_name: str, year: int, offset: int) -> str: +def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int) -> str: endpoint = ENDPOINTS[type_name] if year: @@ -115,7 +115,7 @@ def get_api_request_url(type_name: str, year: int, offset: int) -> str: return full_url -def dump_to_s3(year: int, bucket: str, loc: str, type_name: str): +def dump_to_s3(year: typing.Optional[int], bucket: str, loc: str, type_name: str): offset = 0 done = False @@ -135,6 +135,7 @@ def append_in_bucket(key: str, records: typing.List[dict]): sample_record = records[0] if type_name == 'haul': + assert year is not None template_vals = ( year, sample_record['survey'], @@ -146,14 +147,16 @@ def append_in_bucket(key: str, records: typing.List[dict]): elif type_name == 'species': full_loc = loc + '/%d.avro' % sample_record['species_code'] - try: - target_buffer = io.BytesIO() - s3_client.download_fileobj(bucket, full_loc, target_buffer) - target_buffer.seek(0) - prior_records = fastavro.reader(target_buffer) - except s3_client.exceptions.ClientError: - prior_records = [] + def read_prior_records() -> typing.Iterable[dict]: + try: + target_buffer = io.BytesIO() + s3_client.download_fileobj(bucket, full_loc, target_buffer) + target_buffer.seek(0) + return fastavro.reader(target_buffer) # type: ignore + except s3_client.exceptions.ClientError: + return [] + prior_records = read_prior_records() records_avro = convert_to_avro(itertools.chain(prior_records, records)) s3_client.upload_fileobj(records_avro, bucket, full_loc) From a46ca215be0e04ff2361e1d3b55242db366a6148 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 20:49:27 +0000 Subject: [PATCH 34/36] Add docstring for request source. --- snapshot/request_source.py | 85 +++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/snapshot/request_source.py b/snapshot/request_source.py index 6b81468f..56d50a9d 100644 --- a/snapshot/request_source.py +++ b/snapshot/request_source.py @@ -28,6 +28,7 @@ 'catch': '/ods/foss/afsc_groundfish_survey_catch/', 'species': '/ods/foss/afsc_groundfish_survey_species/' } +YEAR_ENDPOINTS = {'catch'} HAUL_SCHEMA = { 'doc': 'Description of a haul', @@ -102,20 +103,57 @@ 'species': SPECIES_SCHEMA } +DEFAULT_LIMIT = 100000 -def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int) -> str: + +def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int, + limit: int = DEFAULT_LIMIT) -> str: + """Get the URL where an API endoping can be found for a given set of records. + + Get the URL where an API endoping can be found for a given set of records, raising an exception + if an invalid request is provided like if year is provided but not supported by the endpoint. + + Args: + type_name: The type of record requested like "catch" for catch records. + year: The year like 2025 for which records are requested. If None, will request without a + year filter. Ignored by some endpoints. + offset: The offset into this year / type combination. + limit: The maximum number of records to return. + + Returns: + String URL where the requested records can be found. + """ endpoint = ENDPOINTS[type_name] if year: - params = '?offset=%d&limit=10000&q={"year":%d}' % (offset, year) + if type_name not in YEAR_ENDPOINTS: + raise RuntimeError('Provided a year filter to an endpoint that does not support it.') + + params = '?offset=%d&limit=%d&q={"year":%d}' % (offset, limit, year) else: - params = '?offset=%d&limit=10000' % offset + if type_name in YEAR_ENDPOINTS: + raise RuntimeError('Did not provide a year filter to an endpoint that supports it.') + + params = '?offset=%d&limit=%d' % (offset, limit) full_url = DOMAIN + endpoint + params return full_url def dump_to_s3(year: typing.Optional[int], bucket: str, loc: str, type_name: str): + """Dump a set of records to an S3 bucket for later processing / joining. + + Dump a set of records to an S3 bucket. These may be saved for later processing such as joining + across record types. This will perform pagination until all records saved, making multiple API + requests. Raises an exception if year is provided but not supported. + + Args: + year: The year for which records should be dumped. This is ignored by some endpoints and + None may be passed. + bucket: The name of the bucket within S3 in which they should be dumped. + loc: The location within the bucket where they should be written. + type_name: The type of record to dump like "catch" for catch records. + """ offset = 0 done = False @@ -126,12 +164,26 @@ def dump_to_s3(year: typing.Optional[int], bucket: str, loc: str, type_name: str ) def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO: + """Convert a set of records to Avro. + + Args: + records: The records to convert to Avro. + + Returns: + The provided records as binary. + """ target_buffer = io.BytesIO() fastavro.writer(target_buffer, SCHEMAS[type_name], records) target_buffer.seek(0) return target_buffer def append_in_bucket(key: str, records: typing.List[dict]): + """Append to a file within an S3 bucket, making the file if it does not exist. + + Args: + key: The path to the file to be appended. + records: The records to be appended as Avro. + """ sample_record = records[0] if type_name == 'haul': @@ -148,6 +200,12 @@ def append_in_bucket(key: str, records: typing.List[dict]): full_loc = loc + '/%d.avro' % sample_record['species_code'] def read_prior_records() -> typing.Iterable[dict]: + """Get the records already at the target file. + + Returns: + Iterable over records if prior contents found or an empty iterable if the file does + not exist. + """ try: target_buffer = io.BytesIO() s3_client.download_fileobj(bucket, full_loc, target_buffer) @@ -161,6 +219,11 @@ def read_prior_records() -> typing.Iterable[dict]: s3_client.upload_fileobj(records_avro, bucket, full_loc) def write_response(parsed: dict): + """Write the result of an API call to S3. + + Args: + parsed: The record returned from the API. + """ items = parsed['items'] key_name = 'species_code' if type_name == 'species' else 'hauljoin' by_key = toolz.itertoolz.groupby(lambda x: x[key_name], items) @@ -170,12 +233,21 @@ def write_response(parsed: dict): append_in_bucket(key, records) def execute_request(offset: int): - full_url = get_api_request_url(type_name, year, offset) + """Execute a single request for records given an offset into the result set. + + Args: + offset: The number of records to skip at the start of the result set. Used for + pagination. + + Returns: + Unparsed response from a requests-like object. + """ + full_url = get_api_request_url(type_name, year, offset, limit=DEFAULT_LIMIT) response = requests.get(full_url) return response while not done: - if offset % 100000 == 0: + if offset % DEFAULT_LIMIT == 0: print('Offset: %d' % offset) response = execute_request(offset) @@ -184,7 +256,7 @@ def execute_request(offset: int): if status_code == 200: parsed = response.json() write_response(parsed) - offset += 10000 + offset += DEFAULT_LIMIT done = len(parsed['items']) == 0 if done: print('Ending gracefully...') @@ -195,6 +267,7 @@ def execute_request(offset: int): def main(): + """Entrypoint to the request source script.""" if len(sys.argv) < MIN_ARGS + 1 or len(sys.argv) > MAX_ARGS + 1: print(USAGE_STR) sys.exit(1) From 426d9d6d69002c8c32358942b0f360390685e346 Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 20:57:14 +0000 Subject: [PATCH 35/36] Fixes for #114. --- snapshot/request_source.py | 6 ++--- snapshot/test_request_source.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 snapshot/test_request_source.py diff --git a/snapshot/request_source.py b/snapshot/request_source.py index 56d50a9d..a2f96dd5 100644 --- a/snapshot/request_source.py +++ b/snapshot/request_source.py @@ -103,7 +103,7 @@ 'species': SPECIES_SCHEMA } -DEFAULT_LIMIT = 100000 +DEFAULT_LIMIT = 10000 def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int, @@ -234,7 +234,7 @@ def write_response(parsed: dict): def execute_request(offset: int): """Execute a single request for records given an offset into the result set. - + Args: offset: The number of records to skip at the start of the result set. Used for pagination. @@ -247,7 +247,7 @@ def execute_request(offset: int): return response while not done: - if offset % DEFAULT_LIMIT == 0: + if offset % (DEFAULT_LIMIT * 10) == 0: print('Offset: %d' % offset) response = execute_request(offset) diff --git a/snapshot/test_request_source.py b/snapshot/test_request_source.py new file mode 100644 index 00000000..b461980d --- /dev/null +++ b/snapshot/test_request_source.py @@ -0,0 +1,46 @@ +""" +Tests for requesting upstream source data. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import unittest +import unittest.mock + +import request_source + + +class ApiUrlGenerationTests(unittest.TestCase): + + def test_haul(self): + url = request_source.get_api_request_url('haul', None, 12, 34) + self.assertTrue('afsc_groundfish_survey_haul' in url) + self.assertTrue('offset=12' in url) + self.assertTrue('limit=34' in url) + + def test_catch(self): + url = request_source.get_api_request_url('catch', 2025, 12, 34) + self.assertTrue('afsc_groundfish_survey_catch' in url) + self.assertTrue('offset=12' in url) + self.assertTrue('limit=34' in url) + self.assertTrue('q={"year":2025}' in url) + + def test_species(self): + url = request_source.get_api_request_url('species', None, 12, 34) + self.assertTrue('afsc_groundfish_survey_species' in url) + self.assertTrue('offset=12' in url) + self.assertTrue('limit=34' in url) + + def test_year_provide_not_support(self): + with self.assertRaises(RuntimeError): + request_source.get_api_request_url('haul', 2025, 12, 34) + + with self.assertRaises(RuntimeError): + request_source.get_api_request_url('species', 2025, 12, 34) + + def test_year_not_provided_supported(self): + with self.assertRaises(RuntimeError): + request_source.get_api_request_url('catch', None, 12, 34) From d03b171e9190a0f63ebc2281e4111c143ba0aaca Mon Sep 17 00:00:00 2001 From: A Samuel Pottinger Date: Sat, 4 Jan 2025 21:05:49 +0000 Subject: [PATCH 36/36] Add write main index. --- snapshot/test_write_main_index.py | 30 ++++++++++ snapshot/write_main_index.py | 96 +++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 snapshot/test_write_main_index.py create mode 100644 snapshot/write_main_index.py diff --git a/snapshot/test_write_main_index.py b/snapshot/test_write_main_index.py new file mode 100644 index 00000000..fb0cd3e7 --- /dev/null +++ b/snapshot/test_write_main_index.py @@ -0,0 +1,30 @@ +""" +Tests for the main index generation script. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import unittest +import unittest.mock + +import write_main_index + +EXAMPLE_URL = 'joined/2025_Gulf of Alaska_123.avro' + + +class HaulMetadataRecordTests(unittest.TestCase): + + def setUp(self): + self._record = write_main_index.make_haul_metadata_record(EXAMPLE_URL) + + def test_year(self): + self.assertEqual(self._record['year'], 2025) + + def test_survey(self): + self.assertEqual(self._record['survey'], 'Gulf of Alaska') + + def test_haul(self): + self.assertEqual(self._record['haul'], 123) diff --git a/snapshot/write_main_index.py b/snapshot/write_main_index.py new file mode 100644 index 00000000..45beae44 --- /dev/null +++ b/snapshot/write_main_index.py @@ -0,0 +1,96 @@ +""" +Scripts to write the "main" index which includes all hauls without filtering. + +(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center +for Data Science and the Environment at UC Berkeley. + +This file is part of afscgap released under the BSD 3-Clause License. See +LICENSE.md. +""" +import io +import itertools +import os +import sys + +import boto3 # type: ignore +import fastavro + +KEY_SCHEMA = { + 'doc': 'Key to an observation flat file.', + 'name': 'Key', + 'namespace': 'edu.dse.afscgap', + 'type': 'record', + 'fields': [ + {'name': 'year', 'type': 'int'}, + {'name': 'survey', 'type': 'string'}, + {'name': 'haul', 'type': 'long'} + ] +} + +NUM_ARGS = 1 +USAGE_STR = 'python write_main_index.py [bucket]' + + +def make_haul_metadata_record(path: str) -> dict: + """Interpret a path and parse metadata information about a haul found at that path. + + Args: + path: The path to parse as a metadata record. + + Returns: + Dictionary with metadata about the haul that may be found at the provided path. + """ + filename_with_path = path.split('/')[-1] + filename = filename_with_path.split('.')[0] + components = filename.split('_') + return { + 'year': int(components[0]), + 'survey': components[1], + 'haul': int(components[2]) + } + + +def main(): + """Entrypoint into the main index writing script.""" + if len(sys.argv) != NUM_ARGS + 1: + print(USAGE_STR) + sys.exit(1) + + bucket = sys.argv[1] + + access_key = os.environ['AWS_ACCESS_KEY'] + access_secret = os.environ['AWS_ACCESS_SECRET'] + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + + paginator = s3_client.get_paginator('list_objects_v2') + iterator = paginator.paginate(Bucket=bucket, Prefix='joined/') + pages = filter(lambda x: 'Contents' in x, iterator) + contents = map(lambda x: x['Contents'], pages) + contents_flat = itertools.chain(*contents) + keys = map(lambda x: x['Key'], contents_flat) + metadata_records = map(make_haul_metadata_record, keys) + + write_buffer = io.BytesIO() + fastavro.writer( + write_buffer, + KEY_SCHEMA, + metadata_records + ) + write_buffer.seek(0) + + s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=access_secret + ) + output_loc = 'index/main.avro' + s3_client.upload_fileobj(write_buffer, bucket, output_loc) + + +if __name__ == '__main__': + main()