From c8446adbdf9cb749a8672813cc04761b17aff0a7 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 20:18:46 +0000
Subject: [PATCH 01/36] Start snapshot directory.

Start snapshot directory as part of the v2 within #111.
---
 README.md          | 2 +-
 snapshot/README.md | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 snapshot/README.md
diff --git a/README.md b/README.md
index 4d59e08f..5c2f1dde 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ In addition to Github-provided [Github Actions](https://docs.github.com/en/actio
  - [sftp-action](https://github.com/Creepios/sftp-action) under the [MIT License](https://github.com/Creepios/sftp-action/blob/master/LICENSE) from Niklas Creepios.
  - [ssh-action](https://github.com/appleboy/ssh-action) under the [MIT License](https://github.com/appleboy/ssh-action/blob/master/LICENSE) from Bo-Yi Wu.
 
-Next, the visualization tool has additional dependencies as documented in the [visualization readme](https://github.com/SchmidtDSE/afscgap/blob/main/afscgapviz/README.md).
+Next, the visualization tool has additional dependencies as documented in the [visualization readme](https://github.com/SchmidtDSE/afscgap/blob/main/afscgapviz/README.md). Similarly, the community flat files snapshot updater has additional dependencies as documented in the [snapshot readme](https://github.com/SchmidtDSE/afscgap/blob/main/snapshot/README.md).
 
 Finally, note that the website uses assets from [The Noun Project](thenounproject.com/) under the NounPro plan. If used outside of https://pyafscgap.org, they may be subject to a [different license](https://thenounproject.com/pricing/#icons).
 
diff --git a/snapshot/README.md b/snapshot/README.md
new file mode 100644
index 00000000..e380e84a
--- /dev/null
+++ b/snapshot/README.md
@@ -0,0 +1,2 @@
+# Snapshot Updater
+Scripts to update the community Avro flat files as described at [data.pyafscgap.org](https://data.pyafscgap.org/).

From 3e8a8701c57c81131c0f951e2f225981e50592c7 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 21:44:26 +0000
Subject: [PATCH 02/36] Additional README edits.

---
 README.md          |  5 ++--
 snapshot/README.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5c2f1dde..d90a5ebe 100644
--- a/README.md
+++ b/README.md
@@ -133,9 +133,10 @@ at UC Berkeley](https://dse.berkeley.edu) where [Kevin Koy](https://github.com/k
 <br>
 
 ## Open Source
-We are happy to be part of the open source community.
+We are happy to be part of the open source community. We use the following:
 
-At this time, the only open source dependency used by this microlibrary is [Requests](https://docs.python-requests.org/en/latest/index.html) which is available under the [Apache v2 License](https://github.com/psf/requests/blob/main/LICENSE) from [Kenneth Reitz and other contributors](https://github.com/psf/requests/graphs/contributors).
+ - [Requests](https://docs.python-requests.org/en/latest/index.html) which is available under the [Apache v2 License](https://github.com/psf/requests/blob/main/LICENSE) from [Kenneth Reitz and other contributors](https://github.com/psf/requests/graphs/contributors).
+ - [fastavro](https://fastavro.readthedocs.io/en/latest/) by Miki Tebeka and Contributors under the [MIT License](https://github.com/fastavro/fastavro/blob/master/LICENSE).
 
 In addition to Github-provided [Github Actions](https://docs.github.com/en/actions), our build and documentation systems also use the following but are not distributed with or linked to the project itself:
 
diff --git a/snapshot/README.md b/snapshot/README.md
index e380e84a..d2543c41 100644
--- a/snapshot/README.md
+++ b/snapshot/README.md
@@ -1,2 +1,65 @@
 # Snapshot Updater
 Scripts to update the community Avro flat files as described at [data.pyafscgap.org](https://data.pyafscgap.org/).
+
+## Purpose
+Due to API limitations that prevent filtering joined data prior to downloading locally, community flat files in [Avro format](https://avro.apache.org/) offer pre-joined data with indicies which can be used by `pyafscgap` to avoid downloading all catch data or specifying individual hauls. This directory contains scripts used to update those resources which are availble at [data.pyafscgap.org](https://data.pyafscgap.org/).
+
+## Usage
+The updater can be executed with individual scripts or in its entirety through bash. Note that some of these steps use environment variables specified in local setup.
+
+### Python library
+These community files are used by default when interacting with the `pyafscgap` library. See [pyafscgap.org](https://pyafscgap.org/) for instructions. These Avro files will be requested and iterated by the client without the user needing to understand the underlying file format. Only the `pyafscgap` interface is intended to be maintained across major versions for backwards compatibility.
+
+### Prebulit payloads
+Prebuilt Avro files are avialable via HTTPS through [data.pyafscgap.org](https://data.pyafscgap.org/). There are two subdirectories of files.
+
+First, [index](https://data.pyafscgap.org/index) contains "index data files" which indicate where catch data can be found. These indicies include filename that can be found in `joined`. Each file maps from a value for the filename's variable to a set of joined flat files with those data can be found. Each value refers to a specific haul where floating point values are rounded to two decimal places. Note that, due to this rounding, more precise filters will have to further sub-filter after collecting relevant data from the `joined` subdirectory.
+
+Second, [joined](https://data.pyafscgap.org/joined) includes all catch data joined against the species list and hauls table to create a single "flat" file which fully describes all information available for each catch. Each record is a single catch and each file is a single haul where a haul takes place within a specific year and survey.
+
+Note that, while provided as a service to the community, these Avro files and directory structure may be changed in the future. These files exist to serve the `pyafscgap` functionality as the NOAA APIs change over time. Therefore, for a long term stable interface with documentation and further type annotation, please consider using the `pyafscgap` library isntead.
+
+### Manual execution
+In order to build the Avro files yourself by requesting, joining, and indexing original upstream API data, you can simply execute `bash execute_all.sh` after local setup. These will build these files on S3 but they may be deployed to an SFTP server trivially.
+
+## Local setup
+Local environment setup varies depending on how these files are used.
+
+### Python library setup
+Simply install `pyafscgap` normally to have the library automatically use the flat files for queries.
+
+### Prebuilt payloads environment
+These files may be used by any programming language or environment supporting Avro. For more information, see the official [Avro documentation](https://avro.apache.org/docs/) though [fastavro](https://fastavro.readthedocs.io/en/latest/) is recommended for use in Python.
+
+### Environment for manual execution
+To perform manual execution, these scripts expect to use [AWS S3](https://aws.amazon.com/s3/) prior to deployment to a simple SFTP server. In order to use these scripts, the following envrionment variables need to be set after installing dependencies (optionally within a virtual environment) via `pip install -r requirements.txt`:
+
+ - `AWS_ACCESS_KEY`: This is the access key used to upload completed payloads to AWS S3 or to request those data as part of distributed indexing and processing.
+ - `AWS_ACCESS_SECRET`: This is the secret associated with the access key used to upload completed payloads to AWS S3 or to request those data as part of distributed indexing and processing.
+ - `BUCKET_NAME`: This is the name of the bucket where completed uploads should be uploaded or requested within S3.
+
+These may be set within `.bashrc` files or similar through `EXPORT` commands. Finally, these scripts expect [Coiled](https://www.coiled.io/) to perform distributed tasks.
+
+## Testing
+Unit tests can be executed by running `nose2` within the `snapshot` directory.
+
+## Deployment
+Files generated in S3 can be trivially deployed to an SFTP server or accessed directly from AWS.
+
+## Development
+These scripts follow the same development guidelines as the overall `pyafscgap` project. Note that style and type checks are enforced though CI / CD systems. See [contributors documentation](https://github.com/SchmidtDSE/afscgap/blob/main/CONTRIBUTING.md).
+
+## Open source
+The snapshots updater uses the following open source packages:
+
+ - [bokeh](https://docs.bokeh.org/en/latest/) from Bokah Contributors and NumFocus under the [BSD License](https://github.com/bokeh/demo.bokeh.org/blob/main/LICENSE.txt).
+ - [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) under the [Apache v2 License](https://github.com/boto/boto3/blob/develop/LICENSE).
+ - [dask](https://www.dask.org/) from Anaconda and Contributors under the [BSD License](https://github.com/dask/dask/blob/main/LICENSE.txt).
+ - [fastavro](https://fastavro.readthedocs.io/en/latest/) by Miki Tebeka and Contributors under the [MIT License](https://github.com/fastavro/fastavro/blob/master/LICENSE).
+ - [requests](https://docs.python-requests.org/en/latest/index.html) which is available under the [Apache v2 License](https://github.com/psf/requests/blob/main/LICENSE) from [Kenneth Reitz and other contributors](https://github.com/psf/requests/graphs/contributors).
+ - [toolz](https://toolz.readthedocs.io/en/latest/) under a [BSD License](https://github.com/pytoolz/toolz/blob/master/LICENSE.txt).
+
+We thank these projects for their contribution. Note that we also use [coiled](https://www.coiled.io/).
+
+## License
+Code to generate these flat files is released alongside the rest of the pyafscgap project under the [BSD License](https://github.com/SchmidtDSE/afscgap/blob/main/LICENSE.md). See [data.pyafscgap.org](https://data.pyafscgap.org/) for further license details regarding prebuilt files.

From 6aec7cb0de47b4e0cdf617fff45f56c2c01644af Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:23:45 +0000
Subject: [PATCH 03/36] Add combine shards.

---
 .github/workflows/build.yml     |   4 +-
 afscgap/test/test_convert.py    |   9 +++
 snapshot/combine_shards.py      | 127 ++++++++++++++++++++++++++++++++
 snapshot/const.py               |  28 +++++++
 snapshot/norm_util.py           |  39 ++++++++++
 snapshot/test_combine_shards.py |  60 +++++++++++++++
 snapshot/test_norm_util.py      |  67 +++++++++++++++++
 7 files changed, 333 insertions(+), 1 deletion(-)
 create mode 100644 snapshot/combine_shards.py
 create mode 100644 snapshot/const.py
 create mode 100644 snapshot/norm_util.py
 create mode 100644 snapshot/test_combine_shards.py
 create mode 100644 snapshot/test_norm_util.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8e18e3f9..daa6318e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,8 +23,10 @@ jobs:
         run: nose2
       - name: Unit test app
         run: nose2 --start-dir=afscgapviz
+      - name: Unit test snapshot
+        run: nose2 --start-dir=snapshot
       - name: Check types
-        run: mypy **/*.py
+        run: mypy **/*.py --check-untyped-defs
       - name: Check errors
         run: pyflakes **/*.py
       - name: Check style
diff --git a/afscgap/test/test_convert.py b/afscgap/test/test_convert.py
index 075f1961..08c2ac66 100644
--- a/afscgap/test/test_convert.py
+++ b/afscgap/test/test_convert.py
@@ -1,3 +1,12 @@
+"""
+Tests for unit conversion.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
 import unittest
 import unittest.mock
 
diff --git a/snapshot/combine_shards.py b/snapshot/combine_shards.py
new file mode 100644
index 00000000..19fd6319
--- /dev/null
+++ b/snapshot/combine_shards.py
@@ -0,0 +1,127 @@
+"""
+Script to combine sharded indicies into a single index usable by the pyafscgap library.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import io
+import itertools
+import os
+import sys
+import typing
+
+import boto3
+import fastavro
+
+import norm_util
+
+INDEX_SCHEMA = {
+    'doc': 'Index from a value to an observations flat file.',
+    'name': 'Index',
+    'namespace': 'edu.dse.afscgap',
+    'type': 'record',
+    'fields': [
+        {'name': 'value', 'type': ['string', 'long', 'double', 'null']},
+        {'name': 'keys', 'type': {
+            'type': 'array',
+            'items': {
+                'name': 'Key',
+                'type': 'record',
+                'fields': [
+                    {'name': 'year', 'type': 'int'},
+                    {'name': 'survey', 'type': 'string'},
+                    {'name': 'haul', 'type': 'long'}
+                ]
+            }
+        }}
+    ]
+}
+
+NUM_ARGS = 2
+USAGE_STR = 'python combine_shards.py [bucket] [key]'
+
+
+def normalize_record(key: str, target: dict) -> dict:
+    """Normalize a record value.
+
+    Normalize a record value so that it can be used to generate bins of haul keys, rounding or
+    truncating in an expected way.
+
+    Args:
+        key: The property key for which a value should be normalized.
+        target: The record whose value should be updated.
+
+    Returns:
+        The record after its value attribute has been normalized if required or target unmodified
+        if no changes made.
+    """
+    value = target['value']
+    normalized = norm_util.normalize_value(key, value)
+    target['value'] = normalized
+    return target
+
+
+def main():
+    """Entry point for the shard combination script."""
+    if len(sys.argv) != NUM_ARGS + 1:
+        print(USAGE_STR)
+        sys.exit(1)
+
+    bucket = sys.argv[1]
+    key = sys.argv[2]
+
+    filename = key + '.txt'
+    loc = os.path.join('index_shards', filename)
+    with open(loc) as f:
+        batches = [int(x.strip()) for x in f]
+
+    access_key = os.environ['AWS_ACCESS_KEY']
+    access_secret = os.environ['AWS_ACCESS_SECRET']
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+
+    def get_avro(full_loc: str) -> typing.Iterable[dict]:
+        """Get the contents of an Avro file as parsed dictionaries.
+
+        Args:
+            full_loc: The location where the avro file can be found within the S3 bucket.
+
+        Returns:
+            List of parsed Avro records with each element being one parsed Avro record.
+        """
+        target_buffer = io.BytesIO()
+        s3_client.download_fileobj(bucket, full_loc, target_buffer)
+        target_buffer.seek(0)
+        return list(fastavro.reader(target_buffer))
+
+    batch_locs = map(lambda x: 'index_sharded/%s_%d.avro' % (key, x), batches)
+    shards = map(get_avro, batch_locs)
+    combined = itertools.chain(*shards)
+    normalized = map(lambda x: normalize_record(key, x), combined)
+
+    write_buffer = io.BytesIO()
+    fastavro.writer(
+        write_buffer,
+        INDEX_SCHEMA,
+        normalized
+    )
+    write_buffer.seek(0)
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+    output_loc = 'index/%s.avro' % key
+    s3_client.upload_fileobj(write_buffer, bucket, output_loc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/snapshot/const.py b/snapshot/const.py
new file mode 100644
index 00000000..8078c15b
--- /dev/null
+++ b/snapshot/const.py
@@ -0,0 +1,28 @@
+"""
+Shared constants for flat file snapshot scripts.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+REQUIRES_ROUNDING = {
+    'latitude_dd_start',
+    'longitude_dd_start',
+    'latitude_dd_end',
+    'longitude_dd_end',
+    'bottom_temperature_c',
+    'surface_temperature_c',
+    'depth_m',
+    'distance_fished_km',
+    'duration_hr',
+    'net_width_m',
+    'net_height_m',
+    'area_swept_km2',
+    'cpue_kgkm2',
+    'cpue_nokm2',
+    'weight_kg',
+}
+
+REQUIRES_DATE_ROUND = {'date_time'}
diff --git a/snapshot/norm_util.py b/snapshot/norm_util.py
new file mode 100644
index 00000000..6757007f
--- /dev/null
+++ b/snapshot/norm_util.py
@@ -0,0 +1,39 @@
+"""
+Logic to consistently normalize values for indicies.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import typing
+
+import const
+
+T = typing.TypeVar('T')
+
+
+def normalize_value(key: str, value: T) -> T:
+    """Normalize a record value.
+
+    Normalize a record value so that it can be used to generate bins of haul keys, rounding or
+    truncating in an expected way.
+
+    Args:
+        key: The property key for which a value should be normalized.
+        target: The record whose value should be updated.
+
+    Returns:
+        The record after its value attribute has been normalized if required or target unmodified
+        if no changes made.
+    """
+    if value is None:
+        return None
+    else:
+        if key in const.REQUIRES_ROUNDING:
+            return '%.2f' % value  # type: ignore
+        elif key in const.REQUIRES_DATE_ROUND:
+            return value.split('T')[0]  # type: ignore
+        else:
+            return value
diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py
new file mode 100644
index 00000000..375bf0c2
--- /dev/null
+++ b/snapshot/test_combine_shards.py
@@ -0,0 +1,60 @@
+"""
+Tests for scripts to combine index shards.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import unittest
+import unittest.mock
+
+import combine_shards
+
+
+class NormUtilTests(unittest.TestCase):
+
+    def test_unchanged(self):
+        normalized = combine_shards.normalize_record('test attr', {'value': 'test val'})
+        self.assertEqual(normalized['value'], 'test val')
+
+    def test_none(self):
+        normalized = combine_shards.normalize_record('depth_m', {'value': None})
+        self.assertEqual(normalized['value'], None)
+    
+    def test_changed(self):
+        normalized = combine_shards.normalize_record('depth_m', {'value': 1.236})
+        self.assertAlmostEqual(normalized, 1.24)
+
+    def test_rounded_float_same(self):
+        normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236})
+        normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.237})
+        self.assertAlmostEqual(normalized_1['value'], normalized_2['value'])
+
+    def test_rounded_float_different(self):
+        normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.234})
+        normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.236})
+        self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value'])
+
+    def test_rounded_datetime_same(self):
+        normalized_1 = combine_shards.normalize_record(
+            'date_time',
+            {'value': '2025-12-31T13:25:50Z'}
+        )
+        normalized_2 = combine_shards.normalize_record(
+            'date_time',
+            {'value': '2025-12-31T14:25:50Z'}
+        )
+        self.assertAlmostEqual(normalized_1['value'], normalized_2['value'])
+
+    def test_rounded_datetime_different(self):
+        normalized_1 = combine_shards.normalize_record(
+            'date_time',
+            {'value': '2025-12-31T13:25:50Z'}
+        )
+        normalized_2 = combine_shards.normalize_record(
+            'date_time',
+            {'value': '2025-12-30T14:25:50Z'}
+        )
+        self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value'])
diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py
new file mode 100644
index 00000000..e63414a5
--- /dev/null
+++ b/snapshot/test_norm_util.py
@@ -0,0 +1,67 @@
+"""
+Tests for normalization utilities.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import unittest
+import unittest.mock
+
+import norm_util
+
+
+class NormUtilTests(unittest.TestCase):
+
+    def test_normalize_record_unknown_none(self):
+        normalized = norm_util.normalize_value('test attr', 'test val')
+        self.assertEqual(normalized, 'test val')
+    
+    def test_normalize_record_known_none(self):
+        normalized = norm_util.normalize_value('depth_m', None)
+        self.assertEqual(normalized, None)
+
+    def test_normalize_record_pass_float(self):
+        normalized = norm_util.normalize_value('depth_m', 1.23)
+        self.assertAlmostEqual(normalized, 1.23)
+
+    def test_normalize_record_pass_datetime(self):
+        normalized = norm_util.normalize_value('date_time', '2025-01-13')
+        self.assertEqual(normalized, '2025-01-13')
+
+    def test_normalize_record_round_float_same_up(self):
+        normalized_1 = norm_util.normalize_value('depth_m', 1.237)
+        normalized_2 = norm_util.normalize_value('depth_m', 1.236)
+        self.assertAlmostEqual(normalized_1, normalized_2)
+
+    def test_normalize_record_round_float_same_down(self):
+        normalized_1 = norm_util.normalize_value('depth_m', 1.231)
+        normalized_2 = norm_util.normalize_value('depth_m', 1.229)
+        self.assertAlmostEqual(normalized_1, normalized_2)
+    
+    def test_normalize_record_round_float_different(self):
+        normalized_1 = norm_util.normalize_value('depth_m', 1.236)
+        normalized_2 = norm_util.normalize_value('depth_m', 1.234)
+        self.assertNotAlmostEqual(normalized_1, normalized_2)
+
+    def test_normalize_record_round_datetime_valid_same(self):
+        normalized_1 = norm_util.normalize_value('date_time', '2025-01-13T12:25:50Z')
+        normalized_2 = norm_util.normalize_value('date_time', '2025-01-13T13:25:50Z')
+        self.assertEqual(normalized_1, normalized_2)
+
+    def test_normalize_record_round_datetime_valid_different(self):
+        normalized_1 = norm_util.normalize_value('date_time', '2025-01-13T12:25:50Z')
+        normalized_2 = norm_util.normalize_value('date_time', '2025-01-14T13:25:50Z')
+        self.assertNotEqual(normalized_1, normalized_2)
+
+    def test_normalize_record_round_datetime_invalid_same(self):
+        normalized_1 = norm_util.normalize_value('date_time', 'test')
+        normalized_2 = norm_util.normalize_value('date_time', 'test')
+        self.assertEqual(normalized_1, normalized_2)
+
+    def test_normalize_record_round_datetime_invalid_different(self):
+        normalized_1 = norm_util.normalize_value('date_time', 'test')
+        normalized_2 = norm_util.normalize_value('date_time', 'other')
+        self.assertNotEqual(normalized_1, normalized_2)

From 01d46648aa092191856191e1616e0f58984d9448 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:27:12 +0000
Subject: [PATCH 04/36] Fixes for #114 initial code.

---
 .github/workflows/build.yml     |  2 ++
 snapshot/requirements.txt       |  6 ++++++
 snapshot/test_combine_shards.py |  8 ++++----
 snapshot/test_norm_util.py      | 14 +++++++-------
 4 files changed, 19 insertions(+), 11 deletions(-)
 create mode 100644 snapshot/requirements.txt

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index daa6318e..d2c3ea95 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -17,6 +17,8 @@ jobs:
         run: pip install -e .[dev]
       - name: Install dev dependencies for app
         run: pip install -r afscgapviz/requirements.txt
+      - name: Install dev dependencies for snapshot
+        run: pip install -r snapshot/requirements.txt
       - name: Install afscgap
         run: pip install .
       - name: Unit tests main
diff --git a/snapshot/requirements.txt b/snapshot/requirements.txt
new file mode 100644
index 00000000..530277ea
--- /dev/null
+++ b/snapshot/requirements.txt
@@ -0,0 +1,6 @@
+bokeh!=3.0.*,>=2.4.2
+boto3==1.35.54
+coiled==1.59.0
+fastavro==1.9.7
+requests==2.32.3
+toolz==1.0.0
diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py
index 375bf0c2..e88ed30b 100644
--- a/snapshot/test_combine_shards.py
+++ b/snapshot/test_combine_shards.py
@@ -30,12 +30,12 @@ def test_changed(self):
     def test_rounded_float_same(self):
         normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236})
         normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.237})
-        self.assertAlmostEqual(normalized_1['value'], normalized_2['value'])
+        self.assertAlmostEqual(float(normalized_1['value']), float(normalized_2['value']))
 
     def test_rounded_float_different(self):
         normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.234})
         normalized_2 = combine_shards.normalize_record('depth_m', {'value': 1.236})
-        self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value'])
+        self.assertNotAlmostEqual(float(normalized_1['value']), float(normalized_2['value']))
 
     def test_rounded_datetime_same(self):
         normalized_1 = combine_shards.normalize_record(
@@ -46,7 +46,7 @@ def test_rounded_datetime_same(self):
             'date_time',
             {'value': '2025-12-31T14:25:50Z'}
         )
-        self.assertAlmostEqual(normalized_1['value'], normalized_2['value'])
+        self.assertEqual(normalized_1['value'], normalized_2['value'])
 
     def test_rounded_datetime_different(self):
         normalized_1 = combine_shards.normalize_record(
@@ -57,4 +57,4 @@ def test_rounded_datetime_different(self):
             'date_time',
             {'value': '2025-12-30T14:25:50Z'}
         )
-        self.assertNotAlmostEqual(normalized_1['value'], normalized_2['value'])
+        self.assertNotEqual(normalized_1['value'], normalized_2['value'])
diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py
index e63414a5..376524ad 100644
--- a/snapshot/test_norm_util.py
+++ b/snapshot/test_norm_util.py
@@ -24,7 +24,7 @@ def test_normalize_record_known_none(self):
         self.assertEqual(normalized, None)
 
     def test_normalize_record_pass_float(self):
-        normalized = norm_util.normalize_value('depth_m', 1.23)
+        normalized = float(norm_util.normalize_value('depth_m', 1.23))
         self.assertAlmostEqual(normalized, 1.23)
 
     def test_normalize_record_pass_datetime(self):
@@ -32,18 +32,18 @@ def test_normalize_record_pass_datetime(self):
         self.assertEqual(normalized, '2025-01-13')
 
     def test_normalize_record_round_float_same_up(self):
-        normalized_1 = norm_util.normalize_value('depth_m', 1.237)
-        normalized_2 = norm_util.normalize_value('depth_m', 1.236)
+        normalized_1 = float(norm_util.normalize_value('depth_m', 1.237))
+        normalized_2 = float(norm_util.normalize_value('depth_m', 1.236))
         self.assertAlmostEqual(normalized_1, normalized_2)
 
     def test_normalize_record_round_float_same_down(self):
-        normalized_1 = norm_util.normalize_value('depth_m', 1.231)
-        normalized_2 = norm_util.normalize_value('depth_m', 1.229)
+        normalized_1 = float(norm_util.normalize_value('depth_m', 1.231))
+        normalized_2 = float(norm_util.normalize_value('depth_m', 1.229))
         self.assertAlmostEqual(normalized_1, normalized_2)
     
     def test_normalize_record_round_float_different(self):
-        normalized_1 = norm_util.normalize_value('depth_m', 1.236)
-        normalized_2 = norm_util.normalize_value('depth_m', 1.234)
+        normalized_1 = float(norm_util.normalize_value('depth_m', 1.236))
+        normalized_2 = float(norm_util.normalize_value('depth_m', 1.234))
         self.assertNotAlmostEqual(normalized_1, normalized_2)
 
     def test_normalize_record_round_datetime_valid_same(self):

From 8312d7a23a4667c6f45b368fea1342803e6dcd2c Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:28:59 +0000
Subject: [PATCH 05/36] Fix test_changed in test_combine_shards.

---
 snapshot/test_combine_shards.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py
index e88ed30b..7969c3a4 100644
--- a/snapshot/test_combine_shards.py
+++ b/snapshot/test_combine_shards.py
@@ -25,7 +25,7 @@ def test_none(self):
     
     def test_changed(self):
         normalized = combine_shards.normalize_record('depth_m', {'value': 1.236})
-        self.assertAlmostEqual(normalized, 1.24)
+        self.assertAlmostEqual(float(normalized), 1.24)
 
     def test_rounded_float_same(self):
         normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236})

From d5d2373e8b76b433906f6190ac91a63f124bd22d Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:30:30 +0000
Subject: [PATCH 06/36] Additional bash scripts.

---
 snapshot/combine_shards.sh | 76 ++++++++++++++++++++++++++++++++++++++
 snapshot/execute_all.sh    | 25 +++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 snapshot/combine_shards.sh
 create mode 100644 snapshot/execute_all.sh

diff --git a/snapshot/combine_shards.sh b/snapshot/combine_shards.sh
new file mode 100644
index 00000000..1f5402b3
--- /dev/null
+++ b/snapshot/combine_shards.sh
@@ -0,0 +1,76 @@
+echo "area_swept_km2"
+python combine_shards.py $BUCKET_NAME area_swept_km2
+echo "bottom_temperature_c"
+python combine_shards.py $BUCKET_NAME bottom_temperature_c
+echo "common_name"
+python combine_shards.py $BUCKET_NAME common_name
+echo "count"
+python combine_shards.py $BUCKET_NAME count
+echo "cpue_kgkm2"
+python combine_shards.py $BUCKET_NAME cpue_kgkm2
+echo "cpue_nokm2"
+python combine_shards.py $BUCKET_NAME cpue_nokm2
+echo "cruise"
+python combine_shards.py $BUCKET_NAME cruise
+echo "cruisejoin"
+python combine_shards.py $BUCKET_NAME cruisejoin
+echo "date_time"
+python combine_shards.py $BUCKET_NAME date_time
+echo "depth_m"
+python combine_shards.py $BUCKET_NAME depth_m
+echo "distance_fished_km"
+python combine_shards.py $BUCKET_NAME distance_fished_km
+echo "duration_hr"
+python combine_shards.py $BUCKET_NAME duration_hr
+echo "haul"
+python combine_shards.py $BUCKET_NAME haul
+echo "hauljoin"
+python combine_shards.py $BUCKET_NAME hauljoin
+echo "id_rank"
+python combine_shards.py $BUCKET_NAME id_rank
+echo "latitude_dd_end"
+python combine_shards.py $BUCKET_NAME latitude_dd_end
+echo "latitude_dd_start"
+python combine_shards.py $BUCKET_NAME latitude_dd_start
+echo "longitude_dd_end"
+python combine_shards.py $BUCKET_NAME longitude_dd_end
+echo "longitude_dd_start"
+python combine_shards.py $BUCKET_NAME longitude_dd_start
+echo "net_height_m"
+python combine_shards.py $BUCKET_NAME net_height_m
+echo "net_width_m"
+python combine_shards.py $BUCKET_NAME net_width_m
+echo "performance"
+python combine_shards.py $BUCKET_NAME performance
+echo "requirements"
+python combine_shards.py $BUCKET_NAME requirements
+echo "scientific_name"
+python combine_shards.py $BUCKET_NAME scientific_name
+echo "species_code"
+python combine_shards.py $BUCKET_NAME species_code
+echo "srvy"
+python combine_shards.py $BUCKET_NAME srvy
+echo "station"
+python combine_shards.py $BUCKET_NAME station
+echo "stratum"
+python combine_shards.py $BUCKET_NAME stratum
+echo "surface_temperature_c"
+python combine_shards.py $BUCKET_NAME surface_temperature_c
+echo "survey"
+python combine_shards.py $BUCKET_NAME survey
+echo "survey_definition_id"
+python combine_shards.py $BUCKET_NAME survey_definition_id
+echo "survey_name"
+python combine_shards.py $BUCKET_NAME survey_name
+echo "taxon_confidence"
+python combine_shards.py $BUCKET_NAME taxon_confidence
+echo "variables"
+python combine_shards.py $BUCKET_NAME variables
+echo "vessel_id"
+python combine_shards.py $BUCKET_NAME vessel_id
+echo "vessel_name"
+python combine_shards.py $BUCKET_NAME vessel_name
+echo "weight_kg"
+python combine_shards.py $BUCKET_NAME weight_kg
+echo "year"
+python combine_shards.py $BUCKET_NAME year
diff --git a/snapshot/execute_all.sh b/snapshot/execute_all.sh
new file mode 100644
index 00000000..00847db4
--- /dev/null
+++ b/snapshot/execute_all.sh
@@ -0,0 +1,25 @@
+echo "Starting..." >> status.txt
+
+echo "== Get all years =="
+echo "== Get all years ==" >> status.txt
+bash get_all_years.sh
+echo "== Render flat =="
+echo "== Render flat ==" >> status.txt
+python3 render_flat.py $BUCKET_NAME written_paths.csv
+echo "== Index data =="
+echo "== Index data ==" >> status.txt
+bash index_data.sh
+echo "== Combine shards =="
+echo "== Combine shards ==" >> status.txt
+bash combine_shards.sh
+echo "== Write main =="
+echo "== Write main ==" >> status.txt
+python3 write_main_index.py $BUCKET_NAME
+echo "== Move index =="
+echo "== Move index ==" >> status.txt
+python3 move_afscgap.py index
+echo "== Move joined =="
+echo "== Move joined ==" >> status.txt
+python3 move_afscgap.py joined
+
+echo "Done." >> status.txt

From 7c02f93382fa59b7882c465f4fef14a89a010873 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:32:11 +0000
Subject: [PATCH 07/36] Fix in test for #114.

---
 snapshot/test_combine_shards.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py
index 7969c3a4..d00cdd75 100644
--- a/snapshot/test_combine_shards.py
+++ b/snapshot/test_combine_shards.py
@@ -25,7 +25,7 @@ def test_none(self):
     
     def test_changed(self):
         normalized = combine_shards.normalize_record('depth_m', {'value': 1.236})
-        self.assertAlmostEqual(float(normalized), 1.24)
+        self.assertAlmostEqual(float(normalized['value']), 1.24)
 
     def test_rounded_float_same(self):
         normalized_1 = combine_shards.normalize_record('depth_m', {'value': 1.236})

From 559166c8ddc86582ac1b18425c7f5a5bb98cb5ae Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:35:08 +0000
Subject: [PATCH 08/36] Type fixes for #114.

---
 snapshot/combine_shards.py | 2 +-
 snapshot/norm_util.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/snapshot/combine_shards.py b/snapshot/combine_shards.py
index 19fd6319..9d42f6f0 100644
--- a/snapshot/combine_shards.py
+++ b/snapshot/combine_shards.py
@@ -99,7 +99,7 @@ def get_avro(full_loc: str) -> typing.Iterable[dict]:
         target_buffer = io.BytesIO()
         s3_client.download_fileobj(bucket, full_loc, target_buffer)
         target_buffer.seek(0)
-        return list(fastavro.reader(target_buffer))
+        return list(fastavro.reader(target_buffer))  # type: ignore
 
     batch_locs = map(lambda x: 'index_sharded/%s_%d.avro' % (key, x), batches)
     shards = map(get_avro, batch_locs)
diff --git a/snapshot/norm_util.py b/snapshot/norm_util.py
index 6757007f..0bc9f10f 100644
--- a/snapshot/norm_util.py
+++ b/snapshot/norm_util.py
@@ -14,7 +14,7 @@
 T = typing.TypeVar('T')
 
 
-def normalize_value(key: str, value: T) -> T:
+def normalize_value(key: str, value: typing.Optional[T]) -> typing.Optional[T]:
     """Normalize a record value.
 
     Normalize a record value so that it can be used to generate bins of haul keys, rounding or

From 400643a0be28057b3796487d08a4bf6cc889716e Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:38:59 +0000
Subject: [PATCH 09/36] Viz updates for expanded types checks.

---
 afscgapviz/afscgapviz.py     | 12 ++++++------
 afscgapviz/build_database.py |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/afscgapviz/afscgapviz.py b/afscgapviz/afscgapviz.py
index 3a8519ba..22058117 100644
--- a/afscgapviz/afscgapviz.py
+++ b/afscgapviz/afscgapviz.py
@@ -265,7 +265,7 @@ def render_page():
         with conn_generator() as con:
             return flask.render_template(
                 'viz.html',
-                displays=get_display_info(con, state)['state'],
+                displays=get_display_info(con, state)['state'],  # type: ignore
                 get_species_select_content=get_species_select_content
             )
 
@@ -387,7 +387,7 @@ def download_geohashes():
         else:
             base_sql = sql_util.get_sql('query')
             query_sql = base_sql % (geohash_size + 1, species_filter[0])
-            query_args = (year, survey, species_filter[1])
+            query_args = (year, survey, species_filter[1])  # type: ignore
 
         output_io = io.StringIO()
         writer = csv.DictWriter(
@@ -416,7 +416,7 @@ def download_geohashes():
         writer.writerows(results_dict_final)
 
         full_filename_pieces = comparison_filename_pieces + filename_pieces
-        filename_spaces = '_'.join(full_filename_pieces)
+        filename_spaces = '_'.join(full_filename_pieces)  # type: ignore
         filename = filename_spaces.replace(' ', '_')
 
         if FILENAME_REGEX.match(filename) is None:
@@ -556,7 +556,7 @@ def try_float(target: str) -> float:
                 species_filter[0],
                 geohash_size + 1
             )
-            query_args = (year, survey, species_filter[1])
+            query_args = (year, survey, species_filter[1])  # type: ignore
 
         with conn_generator() as connection:
             cursor = connection.cursor()
@@ -586,7 +586,7 @@ def try_float(target: str) -> float:
                 max_temp,
                 first_cpue,
                 second_cpue
-            ) = result_float
+            ) = result_float  # type: ignore
 
         ret_object = {
             'cpue': {
@@ -602,7 +602,7 @@ def try_float(target: str) -> float:
         }
 
         if is_comparison:
-            ret_object['cpue']['second'] = {
+            ret_object['cpue']['second'] = {  # type: ignore
                 'name': other_species_filter[1],
                 'year': other_year,
                 'value': second_cpue
diff --git a/afscgapviz/build_database.py b/afscgapviz/build_database.py
index b83ebda6..8923d4a6 100644
--- a/afscgapviz/build_database.py
+++ b/afscgapviz/build_database.py
@@ -327,8 +327,8 @@ def download_main(args):
     for year in years:
         for survey in SURVEYS:
 
-            with connection as cursor:
-                download_and_persist_year(survey, year, cursor, geohash_size)
+            with connection as cursor:  # type: ignore
+                download_and_persist_year(survey, year, cursor, geohash_size)  # type: ignore
 
             print('Completed %d for %s.' % (year, survey))
             time.sleep(SLEEP_TIME)

From 4ea9b4b092ea9463544966fe2bbc95db5ac20409 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:44:04 +0000
Subject: [PATCH 10/36] Additional type fixes #114.

---
 afscgapviz/afscgapviz.py   |  2 +-
 snapshot/combine_shards.py |  2 +-
 snapshot/test_norm_util.py | 18 +++++++++++-------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/afscgapviz/afscgapviz.py b/afscgapviz/afscgapviz.py
index 22058117..fb6dbb27 100644
--- a/afscgapviz/afscgapviz.py
+++ b/afscgapviz/afscgapviz.py
@@ -14,7 +14,7 @@
 import sqlite3
 import typing
 
-import flask
+import flask  # type: ignore
 
 import data_util
 import model
diff --git a/snapshot/combine_shards.py b/snapshot/combine_shards.py
index 9d42f6f0..310de248 100644
--- a/snapshot/combine_shards.py
+++ b/snapshot/combine_shards.py
@@ -13,7 +13,7 @@
 import sys
 import typing
 
-import boto3
+import boto3  # type: ignore
 import fastavro
 
 import norm_util
diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py
index 376524ad..1484081d 100644
--- a/snapshot/test_norm_util.py
+++ b/snapshot/test_norm_util.py
@@ -24,7 +24,7 @@ def test_normalize_record_known_none(self):
         self.assertEqual(normalized, None)
 
     def test_normalize_record_pass_float(self):
-        normalized = float(norm_util.normalize_value('depth_m', 1.23))
+        normalized = self._force_float(norm_util.normalize_value('depth_m', 1.23))
         self.assertAlmostEqual(normalized, 1.23)
 
     def test_normalize_record_pass_datetime(self):
@@ -32,18 +32,18 @@ def test_normalize_record_pass_datetime(self):
         self.assertEqual(normalized, '2025-01-13')
 
     def test_normalize_record_round_float_same_up(self):
-        normalized_1 = float(norm_util.normalize_value('depth_m', 1.237))
-        normalized_2 = float(norm_util.normalize_value('depth_m', 1.236))
+        normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.237))
+        normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.236))
         self.assertAlmostEqual(normalized_1, normalized_2)
 
     def test_normalize_record_round_float_same_down(self):
-        normalized_1 = float(norm_util.normalize_value('depth_m', 1.231))
-        normalized_2 = float(norm_util.normalize_value('depth_m', 1.229))
+        normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.231))
+        normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.229))
         self.assertAlmostEqual(normalized_1, normalized_2)
     
     def test_normalize_record_round_float_different(self):
-        normalized_1 = float(norm_util.normalize_value('depth_m', 1.236))
-        normalized_2 = float(norm_util.normalize_value('depth_m', 1.234))
+        normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.236))
+        normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.234))
         self.assertNotAlmostEqual(normalized_1, normalized_2)
 
     def test_normalize_record_round_datetime_valid_same(self):
@@ -65,3 +65,7 @@ def test_normalize_record_round_datetime_invalid_different(self):
         normalized_1 = norm_util.normalize_value('date_time', 'test')
         normalized_2 = norm_util.normalize_value('date_time', 'other')
         self.assertNotEqual(normalized_1, normalized_2)
+
+    def _force_float(self, value) -> float:
+        assert value is not None
+        return float(value)

From 3368e5ca7fa0103ca455f55079ff241a44e50c3b Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 22:45:49 +0000
Subject: [PATCH 11/36] Style fixes for #114.

---
 snapshot/test_combine_shards.py | 2 +-
 snapshot/test_norm_util.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py
index d00cdd75..4aee0c86 100644
--- a/snapshot/test_combine_shards.py
+++ b/snapshot/test_combine_shards.py
@@ -22,7 +22,7 @@ def test_unchanged(self):
     def test_none(self):
         normalized = combine_shards.normalize_record('depth_m', {'value': None})
         self.assertEqual(normalized['value'], None)
-    
+
     def test_changed(self):
         normalized = combine_shards.normalize_record('depth_m', {'value': 1.236})
         self.assertAlmostEqual(float(normalized['value']), 1.24)
diff --git a/snapshot/test_norm_util.py b/snapshot/test_norm_util.py
index 1484081d..a9d18e5d 100644
--- a/snapshot/test_norm_util.py
+++ b/snapshot/test_norm_util.py
@@ -18,7 +18,7 @@ class NormUtilTests(unittest.TestCase):
     def test_normalize_record_unknown_none(self):
         normalized = norm_util.normalize_value('test attr', 'test val')
         self.assertEqual(normalized, 'test val')
-    
+
     def test_normalize_record_known_none(self):
         normalized = norm_util.normalize_value('depth_m', None)
         self.assertEqual(normalized, None)
@@ -40,7 +40,7 @@ def test_normalize_record_round_float_same_down(self):
         normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.231))
         normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.229))
         self.assertAlmostEqual(normalized_1, normalized_2)
-    
+
     def test_normalize_record_round_float_different(self):
         normalized_1 = self._force_float(norm_util.normalize_value('depth_m', 1.236))
         normalized_2 = self._force_float(norm_util.normalize_value('depth_m', 1.234))

From efefb1a5775e06aed90b2da7bbf9aa0610398796 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 23:36:02 +0000
Subject: [PATCH 12/36] Add generate_indicies.

---
 snapshot/const.py             |   2 +
 snapshot/execute_all.sh       |   6 -
 snapshot/generate_indicies.py | 457 ++++++++++++++++++++++++++++++++++
 3 files changed, 459 insertions(+), 6 deletions(-)
 create mode 100644 snapshot/generate_indicies.py

diff --git a/snapshot/const.py b/snapshot/const.py
index 8078c15b..619c8bbe 100644
--- a/snapshot/const.py
+++ b/snapshot/const.py
@@ -26,3 +26,5 @@
 }
 
 REQUIRES_DATE_ROUND = {'date_time'}
+
+ZEROABLE_FIELDS = ['cpue_kgkm2', 'cpue_nokm2', 'weight_kg', 'count']
diff --git a/snapshot/execute_all.sh b/snapshot/execute_all.sh
index 00847db4..b3081ff9 100644
--- a/snapshot/execute_all.sh
+++ b/snapshot/execute_all.sh
@@ -15,11 +15,5 @@ bash combine_shards.sh
 echo "== Write main =="
 echo "== Write main ==" >> status.txt
 python3 write_main_index.py $BUCKET_NAME
-echo "== Move index =="
-echo "== Move index ==" >> status.txt
-python3 move_afscgap.py index
-echo "== Move joined =="
-echo "== Move joined ==" >> status.txt
-python3 move_afscgap.py joined
 
 echo "Done." >> status.txt
diff --git a/snapshot/generate_indicies.py b/snapshot/generate_indicies.py
new file mode 100644
index 00000000..230965c0
--- /dev/null
+++ b/snapshot/generate_indicies.py
@@ -0,0 +1,457 @@
+"""
+Script to generate sharded indicies which indicate in which hauls values can be found.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import itertools
+import os
+import sys
+import typing
+
+import boto3  # type: ignore
+import coiled  # type: ignore
+import dask  # type: ignore
+import dask.bag  # type: ignore
+
+import const
+import norm_util
+
+USAGE_STR = 'python render_flat.py [bucket] [keys] [terminate]'
+NUM_ARGS = 3
+
+REQUIRES_FLAT = {
+    'performance',
+    'cruise',
+    'cruisejoin',
+    'hauljoin',
+    'haul'
+}
+
+IGNORE_ZEROS = {
+    'species_code',
+    'scientific_name',
+    'common_name'
+}
+
+T = typing.TypeVar('T')
+
+
+def build_index_record(record: dict, key: str, year: int, survey: str, haul: int) -> dict:
+    """Build an index record.
+
+    Args:
+        record: The record to index.
+        key: The key (attribute name) being indexed.
+        year: The year of the haul represented by the record.
+        survey: The survey in which the haul took place.
+        haul: The ID of the haul which produced the data to index.
+
+    Returns:
+        Dictionary describing the index record which can be combined through a reduce operation.
+    """
+    value = record[key]
+    key_pieces = [year, survey, haul]
+    key_pieces_str = map(lambda x: str(x), key_pieces)
+    key_output = '\t'.join(key_pieces_str)
+    return {
+        'value': value,
+        'keys': set([key_output])
+    }
+
+
+def is_non_zero(target: dict) -> bool:
+    """Determine if the record is a zeroed record.
+
+    Determine if the record is a zeroed record, potentially indicating absence according to the
+    ZEROABLE_FIELDS.
+
+    Args:
+        target: The record to check.
+
+    Returns:
+        False if the record is zeroed and true otherwise.
+    """
+
+    def is_field_non_zero(field: str) -> bool:
+        value = target[field]
+        return (value is not None) and (value > 0)
+
+    fields = const.ZEROABLE_FIELDS
+    flags = map(is_field_non_zero, fields)
+    flags_positive = filter(lambda x: x is True, flags)
+    num_flags_positive = sum(map(lambda x: 1, flags_positive))
+    return num_flags_positive > 0
+
+
+def process_file(bucket: str, year: int, survey: str, haul: int, key: str) -> typing.List[dict]:
+    """Process a single flattened joined file remotely.
+
+    Args:
+        bucket: The name of the bucket where the file can be found.
+        year: The year of the haul represented in the file.
+        survey: The survey in which the haul took place for the file.
+        haul: The ID of the haul which produced the data to process.
+        key: The key (attribute name) being indexed.
+
+    Returns:
+        Dictionary with index records from the given file.
+    """
+    import io
+    import os
+
+    import botocore  # type: ignore
+    import boto3  # type: ignore
+    import fastavro
+
+    access_key = os.environ['AWS_ACCESS_KEY']
+    access_secret = os.environ['AWS_ACCESS_SECRET']
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+
+    def get_avro(full_loc: str) -> typing.Optional[typing.List[dict]]:
+        """Get all the records from a file within S3.
+
+        Args:
+            full_loc: The location (path) within the S3 bucket.
+
+        Returns:
+            List of records found at the given location or None if there is an error in reading like
+            the file is not found.
+        """
+        target_buffer = io.BytesIO()
+        s3_client.download_fileobj(bucket, full_loc, target_buffer)
+        target_buffer.seek(0)
+        return list(fastavro.reader(target_buffer))  # type: ignore
+
+    def check_file_exists(full_loc: str) -> bool:
+        """Check that a file exists in S3.
+
+        Args:
+            full_loc: The location (path) within the S3 bucket.
+
+        Returns:
+            True if the file is found and false otherwise.
+        """
+        try:
+            s3_client.head_object(bucket, full_loc)
+            return True
+        except botocore.exceptions.ClientError as e:
+            error_code = e.response['Error']['Code']
+            error_code_cast = int(error_code)
+            if error_code_cast == 404:
+                return False
+            else:
+                raise RuntimeError('Unexpected S3 head code: %d' % error_code)
+
+    def infer_index_record(record: dict) -> dict:
+        """Build an index record.
+
+        Args:
+            record: The record to index.
+
+        Returns:
+            Dictionary describing the index record which can be combined through a reduce operation.
+        """
+        return build_index_record(record, key, year, survey, haul)
+
+    template_vals = (year, survey, haul)
+    flat_loc = 'joined/%d_%s_%d.avro' % template_vals
+
+    if not check_file_exists(flat_loc):
+        return []
+
+    flat_records_all = get_avro(flat_loc)
+    flat_records = filter(lambda x: x is not None, flat_records_all)  # type: ignore
+    flat_records_allowed = get_flat_records_allowed(flat_records, key)
+
+    index_records = map(infer_index_record, flat_records_allowed)
+
+    return list(index_records)
+
+
+def build_output_record(target: dict) -> dict:
+    """Convert an index record to a JSON serializable dictionary which can be written to S3.
+
+    Args:
+        target: The index record to be converted for serialization.
+
+    Returns:
+        A JSON-serializable version of target.
+    """
+
+    def process_key(key_str: str) -> dict:
+        """Parse a key into a dictionary which will be saved as an object in JSON.
+
+        Args:
+            key_str: The string description of the key to parse.
+
+        Returns:
+            The key string interpreted as a dictionary with separated fields.
+        """
+        key_pieces = key_str.split('\t')
+        year = int(key_pieces[0])
+        survey = key_pieces[1]
+        haul = int(key_pieces[2])
+        return {'year': year, 'survey': survey, 'haul': haul}
+
+    return {
+        'value': target['value'],
+        'keys': [process_key(x) for x in target['keys']]
+    }
+
+
+def get_observations_meta(bucket: str) -> typing.Iterable[dict]:
+    """Get keys for all available joined data inside a bucket.
+
+    Args:
+        bucket: The bucket at which the data are to be found.
+
+    Returns:
+        Records of all data available within the bucket inside the "joined" directory.
+    """
+    access_key = os.environ['AWS_ACCESS_KEY']
+    access_secret = os.environ['AWS_ACCESS_SECRET']
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+
+    def make_haul_metadata_record(path: str) -> dict:
+        """Create a key object (dict) given a path to data for that haul.
+
+        Args:
+            path: The path within the s3 bucket.
+
+        Returns:
+            Dictionary representing a haul key for the given path.
+        """
+        filename_with_path = path.split('/')[-1]
+        filename = filename_with_path.split('.')[0]
+        components = filename.split('_')
+        return {
+            'path': path,
+            'year': int(components[0]),
+            'survey': components[1],
+            'haul': int(components[2])
+        }
+
+    paginator = s3_client.get_paginator('list_objects_v2')
+    iterator = paginator.paginate(Bucket=bucket, Prefix='joined/')
+    pages = filter(lambda x: 'Contents' in x, iterator)
+    contents = map(lambda x: x['Contents'], pages)
+    contents_flat = itertools.chain(*contents)
+    keys = map(lambda x: x['Key'], contents_flat)
+    return map(make_haul_metadata_record, keys)
+
+
+def write_sample(key: str, bucket: str, sample: typing.Iterable[dict]) -> typing.Optional[int]:
+    """Write an index shard.
+
+    Args:
+        key: The key (attribute name) being indexed.
+        bucket: The bucket in which the shard should be written.
+        sample: The contents of the shared to be written.
+
+    Returns:
+        Random index number given to the shard. Client code should ensure that there were no
+        collisions.
+    """
+    import io
+    import os
+    import random
+
+    import boto3
+    import fastavro
+
+    INDEX_SCHEMA = {
+        'doc': 'Index from a value to an observations flat file.',
+        'name': 'Index',
+        'namespace': 'edu.dse.afscgap',
+        'type': 'record',
+        'fields': [
+            {'name': 'value', 'type': ['string', 'long', 'double', 'null']},
+            {'name': 'keys', 'type': {
+                'type': 'array',
+                'items': {
+                    'name': 'Key',
+                    'type': 'record',
+                    'fields': [
+                        {'name': 'year', 'type': 'int'},
+                        {'name': 'survey', 'type': 'string'},
+                        {'name': 'haul', 'type': 'long'}
+                    ]
+                }
+            }}
+        ]
+    }
+
+    sample_realized = list(sample)
+    if len(sample_realized) == 0:
+        return None
+
+    batch = random.randint(0, 1000000)
+
+    access_key = os.environ.get('AWS_ACCESS_KEY', '')
+    access_secret = os.environ.get('AWS_ACCESS_SECRET', '')
+
+    target_buffer = io.BytesIO()
+    fastavro.writer(
+        target_buffer,
+        INDEX_SCHEMA,
+        sample_realized
+    )
+    target_buffer.seek(0)
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+    output_loc = 'index_sharded/%s_%d.avro' % (key, batch)
+    s3_client.upload_fileobj(target_buffer, bucket, output_loc)
+    return batch
+
+
+def normalize_value(target: dict, key: str) -> typing.Optional[T]:
+    """Normalize a value for indexing.
+
+    Args:
+        target: Record from which a normalized value for indexing is requested.
+        key: The name of the attribute being indexed.
+
+    Returns:
+        Normalized value to use for indexing.
+    """
+    value = target['value']
+    return norm_util.normalize_value(key, value)
+
+
+def combine_records(a: dict, b: dict) -> dict:
+    """Combine index records.
+
+    Args:
+        a: The first index record to combine.
+        b: The second index record to combine.
+
+    Returns:
+        New index record indicating the cobination of the two records.
+    """
+    assert a['value'] == b['value']
+    return {'value': a['value'], 'keys': a['keys'].union(b['keys'])}
+
+
+def get_flat_records_allowed(records: typing.Iterable[dict], key: str) -> typing.Iterable[dict]:
+    """Get records which are permitted to be indexed.
+
+    Args:
+        records: Candidate records.
+        key: The name of the attribute being indexed.
+
+    Returns:
+        Records which can be indexed, excluding zero catch records in some circumstances.
+    """
+    if key in IGNORE_ZEROS:
+        return filter(is_non_zero, records)
+    else:
+        return records
+
+
+def main():
+    """Entry point for the shareded index generation script."""
+    if len(sys.argv) != NUM_ARGS + 1:
+        print(USAGE_STR)
+        sys.exit(1)
+
+    bucket = sys.argv[1]
+    keys = sys.argv[2].split(',')
+    terminate = sys.argv[3].lower() in ['y', 'yes', 't', 'true', '1']
+    hauls_meta = list(get_observations_meta(bucket))
+
+    access_key = os.environ.get('AWS_ACCESS_KEY', '')
+    access_secret = os.environ.get('AWS_ACCESS_SECRET', '')
+    cluster = coiled.Cluster(
+        name='DseProcessAfscgap',
+        n_workers=100,
+        worker_vm_types=['m7a.medium'],
+        scheduler_vm_types=['m7a.medium'],
+        environ={
+            'AWS_ACCESS_KEY': access_key,
+            'AWS_ACCESS_SECRET': access_secret
+        }
+    )
+    client = cluster.get_client()
+
+    def execute_for_key(key: str):
+        """Execute sharded index generation for a single attribute.
+
+        Args:
+            key: The name of the attribute to be indexed.
+        """
+        hauls_meta_realized = dask.bag.from_sequence(hauls_meta)
+        index_records_nest = hauls_meta_realized.map(
+            lambda x: process_file(
+                bucket,
+                x['year'],
+                x['survey'],
+                x['haul'],
+                key
+            )
+        )
+        index_records = index_records_nest.flatten()
+
+        def key_record(target: dict):
+            """Get a normalized value for indexing.
+
+            Args:
+                target: Record from which a normalized value for indexing is requested.
+                key: The name of the attribute being indexed.
+
+            Returns:
+                Normalized value to use for indexing.
+            """
+            return normalize_value(target, key)
+
+        if key in REQUIRES_FLAT:
+            index_records_output = index_records.map(build_output_record)
+        else:
+            index_records_grouped_nest = index_records.foldby(
+                key=key_record,
+                binop=combine_records
+            )
+            index_records_grouped = index_records_grouped_nest.map(lambda x: x[1])
+            index_records_output = index_records_grouped.map(build_output_record)
+
+        repartitioned = index_records_output.repartition(npartitions=20)
+        incidies_future = repartitioned.map_partitions(
+            lambda x: write_sample(key, bucket, x)
+        )
+
+        indicies_all = incidies_future.compute(scheduler=client)
+        indicies = filter(lambda x: x is not None, indicies_all)
+        indicies_strs = list(map(lambda x: str(x), indicies))
+        assert len(indicies_strs) == len(set(indicies_strs))
+
+        loc = os.path.join('index_shards', key + '.txt')
+        with open(loc, 'w') as f:
+            f.write('\n'.join(indicies_strs))
+
+    for key in keys:
+        print('Executing for %s...' % key)
+        execute_for_key(key)
+
+    if terminate:
+        cluster.close(force_shutdown=True)
+
+
+if __name__ == '__main__':
+    main()

From 29158049dda540aafbd2d03283aa42768fcf19e9 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 23:36:42 +0000
Subject: [PATCH 13/36] Move some values to const for #114.

---
 snapshot/const.py             | 14 ++++++++++++++
 snapshot/generate_indicies.py | 18 ++----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/snapshot/const.py b/snapshot/const.py
index 619c8bbe..2e9218fa 100644
--- a/snapshot/const.py
+++ b/snapshot/const.py
@@ -28,3 +28,17 @@
 REQUIRES_DATE_ROUND = {'date_time'}
 
 ZEROABLE_FIELDS = ['cpue_kgkm2', 'cpue_nokm2', 'weight_kg', 'count']
+
+REQUIRES_FLAT = {
+    'performance',
+    'cruise',
+    'cruisejoin',
+    'hauljoin',
+    'haul'
+}
+
+IGNORE_ZEROS = {
+    'species_code',
+    'scientific_name',
+    'common_name'
+}
diff --git a/snapshot/generate_indicies.py b/snapshot/generate_indicies.py
index 230965c0..115ef0e5 100644
--- a/snapshot/generate_indicies.py
+++ b/snapshot/generate_indicies.py
@@ -23,20 +23,6 @@
 USAGE_STR = 'python render_flat.py [bucket] [keys] [terminate]'
 NUM_ARGS = 3
 
-REQUIRES_FLAT = {
-    'performance',
-    'cruise',
-    'cruisejoin',
-    'hauljoin',
-    'haul'
-}
-
-IGNORE_ZEROS = {
-    'species_code',
-    'scientific_name',
-    'common_name'
-}
-
 T = typing.TypeVar('T')
 
 
@@ -360,7 +346,7 @@ def get_flat_records_allowed(records: typing.Iterable[dict], key: str) -> typing
     Returns:
         Records which can be indexed, excluding zero catch records in some circumstances.
     """
-    if key in IGNORE_ZEROS:
+    if key in const.IGNORE_ZEROS:
         return filter(is_non_zero, records)
     else:
         return records
@@ -421,7 +407,7 @@ def key_record(target: dict):
             """
             return normalize_value(target, key)
 
-        if key in REQUIRES_FLAT:
+        if key in const.REQUIRES_FLAT:
             index_records_output = index_records.map(build_output_record)
         else:
             index_records_grouped_nest = index_records.foldby(

From 6ad249b6eaa836efd1ab86753e95b3ac58e32627 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 23:40:25 +0000
Subject: [PATCH 14/36] Add check for presence only index.

---
 afscgap/flat_index_util.py           |  3 +++
 afscgap/test/test_flat_index_util.py | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py
index dbaad69a..0a8cf7f7 100644
--- a/afscgap/flat_index_util.py
+++ b/afscgap/flat_index_util.py
@@ -497,6 +497,9 @@ def make_filters(field: str, param: afscgap.param.Param,
     if param.get_is_ignorable():
         return []
 
+    if presence_only and field in PRESENCE_ONLY_FIELDS:
+        return []
+
     filter_type = param.get_filter_type()
     if filter_type == 'empty':
         return []
diff --git a/afscgap/test/test_flat_index_util.py b/afscgap/test/test_flat_index_util.py
index b97c7ed6..902ac655 100644
--- a/afscgap/test/test_flat_index_util.py
+++ b/afscgap/test/test_flat_index_util.py
@@ -365,6 +365,18 @@ def test_string_false(self):
         filters = afscgap.flat_index_util.make_filters('common_name', param, True)
         self.assertEqual(len(filters), 1)
         self.assertFalse(filters[0].get_matches('other'))
+    
+    def test_string_true_presence_only(self):
+        param = afscgap.param.StrEqualsParam('test')
+        filters = afscgap.flat_index_util.make_filters('common_name', param, False)
+        self.assertEqual(len(filters), 1)
+        self.assertTrue(filters[0].get_matches('test'))
+
+    def test_string_false_presence_only(self):
+        param = afscgap.param.StrEqualsParam('test')
+        filters = afscgap.flat_index_util.make_filters('common_name', param, False)
+        self.assertEqual(len(filters), 1)
+        self.assertTrue(filters[0].get_matches('other'))
 
     def test_int_true(self):
         param = afscgap.param.IntEqualsParam(1)

From cdd9916fb8dbb159af39e0612896f2583f9f8f72 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 23:42:48 +0000
Subject: [PATCH 15/36] Additional fixes for #115.

---
 afscgap/flat_index_util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py
index 0a8cf7f7..ba35a000 100644
--- a/afscgap/flat_index_util.py
+++ b/afscgap/flat_index_util.py
@@ -457,6 +457,7 @@ def get_matches(self, value: MATCH_TARGET) -> bool:
 
 FIELD_DATA_TYPE_OVERRIDES = {'date_time': 'datetime'}
 
+# These fields, when indexed, ignore zero values. If not presence only, these need to be included.
 PRESENCE_ONLY_FIELDS = {'species_code', 'common_name', 'scientific_name'}
 
 
@@ -497,7 +498,7 @@ def make_filters(field: str, param: afscgap.param.Param,
     if param.get_is_ignorable():
         return []
 
-    if presence_only and field in PRESENCE_ONLY_FIELDS:
+    if (not presence_only) and (field in PRESENCE_ONLY_FIELDS):
         return []
 
     filter_type = param.get_filter_type()

From d3043e5f3060a5b6a696559b16c37ca9f5d5529b Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 23:43:35 +0000
Subject: [PATCH 16/36] Add additional explanation for #115.

---
 afscgap/flat_index_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py
index ba35a000..c0d7dc80 100644
--- a/afscgap/flat_index_util.py
+++ b/afscgap/flat_index_util.py
@@ -498,6 +498,8 @@ def make_filters(field: str, param: afscgap.param.Param,
     if param.get_is_ignorable():
         return []
 
+    # If the field index is presence only and this isn't a presence only request, the index must be
+    # ignored (cannot be used to pre-filter results).
     if (not presence_only) and (field in PRESENCE_ONLY_FIELDS):
         return []
 

From 6ca70a6ec45329758bbda95e32de9beb56515db5 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Fri, 3 Jan 2025 23:45:14 +0000
Subject: [PATCH 17/36] Update tests for #115.

---
 afscgap/test/test_flat_index_util.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/afscgap/test/test_flat_index_util.py b/afscgap/test/test_flat_index_util.py
index 902ac655..f23ed24b 100644
--- a/afscgap/test/test_flat_index_util.py
+++ b/afscgap/test/test_flat_index_util.py
@@ -366,17 +366,10 @@ def test_string_false(self):
         self.assertEqual(len(filters), 1)
         self.assertFalse(filters[0].get_matches('other'))
     
-    def test_string_true_presence_only(self):
+    def test_presence_only(self):
         param = afscgap.param.StrEqualsParam('test')
         filters = afscgap.flat_index_util.make_filters('common_name', param, False)
-        self.assertEqual(len(filters), 1)
-        self.assertTrue(filters[0].get_matches('test'))
-
-    def test_string_false_presence_only(self):
-        param = afscgap.param.StrEqualsParam('test')
-        filters = afscgap.flat_index_util.make_filters('common_name', param, False)
-        self.assertEqual(len(filters), 1)
-        self.assertTrue(filters[0].get_matches('other'))
+        self.assertEqual(len(filters), 0)
 
     def test_int_true(self):
         param = afscgap.param.IntEqualsParam(1)

From f540475726651c48b0d657d3c1b5645e0ba707fe Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 00:23:33 +0000
Subject: [PATCH 18/36] Update coverage targets.

---
 CONTRIBUTING.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4fa559f8..09214712 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -14,9 +14,9 @@ Thank you for your contribution. We appreciate the community's help in any capac
 In order to ensure the conceptual integrity and readability of our code, we have a few guidelines for Python code under the `afscgap` library itself:
 
  - Please try to follow the conventions laid out by the project in existing code. In cases of ambiguity, please refer to the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) where possible.
- - Tests are encouraged and we aim for 80% coverage where feasible.
- - Type hints are encouraged and we aim for 80% coverage where feasible.
- - Docstrings are encouraged and we aim for 80% coverage. Please use the [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) to ensure that our automated documentation system can use your work.
+ - Tests are encouraged.
+ - Type hints are encouraged.
+ - Docstrings are encouraged. Please use the [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) to ensure that our automated documentation system can use your work.
  - Please check that you have no mypy errors when contributing.
  - Please check that you have no linting (pycodestyle, pyflakes) errors when contributing.
  - As contributors may be periodic, please do not re-write history / squash commits for ease of fast forward.
@@ -27,6 +27,8 @@ The `afscgap` library itself requires a very high rigor. For other sections incl
 
 Of course, **do not worry if you aren't sure that you met all of our the guidelines!** We encourage pull requests and are happy to work through any necessary outstanding tasks with you.
 
+Previous versions of this guide indicated specific coverage targets but those are removed for the `2.x` release as the codebase spans more modalities where different approaches may be more appropriate in different areas.
+
 <br>
 <br>
 

From 0539c913cbd55a2fdd8d308520c291c3550716a1 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 00:41:59 +0000
Subject: [PATCH 19/36] Expand tests for ignorable re #111.

---
 afscgap/flat_index_util.py           | 41 +++++++++++++++++++++-----
 afscgap/test/test_flat_index_util.py | 44 ++++++++++++++++++++++++++++
 snapshot/test_generate_indicies.py   | 13 ++++++++
 3 files changed, 90 insertions(+), 8 deletions(-)
 create mode 100644 snapshot/test_generate_indicies.py

diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py
index c0d7dc80..2972616e 100644
--- a/afscgap/flat_index_util.py
+++ b/afscgap/flat_index_util.py
@@ -480,6 +480,38 @@ def decorate_filter(field: str, original: IndexFilter) -> IndexFilter:
     return UnitConversionIndexFilter(original, user_units, system_units)
 
 
+def determine_if_ignorable(field: str, param: afscgap.param.Param, presence_only: bool) -> bool:
+    """Determine if a field parameter is ignored for pre-filtering.
+    
+    Determine if a field parameter is ignored for pre-filtering, turning it into a noop because
+    pre-filtering isn't possible or precomputed indicies are not available.
+    
+    Args:
+        field: The name of the field for which filters should be made.
+        param: The parameter to apply for the field.
+        presence_only: Flag indicating if the query is for presence so zero inference records can be
+            excluded.
+
+    Returns:
+        True if 
+    """
+    if param.get_is_ignorable():
+        return True
+
+    # If the field index is presence only and this isn't a presence only request, the index must be
+    # ignored (cannot be used to pre-filter results).
+    zero_inference_required = not presence_only
+    field_index_excludes_zeros = field in PRESENCE_ONLY_FIELDS
+    if zero_inference_required and field_index_excludes_zeros:
+        return True
+
+    filter_type = param.get_filter_type()
+    if filter_type == 'empty':
+        return True
+
+    return False
+
+
 def make_filters(field: str, param: afscgap.param.Param,
     presence_only: bool) -> typing.Iterable[IndexFilter]:
     """Make filters for a field describing a backend-agnostic parameter.
@@ -495,17 +527,10 @@ def make_filters(field: str, param: afscgap.param.Param,
         be approximated such that all matching results are included in results but some results may
         included may not match, requiring re-evaluation locally.
     """
-    if param.get_is_ignorable():
-        return []
-
-    # If the field index is presence only and this isn't a presence only request, the index must be
-    # ignored (cannot be used to pre-filter results).
-    if (not presence_only) and (field in PRESENCE_ONLY_FIELDS):
+    if determine_if_ignorable(field, param, presence_only):
         return []
 
     filter_type = param.get_filter_type()
-    if filter_type == 'empty':
-        return []
 
     if field in FIELD_DATA_TYPE_OVERRIDES:
         data_type = FIELD_DATA_TYPE_OVERRIDES[field]
diff --git a/afscgap/test/test_flat_index_util.py b/afscgap/test/test_flat_index_util.py
index f23ed24b..3062e0ce 100644
--- a/afscgap/test/test_flat_index_util.py
+++ b/afscgap/test/test_flat_index_util.py
@@ -347,6 +347,50 @@ def test_decorate_filter_active_none(self):
         self.assertFalse(decorated.get_matches(None))
 
 
+class DetermineIfIgnorableTests(unittest.TestCase):
+
+    def test_explicit_ignorable_require_zero(self):
+        param = self._make_test_param(True, 'int')
+        ignorable = afscgap.flat_index_util.determine_if_ignorable('test', param, False)
+        self.assertTrue(ignorable)
+
+    def test_explicit_ignorable_presence_only(self):
+        param = self._make_test_param(True, 'int')
+        ignorable = afscgap.flat_index_util.determine_if_ignorable('test', param, True)
+        self.assertTrue(ignorable)
+
+    def test_require_zero_supported(self):
+        param = self._make_test_param(False, 'int')
+        ignorable = afscgap.flat_index_util.determine_if_ignorable('count', param, False)
+        self.assertFalse(ignorable)
+
+    def test_require_zero_unsupported(self):
+        param = self._make_test_param(False, 'str')
+        ignorable = afscgap.flat_index_util.determine_if_ignorable('species_code', param, False)
+        self.assertTrue(ignorable)
+
+    def test_presence_only_unsupported(self):
+        param = self._make_test_param(False, 'str')
+        ignorable = afscgap.flat_index_util.determine_if_ignorable('species_code', param, True)
+        self.assertFalse(ignorable)
+
+    def test_empty(self):
+        param = afscgap.param.EmptyParam()
+        ignorable = afscgap.flat_index_util.determine_if_ignorable('count', param, True)
+        self.assertTrue(ignorable)
+
+    def test_plain_not_ignorable(self):
+        param = afscgap.param.IntRangeParam(1, None)
+        ignorable = afscgap.flat_index_util.determine_if_ignorable('count', param, True)
+        self.assertFalse(ignorable)
+
+    def _make_test_param(self, ignorable, filter_type):
+        param = unittest.mock.MagicMock()
+        param.get_is_ignorable = unittest.mock.MagicMock(return_value=ignorable)
+        param.get_filter_type = unittest.mock.MagicMock(return_value=filter_type)
+        return param
+
+
 class MakeFilterTests(unittest.TestCase):
 
     def test_empty(self):
diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py
new file mode 100644
index 00000000..5271fb01
--- /dev/null
+++ b/snapshot/test_generate_indicies.py
@@ -0,0 +1,13 @@
+"""
+Tests for generating sharded indicies.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import unittest
+import unittest.mock
+
+import generate_indicies

From 8a408200da4af1d9fff1062021c8e0104bbc42cc Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 01:13:18 +0000
Subject: [PATCH 20/36] Add tests for generate_indicies.

---
 snapshot/generate_indicies.py      |   4 +-
 snapshot/test_combine_shards.py    |   2 +-
 snapshot/test_generate_indicies.py | 187 +++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 2 deletions(-)

diff --git a/snapshot/generate_indicies.py b/snapshot/generate_indicies.py
index 115ef0e5..908cd630 100644
--- a/snapshot/generate_indicies.py
+++ b/snapshot/generate_indicies.py
@@ -332,7 +332,9 @@ def combine_records(a: dict, b: dict) -> dict:
     Returns:
         New index record indicating the cobination of the two records.
     """
-    assert a['value'] == b['value']
+    if a['value'] != b['value']:
+        raise RuntimeError('Tried combining keys for incompatible values.')
+
     return {'value': a['value'], 'keys': a['keys'].union(b['keys'])}
 
 
diff --git a/snapshot/test_combine_shards.py b/snapshot/test_combine_shards.py
index 4aee0c86..f165bed4 100644
--- a/snapshot/test_combine_shards.py
+++ b/snapshot/test_combine_shards.py
@@ -13,7 +13,7 @@
 import combine_shards
 
 
-class NormUtilTests(unittest.TestCase):
+class NormTests(unittest.TestCase):
 
     def test_unchanged(self):
         normalized = combine_shards.normalize_record('test attr', {'value': 'test val'})
diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py
index 5271fb01..53ace4ad 100644
--- a/snapshot/test_generate_indicies.py
+++ b/snapshot/test_generate_indicies.py
@@ -10,4 +10,191 @@
 import unittest
 import unittest.mock
 
+import const
 import generate_indicies
+
+
+class BuildIndexRecordTests(unittest.TestCase):
+
+    def setUp(self):
+        self._record = {'testkey': 'testvalue'}
+
+    def test_build_index_record_value(self):
+        record = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123)
+        self.assertEqual(record['value'], 'testvalue')
+
+    def test_build_index_record_same(self):
+        record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123)
+        record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123)
+        key_1 = list(record_1['keys'])[0]
+        key_2 = list(record_2['keys'])[0]
+        self.assertEqual(key_1, key_2)
+
+    def test_build_index_record_different_year(self):
+        record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123)
+        record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2026, 'GOA', 123)
+        key_1 = list(record_1['keys'])[0]
+        key_2 = list(record_2['keys'])[0]
+        self.assertNotEqual(key_1, key_2)
+
+    def test_build_index_record_different_survey(self):
+        record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123)
+        record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'Other', 123)
+        key_1 = list(record_1['keys'])[0]
+        key_2 = list(record_2['keys'])[0]
+        self.assertNotEqual(key_1, key_2)
+
+    def test_build_index_record_different_haul(self):
+        record_1 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 123)
+        record_2 = generate_indicies.build_index_record(self._record, 'testkey', 2025, 'GOA', 124)
+        key_1 = list(record_1['keys'])[0]
+        key_2 = list(record_2['keys'])[0]
+        self.assertNotEqual(key_1, key_2)
+
+
+class NormTests(unittest.TestCase):
+
+    def test_unchanged(self):
+        normalized = generate_indicies.normalize_value(
+            {'value': 'test val'},
+            'test attr'
+        )
+        self.assertEqual(normalized, 'test val')
+
+    def test_none(self):
+        normalized = generate_indicies.normalize_value(
+            {'value': None},
+            'depth_m'
+        )
+        self.assertEqual(normalized, None)
+
+    def test_changed(self):
+        normalized = generate_indicies.normalize_value(
+            {'value': 1.236},
+            'depth_m'
+        )
+        self.assertEqual(normalized, '1.24')
+
+    def test_rounded_float_same(self):
+        normalized_1 = generate_indicies.normalize_value(
+            {'value': 1.236},
+            'depth_m'
+        )
+        normalized_2 = generate_indicies.normalize_value(
+            {'value': 1.237},
+            'depth_m'
+        )
+        self.assertEqual(normalized_1, normalized_2)
+
+    def test_rounded_float_different(self):
+        normalized_1 = generate_indicies.normalize_value(
+            {'value': 1.234},
+            'depth_m'
+        )
+        normalized_2 = generate_indicies.normalize_value(
+            {'value': 1.236},
+            'depth_m'
+        )
+        self.assertNotEqual(normalized_1, normalized_2)
+
+    def test_rounded_datetime_same(self):
+        normalized_1 = generate_indicies.normalize_value(
+            {'value': '2025-12-31T13:25:50Z'},
+            'date_time'
+        )
+        normalized_2 = generate_indicies.normalize_value(
+            {'value': '2025-12-31T14:25:50Z'},
+            'date_time'
+        )
+        self.assertEqual(normalized_1, normalized_2)
+
+    def test_rounded_datetime_different(self):
+        normalized_1 = generate_indicies.normalize_value(
+            {'value': '2025-12-31T13:25:50Z'},
+            'date_time'
+        )
+        normalized_2 = generate_indicies.normalize_value(
+            {'value': '2025-12-30T14:25:50Z'},
+            'date_time'
+        )
+        self.assertNotEqual(normalized_1, normalized_2)
+
+
+class IsNonZeroTests(unittest.TestCase):
+
+    def setUp(self):
+        self._target = {}
+        for field in const.ZEROABLE_FIELDS:
+            self._target[field] = 123
+
+    def test_is_non_zero_not_zeroable_zero(self):
+        self._target['other'] = 0
+        self.assertTrue(generate_indicies.is_non_zero(self._target))
+
+    def test_is_non_zero_not_zeroable_none(self):
+        self._target['other'] = None
+        self.assertTrue(generate_indicies.is_non_zero(self._target))
+
+    def test_is_non_zero_zeroable_zero_partial(self):
+        self._target['count'] = 0
+        self.assertTrue(generate_indicies.is_non_zero(self._target))
+
+    def test_is_non_zero_zeroable_none_partial(self):
+        self._target['count'] = None
+        self.assertTrue(generate_indicies.is_non_zero(self._target))
+    
+    def test_is_non_zero_zeroable_zero_all(self):
+        for field in const.ZEROABLE_FIELDS:
+            self._target[field] = 0
+
+        self.assertFalse(generate_indicies.is_non_zero(self._target))
+
+    def test_is_non_zero_zeroable_none_all(self):
+        for field in const.ZEROABLE_FIELDS:
+            self._target[field] = None
+
+        self.assertFalse(generate_indicies.is_non_zero(self._target))
+
+
+class BuildOutputRecordTests(unittest.TestCase):
+
+    def setUp(self):
+        target = {'value': 'test value', 'keys': ['2025\tGOA\t123', '2025\tGOA\t124']}
+        self._output_record = generate_indicies.build_output_record(target)
+
+    def test_build_output_record(self):
+        self.assertEqual(self._output_record['value'], 'test value')
+
+    def test_build_key_meta(self):
+        key = self._output_record['keys'][0]
+        self.assertEqual(key['year'], 2025)
+        self.assertEqual(key['survey'], 'GOA')
+        self.assertEqual(key['haul'], 124)
+
+    def test_build_key_meta(self):
+        key_1 = self._output_record['keys'][0]
+        self.assertEqual(key_1['haul'], 123)
+
+        key_2 = self._output_record['keys'][1]
+        self.assertEqual(key_2['haul'], 124)
+
+
+class CombineTests(unittest.TestCase):
+
+    def setUp(self):
+        self._base = {'value': 1, 'keys': {'a'}}
+        self._compatible = {'value': 1, 'keys': {'b'}}
+        self._incompatible = {'value': 2, 'keys': {'c'}}
+
+    def test_combine_compatible(self):
+        combined = generate_indicies.combine_records(self._base, self._compatible)
+        self.assertEqual(combined['value'], 1)
+
+        keys = combined['keys']
+        self.assertEqual(len(keys), 2)
+        self.assertTrue('a' in keys)
+        self.assertTrue('b' in keys)
+
+    def test_combine_incompatible(self):
+        with self.assertRaises(RuntimeError):
+            combined = generate_indicies.combine_records(self._base, self._incompatible)

From 131c09e1f3636bee828810a499704c9691a66a7b Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 01:17:09 +0000
Subject: [PATCH 21/36] Add additional bash scripts for #114.

---
 snapshot/get_all_years.sh | 90 +++++++++++++++++++++++++++++++++++++++
 snapshot/index_data.sh    | 77 +++++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 snapshot/get_all_years.sh
 create mode 100644 snapshot/index_data.sh

diff --git a/snapshot/get_all_years.sh b/snapshot/get_all_years.sh
new file mode 100644
index 00000000..d278d15d
--- /dev/null
+++ b/snapshot/get_all_years.sh
@@ -0,0 +1,90 @@
+echo "-- Getting species --"
+python request_source.py species $BUCKET_NAME species
+echo "-- Getting catch --"
+python request_source.py catch $BUCKET_NAME catch
+echo "-- Getting 1982 --"
+python request_source.py haul $BUCKET_NAME haul 1982
+echo "-- Getting 1983 --"
+python request_source.py haul $BUCKET_NAME haul 1983
+echo "-- Getting 1984 --"
+python request_source.py haul $BUCKET_NAME haul 1984
+echo "-- Getting 1985 --"
+python request_source.py haul $BUCKET_NAME haul 1985
+echo "-- Getting 1986 --"
+python request_source.py haul $BUCKET_NAME haul 1986
+echo "-- Getting 1987 --"
+python request_source.py haul $BUCKET_NAME haul 1987
+echo "-- Getting 1988 --"
+python request_source.py haul $BUCKET_NAME haul 1988
+echo "-- Getting 1989 --"
+python request_source.py haul $BUCKET_NAME haul 1989
+echo "-- Getting 1990 --"
+python request_source.py haul $BUCKET_NAME haul 1990
+echo "-- Getting 1991 --"
+python request_source.py haul $BUCKET_NAME haul 1991
+echo "-- Getting 1992 --"
+python request_source.py haul $BUCKET_NAME haul 1992
+echo "-- Getting 1993 --"
+python request_source.py haul $BUCKET_NAME haul 1993
+echo "-- Getting 1994 --"
+python request_source.py haul $BUCKET_NAME haul 1994
+echo "-- Getting 1995 --"
+python request_source.py haul $BUCKET_NAME haul 1995
+echo "-- Getting 1996 --"
+python request_source.py haul $BUCKET_NAME haul 1996
+echo "-- Getting 1997 --"
+python request_source.py haul $BUCKET_NAME haul 1997
+echo "-- Getting 1998 --"
+python request_source.py haul $BUCKET_NAME haul 1998
+echo "-- Getting 1999 --"
+python request_source.py haul $BUCKET_NAME haul 1999
+echo "-- Getting 2000 --"
+python request_source.py haul $BUCKET_NAME haul 2000
+echo "-- Getting 2001 --"
+python request_source.py haul $BUCKET_NAME haul 2001
+echo "-- Getting 2002 --"
+python request_source.py haul $BUCKET_NAME haul 2002
+echo "-- Getting 2003 --"
+python request_source.py haul $BUCKET_NAME haul 2003
+echo "-- Getting 2004 --"
+python request_source.py haul $BUCKET_NAME haul 2004
+echo "-- Getting 2005 --"
+python request_source.py haul $BUCKET_NAME haul 2005
+echo "-- Getting 2006 --"
+python request_source.py haul $BUCKET_NAME haul 2006
+echo "-- Getting 2007 --"
+python request_source.py haul $BUCKET_NAME haul 2007
+echo "-- Getting 2008 --"
+python request_source.py haul $BUCKET_NAME haul 2008
+echo "-- Getting 2009 --"
+python request_source.py haul $BUCKET_NAME haul 2009
+echo "-- Getting 2010 --"
+python request_source.py haul $BUCKET_NAME haul 2010
+echo "-- Getting 2011 --"
+python request_source.py haul $BUCKET_NAME haul 2011
+echo "-- Getting 2012 --"
+python request_source.py haul $BUCKET_NAME haul 2012
+echo "-- Getting 2013 --"
+python request_source.py haul $BUCKET_NAME haul 2013
+echo "-- Getting 2014 --"
+python request_source.py haul $BUCKET_NAME haul 2014
+echo "-- Getting 2015 --"
+python request_source.py haul $BUCKET_NAME haul 2015
+echo "-- Getting 2016 --"
+python request_source.py haul $BUCKET_NAME haul 2016
+echo "-- Getting 2017 --"
+python request_source.py haul $BUCKET_NAME haul 2017
+echo "-- Getting 2018 --"
+python request_source.py haul $BUCKET_NAME haul 2018
+echo "-- Getting 2019 --"
+python request_source.py haul $BUCKET_NAME haul 2019
+echo "-- Getting 2020 --"
+python request_source.py haul $BUCKET_NAME haul 2020
+echo "-- Getting 2021 --"
+python request_source.py haul $BUCKET_NAME haul 2021
+echo "-- Getting 2022 --"
+python request_source.py haul $BUCKET_NAME haul 2022
+echo "-- Getting 2023 --"
+python request_source.py haul $BUCKET_NAME haul 2023
+echo "-- Getting 2024 --"
+python request_source.py haul $BUCKET_NAME haul 2024
diff --git a/snapshot/index_data.sh b/snapshot/index_data.sh
new file mode 100644
index 00000000..7bc72fa4
--- /dev/null
+++ b/snapshot/index_data.sh
@@ -0,0 +1,77 @@
+mkdir index_shards
+echo "area_swept_km2"
+python generate_indicies.py $BUCKET_NAME area_swept_km2 n
+echo "bottom_temperature_c"
+python generate_indicies.py $BUCKET_NAME bottom_temperature_c n
+echo "common_name"
+python generate_indicies.py $BUCKET_NAME common_name n
+echo "count"
+python generate_indicies.py $BUCKET_NAME count n
+echo "cpue_kgkm2"
+python generate_indicies.py $BUCKET_NAME cpue_kgkm2 n
+echo "cpue_nokm2"
+python generate_indicies.py $BUCKET_NAME cpue_nokm2 n
+echo "cruise"
+python generate_indicies.py $BUCKET_NAME cruise n
+echo "cruisejoin"
+python generate_indicies.py $BUCKET_NAME cruisejoin n
+echo "date_time"
+python generate_indicies.py $BUCKET_NAME date_time n
+echo "depth_m"
+python generate_indicies.py $BUCKET_NAME depth_m n
+echo "distance_fished_km"
+python generate_indicies.py $BUCKET_NAME distance_fished_km n
+echo "duration_hr"
+python generate_indicies.py $BUCKET_NAME duration_hr n
+echo "haul"
+python generate_indicies.py $BUCKET_NAME haul n
+echo "hauljoin"
+python generate_indicies.py $BUCKET_NAME hauljoin n
+echo "id_rank"
+python generate_indicies.py $BUCKET_NAME id_rank n
+echo "latitude_dd_end"
+python generate_indicies.py $BUCKET_NAME latitude_dd_end n
+echo "latitude_dd_start"
+python generate_indicies.py $BUCKET_NAME latitude_dd_start n
+echo "longitude_dd_end"
+python generate_indicies.py $BUCKET_NAME longitude_dd_end n
+echo "longitude_dd_start"
+python generate_indicies.py $BUCKET_NAME longitude_dd_start n
+echo "net_height_m"
+python generate_indicies.py $BUCKET_NAME net_height_m n
+echo "net_width_m"
+python generate_indicies.py $BUCKET_NAME net_width_m n
+echo "performance"
+python generate_indicies.py $BUCKET_NAME performance n
+echo "requirements"
+python generate_indicies.py $BUCKET_NAME requirements n
+echo "scientific_name"
+python generate_indicies.py $BUCKET_NAME scientific_name n
+echo "species_code"
+python generate_indicies.py $BUCKET_NAME species_code n
+echo "srvy"
+python generate_indicies.py $BUCKET_NAME srvy n
+echo "station"
+python generate_indicies.py $BUCKET_NAME station n
+echo "stratum"
+python generate_indicies.py $BUCKET_NAME stratum n
+echo "surface_temperature_c"
+python generate_indicies.py $BUCKET_NAME surface_temperature_c n
+echo "survey"
+python generate_indicies.py $BUCKET_NAME survey n
+echo "survey_definition_id"
+python generate_indicies.py $BUCKET_NAME survey_definition_id n
+echo "survey_name"
+python generate_indicies.py $BUCKET_NAME survey_name n
+echo "taxon_confidence"
+python generate_indicies.py $BUCKET_NAME taxon_confidence n
+#echo "variables"
+#python generate_indicies.py $BUCKET_NAME variables n
+echo "vessel_id"
+python generate_indicies.py $BUCKET_NAME vessel_id n
+echo "vessel_name"
+python generate_indicies.py $BUCKET_NAME vessel_name n
+echo "weight_kg"
+python generate_indicies.py $BUCKET_NAME weight_kg n
+echo "year"
+python generate_indicies.py $BUCKET_NAME year y

From b8f8ed5dda0e60324f169cc88eab09037ad9a789 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 01:19:44 +0000
Subject: [PATCH 22/36] Type fixes for tests in #114.

---
 snapshot/test_generate_indicies.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py
index 53ace4ad..0d1cd7af 100644
--- a/snapshot/test_generate_indicies.py
+++ b/snapshot/test_generate_indicies.py
@@ -132,7 +132,7 @@ def test_is_non_zero_not_zeroable_zero(self):
         self.assertTrue(generate_indicies.is_non_zero(self._target))
 
     def test_is_non_zero_not_zeroable_none(self):
-        self._target['other'] = None
+        self._target['other'] = None  # type: ignore
         self.assertTrue(generate_indicies.is_non_zero(self._target))
 
     def test_is_non_zero_zeroable_zero_partial(self):
@@ -140,7 +140,7 @@ def test_is_non_zero_zeroable_zero_partial(self):
         self.assertTrue(generate_indicies.is_non_zero(self._target))
 
     def test_is_non_zero_zeroable_none_partial(self):
-        self._target['count'] = None
+        self._target['count'] = None  # type: ignore
         self.assertTrue(generate_indicies.is_non_zero(self._target))
     
     def test_is_non_zero_zeroable_zero_all(self):
@@ -151,7 +151,7 @@ def test_is_non_zero_zeroable_zero_all(self):
 
     def test_is_non_zero_zeroable_none_all(self):
         for field in const.ZEROABLE_FIELDS:
-            self._target[field] = None
+            self._target[field] = None  # type: ignore
 
         self.assertFalse(generate_indicies.is_non_zero(self._target))
 

From e51a38c4dcc6b81eb6efd772ec6ba6bd3854150b Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 02:10:18 +0000
Subject: [PATCH 23/36] Partial implementation of render_flat.

---
 snapshot/render_flat.py            | 574 +++++++++++++++++++++++++++++
 snapshot/test_generate_indicies.py |   4 +-
 2 files changed, 576 insertions(+), 2 deletions(-)
 create mode 100644 snapshot/render_flat.py

diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py
new file mode 100644
index 00000000..5a82fc7c
--- /dev/null
+++ b/snapshot/render_flat.py
@@ -0,0 +1,574 @@
+"""
+Script to build joined flat Avro files.
+
+Script to build joined flat Avro files by joining across the speices list, the hauls dataset, and
+the catch dataset. Catches and species without haul matches will be excluded.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import csv
+import io
+import itertools
+import functools
+import os
+import sys
+
+import boto3
+import coiled
+import fastavro
+
+USAGE_STR = 'python render_flat.py [bucket] [filenames]'
+NUM_ARGS = 2
+
+
+OBSERVATION_SCHEMA = {
+    'doc': 'Description of an observation joined across haul, catch, species.',
+    'name': 'Observation',
+    'namespace': 'edu.dse.afscgap',
+    'type': 'record',
+    'fields': [
+        {'name': 'year', 'type': ['int', 'null']},
+        {'name': 'srvy', 'type': ['string', 'null']},
+        {'name': 'survey', 'type': ['string', 'null']},
+        {'name': 'survey_name', 'type': ['string', 'null']},
+        {'name': 'survey_definition_id', 'type': ['long', 'null']},
+        {'name': 'cruise', 'type': ['long', 'null']},
+        {'name': 'cruisejoin', 'type': ['long', 'null']},
+        {'name': 'hauljoin', 'type': ['long', 'null']},
+        {'name': 'haul', 'type': ['long', 'null']},
+        {'name': 'stratum', 'type': ['long', 'null']},
+        {'name': 'station', 'type': ['string', 'null']},
+        {'name': 'vessel_id', 'type': ['long', 'null']},
+        {'name': 'vessel_name', 'type': ['string', 'null']},
+        {'name': 'date_time', 'type': ['string', 'null']},
+        {'name': 'latitude_dd_start', 'type': ['double', 'null']},
+        {'name': 'longitude_dd_start', 'type': ['double', 'null']},
+        {'name': 'latitude_dd_end', 'type': ['double', 'null']},
+        {'name': 'longitude_dd_end', 'type': ['double', 'null']},
+        {'name': 'bottom_temperature_c', 'type': ['double', 'null']},
+        {'name': 'surface_temperature_c', 'type': ['double', 'null']},
+        {'name': 'depth_m', 'type': ['double', 'null']},
+        {'name': 'distance_fished_km', 'type': ['double', 'null']},
+        {'name': 'duration_hr', 'type': ['double', 'null']},
+        {'name': 'net_width_m', 'type': ['double', 'null']},
+        {'name': 'net_height_m', 'type': ['double', 'null']},
+        {'name': 'area_swept_km2', 'type': ['double', 'null']},
+        {'name': 'performance', 'type': ['float', 'null']},
+        {'name': 'species_code', 'type': ['long', 'null']},
+        {'name': 'cpue_kgkm2', 'type': ['double', 'null']},
+        {'name': 'cpue_nokm2', 'type': ['double', 'null']},
+        {'name': 'count', 'type': ['long', 'null']},
+        {'name': 'weight_kg', 'type': ['double', 'null']},
+        {'name': 'taxon_confidence', 'type': ['string', 'null']},
+        {'name': 'scientific_name', 'type': ['string', 'null']},
+        {'name': 'common_name', 'type': ['string', 'null']},
+        {'name': 'id_rank', 'type': ['string', 'null']},
+        {'name': 'worms', 'type': ['long', 'null']},
+        {'name': 'itis', 'type': ['long', 'null']},
+        {'name': 'complete', 'type': ['boolean', 'null']}
+    ]
+}
+
+SPECIES_DICT = typing.Dict[str, dict]
+
+
+def make_zero_record(species: dict, haul_record: dict) -> dict:
+    """Make a zero catch record meaning that a species was not found.
+
+    Args:
+        species: Information about the species not found.
+        haul_record: Informatino about the haul for which no specimens were found.
+
+    Returns:
+        Complete output record indicating the given species not found for the given haul.
+    """
+    haul_copy = copy.deepcopy(haul_record)
+    haul_copy['species_code'] = species['species_code']
+    haul_copy['cpue_kgkm2'] = 0
+    haul_copy['cpue_nokm2'] = 0
+    haul_copy['count'] = 0
+    haul_copy['weight_kg'] = 0
+    haul_copy['taxon_confidence'] = None
+    haul_copy['scientific_name'] = species['scientific_name']
+    haul_copy['common_name'] = species['common_name']
+    haul_copy['id_rank'] = species['id_rank']
+    haul_copy['worms'] = species['worms']
+    haul_copy['itis'] = species['itis']
+    haul_copy['complete'] = True
+    return haul_copy
+
+
+def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT) -> dict:
+    """Add information about a species found within a catch.
+
+    Args:
+        target: Record describing a catch within haul.
+        species_by_code: Dictionary mapping from species code found in a catch to information about
+            that species.
+
+    Returns:
+        Catch record with species information added.
+    """
+    species_code = target['species_code']
+
+    if species_code not in species_by_code:
+        target['complete'] = False
+        return target
+
+    species_record = species_by_code[species_code]
+    target.update(species_record)
+    return target
+
+
+def make_get_avro(bucket: str, s3_client) -> typling.Callable[[str], typing.List[dict]]:
+    """Build a function which gets a file from a bucket using the given S3 client.
+
+    Args:
+        bucket: The name of the bucket where files should be found.
+        s3_client: The S3 client to use in getting those files.
+
+    Returns:
+        Function which takes a full path and reutrns a list of parsed dictionaries from the Avro
+        file at that path.
+    """
+
+    def get_avro(full_loc: str) -> typing.List[dict]:
+        """Get all records from an Avro file.
+
+        Args:
+            full_loc: The full path to the avro file to be read.
+
+        Returns:
+            All records within that Avro file parsed as dictionaries.
+        """
+        target_buffer = io.BytesIO()
+        s3_client.download_fileobj(bucket, full_loc, target_buffer)
+        target_buffer.seek(0)
+        return list(fastavro.reader(target_buffer))
+    
+    return get_avro
+
+
+def append_catch_haul(catch_record: dict, haul_record: dict) -> dic:
+    """Combine information between a catch record and a haul record.
+
+    Args:
+        catch_record: The catch record to combine with haul information.
+        haul_record: The haul information for the given catch record.
+
+    Returns:
+        Combined catch and haul information.
+    """
+    catch_record.update(haul_record)
+    return catch_record
+
+
+def complete_record(target: dict) -> dict:
+    """Fill in any missing fields with None to fit edu.dse.afscgap.Observation Avro format.
+
+    Args:
+        target: Record to finish. This record may or may not be modified in place.
+
+    Returns:
+        Record with any missing fields set to None.
+    """
+    keys = map(lambda x: x['name'], OBSERVATION_SCHEMA['fields'])
+    keys_realized = list(keys)
+    values = map(lambda x: target.get(x, None), keys_realized)
+    return dict(zip(keys_realized, values))
+
+
+def mark_incomplete(target: dict) -> dict:
+    """Mark a record as incomplete.
+
+    Args:
+        target: Record on which the complete attribute should be changed. This may or may not be
+            modified in-place.
+
+    Returns:
+        Record with the complete flag updated.
+    """
+    target['complete'] = False
+    return target
+
+
+def mark_complete(target: dict) -> dict:
+     """Mark a record as complete.
+
+    Args:
+        target: Record on which the complete attribute should be changed. This may or may not be
+            modified in-place.
+
+    Returns:
+        Record with the complete flag updated.
+    """
+    target['complete'] = True
+    return target
+
+
+def combine_catch_and_haul(haul_record: dict,
+    catch_records: typing.Optional[typing.List[dict]]) -> typing.Iterable[dict]:
+    """Combine catch information with information about the haul in which that catch happened.
+
+    Args:
+        haul_record: Information about the haul in which the catch took place.
+        catch_records: The catch records to be joined with haul information.
+
+    Returns:
+        Updated catch records or, if no catch records provided, a single record with haul
+        information marked incomplete.
+    """
+    if catch_records is None:
+        catch_records_out = map(mark_incomplete, [haul_record])
+    else:
+        catch_no_species = map(
+            lambda x: append_catch_haul(x, haul_record),
+            catch_records
+        )
+        catch_with_species = map(append_species, catch_no_species)
+        catch_records_out = map(mark_complete, catch_with_species)
+    
+    return catch_records_out
+
+
+def make_zero_catch_records(catch_records_out_realized: typing.List[dict],
+    species_by_code: SPECIES_DICT) -> typing.Iterable[dict]:
+    """Generate zero catch records for species not found in catches for a haul.
+
+    Args:
+        catch_records_out_realized: All catch records for a haul.
+        species_by_code: Mapping from species code to information about the species such that all
+            formally tracked species are in this dictionary.
+
+    Returns:
+        Inferred zero catch records.
+    """
+    species_codes_found = set(map(
+        lambda x: x.get('species_code', None),
+        catch_records_out_realized
+    ))
+    species_codes_all = set(species_by_code.keys())
+    speices_codes_missing = species_codes_all - species_codes_found
+    speices_missing = map(lambda x: species_by_code[x], speices_codes_missing)
+    catch_records_zero = map(
+        lambda x: make_zero_record(x, haul_record),
+        speices_missing
+    )
+    return catch_records_zero
+
+
+def get_url_for_catches_in_haul(haul: int) -> str:
+    """Get the URL where the catches associated with a haul may be found.
+
+    Args:
+        haul: The ID of the desired haul.
+
+    Returns:
+        Path to where the catches associated with the given haul may be found if there is any data
+        available.
+    """
+    return 'catch/%d.avro' % haul
+
+
+def get_meta_url_for_haul(year: int, survey: str, haul: int) -> str:
+    """Get the URL for a haul's metadata given the haul location.
+
+    Args:
+        year: The year in which the haul took place like 2025.
+        survey: The survey name like "Gulf of Alaska" that the haul was part of.
+        haul: The haul ID for the desired haul.
+
+    Returns:
+        String path where the Avro file with haul metadata is expected.
+    """
+    template_vals = (year, survey, haul)
+    return = 'haul/%d_%s_%d.avro' % template_vals
+
+
+def process_haul(bucket: str, year: int, survey: str, haul: int,
+    species_by_code: SPECIES_DICT) -> dict:
+    """Distributed task to process a single haul.
+
+    Distributed task to process a single haul, joining across species and catch datasets for that
+    haul and writing out the joined file to S3.
+
+    Args:
+        bucket: The name of the bucket where catch, haul, and species information can be found.
+        year: The year of the haul to be processed.
+        survey: The suvey name like "Gulf of Alaska" in which the haul to bpe processed was found.
+        haul: The haul ID to be processed.
+        species_by_code: Information about all species formally tracked indexed by species code.
+
+    Returns:
+        Diagnostic information about the file written.
+    """
+
+    import copy
+    import io
+    import os
+
+    import botocore
+    import boto3
+    import fastavro
+
+    access_key = os.environ['AWS_ACCESS_KEY']
+    access_secret = os.environ['AWS_ACCESS_SECRET']
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+
+    get_avro = make_get_avro(bucket, s3_client)
+
+    def check_file_exists(full_loc: str) -> bool:
+        """Check that a file exists in S3.
+
+        Args:
+            full_loc: The location (path) within the S3 bucket.
+
+        Returns:
+            True if the file is found and false otherwise.
+        """
+        try:
+            s3_client.head_object(bucket, full_loc)
+            return True
+        except botocore.exceptions.ClientError as e:
+            error_code = e.response['Error']['Code']
+            error_code_cast = int(error_code)
+            if error_code_cast == 404:
+                return False
+            else:
+                raise RuntimeError('Unexpected S3 head code: %d' % error_code)
+
+    def append_species(target: dict) -> dict:
+        """Add information about the speices found in a catch.
+
+        Args:
+            target: Catch information to which species information should be added. This may or
+                may not be modified in-place.
+
+        Returns:
+            Record with species information added.
+        """
+        return append_species_from_species_list(target, species_by_code)
+
+    def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO:
+        """Convert an iterable of dictionaries to Avro bytes.
+
+        Args:
+            records: The dictionaries to convert.
+
+        Returns:
+            Bytes with Avro payload.
+        """
+        records_complete = map(complete_record, records)
+        target_buffer = io.BytesIO()
+        fastavro.writer(target_buffer, OBSERVATION_SCHEMA, records_complete)
+        target_buffer.seek(0)
+        return target_buffer
+
+    def get_haul_record(year: int, survey: str, haul: int) -> typing.Optional[dict]:
+        """Get the record for a haul given the haul location.
+
+        Args:
+            year: The year in which the haul took place like 2025.
+            survey: The survey name like "Gulf of Alaska" that the haul was part of.
+            haul: The haul ID for the desired haul.
+
+        Returns:
+            Dictionary record describing the haul or None if the haul was not found.
+        """
+        haul_loc = get_meta_url_for_haul(year, survey, haul)
+
+        if not check_file_exists(haul_loc):
+            return None
+
+        haul_records = get_avro(haul_loc)
+        assert len(haul_records) == 1
+        haul_record = haul_records[0]
+        return haul_record
+    
+    def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
+        """Get the catch records associated with a haul.
+
+        Args:
+            haul: The ID of the haul for which catch records should be returned.
+
+        Returns:
+            All catch records associated with a haul. Either None or empty list if no data could be
+            found.
+        """
+        catch_loc = get_url_for_catches_in_haul(haul)
+        if check_file_exists(catch_loc):
+            return get_avro(catch_loc)
+        else:
+            return None
+
+    haul_record = get_haul_record(haul_loc)
+
+    if haul_record is None:
+        return {
+            'complete': 0,
+            'incomplete': 0,
+            'zero': 0,
+            'path': None
+        }
+
+    catch_records = list(get_catch_records(haul))
+    catch_records_out = combine_catch_and_haul(haul_reord, catch_records)
+    catch_records_out_realized = list(catch_records_out)
+    catch_records_zero = make_zero_catch_records(catch_records_out_realized, species_by_code)
+
+    # Combine regular records with zero catch inferred records
+    catch_records_all = itertools.chain(
+        catch_records_out_realized,
+        catch_records_zero
+    )
+
+    # Upload to S3
+    catch_with_species_avro = convert_to_avro(catch_records_all)
+    output_loc = 'joined/%d_%s_%d.avro' % template_vals
+    s3_client.upload_fileobj(catch_with_species_avro, bucket, output_loc)
+
+    # Write out diagnostic information
+    outputs_dicts = map(
+        lambda x: {
+            'complete': 1 if x['complete'] else 0,
+            'incomplete': 0 if x['complete'] else 1,
+            'zero': 1 if x.get('count', 0) == 0 else 0
+        },
+        catch_records_out_realized
+    )
+    output_dict = functools.reduce(
+        lambda a, b: {
+            'complete': a['complete'] + b['complete'],
+            'incomplete': a['incomplete'] + b['incomplete'],
+            'zero': a['zero'] + b['zero']
+        },
+        outputs_dicts
+    )
+    output_dict['loc'] = output_loc
+    return output_dict
+
+
+def get_hauls_meta(bucket: str) -> typing.Iterable[dict]:
+    """Get metadata for all available hauls.
+
+    Args:
+        bucket: The bucket where hauls inforamtion can be found.
+
+    Returns:
+        Iterable over hauls metadata required to find that haul within S3.
+    """
+    access_key = os.environ['AWS_ACCESS_KEY']
+    access_secret = os.environ['AWS_ACCESS_SECRET']
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+
+    def make_haul_metadata_record(path):
+        filename_with_path = path.split('/')[-1]
+        filename = filename_with_path.split('.')[0]
+        components = filename.split('_')
+        return {
+            'path': path,
+            'year': int(components[0]),
+            'survey': components[1],
+            'haul': int(components[2])
+        }
+
+    paginator = s3_client.get_paginator('list_objects_v2')
+    iterator = paginator.paginate(Bucket=bucket, Prefix='haul/')
+    pages = filter(lambda x: 'Contents' in x, iterator)
+    contents = map(lambda x: x['Contents'], pages)
+    contents_flat = itertools.chain(*contents)
+    keys = map(lambda x: x['Key'], contents_flat)
+    return map(make_haul_metadata_record, keys)
+
+
+def get_all_species(bucket):
+    access_key = os.environ['AWS_ACCESS_KEY']
+    access_secret = os.environ['AWS_ACCESS_SECRET']
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+
+    get_avro = make_get_avro(bucket, s3_client)
+
+    paginator = s3_client.get_paginator('list_objects_v2')
+    iterator = paginator.paginate(Bucket=bucket, Prefix='species/')
+    pages = filter(lambda x: 'Contents' in x, iterator)
+    contents = map(lambda x: x['Contents'], pages)
+    contents_flat = itertools.chain(*contents)
+    keys = map(lambda x: x['Key'], contents_flat)
+    records_nest = map(get_avro, keys)
+    records_flat = itertools.chain(*records_nest)
+    records_tuples = map(lambda x: (x['species_code'], x), records_flat)
+    return dict(records_tuples)
+
+
+def main():
+    if len(sys.argv) != NUM_ARGS + 1:
+        print(USAGE_STR)
+        sys.exit(1)
+
+    bucket = sys.argv[1]
+    file_paths_loc = sys.argv[2]
+    hauls_meta = get_hauls_meta(bucket)
+
+    cluster = coiled.Cluster(
+        name='DseProcessAfscgap',
+        n_workers=10,
+        worker_vm_types=['m7a.medium'],
+        scheduler_vm_types=['m7a.medium'],
+        environ={
+            'AWS_ACCESS_KEY': os.environ.get('AWS_ACCESS_KEY', ''),
+            'AWS_ACCESS_SECRET': os.environ.get('AWS_ACCESS_SECRET', ''),
+            'SOURCE_DATA_LOC': os.environ.get('SOURCE_DATA_LOC', '')
+        }
+    )
+    cluster.adapt(minimum=10, maximum=500)
+    client = cluster.get_client()
+
+    hauls_meta_realized = list(hauls_meta)
+    species_by_code = get_all_species(bucket)
+
+    written_paths_future = client.map(
+        lambda x: process_haul(
+            bucket,
+            x['year'],
+            x['survey'],
+            x['haul'],
+            species_by_code
+        ),
+        hauls_meta_realized
+    )
+    written_paths = map(lambda x: x.result(), written_paths_future)
+
+    with open(file_paths_loc, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=[
+            'loc',
+            'complete',
+            'incomplete',
+            'zero'
+        ])
+        writer.writeheader()
+        writer.writerows(written_paths)
+
+    cluster.close(force_shutdown=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py
index 0d1cd7af..6e26468e 100644
--- a/snapshot/test_generate_indicies.py
+++ b/snapshot/test_generate_indicies.py
@@ -165,13 +165,13 @@ def setUp(self):
     def test_build_output_record(self):
         self.assertEqual(self._output_record['value'], 'test value')
 
-    def test_build_key_meta(self):
+    def test_build_key_meta_check_first(self):
         key = self._output_record['keys'][0]
         self.assertEqual(key['year'], 2025)
         self.assertEqual(key['survey'], 'GOA')
         self.assertEqual(key['haul'], 124)
 
-    def test_build_key_meta(self):
+    def test_build_key_meta_check_hauls(self):
         key_1 = self._output_record['keys'][0]
         self.assertEqual(key_1['haul'], 123)
 

From bf49a8beebfde9ec2bfb5d5a6e60c8432217e769 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 02:12:47 +0000
Subject: [PATCH 24/36] Docstring complete on render_flat.

---
 snapshot/render_flat.py | 42 +++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py
index 5a82fc7c..7b472995 100644
--- a/snapshot/render_flat.py
+++ b/snapshot/render_flat.py
@@ -457,6 +457,26 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
     return output_dict
 
 
+def make_haul_metadata_record(path: str) -> dict:
+    """Parse a path into a metadata record.
+
+    Args:
+        path: The path to be parsed as a metadata record.
+
+    Returns:
+        Dictionary describing metadata for a haul.
+    """
+    filename_with_path = path.split('/')[-1]
+    filename = filename_with_path.split('.')[0]
+    components = filename.split('_')
+    return {
+        'path': path,
+        'year': int(components[0]),
+        'survey': components[1],
+        'haul': int(components[2])
+    }
+
+
 def get_hauls_meta(bucket: str) -> typing.Iterable[dict]:
     """Get metadata for all available hauls.
 
@@ -475,17 +495,6 @@ def get_hauls_meta(bucket: str) -> typing.Iterable[dict]:
         aws_secret_access_key=access_secret
     )
 
-    def make_haul_metadata_record(path):
-        filename_with_path = path.split('/')[-1]
-        filename = filename_with_path.split('.')[0]
-        components = filename.split('_')
-        return {
-            'path': path,
-            'year': int(components[0]),
-            'survey': components[1],
-            'haul': int(components[2])
-        }
-
     paginator = s3_client.get_paginator('list_objects_v2')
     iterator = paginator.paginate(Bucket=bucket, Prefix='haul/')
     pages = filter(lambda x: 'Contents' in x, iterator)
@@ -495,7 +504,15 @@ def make_haul_metadata_record(path):
     return map(make_haul_metadata_record, keys)
 
 
-def get_all_species(bucket):
+def get_all_species(bucket: str) -> SPECIES_DICT:
+    """Get information about all species formally tracked.
+
+    Args:
+        bucket: The S3 bucket name where species information can be found.
+
+    Returns:
+        Dictionary mapping from species code to information about that species.
+    """
     access_key = os.environ['AWS_ACCESS_KEY']
     access_secret = os.environ['AWS_ACCESS_SECRET']
 
@@ -520,6 +537,7 @@ def get_all_species(bucket):
 
 
 def main():
+    """Entry point for the join script."""
     if len(sys.argv) != NUM_ARGS + 1:
         print(USAGE_STR)
         sys.exit(1)

From 51db277210e6044fb6d4772e9ada04aa7cc53607 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 02:13:30 +0000
Subject: [PATCH 25/36] Fix unit tests for #114.

---
 snapshot/test_generate_indicies.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py
index 6e26468e..1b425b13 100644
--- a/snapshot/test_generate_indicies.py
+++ b/snapshot/test_generate_indicies.py
@@ -169,7 +169,7 @@ def test_build_key_meta_check_first(self):
         key = self._output_record['keys'][0]
         self.assertEqual(key['year'], 2025)
         self.assertEqual(key['survey'], 'GOA')
-        self.assertEqual(key['haul'], 124)
+        self.assertEqual(key['haul'], 123)
 
     def test_build_key_meta_check_hauls(self):
         key_1 = self._output_record['keys'][0]

From 65b5ce97f36115bd630a780b329daaf1bc364fd7 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 02:23:23 +0000
Subject: [PATCH 26/36] Through pyflakes for render_flat.

---
 snapshot/render_flat.py            | 74 ++++++++++++++++++------------
 snapshot/test_generate_indicies.py |  2 +-
 2 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py
index 7b472995..ece94df1 100644
--- a/snapshot/render_flat.py
+++ b/snapshot/render_flat.py
@@ -16,6 +16,7 @@
 import functools
 import os
 import sys
+import typing
 
 import boto3
 import coiled
@@ -81,11 +82,13 @@ def make_zero_record(species: dict, haul_record: dict) -> dict:
 
     Args:
         species: Information about the species not found.
-        haul_record: Informatino about the haul for which no specimens were found.
+        haul_record: Information about the haul for which no specimens were found.
 
     Returns:
         Complete output record indicating the given species not found for the given haul.
     """
+    import copy
+
     haul_copy = copy.deepcopy(haul_record)
     haul_copy['species_code'] = species['species_code']
     haul_copy['cpue_kgkm2'] = 0
@@ -124,7 +127,7 @@ def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT
     return target
 
 
-def make_get_avro(bucket: str, s3_client) -> typling.Callable[[str], typing.List[dict]]:
+def make_get_avro(bucket: str, s3_client) -> typing.Callable[[str], typing.List[dict]]:
     """Build a function which gets a file from a bucket using the given S3 client.
 
     Args:
@@ -153,7 +156,7 @@ def get_avro(full_loc: str) -> typing.List[dict]:
     return get_avro
 
 
-def append_catch_haul(catch_record: dict, haul_record: dict) -> dic:
+def append_catch_haul(catch_record: dict, haul_record: dict) -> dict:
     """Combine information between a catch record and a haul record.
 
     Args:
@@ -197,7 +200,7 @@ def mark_incomplete(target: dict) -> dict:
 
 
 def mark_complete(target: dict) -> dict:
-     """Mark a record as complete.
+    """Mark a record as complete.
 
     Args:
         target: Record on which the complete attribute should be changed. This may or may not be
@@ -211,12 +214,14 @@ def mark_complete(target: dict) -> dict:
 
 
 def combine_catch_and_haul(haul_record: dict,
-    catch_records: typing.Optional[typing.List[dict]]) -> typing.Iterable[dict]:
+    catch_records: typing.Optional[typing.List[dict]],
+    species_by_code: SPECIES_DICT) -> typing.Iterable[dict]:
     """Combine catch information with information about the haul in which that catch happened.
 
     Args:
         haul_record: Information about the haul in which the catch took place.
         catch_records: The catch records to be joined with haul information.
+        species_by_code: Information about all tracked species indexed by species code.
 
     Returns:
         Updated catch records or, if no catch records provided, a single record with haul
@@ -229,20 +234,24 @@ def combine_catch_and_haul(haul_record: dict,
             lambda x: append_catch_haul(x, haul_record),
             catch_records
         )
-        catch_with_species = map(append_species, catch_no_species)
+        catch_with_species = map(
+            lambda x: append_species_from_species_list(x, species_by_code),
+            catch_no_species
+        )
         catch_records_out = map(mark_complete, catch_with_species)
     
     return catch_records_out
 
 
 def make_zero_catch_records(catch_records_out_realized: typing.List[dict],
-    species_by_code: SPECIES_DICT) -> typing.Iterable[dict]:
+    species_by_code: SPECIES_DICT, haul_record: dict) -> typing.Iterable[dict]:
     """Generate zero catch records for species not found in catches for a haul.
 
     Args:
         catch_records_out_realized: All catch records for a haul.
         species_by_code: Mapping from species code to information about the species such that all
             formally tracked species are in this dictionary.
+        haul_record: Base record to use in generating zero catch records.
 
     Returns:
         Inferred zero catch records.
@@ -261,7 +270,7 @@ def make_zero_catch_records(catch_records_out_realized: typing.List[dict],
     return catch_records_zero
 
 
-def get_url_for_catches_in_haul(haul: int) -> str:
+def get_path_for_catches_in_haul(haul: int) -> str:
     """Get the URL where the catches associated with a haul may be found.
 
     Args:
@@ -274,7 +283,7 @@ def get_url_for_catches_in_haul(haul: int) -> str:
     return 'catch/%d.avro' % haul
 
 
-def get_meta_url_for_haul(year: int, survey: str, haul: int) -> str:
+def get_meta_path_for_haul(year: int, survey: str, haul: int) -> str:
     """Get the URL for a haul's metadata given the haul location.
 
     Args:
@@ -286,7 +295,22 @@ def get_meta_url_for_haul(year: int, survey: str, haul: int) -> str:
         String path where the Avro file with haul metadata is expected.
     """
     template_vals = (year, survey, haul)
-    return = 'haul/%d_%s_%d.avro' % template_vals
+    return 'haul/%d_%s_%d.avro' % template_vals
+
+
+def get_joined_path(year: int, survey: str, haul: int) -> str:
+    """Get that path at which joined data is expected to be written for a haul.
+
+    Args:
+        year: The year in which the haul occurred like 2025.
+        survey: The name of the survey like "Gulf of Alaska" in which the haul took place.
+        haul: The ID of the haul.
+
+    Returns:
+        STring path where the Avro file with joined haul data is expected.
+    """
+    template_vals = (year, survey, haul)
+    return 'joined/%d_%s_%d.avro' % template_vals
 
 
 def process_haul(bucket: str, year: int, survey: str, haul: int,
@@ -307,7 +331,6 @@ def process_haul(bucket: str, year: int, survey: str, haul: int,
         Diagnostic information about the file written.
     """
 
-    import copy
     import io
     import os
 
@@ -346,18 +369,6 @@ def check_file_exists(full_loc: str) -> bool:
             else:
                 raise RuntimeError('Unexpected S3 head code: %d' % error_code)
 
-    def append_species(target: dict) -> dict:
-        """Add information about the speices found in a catch.
-
-        Args:
-            target: Catch information to which species information should be added. This may or
-                may not be modified in-place.
-
-        Returns:
-            Record with species information added.
-        """
-        return append_species_from_species_list(target, species_by_code)
-
     def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO:
         """Convert an iterable of dictionaries to Avro bytes.
 
@@ -384,7 +395,7 @@ def get_haul_record(year: int, survey: str, haul: int) -> typing.Optional[dict]:
         Returns:
             Dictionary record describing the haul or None if the haul was not found.
         """
-        haul_loc = get_meta_url_for_haul(year, survey, haul)
+        haul_loc = get_meta_path_for_haul(year, survey, haul)
 
         if not check_file_exists(haul_loc):
             return None
@@ -404,14 +415,13 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
             All catch records associated with a haul. Either None or empty list if no data could be
             found.
         """
-        catch_loc = get_url_for_catches_in_haul(haul)
+        catch_loc = get_path_for_catches_in_haul(haul)
         if check_file_exists(catch_loc):
             return get_avro(catch_loc)
         else:
             return None
 
-    haul_record = get_haul_record(haul_loc)
-
+    haul_record = get_haul_record(year, survey, haul)
     if haul_record is None:
         return {
             'complete': 0,
@@ -421,9 +431,13 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
         }
 
     catch_records = list(get_catch_records(haul))
-    catch_records_out = combine_catch_and_haul(haul_reord, catch_records)
+    catch_records_out = combine_catch_and_haul(haul_record, catch_records, species_by_code)
     catch_records_out_realized = list(catch_records_out)
-    catch_records_zero = make_zero_catch_records(catch_records_out_realized, species_by_code)
+    catch_records_zero = make_zero_catch_records(
+        catch_records_out_realized,
+        species_by_code,
+        haul_record
+    )
 
     # Combine regular records with zero catch inferred records
     catch_records_all = itertools.chain(
@@ -433,7 +447,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
 
     # Upload to S3
     catch_with_species_avro = convert_to_avro(catch_records_all)
-    output_loc = 'joined/%d_%s_%d.avro' % template_vals
+    output_loc = get_joined_path(year, survey, haul)
     s3_client.upload_fileobj(catch_with_species_avro, bucket, output_loc)
 
     # Write out diagnostic information
diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py
index 1b425b13..01dcc06b 100644
--- a/snapshot/test_generate_indicies.py
+++ b/snapshot/test_generate_indicies.py
@@ -197,4 +197,4 @@ def test_combine_compatible(self):
 
     def test_combine_incompatible(self):
         with self.assertRaises(RuntimeError):
-            combined = generate_indicies.combine_records(self._base, self._incompatible)
+            generate_indicies.combine_records(self._base, self._incompatible)

From 6f2aaa9bc3a38c31e30b57ef9ca6d176dc9ccb0d Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 02:24:10 +0000
Subject: [PATCH 27/36] Style fixes in flat_index_util.

---
 afscgap/flat_index_util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/afscgap/flat_index_util.py b/afscgap/flat_index_util.py
index 2972616e..6278a67d 100644
--- a/afscgap/flat_index_util.py
+++ b/afscgap/flat_index_util.py
@@ -482,10 +482,10 @@ def decorate_filter(field: str, original: IndexFilter) -> IndexFilter:
 
 def determine_if_ignorable(field: str, param: afscgap.param.Param, presence_only: bool) -> bool:
     """Determine if a field parameter is ignored for pre-filtering.
-    
+
     Determine if a field parameter is ignored for pre-filtering, turning it into a noop because
     pre-filtering isn't possible or precomputed indicies are not available.
-    
+
     Args:
         field: The name of the field for which filters should be made.
         param: The parameter to apply for the field.
@@ -493,7 +493,7 @@ def determine_if_ignorable(field: str, param: afscgap.param.Param, presence_only
             excluded.
 
     Returns:
-        True if 
+        True if ignorable and false otherwise.
     """
     if param.get_is_ignorable():
         return True

From 2426f187e351ab8a0dbbd2d773b60b5767b66a4b Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 02:24:52 +0000
Subject: [PATCH 28/36] Additional style fixes on #114.

---
 snapshot/render_flat.py            | 6 +++---
 snapshot/test_generate_indicies.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py
index ece94df1..299f072f 100644
--- a/snapshot/render_flat.py
+++ b/snapshot/render_flat.py
@@ -152,7 +152,7 @@ def get_avro(full_loc: str) -> typing.List[dict]:
         s3_client.download_fileobj(bucket, full_loc, target_buffer)
         target_buffer.seek(0)
         return list(fastavro.reader(target_buffer))
-    
+
     return get_avro
 
 
@@ -239,7 +239,7 @@ def combine_catch_and_haul(haul_record: dict,
             catch_no_species
         )
         catch_records_out = map(mark_complete, catch_with_species)
-    
+
     return catch_records_out
 
 
@@ -404,7 +404,7 @@ def get_haul_record(year: int, survey: str, haul: int) -> typing.Optional[dict]:
         assert len(haul_records) == 1
         haul_record = haul_records[0]
         return haul_record
-    
+
     def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
         """Get the catch records associated with a haul.
 
diff --git a/snapshot/test_generate_indicies.py b/snapshot/test_generate_indicies.py
index 01dcc06b..72968f4e 100644
--- a/snapshot/test_generate_indicies.py
+++ b/snapshot/test_generate_indicies.py
@@ -142,7 +142,7 @@ def test_is_non_zero_zeroable_zero_partial(self):
     def test_is_non_zero_zeroable_none_partial(self):
         self._target['count'] = None  # type: ignore
         self.assertTrue(generate_indicies.is_non_zero(self._target))
-    
+
     def test_is_non_zero_zeroable_zero_all(self):
         for field in const.ZEROABLE_FIELDS:
             self._target[field] = 0

From 43b19dec2a7d6cb03b30447fab448c8e3ae0f46b Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 02:32:56 +0000
Subject: [PATCH 29/36] Through mypy in render_flat.

---
 snapshot/render_flat.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py
index 299f072f..ff7371de 100644
--- a/snapshot/render_flat.py
+++ b/snapshot/render_flat.py
@@ -18,8 +18,8 @@
 import sys
 import typing
 
-import boto3
-import coiled
+import boto3  # type: ignore
+import coiled  # type: ignore
 import fastavro
 
 USAGE_STR = 'python render_flat.py [bucket] [filenames]'
@@ -151,7 +151,7 @@ def get_avro(full_loc: str) -> typing.List[dict]:
         target_buffer = io.BytesIO()
         s3_client.download_fileobj(bucket, full_loc, target_buffer)
         target_buffer.seek(0)
-        return list(fastavro.reader(target_buffer))
+        return list(fastavro.reader(target_buffer))  # type: ignore
 
     return get_avro
 
@@ -179,7 +179,7 @@ def complete_record(target: dict) -> dict:
     Returns:
         Record with any missing fields set to None.
     """
-    keys = map(lambda x: x['name'], OBSERVATION_SCHEMA['fields'])
+    keys = map(lambda x: x['name'], OBSERVATION_SCHEMA['fields'])  # type: ignore
     keys_realized = list(keys)
     values = map(lambda x: target.get(x, None), keys_realized)
     return dict(zip(keys_realized, values))
@@ -334,7 +334,7 @@ def process_haul(bucket: str, year: int, survey: str, haul: int,
     import io
     import os
 
-    import botocore
+    import botocore  # type: ignore
     import boto3
     import fastavro
 
@@ -422,7 +422,9 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
             return None
 
     haul_record = get_haul_record(year, survey, haul)
-    if haul_record is None:
+    catch_records_maybe = get_catch_records(haul)
+
+    if haul_record is None or catch_records_maybe is None:
         return {
             'complete': 0,
             'incomplete': 0,
@@ -430,7 +432,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
             'path': None
         }
 
-    catch_records = list(get_catch_records(haul))
+    catch_records = list(catch_records_maybe)
     catch_records_out = combine_catch_and_haul(haul_record, catch_records, species_by_code)
     catch_records_out_realized = list(catch_records_out)
     catch_records_zero = make_zero_catch_records(
@@ -459,7 +461,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
         },
         catch_records_out_realized
     )
-    output_dict = functools.reduce(
+    output_dict: typing.Dict[str, int] = functools.reduce(
         lambda a, b: {
             'complete': a['complete'] + b['complete'],
             'incomplete': a['incomplete'] + b['incomplete'],
@@ -467,8 +469,10 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
         },
         outputs_dicts
     )
-    output_dict['loc'] = output_loc
-    return output_dict
+
+    output_dict_with_loc: typing.Dict[str, typing.Union[str, int]] = output_dict  # type: ignore
+    output_dict_with_loc['loc'] = output_loc
+    return output_dict_with_loc
 
 
 def make_haul_metadata_record(path: str) -> dict:

From 2d081ca2f73615d034b868b798c0e6149e9b5489 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 19:37:04 +0000
Subject: [PATCH 30/36] Add tests for render flat.

---
 snapshot/render_flat.py      | 106 +++++++++---------
 snapshot/test_render_flat.py | 207 +++++++++++++++++++++++++++++++++++
 2 files changed, 260 insertions(+), 53 deletions(-)
 create mode 100644 snapshot/test_render_flat.py

diff --git a/snapshot/render_flat.py b/snapshot/render_flat.py
index ff7371de..c1806d41 100644
--- a/snapshot/render_flat.py
+++ b/snapshot/render_flat.py
@@ -105,6 +105,33 @@ def make_zero_record(species: dict, haul_record: dict) -> dict:
     return haul_copy
 
 
+def make_zero_catch_records(catch_records_out_realized: typing.List[dict],
+    species_by_code: SPECIES_DICT, haul_record: dict) -> typing.Iterable[dict]:
+    """Generate zero catch records for species not found in catches for a haul.
+
+    Args:
+        catch_records_out_realized: All catch records for a haul.
+        species_by_code: Mapping from species code to information about the species such that all
+            formally tracked species are in this dictionary.
+        haul_record: Base record to use in generating zero catch records.
+
+    Returns:
+        Inferred zero catch records.
+    """
+    species_codes_found = set(map(
+        lambda x: x.get('species_code', None),
+        catch_records_out_realized
+    ))
+    species_codes_all = set(species_by_code.keys())
+    speices_codes_missing = species_codes_all - species_codes_found
+    speices_missing = map(lambda x: species_by_code[x], speices_codes_missing)
+    catch_records_zero = map(
+        lambda x: make_zero_record(x, haul_record),
+        speices_missing
+    )
+    return catch_records_zero
+
+
 def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT) -> dict:
     """Add information about a species found within a catch.
 
@@ -119,8 +146,9 @@ def append_species_from_species_list(target: dict, species_by_code: SPECIES_DICT
     species_code = target['species_code']
 
     if species_code not in species_by_code:
-        target['complete'] = False
-        return target
+        return mark_incomplete(target)
+    else:
+        target = mark_complete(target)
 
     species_record = species_by_code[species_code]
     target.update(species_record)
@@ -213,7 +241,7 @@ def mark_complete(target: dict) -> dict:
     return target
 
 
-def combine_catch_and_haul(haul_record: dict,
+def execute_full_join(haul_record: dict,
     catch_records: typing.Optional[typing.List[dict]],
     species_by_code: SPECIES_DICT) -> typing.Iterable[dict]:
     """Combine catch information with information about the haul in which that catch happened.
@@ -234,42 +262,14 @@ def combine_catch_and_haul(haul_record: dict,
             lambda x: append_catch_haul(x, haul_record),
             catch_records
         )
-        catch_with_species = map(
+        catch_records_out = map(
             lambda x: append_species_from_species_list(x, species_by_code),
             catch_no_species
         )
-        catch_records_out = map(mark_complete, catch_with_species)
 
     return catch_records_out
 
 
-def make_zero_catch_records(catch_records_out_realized: typing.List[dict],
-    species_by_code: SPECIES_DICT, haul_record: dict) -> typing.Iterable[dict]:
-    """Generate zero catch records for species not found in catches for a haul.
-
-    Args:
-        catch_records_out_realized: All catch records for a haul.
-        species_by_code: Mapping from species code to information about the species such that all
-            formally tracked species are in this dictionary.
-        haul_record: Base record to use in generating zero catch records.
-
-    Returns:
-        Inferred zero catch records.
-    """
-    species_codes_found = set(map(
-        lambda x: x.get('species_code', None),
-        catch_records_out_realized
-    ))
-    species_codes_all = set(species_by_code.keys())
-    speices_codes_missing = species_codes_all - species_codes_found
-    speices_missing = map(lambda x: species_by_code[x], speices_codes_missing)
-    catch_records_zero = map(
-        lambda x: make_zero_record(x, haul_record),
-        speices_missing
-    )
-    return catch_records_zero
-
-
 def get_path_for_catches_in_haul(haul: int) -> str:
     """Get the URL where the catches associated with a haul may be found.
 
@@ -313,6 +313,26 @@ def get_joined_path(year: int, survey: str, haul: int) -> str:
     return 'joined/%d_%s_%d.avro' % template_vals
 
 
+def make_haul_metadata_record(path: str) -> dict:
+    """Parse a path into a metadata record.
+
+    Args:
+        path: The path to be parsed as a metadata record.
+
+    Returns:
+        Dictionary describing metadata for a haul.
+    """
+    filename_with_path = path.split('/')[-1]
+    filename = filename_with_path.split('.')[0]
+    components = filename.split('_')
+    return {
+        'path': path,
+        'year': int(components[0]),
+        'survey': components[1],
+        'haul': int(components[2])
+    }
+
+
 def process_haul(bucket: str, year: int, survey: str, haul: int,
     species_by_code: SPECIES_DICT) -> dict:
     """Distributed task to process a single haul.
@@ -433,7 +453,7 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
         }
 
     catch_records = list(catch_records_maybe)
-    catch_records_out = combine_catch_and_haul(haul_record, catch_records, species_by_code)
+    catch_records_out = execute_full_join(haul_record, catch_records, species_by_code)
     catch_records_out_realized = list(catch_records_out)
     catch_records_zero = make_zero_catch_records(
         catch_records_out_realized,
@@ -475,26 +495,6 @@ def get_catch_records(haul: int) -> typing.Optional[typing.Iterable[dict]]:
     return output_dict_with_loc
 
 
-def make_haul_metadata_record(path: str) -> dict:
-    """Parse a path into a metadata record.
-
-    Args:
-        path: The path to be parsed as a metadata record.
-
-    Returns:
-        Dictionary describing metadata for a haul.
-    """
-    filename_with_path = path.split('/')[-1]
-    filename = filename_with_path.split('.')[0]
-    components = filename.split('_')
-    return {
-        'path': path,
-        'year': int(components[0]),
-        'survey': components[1],
-        'haul': int(components[2])
-    }
-
-
 def get_hauls_meta(bucket: str) -> typing.Iterable[dict]:
     """Get metadata for all available hauls.
 
diff --git a/snapshot/test_render_flat.py b/snapshot/test_render_flat.py
new file mode 100644
index 00000000..d8955cfb
--- /dev/null
+++ b/snapshot/test_render_flat.py
@@ -0,0 +1,207 @@
+"""
+Tests for script to join data to build flat files.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import unittest
+import unittest.mock
+
+import render_flat
+
+
+class ZeroRecordTests(unittest.TestCase):
+
+    def setUp(self):
+        self._species = {
+            'species_code': 1,
+            'scientific_name': 'test_science',
+            'common_name': 'test_common',
+            'id_rank': 2,
+            'worms': False,
+            'itis': 3
+        }
+        self._species_by_code = {
+            1: self._species
+        }
+        self._haul_record = {'haul_field': 123}
+        self._zero_record_sample = render_flat.make_zero_record(
+            self._species,
+            self._haul_record
+        )
+
+    def test_make_zero_record_haul(self):
+        self.assertEqual(self._zero_record_sample['haul_field'], 123)
+
+    def test_make_zero_record_zero(self):
+        self.assertEqual(self._zero_record_sample['count'], 0)
+
+    def test_make_zero_record_species(self):
+        self.assertEqual(self._zero_record_sample['scientific_name'], 'test_science')
+
+    def test_make_zero_catch_records_no_infer(self):
+        output_records_iter = render_flat.make_zero_catch_records(
+            [{'species_code': 1, 'haul_field': 456}],
+            self._species_by_code,
+            self._haul_record
+        )
+        output_records = list(output_records_iter)
+        self.assertEqual(len(output_records), 0)
+
+    def test_make_zero_catch_records_infer(self):
+        output_records_iter = render_flat.make_zero_catch_records(
+            [{'species_code': 2, 'haul_field': 456}],
+            self._species_by_code,
+            self._haul_record
+        )
+        output_records = list(output_records_iter)
+        self.assertEqual(len(output_records), 1)
+        self.assertEqual(output_records[0]['haul_field'], 123)
+
+
+class JoinTests(unittest.TestCase):
+
+    def setUp(self):
+        self._known = {'species_code': 1, 'catch_field': 12}
+        self._unknown = {'species_code': 2, 'catch_field': 23}
+        self._catch_records = [self._known, self._unknown]
+        self._species_by_code = {
+            1: {'species_field': 34}
+        }
+        self._haul_record = {'haul_field': 45}
+
+    def test_append_complete_catch_field(self):
+        result = render_flat.append_species_from_species_list(self._known, self._species_by_code)
+        self.assertEqual(result['catch_field'], 12)
+
+    def test_append_complete_species_field(self):
+        result = render_flat.append_species_from_species_list(self._known, self._species_by_code)
+        self.assertEqual(result['species_field'], 34)
+
+    def test_append_complete_flag(self):
+        result = render_flat.append_species_from_species_list(self._known, self._species_by_code)
+        self.assertTrue(result['complete'])
+
+    def test_append_incomplete_catch_field(self):
+        result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code)
+        self.assertEqual(result['catch_field'], 23)
+
+    def test_append_incomplete_species_field(self):
+        result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code)
+        self.assertFalse('species_field' in result)
+
+    def test_append_incomplete_flag(self):
+        result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code)
+        self.assertFalse(result['complete'])
+
+    def test_full_join_execution(self):
+        full_join = self._execute_full_join()
+        count = sum(map(lambda x: 1, full_join))
+        self.assertEqual(count, 2)
+
+    def test_full_join_no_catch(self):
+        result = render_flat.execute_full_join(
+            self._haul_record,
+            None,
+            self._species_by_code
+        )
+
+        result_realized = list(result)
+        self.assertEqual(len(result_realized), 1)
+
+        result_individual = result_realized[0]
+        self.assertFalse(result_individual['complete'])
+
+    def test_full_join_no_species(self):
+        target = self._get_species_code_from_join(2)
+        self.assertFalse(target['complete'])
+        self.assertEqual(target['catch_field'], 23)
+        self.assertEqual(target['haul_field'], 45)
+
+    def test_full_join_success(self):
+        target = self._get_species_code_from_join(1)
+        self.assertTrue(target['complete'])
+        self.assertEqual(target['catch_field'], 12)
+        self.assertEqual(target['species_code'], 1)
+        self.assertEqual(target['haul_field'], 45)
+
+    def _execute_full_join(self):
+        return render_flat.execute_full_join(
+            self._haul_record,
+            self._catch_records,
+            self._species_by_code
+        )
+
+    def _get_species_code_from_join(self, species_code):
+        full_join = self._execute_full_join()
+        full_join_tuple = map(lambda x: (x['species_code'], x), full_join)
+        full_join_dict = dict(full_join_tuple)
+        return full_join_dict[species_code]
+
+
+class MakeAvroTests(unittest.TestCase):
+
+    def test_make_get_avro(self):
+        client = unittest.mock.MagicMock()
+        result = render_flat.make_get_avro('bucket', client)
+        self.assertIsNotNone(result)
+
+
+class CombineCatchHaulTests(unittest.TestCase):
+
+    def test_append_catch_haul(self):
+        catch = {'catch_field': 12}
+        haul = {'haul_field': 34}
+        combined = render_flat.append_catch_haul(catch, haul)
+        self.assertEqual(combined['catch_field'], 12)
+        self.assertEqual(combined['haul_field'], 34)
+
+
+class CompleteRecordTests(unittest.TestCase):
+
+    def setUp(self):
+        self._start_record = {'unknown': 12, 'count': 34}
+        self._completed = render_flat.complete_record(self._start_record)
+
+    def test_complete_record_pass_through(self):
+        self.assertEqual(self._completed['count'], 34)
+
+    def test_complete_record_unknown_field(self):
+        self.assertFalse('unknown' in self._completed)
+
+    def test_complete_record_missing_field(self):
+        self.assertIsNone(self._completed['species_code'])
+
+    def test_mark_complete(self):
+        result = render_flat.mark_complete(self._start_record)
+        self.assertTrue(result['complete'])
+
+    def test_mark_incomplete(self):
+        result = render_flat.mark_incomplete(self._start_record)
+        self.assertFalse(result['complete'])
+
+
+class PathTests(unittest.TestCase):
+
+    def test_get_path_for_catches_in_haul(self):
+        path = render_flat.get_path_for_catches_in_haul(123)
+        self.assertEqual(path, 'catch/123.avro')
+
+    def test_get_meta_path_for_haul(self):
+        path = render_flat.get_meta_path_for_haul(2025, 'Gulf of Alaska', 123)
+        self.assertEqual(path, 'haul/2025_Gulf of Alaska_123.avro')
+
+    def test_get_joined_path(self):
+        path = render_flat.get_joined_path(2025, 'Gulf of Alaska', 123)
+        self.assertEqual(path, 'joined/2025_Gulf of Alaska_123.avro')
+
+    def test_make_haul_metadata_record(self):
+        path = render_flat.get_joined_path(2025, 'Gulf of Alaska', 123)
+        metadata = render_flat.make_haul_metadata_record(path)
+        self.assertEqual(metadata['path'], path)
+        self.assertEqual(metadata['year'], 2025)
+        self.assertEqual(metadata['survey'], 'Gulf of Alaska')
+        self.assertEqual(metadata['haul'], 123)

From f576e0f2eb18a8a1d0db5c5417a0b044ff1950f9 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 19:41:28 +0000
Subject: [PATCH 31/36] Test fixes for mypy as part of #114.

---
 snapshot/test_render_flat.py | 38 ++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/snapshot/test_render_flat.py b/snapshot/test_render_flat.py
index d8955cfb..21c88728 100644
--- a/snapshot/test_render_flat.py
+++ b/snapshot/test_render_flat.py
@@ -45,7 +45,7 @@ def test_make_zero_record_species(self):
     def test_make_zero_catch_records_no_infer(self):
         output_records_iter = render_flat.make_zero_catch_records(
             [{'species_code': 1, 'haul_field': 456}],
-            self._species_by_code,
+            self._species_by_code,  # type: ignore
             self._haul_record
         )
         output_records = list(output_records_iter)
@@ -54,7 +54,7 @@ def test_make_zero_catch_records_no_infer(self):
     def test_make_zero_catch_records_infer(self):
         output_records_iter = render_flat.make_zero_catch_records(
             [{'species_code': 2, 'haul_field': 456}],
-            self._species_by_code,
+            self._species_by_code,  # type: ignore
             self._haul_record
         )
         output_records = list(output_records_iter)
@@ -74,27 +74,45 @@ def setUp(self):
         self._haul_record = {'haul_field': 45}
 
     def test_append_complete_catch_field(self):
-        result = render_flat.append_species_from_species_list(self._known, self._species_by_code)
+        result = render_flat.append_species_from_species_list(
+            self._known,
+            self._species_by_code  # type: ignore
+        )
         self.assertEqual(result['catch_field'], 12)
 
     def test_append_complete_species_field(self):
-        result = render_flat.append_species_from_species_list(self._known, self._species_by_code)
+        result = render_flat.append_species_from_species_list(
+            self._known,
+            self._species_by_code  # type: ignore
+        )
         self.assertEqual(result['species_field'], 34)
 
     def test_append_complete_flag(self):
-        result = render_flat.append_species_from_species_list(self._known, self._species_by_code)
+        result = render_flat.append_species_from_species_list(
+            self._known,
+            self._species_by_code  # type: ignore
+        )
         self.assertTrue(result['complete'])
 
     def test_append_incomplete_catch_field(self):
-        result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code)
+        result = render_flat.append_species_from_species_list(
+            self._unknown,
+            self._species_by_code  # type: ignore
+        )
         self.assertEqual(result['catch_field'], 23)
 
     def test_append_incomplete_species_field(self):
-        result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code)
+        result = render_flat.append_species_from_species_list(
+            self._unknown,
+            self._species_by_code  # type: ignore
+        )
         self.assertFalse('species_field' in result)
 
     def test_append_incomplete_flag(self):
-        result = render_flat.append_species_from_species_list(self._unknown, self._species_by_code)
+        result = render_flat.append_species_from_species_list(
+            self._unknown,
+            self._species_by_code  # type: ignore
+        )
         self.assertFalse(result['complete'])
 
     def test_full_join_execution(self):
@@ -106,7 +124,7 @@ def test_full_join_no_catch(self):
         result = render_flat.execute_full_join(
             self._haul_record,
             None,
-            self._species_by_code
+            self._species_by_code  # type: ignore
         )
 
         result_realized = list(result)
@@ -132,7 +150,7 @@ def _execute_full_join(self):
         return render_flat.execute_full_join(
             self._haul_record,
             self._catch_records,
-            self._species_by_code
+            self._species_by_code  # type: ignore
         )
 
     def _get_species_code_from_join(self, species_code):

From 7adeee9586f62277722c7940a3d6e09680624382 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 19:52:39 +0000
Subject: [PATCH 32/36] Add initial request_source implementation.

---
 snapshot/request_source.py | 212 +++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 snapshot/request_source.py

diff --git a/snapshot/request_source.py b/snapshot/request_source.py
new file mode 100644
index 00000000..b034fd06
--- /dev/null
+++ b/snapshot/request_source.py
@@ -0,0 +1,212 @@
+"""
+Scripts to request haul, catch, and species data from upstream APIs.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import io
+import itertools
+import os
+import sys
+import time
+import typing
+
+import boto3
+import fastavro
+import requests
+import toolz.itertoolz
+
+MIN_ARGS = 3
+MAX_ARGS = 4
+USAGE_STR = 'python request_source.py [type] [bucket] [location] [year]'
+DOMAIN = 'https://apps-st.fisheries.noaa.gov'
+ENDPOINTS = {
+    'haul': '/ods/foss/afsc_groundfish_survey_haul/',
+    'catch': '/ods/foss/afsc_groundfish_survey_catch/',
+    'species': '/ods/foss/afsc_groundfish_survey_species/'
+}
+
+HAUL_SCHEMA = {
+    'doc': 'Description of a haul',
+    'name': 'Haul',
+    'namespace': 'edu.dse.afscgap',
+    'type': 'record',
+    'fields': [
+        {'name': 'year', 'type': 'int'},
+        {'name': 'srvy', 'type': 'string'},
+        {'name': 'survey', 'type': 'string'},
+        {'name': 'survey_name', 'type': 'string'},
+        {'name': 'survey_definition_id', 'type': ['long', 'null']},
+        {'name': 'cruise', 'type': ['long', 'null']},
+        {'name': 'cruisejoin', 'type': 'long'},
+        {'name': 'hauljoin', 'type': 'long'},
+        {'name': 'haul', 'type': ['long', 'null']},
+        {'name': 'stratum', 'type': ['long', 'null']},
+        {'name': 'station', 'type': ['string', 'null']},
+        {'name': 'vessel_id', 'type': ['long', 'null']},
+        {'name': 'vessel_name', 'type': ['string', 'null']},
+        {'name': 'date_time', 'type': 'string'},
+        {'name': 'latitude_dd_start', 'type': ['double', 'null']},
+        {'name': 'longitude_dd_start', 'type': ['double', 'null']},
+        {'name': 'latitude_dd_end', 'type': ['double', 'null']},
+        {'name': 'longitude_dd_end', 'type': ['double', 'null']},
+        {'name': 'bottom_temperature_c', 'type': ['double', 'null']},
+        {'name': 'surface_temperature_c', 'type': ['double', 'null']},
+        {'name': 'depth_m', 'type': ['double', 'null']},
+        {'name': 'distance_fished_km', 'type': ['double', 'null']},
+        {'name': 'duration_hr', 'type': ['double', 'null']},
+        {'name': 'net_width_m', 'type': ['double', 'null']},
+        {'name': 'net_height_m', 'type': ['double', 'null']},
+        {'name': 'area_swept_km2', 'type': ['double', 'null']},
+        {'name': 'performance', 'type': ['float', 'null']}
+    ]
+}
+
+CATCH_SCHEMA = {
+    'doc': 'Description of a catch',
+    'name': 'Catch',
+    'namespace': 'edu.dse.afscgap',
+    'type': 'record',
+    'fields': [
+        {'name': 'hauljoin', 'type': 'long'},
+        {'name': 'species_code', 'type': 'long'},
+        {'name': 'cpue_kgkm2', 'type': ['double', 'null']},
+        {'name': 'cpue_nokm2', 'type': ['double', 'null']},
+        {'name': 'count', 'type': ['long', 'null']},
+        {'name': 'weight_kg', 'type': ['double', 'null']},
+        {'name': 'taxon_confidence', 'type': ['string', 'null']}
+    ]
+}
+
+SPECIES_SCHEMA = {
+    'doc': 'Description of a species',
+    'name': 'Species',
+    'namespace': 'edu.dse.afscgap',
+    'type': 'record',
+    'fields': [
+        {'name': 'species_code', 'type': 'long'},
+        {'name': 'scientific_name', 'type': ['string', 'null']},
+        {'name': 'common_name', 'type': ['string', 'null']},
+        {'name': 'id_rank', 'type': ['string', 'null']},
+        {'name': 'worms', 'type': ['long', 'null']},
+        {'name': 'itis', 'type': ['long', 'null']}
+    ]
+}
+
+SCHEMAS = {
+    'haul': HAUL_SCHEMA,
+    'catch': CATCH_SCHEMA,
+    'species': SPECIES_SCHEMA
+}
+
+
+def get_api_request_url(type_name: str, year: int, offset: int) -> str:
+    endpoint = ENDPOINTS[type_name]
+
+    if year:
+        params = '?offset=%d&limit=10000&q={"year":%d}' % (offset, year)
+    else:
+        params = '?offset=%d&limit=10000' % offset
+
+    full_url = DOMAIN + endpoint + params
+    return full_url
+
+
+def dump_to_s3(year: int, bucket: str, loc: str, type_name: str):
+    offset = 0
+    done = False
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=os.environ['AWS_ACCESS_KEY'],
+        aws_secret_access_key=os.environ['AWS_ACCESS_SECRET']
+    )
+
+    def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO:
+        target_buffer = io.BytesIO()
+        fastavro.writer(target_buffer, SCHEMAS[type_name], records)
+        target_buffer.seek(0)
+        return target_buffer
+
+    def append_in_bucket(key: str, records: typing.List[dict]):
+        sample_record = records[0]
+
+        if type_name == 'haul':
+            template_vals = (
+                year,
+                sample_record['survey'],
+                sample_record['hauljoin']
+            )
+            full_loc = loc + '/%d_%s_%d.avro' % template_vals
+        elif type_name == 'catch':
+            full_loc = loc + '/%d.avro' % sample_record['hauljoin']
+        elif type_name == 'species':
+            full_loc = loc + '/%d.avro' % sample_record['species_code']
+
+        try:
+            target_buffer = io.BytesIO()
+            s3_client.download_fileobj(bucket, full_loc, target_buffer)
+            target_buffer.seek(0)
+            prior_records = fastavro.reader(target_buffer)
+        except s3_client.exceptions.ClientError:
+            prior_records = []
+
+        records_avro = convert_to_avro(itertools.chain(prior_records, records))
+        s3_client.upload_fileobj(records_avro, bucket, full_loc)
+
+    def write_response(parsed: dict):
+        items = parsed['items']
+        key_name = 'species_code' if type_name == 'species' else 'hauljoin'
+        by_key = toolz.itertoolz.groupby(lambda x: x[key_name], items)
+        for key_tuple in by_key.items():
+            key = key_tuple[0]
+            records = key_tuple[1]
+            append_in_bucket(key, records)
+
+    def execute_request(offset: int):
+        full_url = get_api_request_url(type_name, year, offset)
+        response = requests.get(full_url)
+        return response
+
+    while not done:
+        if offset % 100000 == 0:
+            print('Offset: %d' % offset)
+
+        response = execute_request(offset)
+        status_code = response.status_code
+
+        if status_code == 200:
+            parsed = response.json()
+            write_response(parsed)
+            offset += 10000
+            done = len(parsed['items']) == 0
+            if done:
+                print('Ending gracefully...')
+        else:
+            template_vals = (offset, status_code)
+            print('Offset of %d with status %d. Waiting...' % template_vals)
+            time.sleep(1)
+
+
+def main():
+    if len(sys.argv) < MIN_ARGS + 1 or len(sys.argv) > MAX_ARGS + 1:
+        print(USAGE_STR)
+        sys.exit(1)
+
+    type_name = sys.argv[1]
+    bucket = sys.argv[2]
+    loc = sys.argv[3]
+
+    if len(sys.argv) > 4:
+        year = int(sys.argv[4])
+    else:
+        year = None
+
+    dump_to_s3(year, bucket, loc, type_name)
+
+
+if __name__ == '__main__':
+    main()

From 294325d184ef18cdafa898316ada7a6ae913b154 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 19:58:16 +0000
Subject: [PATCH 33/36] Type fixes for request source.

Type fixes for newly added request source script as part of #114.
---
 snapshot/request_source.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/snapshot/request_source.py b/snapshot/request_source.py
index b034fd06..6b81468f 100644
--- a/snapshot/request_source.py
+++ b/snapshot/request_source.py
@@ -14,10 +14,10 @@
 import time
 import typing
 
-import boto3
+import boto3  # type: ignore
 import fastavro
 import requests
-import toolz.itertoolz
+import toolz.itertoolz  # type: ignore
 
 MIN_ARGS = 3
 MAX_ARGS = 4
@@ -103,7 +103,7 @@
 }
 
 
-def get_api_request_url(type_name: str, year: int, offset: int) -> str:
+def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int) -> str:
     endpoint = ENDPOINTS[type_name]
 
     if year:
@@ -115,7 +115,7 @@ def get_api_request_url(type_name: str, year: int, offset: int) -> str:
     return full_url
 
 
-def dump_to_s3(year: int, bucket: str, loc: str, type_name: str):
+def dump_to_s3(year: typing.Optional[int], bucket: str, loc: str, type_name: str):
     offset = 0
     done = False
 
@@ -135,6 +135,7 @@ def append_in_bucket(key: str, records: typing.List[dict]):
         sample_record = records[0]
 
         if type_name == 'haul':
+            assert year is not None
             template_vals = (
                 year,
                 sample_record['survey'],
@@ -146,14 +147,16 @@ def append_in_bucket(key: str, records: typing.List[dict]):
         elif type_name == 'species':
             full_loc = loc + '/%d.avro' % sample_record['species_code']
 
-        try:
-            target_buffer = io.BytesIO()
-            s3_client.download_fileobj(bucket, full_loc, target_buffer)
-            target_buffer.seek(0)
-            prior_records = fastavro.reader(target_buffer)
-        except s3_client.exceptions.ClientError:
-            prior_records = []
+        def read_prior_records() -> typing.Iterable[dict]:
+            try:
+                target_buffer = io.BytesIO()
+                s3_client.download_fileobj(bucket, full_loc, target_buffer)
+                target_buffer.seek(0)
+                return fastavro.reader(target_buffer)  # type: ignore
+            except s3_client.exceptions.ClientError:
+                return []
 
+        prior_records = read_prior_records()
         records_avro = convert_to_avro(itertools.chain(prior_records, records))
         s3_client.upload_fileobj(records_avro, bucket, full_loc)
 

From a46ca215be0e04ff2361e1d3b55242db366a6148 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 20:49:27 +0000
Subject: [PATCH 34/36] Add docstring for request source.

---
 snapshot/request_source.py | 85 +++++++++++++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 6 deletions(-)

diff --git a/snapshot/request_source.py b/snapshot/request_source.py
index 6b81468f..56d50a9d 100644
--- a/snapshot/request_source.py
+++ b/snapshot/request_source.py
@@ -28,6 +28,7 @@
     'catch': '/ods/foss/afsc_groundfish_survey_catch/',
     'species': '/ods/foss/afsc_groundfish_survey_species/'
 }
+YEAR_ENDPOINTS = {'catch'}
 
 HAUL_SCHEMA = {
     'doc': 'Description of a haul',
@@ -102,20 +103,57 @@
     'species': SPECIES_SCHEMA
 }
 
+DEFAULT_LIMIT = 100000
 
-def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int) -> str:
+
+def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int,
+    limit: int = DEFAULT_LIMIT) -> str:
+    """Get the URL where an API endoping can be found for a given set of records.
+
+    Get the URL where an API endoping can be found for a given set of records, raising an exception
+    if an invalid request is provided like if year is provided but not supported by the endpoint.
+
+    Args:
+        type_name: The type of record requested like "catch" for catch records.
+        year: The year like 2025 for which records are requested. If None, will request without a
+            year filter. Ignored by some endpoints.
+        offset: The offset into this year / type combination.
+        limit: The maximum number of records to return.
+
+    Returns:
+        String URL where the requested records can be found.
+    """
     endpoint = ENDPOINTS[type_name]
 
     if year:
-        params = '?offset=%d&limit=10000&q={"year":%d}' % (offset, year)
+        if type_name not in YEAR_ENDPOINTS:
+            raise RuntimeError('Provided a year filter to an endpoint that does not support it.')
+
+        params = '?offset=%d&limit=%d&q={"year":%d}' % (offset, limit, year)
     else:
-        params = '?offset=%d&limit=10000' % offset
+        if type_name in YEAR_ENDPOINTS:
+            raise RuntimeError('Did not provide a year filter to an endpoint that supports it.')
+
+        params = '?offset=%d&limit=%d' % (offset, limit)
 
     full_url = DOMAIN + endpoint + params
     return full_url
 
 
 def dump_to_s3(year: typing.Optional[int], bucket: str, loc: str, type_name: str):
+    """Dump a set of records to an S3 bucket for later processing / joining.
+
+    Dump a set of records to an S3 bucket. These may be saved for later processing such as joining
+    across record types. This will perform pagination until all records saved, making multiple API
+    requests. Raises an exception if year is provided but not supported.
+
+    Args:
+        year: The year for which records should be dumped. This is ignored by some endpoints and
+            None may be passed.
+        bucket: The name of the bucket within S3 in which they should be dumped.
+        loc: The location within the bucket where they should be written.
+        type_name: The type of record to dump like "catch" for catch records.
+    """
     offset = 0
     done = False
 
@@ -126,12 +164,26 @@ def dump_to_s3(year: typing.Optional[int], bucket: str, loc: str, type_name: str
     )
 
     def convert_to_avro(records: typing.Iterable[dict]) -> io.BytesIO:
+        """Convert a set of records to Avro.
+
+        Args:
+            records: The records to convert to Avro.
+
+        Returns:
+            The provided records as binary.
+        """
         target_buffer = io.BytesIO()
         fastavro.writer(target_buffer, SCHEMAS[type_name], records)
         target_buffer.seek(0)
         return target_buffer
 
     def append_in_bucket(key: str, records: typing.List[dict]):
+        """Append to a file within an S3 bucket, making the file if it does not exist.
+
+        Args:
+            key: The path to the file to be appended.
+            records: The records to be appended as Avro.
+        """
         sample_record = records[0]
 
         if type_name == 'haul':
@@ -148,6 +200,12 @@ def append_in_bucket(key: str, records: typing.List[dict]):
             full_loc = loc + '/%d.avro' % sample_record['species_code']
 
         def read_prior_records() -> typing.Iterable[dict]:
+            """Get the records already at the target file.
+
+            Returns:
+                Iterable over records if prior contents found or an empty iterable if the file does
+                not exist.
+            """
             try:
                 target_buffer = io.BytesIO()
                 s3_client.download_fileobj(bucket, full_loc, target_buffer)
@@ -161,6 +219,11 @@ def read_prior_records() -> typing.Iterable[dict]:
         s3_client.upload_fileobj(records_avro, bucket, full_loc)
 
     def write_response(parsed: dict):
+        """Write the result of an API call to S3.
+
+        Args:
+            parsed: The record returned from the API.
+        """
         items = parsed['items']
         key_name = 'species_code' if type_name == 'species' else 'hauljoin'
         by_key = toolz.itertoolz.groupby(lambda x: x[key_name], items)
@@ -170,12 +233,21 @@ def write_response(parsed: dict):
             append_in_bucket(key, records)
 
     def execute_request(offset: int):
-        full_url = get_api_request_url(type_name, year, offset)
+        """Execute a single request for records given an offset into the result set.
+        
+        Args:
+            offset: The number of records to skip at the start of the result set. Used for
+                pagination.
+
+        Returns:
+            Unparsed response from a requests-like object.
+        """
+        full_url = get_api_request_url(type_name, year, offset, limit=DEFAULT_LIMIT)
         response = requests.get(full_url)
         return response
 
     while not done:
-        if offset % 100000 == 0:
+        if offset % DEFAULT_LIMIT == 0:
             print('Offset: %d' % offset)
 
         response = execute_request(offset)
@@ -184,7 +256,7 @@ def execute_request(offset: int):
         if status_code == 200:
             parsed = response.json()
             write_response(parsed)
-            offset += 10000
+            offset += DEFAULT_LIMIT
             done = len(parsed['items']) == 0
             if done:
                 print('Ending gracefully...')
@@ -195,6 +267,7 @@ def execute_request(offset: int):
 
 
 def main():
+    """Entrypoint to the request source script."""
     if len(sys.argv) < MIN_ARGS + 1 or len(sys.argv) > MAX_ARGS + 1:
         print(USAGE_STR)
         sys.exit(1)

From 426d9d6d69002c8c32358942b0f360390685e346 Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 20:57:14 +0000
Subject: [PATCH 35/36] Fixes for #114.

---
 snapshot/request_source.py      |  6 ++---
 snapshot/test_request_source.py | 46 +++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 3 deletions(-)
 create mode 100644 snapshot/test_request_source.py

diff --git a/snapshot/request_source.py b/snapshot/request_source.py
index 56d50a9d..a2f96dd5 100644
--- a/snapshot/request_source.py
+++ b/snapshot/request_source.py
@@ -103,7 +103,7 @@
     'species': SPECIES_SCHEMA
 }
 
-DEFAULT_LIMIT = 100000
+DEFAULT_LIMIT = 10000
 
 
 def get_api_request_url(type_name: str, year: typing.Optional[int], offset: int,
@@ -234,7 +234,7 @@ def write_response(parsed: dict):
 
     def execute_request(offset: int):
         """Execute a single request for records given an offset into the result set.
-        
+
         Args:
             offset: The number of records to skip at the start of the result set. Used for
                 pagination.
@@ -247,7 +247,7 @@ def execute_request(offset: int):
         return response
 
     while not done:
-        if offset % DEFAULT_LIMIT == 0:
+        if offset % (DEFAULT_LIMIT * 10) == 0:
             print('Offset: %d' % offset)
 
         response = execute_request(offset)
diff --git a/snapshot/test_request_source.py b/snapshot/test_request_source.py
new file mode 100644
index 00000000..b461980d
--- /dev/null
+++ b/snapshot/test_request_source.py
@@ -0,0 +1,46 @@
+"""
+Tests for requesting upstream source data.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import unittest
+import unittest.mock
+
+import request_source
+
+
+class ApiUrlGenerationTests(unittest.TestCase):
+
+    def test_haul(self):
+        url = request_source.get_api_request_url('haul', None, 12, 34)
+        self.assertTrue('afsc_groundfish_survey_haul' in url)
+        self.assertTrue('offset=12' in url)
+        self.assertTrue('limit=34' in url)
+
+    def test_catch(self):
+        url = request_source.get_api_request_url('catch', 2025, 12, 34)
+        self.assertTrue('afsc_groundfish_survey_catch' in url)
+        self.assertTrue('offset=12' in url)
+        self.assertTrue('limit=34' in url)
+        self.assertTrue('q={"year":2025}' in url)
+
+    def test_species(self):
+        url = request_source.get_api_request_url('species', None, 12, 34)
+        self.assertTrue('afsc_groundfish_survey_species' in url)
+        self.assertTrue('offset=12' in url)
+        self.assertTrue('limit=34' in url)
+
+    def test_year_provide_not_support(self):
+        with self.assertRaises(RuntimeError):
+            request_source.get_api_request_url('haul', 2025, 12, 34)
+
+        with self.assertRaises(RuntimeError):
+            request_source.get_api_request_url('species', 2025, 12, 34)
+
+    def test_year_not_provided_supported(self):
+        with self.assertRaises(RuntimeError):
+            request_source.get_api_request_url('catch', None, 12, 34)

From d03b171e9190a0f63ebc2281e4111c143ba0aaca Mon Sep 17 00:00:00 2001
From: A Samuel Pottinger <sam@gleap.org>
Date: Sat, 4 Jan 2025 21:05:49 +0000
Subject: [PATCH 36/36] Add write main index.

---
 snapshot/test_write_main_index.py | 30 ++++++++++
 snapshot/write_main_index.py      | 96 +++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 snapshot/test_write_main_index.py
 create mode 100644 snapshot/write_main_index.py

diff --git a/snapshot/test_write_main_index.py b/snapshot/test_write_main_index.py
new file mode 100644
index 00000000..fb0cd3e7
--- /dev/null
+++ b/snapshot/test_write_main_index.py
@@ -0,0 +1,30 @@
+"""
+Tests for the main index generation script.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import unittest
+import unittest.mock
+
+import write_main_index
+
+EXAMPLE_URL = 'joined/2025_Gulf of Alaska_123.avro'
+
+
+class HaulMetadataRecordTests(unittest.TestCase):
+
+    def setUp(self):
+        self._record = write_main_index.make_haul_metadata_record(EXAMPLE_URL)
+
+    def test_year(self):
+        self.assertEqual(self._record['year'], 2025)
+
+    def test_survey(self):
+        self.assertEqual(self._record['survey'], 'Gulf of Alaska')
+
+    def test_haul(self):
+        self.assertEqual(self._record['haul'], 123)
diff --git a/snapshot/write_main_index.py b/snapshot/write_main_index.py
new file mode 100644
index 00000000..45beae44
--- /dev/null
+++ b/snapshot/write_main_index.py
@@ -0,0 +1,96 @@
+"""
+Scripts to write the "main" index which includes all hauls without filtering.
+
+(c) 2025 Regents of University of California / The Eric and Wendy Schmidt Center
+for Data Science and the Environment at UC Berkeley.
+
+This file is part of afscgap released under the BSD 3-Clause License. See
+LICENSE.md.
+"""
+import io
+import itertools
+import os
+import sys
+
+import boto3  # type: ignore
+import fastavro
+
+KEY_SCHEMA = {
+    'doc': 'Key to an observation flat file.',
+    'name': 'Key',
+    'namespace': 'edu.dse.afscgap',
+    'type': 'record',
+    'fields': [
+        {'name': 'year', 'type': 'int'},
+        {'name': 'survey', 'type': 'string'},
+        {'name': 'haul', 'type': 'long'}
+    ]
+}
+
+NUM_ARGS = 1
+USAGE_STR = 'python write_main_index.py [bucket]'
+
+
+def make_haul_metadata_record(path: str) -> dict:
+    """Interpret a path and parse metadata information about a haul found at that path.
+
+    Args:
+        path: The path to parse as a metadata record.
+
+    Returns:
+        Dictionary with metadata about the haul that may be found at the provided path.
+    """
+    filename_with_path = path.split('/')[-1]
+    filename = filename_with_path.split('.')[0]
+    components = filename.split('_')
+    return {
+        'year': int(components[0]),
+        'survey': components[1],
+        'haul': int(components[2])
+    }
+
+
+def main():
+    """Entrypoint into the main index writing script."""
+    if len(sys.argv) != NUM_ARGS + 1:
+        print(USAGE_STR)
+        sys.exit(1)
+
+    bucket = sys.argv[1]
+
+    access_key = os.environ['AWS_ACCESS_KEY']
+    access_secret = os.environ['AWS_ACCESS_SECRET']
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+
+    paginator = s3_client.get_paginator('list_objects_v2')
+    iterator = paginator.paginate(Bucket=bucket, Prefix='joined/')
+    pages = filter(lambda x: 'Contents' in x, iterator)
+    contents = map(lambda x: x['Contents'], pages)
+    contents_flat = itertools.chain(*contents)
+    keys = map(lambda x: x['Key'], contents_flat)
+    metadata_records = map(make_haul_metadata_record, keys)
+
+    write_buffer = io.BytesIO()
+    fastavro.writer(
+        write_buffer,
+        KEY_SCHEMA,
+        metadata_records
+    )
+    write_buffer.seek(0)
+
+    s3_client = boto3.client(
+        's3',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=access_secret
+    )
+    output_loc = 'index/main.avro'
+    s3_client.upload_fileobj(write_buffer, bucket, output_loc)
+
+
+if __name__ == '__main__':
+    main()