maize-loss-climate-experiment/gaussian_process.py at main · SchmidtDSE/maize-loss-climate-experiment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
"""Optional tasks for gaussian process.

License:
    BSD
"""
import csv
import itertools
import json
import random

import luigi
import numpy
import sklearn.gaussian_process

import const
import normalize_tasks
import training_tasks

INPUT_ATTRS = training_tasks.get_input_attrs('all attrs', True)
SAMPLE_RATE = 500000
SWEEP_KERNELS = [
    'default',
    'matern_rough',
    'matern_mid',
    'matern_smooth',
    'matern_very_smooth',
    'matern_rough_white',
    'matern_mid_white',
    'matern_smooth_white',
    'matern_very_smooth_white'
]


def assign_year(year):
    """
    Assign a dataset category based on the specified year.

    Args:
        year (int): The year to be evaluated.

    Returns:
        str: A string value that represents the dataset category:
             'train' if the year is before 2013,
             'test' if the year is 2013 or after and an even year,
             'valid' if the year is 2013 or after and an odd year.
    """
    if year < 2013:
        return 'train'
    else:
        return 'test' if year % 2 == 0 else 'valid'


def transform_row(target):
    """Transform a single row by assigning set category and effective year.

    Args:
        target (dict): Dictionary containing row data including year.

    Returns:
        dict: Transformed row with added setAssign and optional effectiveYear.
    """
    year = int(target['year'])

    if const.INCLUDE_YEAR_IN_MODEL:
        target['effectiveYear'] = year - 2007

    target['setAssign'] = assign_year(year)
    return target


class ResampleIndividualizeTask(luigi.Task):
    """Task that resamples and individualizes training data.

    This task takes normalized historic training data and creates individual samples
    based on the mean, standard deviation and sample weights. It filters rows based on
    sample weight threshold and expands them into multiple samples using Gaussian sampling.
    """

    target = luigi.Parameter()

    def requires(self):
        """Specify the dependency on normalized historic training data.

        Returns:
            NormalizeHistoricTrainingFrameTask: Task that provides normalized training data.
        """
        return normalize_tasks.NormalizeHistoricTrainingFrameTask()

    def output(self):
        """Specify the output file location for individualized samples.

        Returns:
            LocalTarget: Target for CSV file containing individualized training samples.
        """
        path = const.get_file_location('sample_individual_%s.csv' % self.target)
        return luigi.LocalTarget(path)

    def run(self):
        """Execute the resampling and individualization process.

        Reads normalized data, filters rows by sample weight, transforms them with
        year assignments, and expands each row into multiple samples using Gaussian
        sampling based on mean and standard deviation.
        """
        with self.input().open() as f_in:
            rows = csv.DictReader(f_in)
            transformed_rows = map(transform_row, rows)
            allowed_rows = filter(lambda x: x['setAssign'] == self.target, transformed_rows)
            expanded_rows_nested = map(lambda x: self._expand_rows(x), allowed_rows)
            expanded_rows = itertools.chain(*expanded_rows_nested)

            output_attrs = INPUT_ATTRS + ['yieldValue', 'geohash', 'year']
            with self.output().open('w') as f_out:
                writer = csv.DictWriter(f_out, fieldnames=output_attrs, extrasaction='ignore')
                writer.writeheader()
                writer.writerows(expanded_rows)

    def _expand_rows(self, target):
        """Expand a single row into multiple samples using Gaussian sampling.

        Args:
            target (dict): Dictionary containing mean, std and sample weight information.

        Returns:
            map: Iterator of dictionaries containing individual samples with values
                drawn from Gaussian distribution.
        """
        mean = float(target['yieldMean'])
        std = float(target['yieldStd'])
        sample_weight = int(target[const.SAMPLE_WEIGHT_ATTR])
        random_array = numpy.random.random(sample_weight)
        threshold = 1 / SAMPLE_RATE
        num_samples = numpy.sum(random_array < threshold)
        samples_indexed = range(0, num_samples)

        def make_sample(index):
            value = random.gauss(mu=mean, sigma=std)
            ret_dict = dict(map(lambda x: (x, target[x]), INPUT_ATTRS))
            ret_dict['yieldValue'] = value
            return ret_dict

        return map(make_sample, samples_indexed)


class BuildGaussianProcessModelTask(luigi.Task):
    """Task that builds and trains a Gaussian Process model.

    This task takes the normalized training data, filters for training set rows,
    and fits a Gaussian Process Regressor with the specified kernel.
    """

    kernel = luigi.Parameter()
    target = luigi.Parameter()

    def requires(self):
        """Specify dependency on normalized individual instance historic training data.

        Returns:
            ResampleIndividualizeTask: Task that provides individual instance data.
        """
        return {
            'train': ResampleIndividualizeTask(target='train'),
            'summary': normalize_tasks.NormalizeHistoricTrainingFrameTask()
        }

    def output(self):
        """Specify the output file location for the trained model.

        Returns:
            LocalTarget: Target for pickle file containing trained model.
        """
        filename = 'gaussian_process_%s_eval_%s.csv' % (self.kernel, self.target)
        path = const.get_file_location(filename)
        return luigi.LocalTarget(path)

    def run(self):
        """Execute the model training process.

        Reads normalized data, filters for training set rows, and fits a
        Gaussian Process model with the specified kernel.
        """
        with self.input()['train'].open() as f_in:
            rows = csv.DictReader(f_in)

            def parse_row(target):
                return {
                    'inputs': [float(target[attr]) for attr in INPUT_ATTRS],
                    'output': float(target['yieldValue'])
                }

            # Prepare inputs and outputs
            parsed_rows = [parse_row(x) for x in rows]
            inputs = [x['inputs'] for x in parsed_rows]
            outputs = [x['output'] for x in parsed_rows]

        # Train model
        model = sklearn.gaussian_process.GaussianProcessRegressor(
            kernel=self._get_kernel(self.kernel),
            copy_X_train=False,
            normalize_y=True
        )
        model.fit(inputs, outputs)

        def parse_test_row(row):
            return {
                'year': int(row['year']),
                'setAssign': row['setAssign'],
                'inputs': [float(row[attr]) for attr in INPUT_ATTRS],
                'output': {
                    'mean': float(row['yieldMean']),
                    'std': float(row['yieldStd'])
                },
                const.SAMPLE_WEIGHT_ATTR: float(row[const.SAMPLE_WEIGHT_ATTR])
            }

        def evaluate_test_row(target, mean_result, std_result):
            return {
                'year': target['year'],
                'setAssign': target['setAssign'],
                'predictedMean': mean_result,
                'actualMean': target['output']['mean'],
                'predictedStd': std_result,
                'actualStd': target['output']['std'],
                const.SAMPLE_WEIGHT_ATTR: float(target[const.SAMPLE_WEIGHT_ATTR])
            }

        # Evaluate on test
        with self.input()['summary'].open() as f_in:
            all_rows = csv.DictReader(f_in)
            all_transformed_rows = map(transform_row, all_rows)
            test_rows = filter(
                lambda x: x['setAssign'] == self.target,
                all_transformed_rows
            )
            parsed_rows = list(map(parse_test_row, test_rows))
            inputs = [x['inputs'] for x in parsed_rows]
            results = model.predict(inputs, return_std=True)
            parsed_rows_with_results = zip(
                parsed_rows,
                results[0].tolist(),
                results[1].tolist()
            )
            eval_rows = map(
                lambda x: evaluate_test_row(x[0], x[1], x[2]),
                parsed_rows_with_results
            )

            with self.output().open('w') as f_out:
                writer = csv.DictWriter(f_out, fieldnames=[
                    'year',
                    'setAssign',
                    'predictedMean',
                    'actualMean',
                    'predictedStd',
                    'actualStd',
                    const.SAMPLE_WEIGHT_ATTR
                ])
                writer.writeheader()
                writer.writerows(eval_rows)

    def _get_kernel(self, name):
        """
        Retrieve the kernel configuration based on the provided name.

        Args:
            name (str): The name of the kernel configuration to retrieve.

        Returns:
            kernel: The kernel setting if known, otherwise raises a KeyError.

        Raises:
            KeyError: If the provided kernel name is unknown.
        """
        matern_rough = sklearn.gaussian_process.kernels.Matern(nu=1)
        matern_mid = sklearn.gaussian_process.kernels.Matern(nu=1.5)
        matern_smooth = sklearn.gaussian_process.kernels.Matern(nu=2)
        matern_very_smooth = sklearn.gaussian_process.kernels.Matern(nu=2.5)
        white_kernel = sklearn.gaussian_process.kernels.WhiteKernel()

        strategies = {
            'default': None,
            'matern_rough': matern_rough,
            'matern_mid': matern_mid,
            'matern_smooth': matern_smooth,
            'matern_very_smooth': matern_very_smooth,
            'matern_rough_white': matern_rough + white_kernel,
            'matern_mid_white': matern_mid + white_kernel,
            'matern_smooth_white': matern_smooth + white_kernel,
            'matern_very_smooth_white': matern_very_smooth + white_kernel
        }
        return strategies[name]


class SummarizeGaussianProcessModelTask(luigi.Task):
    """Task that summarizes Gaussian Process model performance."""

    kernel = luigi.Parameter()
    target = luigi.Parameter()

    def requires(self):
        """Get the tasks whose outputs are required for running the model.

        Returns:
            BuildGaussianProcessModelTask: Task that builds and evaluates the model.
        """
        return BuildGaussianProcessModelTask(
            kernel=self.kernel,
            target=self.target
        )

    def output(self):
        """Specify the output file location for the model summary.

        Returns:
            LocalTarget: Target for JSON file containing model summary metrics.
        """
        filename = 'gaussian_process_%s_summary_%s.json' % (self.kernel, self.target)
        path = const.get_file_location(filename)
        return luigi.LocalTarget(path)

    def run(self):
        """Calculate and write summary metrics."""
        with self.input().open('r') as f_in:
            rows = list(csv.DictReader(f_in))

            # Calculate MAE for mean and std
            def get_abs_diff(row, name):
                predicted = float(row['predicted%s' % name])
                actual = float(row['actual%s' % name])
                return abs(predicted - actual)

            mean_errors = [get_abs_diff(row, 'Mean') for row in rows]
            std_errors = [get_abs_diff(row, 'Std') for row in rows]

            mean_mae = sum(mean_errors) / len(mean_errors)
            std_mae = sum(std_errors) / len(std_errors)

            summary = {
                'mean_mae': mean_mae,
                'std_mae': std_mae,
                'kernel': self.kernel,
                'target': self.target
            }

            with self.output().open('w') as f_out:
                json.dump(summary, f_out, indent=2)


class SummarizeAllGaussianProcessModelTask(luigi.Task):
    """Task that summarizes Gaussian Process model performance."""

    def requires(self):
        """Varying values for sweep.

        Returns:
            Dict: Multiple values to try in sweep.
        """
        return dict(map(
            lambda x: (x, SummarizeGaussianProcessModelTask(
                kernel=x,
                target='valid'
            )),
            SWEEP_KERNELS
        ))

    def output(self):
        """Unified JSON document describing the sweep.

        Returns:
            LocalTarget: Target for JSON file containing model summary metrics.
        """
        filename = 'gaussian_process_all_summary_valid.json'
        path = const.get_file_location(filename)
        return luigi.LocalTarget(path)

    def run(self):
        """Calculate and write summary metrics."""

        ret_dict = {}

        def get_json(name):
            with self.input()[name].open() as f:
                component = json.load(f)
                ret_dict[name] = component

        for option in SWEEP_KERNELS:
            get_json(option)

        with self.output().open('w') as f_out:
            json.dump(ret_dict, f_out, indent=2)