clustering/samplecomparison.py at main · lawrencefchan/clustering · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
'''Notes:
This compares the different resampling methods available.

A working dendrogram is also included, as well as a broken
clustervar.

Refactoring to do:
* It would make sense to move all of the resampling
into this file and import kicluster.py as necessary
* kicluster.cleandata should go to kidata
* dendrogram/clustervar should go to kicluster maybe

'''
# %%

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy.signal as signal
import scipy.cluster.hierarchy as hac

import kicluster


def posttrim_savgol(data, window=31):
    '''
    Applies scipy sav-gol filter then trims output
    '''
    timestamps = data.index
    battnames = data.columns

    # Savitzky-Golay
    filtered_data = signal.savgol_filter(data, window, 3, axis=0)
    filtered_data = pd.DataFrame(filtered_data,
                                 index=timestamps,
                                 columns=battnames)

    # trim refresh edges to isolate profile
    trim = filtered_data.iloc[210:-140]
    return trim


def resamplecomparison(data, factor=10, save_plot=False):
    '''
    Compare different resample methods
    factor: downsample factor-must be a factor of 1620
    '''

    data = data.iloc[:, [0]]

    n_time = factor * 0.5
    # number of samples to take
    n_samples = len(data.index) // factor

    # filter then trim
    pt_savgol = posttrim_savgol(data)

    # Pandas resample
    pd_resample = data.resample(f'{n_time}Min').mean()

    # Scipy resample
    scipy_resample = pd.DataFrame(signal.resample(data, n_samples),
                                  index=data.index[::factor])

    # Scipy decimate
    decimated = np.transpose(signal.decimate(data.T, factor, ftype='fir'))
    decimated = pd.DataFrame(decimated, index=data.index[::factor])

    fig, ax = plt.subplots()  # figsize=(30, 8))

    compare = {'original': data,
               'filter then trim': pt_savgol,
               'pandas resample': pd_resample,
               'scipy resample': scipy_resample,
               'scipy decimate': decimated
               }

    start = '2019-04-05, 23:00'
    stop = '2019-04-06, 9:30'

    for key in compare.keys():
        alpha = 1
        if key == 'original':
            alpha = 0.6
        compare[key][start:stop].plot(ax=ax, alpha=alpha)

    ax.legend(compare.keys(), loc='best')
    ax.set_ylim((2.15, 2.4))
    # ax.set_xlim((1554465200.0, 1554507070.0))

    if save_plot:
        plt.savefig(f'charts/resample_comparison', transparent=True, bbox_inches='tight')
        plt.close(fig)

    plt.show()


# ---- Evaluate metrics, dendograms
def dendrogram(data, p=30, save_plot=False):
    # metrics = [(myMetric, 0.018), ('correlation', 0.018), ('euclidean', 0.6)]
    # for metric, scale in metrics:

    method = ['single', 'complete', 'average', 'weighted', 'centroid', 'ward']
    for i in method:
        D = hac.linkage(data.T, method=i, metric='euclidean')

        fig, ax = plt.subplots(figsize=(10, 8))
        hac.dendrogram(D, p=p, truncate_mode='lastp')

        plt.title(f'Cluster method: {i}')
        ax.tick_params(axis='both', which='major', labelsize=10)
        plt.xlabel('Cluster number')
        plt.ylabel('Distance')
        # plt.ylim(0, 5)
        ax.xaxis.set_ticklabels([])

        if save_plot:
            plt.savefig(f'charts/dendrogram-{i} method', transparent=True, bbox_inches='tight')
            plt.close(fig)

        plt.show()

def clustervar(data):
    '''Compute cluster size variance based on method
    NOTE: This is broken due to standardising dataframes to have time index
    '''
    filter_window = range(51, 252, 50)
    metrics = ['euclidean']
    method = ['single', 'complete', 'centroid', 'ward']
    for i in filter_window:
        filtered_data = posttrim_savgol(data, i)

        for j in method:
            for k in metrics:
                sm = kicluster.cluster_summary(filtered_data, nclusters, j, k)
                print(f"{i}, {j}, {k}, Std: {sm['No. Elements'].std():.2f}")

                # filtered_data.loc[:, sm['Battery position'][0]].plot(legend=False)
                # plt.title(f"window: {i}, metric: {j}, method: {k}")
                # plt.show()


# ========== Input parameters ==========
data = kicluster.cleandata('30sec data.csv')
nclusters = 12   # dendogram cutoff
# ========== ================ ==========

dendrogram(data, p=120)
resamplecomparison(data, factor=20)
clustervar(data)