Skip to content

Commit 514a1c4

Browse files
authored
Merge pull request #142 from mexxexx/fix_mstump_ed_nan_inf
Fixed #117 and #130
2 parents 00ac39d + 45ebd16 commit 514a1c4

File tree

8 files changed

+254
-118
lines changed

8 files changed

+254
-118
lines changed

stumpy/mstump.py

Lines changed: 41 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def _multi_mass(Q, T, m, M_T, Σ_T):
5959
return D
6060

6161

62-
def _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T):
62+
def _get_first_mstump_profile(start, T_A, T_B, m, excl_zone, M_T, Σ_T):
6363
"""
6464
Multi-dimensional wrapper to compute the multi-dimensional matrix profile
6565
and multi-dimensional matrix profile index for a given window within the
@@ -72,10 +72,13 @@ def _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T):
7272
The window index to calculate the first matrix profile, matrix profile
7373
index, left matrix profile index, and right matrix profile index for.
7474
75-
T : ndarray
75+
T_A : ndarray
7676
The time series or sequence for which the matrix profile index will
7777
be returned
7878
79+
T_B : ndarray
80+
The time series or sequence that contains your query subsequences
81+
7982
m : int
8083
Window size
8184
@@ -98,8 +101,8 @@ def _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T):
98101
equal to `start`
99102
"""
100103

101-
d, n = T.shape
102-
D = _multi_mass(T[:, start : start + m], T, m, M_T, Σ_T)
104+
d, n = T_A.shape
105+
D = _multi_mass(T_B[:, start : start + m], T_A, m, M_T, Σ_T)
103106

104107
zone_start = max(0, start - excl_zone)
105108
zone_stop = min(n - m + 1, start + excl_zone)
@@ -159,22 +162,7 @@ def _get_multi_QT(start, T, m):
159162

160163
@njit(parallel=True, fastmath=True)
161164
def _mstump(
162-
T,
163-
m,
164-
P,
165-
I,
166-
D,
167-
D_prime,
168-
range_stop,
169-
excl_zone,
170-
M_T,
171-
Σ_T,
172-
QT,
173-
QT_first,
174-
μ_Q,
175-
σ_Q,
176-
k,
177-
range_start=1,
165+
T, m, range_stop, excl_zone, M_T, Σ_T, QT, QT_first, μ_Q, σ_Q, k, range_start=1
178166
):
179167
"""
180168
A Numba JIT-compiled version of mSTOMP, a variant of mSTAMP, for parallel
@@ -190,18 +178,6 @@ def _mstump(
190178
m : int
191179
Window size
192180
193-
P : ndarray
194-
The output multi-dimensional matrix profile
195-
196-
I : ndarray
197-
The output multi-dimensional matrix profile index
198-
199-
D : ndarray
200-
Storage for the distance profile
201-
202-
D_prime : ndarray
203-
Storage for the cumulative sum of the distance profile
204-
205181
range_stop : int
206182
The index value along T for which to stop the matrix profile
207183
calculation. This parameter is here for consistency with the
@@ -260,8 +236,12 @@ def _mstump(
260236
QT_even = QT.copy()
261237
d = T.shape[0]
262238

239+
P = np.empty((d, range_stop - range_start))
240+
I = np.empty((d, range_stop - range_start))
241+
D = np.empty((d, k))
242+
D_prime = np.empty(k)
243+
263244
for idx in range(range_start, range_stop):
264-
D[:, :] = 0.0
265245
for i in range(d):
266246
# Numba's prange requires incrementing a range by 1 so replace
267247
# `for j in range(k-1,0,-1)` with its incrementing compliment
@@ -311,10 +291,11 @@ def _mstump(
311291
D_prime = D_prime + D[i]
312292

313293
min_index = np.argmin(D_prime)
314-
I[i, idx] = min_index
315-
P[i, idx] = D_prime[min_index] / (i + 1)
316-
if np.isinf(P[i, idx]): # pragma nocover
317-
I[i, idx] = -1
294+
pos = idx - range_start
295+
I[i, pos] = min_index
296+
P[i, pos] = D_prime[min_index] / (i + 1)
297+
if np.isinf(P[i, pos]): # pragma nocover
298+
I[i, pos] = -1
318299

319300
return P, I
320301

@@ -359,55 +340,45 @@ def mstump(T, m):
359340
See mSTAMP Algorithm
360341
"""
361342

362-
T = np.asarray(core.transpose_dataframe(T))
343+
T_A = np.asarray(core.transpose_dataframe(T)).copy()
344+
T_B = T_A.copy()
345+
346+
T_A[np.isinf(T_A)] = np.nan
347+
T_B[np.isinf(T_B)] = np.nan
363348

364-
core.check_dtype(T)
365-
core.check_nan(T)
366-
if T.ndim <= 1: # pragma: no cover
367-
err = f"T is {T.ndim}-dimensional and must be greater than 1-dimensional"
349+
core.check_dtype(T_A)
350+
if T_A.ndim <= 1: # pragma: no cover
351+
err = f"T is {T_A.ndim}-dimensional and must be at least 1-dimensional"
368352
raise ValueError(f"{err}")
369353

370354
core.check_window_size(m)
371355

372-
d = T.shape[0]
373-
n = T.shape[1]
356+
d = T_A.shape[0]
357+
n = T_A.shape[1]
374358
k = n - m + 1
375359
excl_zone = int(np.ceil(m / 4)) # See Definition 3 and Figure 3
376360

377-
M_T, Σ_T = core.compute_mean_std(T, m)
378-
μ_Q, σ_Q = core.compute_mean_std(T, m)
361+
M_T, Σ_T = core.compute_mean_std(T_A, m)
362+
μ_Q, σ_Q = core.compute_mean_std(T_B, m)
363+
364+
T_A[np.isnan(T_A)] = 0
379365

380-
P = np.full((d, k), np.inf, dtype="float64")
381-
D = np.zeros((d, k), dtype="float64")
382-
D_prime = np.zeros(k, dtype="float64")
383-
I = np.ones((d, k), dtype="int64") * -1
366+
P = np.empty((d, k), dtype="float64")
367+
I = np.empty((d, k), dtype="int64")
384368

385369
start = 0
386370
stop = k
387371

388372
P[:, start], I[:, start] = _get_first_mstump_profile(
389-
start, T, m, excl_zone, M_T, Σ_T
373+
start, T_A, T_B, m, excl_zone, M_T, Σ_T
390374
)
391375

392-
QT, QT_first = _get_multi_QT(start, T, m)
393-
394-
_mstump(
395-
T,
396-
m,
397-
P,
398-
I,
399-
D,
400-
D_prime,
401-
stop,
402-
excl_zone,
403-
M_T,
404-
Σ_T,
405-
QT,
406-
QT_first,
407-
μ_Q,
408-
σ_Q,
409-
k,
410-
start + 1,
376+
T_B[np.isnan(T_B)] = 0
377+
378+
QT, QT_first = _get_multi_QT(start, T_A, m)
379+
380+
P[:, start + 1 : stop], I[:, start + 1 : stop] = _mstump(
381+
T_A, m, stop, excl_zone, M_T, Σ_T, QT, QT_first, μ_Q, σ_Q, k, start + 1
411382
)
412383

413384
return P.T, I.T

stumpy/mstumped.py

Lines changed: 32 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -58,63 +58,55 @@ def mstumped(dask_client, T, m):
5858
See mSTAMP Algorithm
5959
"""
6060

61-
hosts = list(dask_client.ncores().keys())
62-
nworkers = len(hosts)
61+
T_A = np.asarray(core.transpose_dataframe(T)).copy()
62+
T_B = T_A.copy()
6363

64-
T = np.asarray(core.transpose_dataframe(T))
64+
T_A[np.isinf(T_A)] = np.nan
65+
T_B[np.isinf(T_B)] = np.nan
6566

66-
core.check_dtype(T)
67-
core.check_nan(T)
68-
if T.ndim <= 1: # pragma: no cover
69-
err = f"T is {T.ndim}-dimensional and must be greater than 1-dimensional"
67+
core.check_dtype(T_A)
68+
if T_A.ndim <= 1: # pragma: no cover
69+
err = f"T is {T_A.ndim}-dimensional and must be at least 1-dimensional"
7070
raise ValueError(f"{err}")
7171

7272
core.check_window_size(m)
7373

74-
d = T.shape[0]
75-
n = T.shape[1]
74+
d, n = T_A.shape
7675
k = n - m + 1
7776
excl_zone = int(np.ceil(m / 4)) # See Definition 3 and Figure 3
7877

79-
M_T, Σ_T = core.compute_mean_std(T, m)
80-
μ_Q, σ_Q = core.compute_mean_std(T, m)
78+
M_T, Σ_T = core.compute_mean_std(T_A, m)
79+
μ_Q, σ_Q = core.compute_mean_std(T_B, m)
80+
81+
T_A[np.isnan(T_A)] = 0
82+
83+
P = np.empty((d, k), dtype="float64")
84+
I = np.empty((d, k), dtype="int64")
85+
86+
hosts = list(dask_client.ncores().keys())
87+
nworkers = len(hosts)
88+
89+
step = 1 + k // nworkers
90+
91+
for i, start in enumerate(range(0, k, step)):
92+
P[:, start], I[:, start] = _get_first_mstump_profile(
93+
start, T_A, T_B, m, excl_zone, M_T, Σ_T
94+
)
8195

82-
P = np.full((nworkers, d, k), np.inf, dtype="float64")
83-
D = np.zeros((nworkers, d, k), dtype="float64")
84-
D_prime = np.zeros((nworkers, k), dtype="float64")
85-
I = np.ones((nworkers, d, k), dtype="int64") * -1
96+
T_B[np.isnan(T_B)] = 0
8697

8798
# Scatter data to Dask cluster
88-
T_future = dask_client.scatter(T, broadcast=True)
99+
T_A_future = dask_client.scatter(T_A, broadcast=True)
89100
M_T_future = dask_client.scatter(M_T, broadcast=True)
90101
Σ_T_future = dask_client.scatter(Σ_T, broadcast=True)
91102
μ_Q_future = dask_client.scatter(μ_Q, broadcast=True)
92103
σ_Q_future = dask_client.scatter(σ_Q, broadcast=True)
93104

94-
step = 1 + k // nworkers
95105
QT_futures = []
96106
QT_first_futures = []
97-
P_futures = []
98-
I_futures = []
99-
D_futures = []
100-
D_prime_futures = []
101107

102108
for i, start in enumerate(range(0, k, step)):
103-
P[i, :, start], I[i, :, start] = _get_first_mstump_profile(
104-
start, T, m, excl_zone, M_T, Σ_T
105-
)
106-
107-
P_future = dask_client.scatter(P[i], workers=[hosts[i]])
108-
I_future = dask_client.scatter(I[i], workers=[hosts[i]])
109-
D_future = dask_client.scatter(D[i], workers=[hosts[i]])
110-
D_prime_future = dask_client.scatter(D_prime[i], workers=[hosts[i]])
111-
112-
P_futures.append(P_future)
113-
I_futures.append(I_future)
114-
D_futures.append(D_future)
115-
D_prime_futures.append(D_prime_future)
116-
117-
QT, QT_first = _get_multi_QT(start, T, m)
109+
QT, QT_first = _get_multi_QT(start, T_A, m)
118110

119111
QT_future = dask_client.scatter(QT, workers=[hosts[i]])
120112
QT_first_future = dask_client.scatter(QT_first, workers=[hosts[i]])
@@ -129,12 +121,8 @@ def mstumped(dask_client, T, m):
129121
futures.append(
130122
dask_client.submit(
131123
_mstump,
132-
T_future,
124+
T_A_future,
133125
m,
134-
P_futures[i],
135-
I_futures[i],
136-
D_futures[i],
137-
D_prime_futures[i],
138126
stop,
139127
excl_zone,
140128
M_T_future,
@@ -150,9 +138,7 @@ def mstumped(dask_client, T, m):
150138

151139
results = dask_client.gather(futures)
152140
for i, start in enumerate(range(0, k, step)):
153-
P[i], I[i] = results[i]
154-
col_mask = P[0] > P[i]
155-
P[0, col_mask] = P[i, col_mask]
156-
I[0, col_mask] = I[i, col_mask]
141+
stop = min(k, start + step)
142+
P[:, start + 1 : stop], I[:, start + 1 : stop] = results[i]
157143

158-
return P[0].T, I[0].T
144+
return P.T, I.T

stumpy/stump.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def _get_first_stump_profile(start, T_A, T_B, m, excl_zone, M_T, Σ_T, ignore_tr
3030
be returned
3131
3232
T_B : ndarray
33-
The time series or sequence that contain your query subsequences
33+
The time series or sequence that contains your query subsequences
3434
3535
m : int
3636
Window size

test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_st
3232
check_errs $?
3333
py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_two_subsequences_nan_A_B_join.py tests/test_stumped_two_subsequences_inf_A_B_join.py tests/test_stumped_two_subsequences_nan_inf_A_B_join.py tests/test_stumped_two_subsequences_nan_inf_A_B_join_swap.py
3434
check_errs $?
35+
py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mstumped_one_subsequence_nan_self_join.py tests/test_mstumped_one_subsequence_nan_self_join.py
36+
check_errs $?
3537
py.test -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_scrimp.py
3638
check_errs $?
3739

tests/test_mstump.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ def naive_rolling_window_dot_product(Q, T):
2626
(np.random.uniform(-1000, 1000, [3, 10]).astype(np.float64), 5),
2727
]
2828

29+
substitution_locations = [
30+
(slice(1, 3), [0, 3])
31+
] # [(slice(0, 0), 0, -1, slice(1, 3), [0, 3])]
32+
substitution_values = [np.nan, np.inf]
33+
2934

3035
@pytest.mark.parametrize("T, m", test_data)
3136
def test_multi_mass(T, m):
@@ -50,7 +55,7 @@ def test_get_first_mstump_profile(T, m):
5055
left_I = left_I[start, :]
5156

5257
M_T, Σ_T = core.compute_mean_std(T, m)
53-
right_P, right_I = _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T)
58+
right_P, right_I = _get_first_mstump_profile(start, T, T, m, excl_zone, M_T, Σ_T)
5459

5560
npt.assert_almost_equal(left_P, right_P)
5661
npt.assert_equal(left_I, right_I)
@@ -138,3 +143,43 @@ def test_constant_subsequence_self_join():
138143
right_P, right_I = mstump(T, m)
139144

140145
npt.assert_almost_equal(left_P, right_P) # ignore indices
146+
147+
148+
@pytest.mark.parametrize("T, m", test_data)
149+
@pytest.mark.parametrize("substitute", substitution_values)
150+
@pytest.mark.parametrize("substitution_locations", substitution_locations)
151+
def test_mstump_nan_inf_self_join_first_dimension(
152+
T, m, substitute, substitution_locations
153+
):
154+
excl_zone = int(np.ceil(m / 4))
155+
156+
T_sub = T.copy()
157+
158+
for substitution_location in substitution_locations:
159+
T_sub[:] = T[:]
160+
T_sub[0, substitution_location] = substitute
161+
162+
left_P, left_I = utils.naive_mstump(T_sub, m, excl_zone)
163+
right_P, right_I = mstump(T_sub, m)
164+
165+
npt.assert_almost_equal(left_P, right_P)
166+
npt.assert_almost_equal(left_I, right_I)
167+
168+
169+
@pytest.mark.parametrize("T, m", test_data)
170+
@pytest.mark.parametrize("substitute", substitution_values)
171+
@pytest.mark.parametrize("substitution_locations", substitution_locations)
172+
def test_mstump_nan_self_join_all_dimensions(T, m, substitute, substitution_locations):
173+
excl_zone = int(np.ceil(m / 4))
174+
175+
T_sub = T.copy()
176+
177+
for substitution_location in substitution_locations:
178+
T_sub[:] = T[:]
179+
T_sub[:, substitution_location] = substitute
180+
181+
left_P, left_I = utils.naive_mstump(T_sub, m, excl_zone)
182+
right_P, right_I = mstump(T_sub, m)
183+
184+
npt.assert_almost_equal(left_P, right_P)
185+
npt.assert_almost_equal(left_I, right_I)

0 commit comments

Comments
 (0)