stumpy-dev
diff --git a/‎stumpy/mstump.py‎
Lines changed: 41 additions & 70 deletions b/‎stumpy/mstump.py‎
Lines changed: 41 additions & 70 deletions
diff --git a/‎stumpy/mstumped.py‎
Lines changed: 32 additions & 46 deletions b/‎stumpy/mstumped.py‎
Lines changed: 32 additions & 46 deletions
diff --git a/‎stumpy/stump.py‎
Lines changed: 1 addition & 1 deletion b/‎stumpy/stump.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test.sh‎
Lines changed: 2 additions & 0 deletions b/‎test.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/test_mstump.py‎
Lines changed: 46 additions & 1 deletion b/‎tests/test_mstump.py‎
Lines changed: 46 additions & 1 deletion
@@ -59,7 +59,7 @@ def _multi_mass(Q, T, m, M_T, Σ_T):
     return D
 
 
-def _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T):
+def _get_first_mstump_profile(start, T_A, T_B, m, excl_zone, M_T, Σ_T):
     """
     Multi-dimensional wrapper to compute the multi-dimensional matrix profile
     and multi-dimensional matrix profile index for a given window within the
@@ -72,10 +72,13 @@ def _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T):
         The window index to calculate the first matrix profile, matrix profile
         index, left matrix profile index, and right matrix profile index for.
 
-    T : ndarray
+    T_A : ndarray
         The time series or sequence for which the matrix profile index will
         be returned
 
+    T_B : ndarray
+        The time series or sequence that contains your query subsequences
+
     m : int
         Window size
 
@@ -98,8 +101,8 @@ def _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T):
         equal to `start`
     """
 
-    d, n = T.shape
-    D = _multi_mass(T[:, start : start + m], T, m, M_T, Σ_T)
+    d, n = T_A.shape
+    D = _multi_mass(T_B[:, start : start + m], T_A, m, M_T, Σ_T)
 
     zone_start = max(0, start - excl_zone)
     zone_stop = min(n - m + 1, start + excl_zone)
@@ -159,22 +162,7 @@ def _get_multi_QT(start, T, m):
 
 @njit(parallel=True, fastmath=True)
 def _mstump(
-    T,
-    m,
-    P,
-    I,
-    D,
-    D_prime,
-    range_stop,
-    excl_zone,
-    M_T,
-    Σ_T,
-    QT,
-    QT_first,
-    μ_Q,
-    σ_Q,
-    k,
-    range_start=1,
+    T, m, range_stop, excl_zone, M_T, Σ_T, QT, QT_first, μ_Q, σ_Q, k, range_start=1
 ):
     """
     A Numba JIT-compiled version of mSTOMP, a variant of mSTAMP, for parallel
@@ -190,18 +178,6 @@ def _mstump(
     m : int
         Window size
 
-    P : ndarray
-        The output multi-dimensional matrix profile
-
-    I : ndarray
-        The output multi-dimensional matrix profile index
-
-    D : ndarray
-        Storage for the distance profile
-
-    D_prime : ndarray
-        Storage for the cumulative sum of the distance profile
-
     range_stop : int
         The index value along T for which to stop the matrix profile
         calculation. This parameter is here for consistency with the
@@ -260,8 +236,12 @@ def _mstump(
     QT_even = QT.copy()
     d = T.shape[0]
 
+    P = np.empty((d, range_stop - range_start))
+    I = np.empty((d, range_stop - range_start))
+    D = np.empty((d, k))
+    D_prime = np.empty(k)
+
     for idx in range(range_start, range_stop):
-        D[:, :] = 0.0
         for i in range(d):
             # Numba's prange requires incrementing a range by 1 so replace
             # `for j in range(k-1,0,-1)` with its incrementing compliment
@@ -311,10 +291,11 @@ def _mstump(
             D_prime = D_prime + D[i]
 
             min_index = np.argmin(D_prime)
-            I[i, idx] = min_index
-            P[i, idx] = D_prime[min_index] / (i + 1)
-            if np.isinf(P[i, idx]):  # pragma nocover
-                I[i, idx] = -1
+            pos = idx - range_start
+            I[i, pos] = min_index
+            P[i, pos] = D_prime[min_index] / (i + 1)
+            if np.isinf(P[i, pos]):  # pragma nocover
+                I[i, pos] = -1
 
     return P, I
 
@@ -359,55 +340,45 @@ def mstump(T, m):
     See mSTAMP Algorithm
     """
 
-    T = np.asarray(core.transpose_dataframe(T))
+    T_A = np.asarray(core.transpose_dataframe(T)).copy()
+    T_B = T_A.copy()
+
+    T_A[np.isinf(T_A)] = np.nan
+    T_B[np.isinf(T_B)] = np.nan
 
-    core.check_dtype(T)
-    core.check_nan(T)
-    if T.ndim <= 1:  # pragma: no cover
-        err = f"T is {T.ndim}-dimensional and must be greater than 1-dimensional"
+    core.check_dtype(T_A)
+    if T_A.ndim <= 1:  # pragma: no cover
+        err = f"T is {T_A.ndim}-dimensional and must be at least 1-dimensional"
         raise ValueError(f"{err}")
 
     core.check_window_size(m)
 
-    d = T.shape[0]
-    n = T.shape[1]
+    d = T_A.shape[0]
+    n = T_A.shape[1]
     k = n - m + 1
     excl_zone = int(np.ceil(m / 4))  # See Definition 3 and Figure 3
 
-    M_T, Σ_T = core.compute_mean_std(T, m)
-    μ_Q, σ_Q = core.compute_mean_std(T, m)
+    M_T, Σ_T = core.compute_mean_std(T_A, m)
+    μ_Q, σ_Q = core.compute_mean_std(T_B, m)
+
+    T_A[np.isnan(T_A)] = 0
 
-    P = np.full((d, k), np.inf, dtype="float64")
-    D = np.zeros((d, k), dtype="float64")
-    D_prime = np.zeros(k, dtype="float64")
-    I = np.ones((d, k), dtype="int64") * -1
+    P = np.empty((d, k), dtype="float64")
+    I = np.empty((d, k), dtype="int64")
 
     start = 0
     stop = k
 
     P[:, start], I[:, start] = _get_first_mstump_profile(
-        start, T, m, excl_zone, M_T, Σ_T
+        start, T_A, T_B, m, excl_zone, M_T, Σ_T
     )
 
-    QT, QT_first = _get_multi_QT(start, T, m)
-
-    _mstump(
-        T,
-        m,
-        P,
-        I,
-        D,
-        D_prime,
-        stop,
-        excl_zone,
-        M_T,
-        Σ_T,
-        QT,
-        QT_first,
-        μ_Q,
-        σ_Q,
-        k,
-        start + 1,
+    T_B[np.isnan(T_B)] = 0
+
+    QT, QT_first = _get_multi_QT(start, T_A, m)
+
+    P[:, start + 1 : stop], I[:, start + 1 : stop] = _mstump(
+        T_A, m, stop, excl_zone, M_T, Σ_T, QT, QT_first, μ_Q, σ_Q, k, start + 1
     )
 
     return P.T, I.T
@@ -58,63 +58,55 @@ def mstumped(dask_client, T, m):
     See mSTAMP Algorithm
     """
 
-    hosts = list(dask_client.ncores().keys())
-    nworkers = len(hosts)
+    T_A = np.asarray(core.transpose_dataframe(T)).copy()
+    T_B = T_A.copy()
 
-    T = np.asarray(core.transpose_dataframe(T))
+    T_A[np.isinf(T_A)] = np.nan
+    T_B[np.isinf(T_B)] = np.nan
 
-    core.check_dtype(T)
-    core.check_nan(T)
-    if T.ndim <= 1:  # pragma: no cover
-        err = f"T is {T.ndim}-dimensional and must be greater than 1-dimensional"
+    core.check_dtype(T_A)
+    if T_A.ndim <= 1:  # pragma: no cover
+        err = f"T is {T_A.ndim}-dimensional and must be at least 1-dimensional"
         raise ValueError(f"{err}")
 
     core.check_window_size(m)
 
-    d = T.shape[0]
-    n = T.shape[1]
+    d, n = T_A.shape
     k = n - m + 1
     excl_zone = int(np.ceil(m / 4))  # See Definition 3 and Figure 3
 
-    M_T, Σ_T = core.compute_mean_std(T, m)
-    μ_Q, σ_Q = core.compute_mean_std(T, m)
+    M_T, Σ_T = core.compute_mean_std(T_A, m)
+    μ_Q, σ_Q = core.compute_mean_std(T_B, m)
+
+    T_A[np.isnan(T_A)] = 0
+
+    P = np.empty((d, k), dtype="float64")
+    I = np.empty((d, k), dtype="int64")
+
+    hosts = list(dask_client.ncores().keys())
+    nworkers = len(hosts)
+
+    step = 1 + k // nworkers
+
+    for i, start in enumerate(range(0, k, step)):
+        P[:, start], I[:, start] = _get_first_mstump_profile(
+            start, T_A, T_B, m, excl_zone, M_T, Σ_T
+        )
 
-    P = np.full((nworkers, d, k), np.inf, dtype="float64")
-    D = np.zeros((nworkers, d, k), dtype="float64")
-    D_prime = np.zeros((nworkers, k), dtype="float64")
-    I = np.ones((nworkers, d, k), dtype="int64") * -1
+    T_B[np.isnan(T_B)] = 0
 
     # Scatter data to Dask cluster
-    T_future = dask_client.scatter(T, broadcast=True)
+    T_A_future = dask_client.scatter(T_A, broadcast=True)
     M_T_future = dask_client.scatter(M_T, broadcast=True)
     Σ_T_future = dask_client.scatter(Σ_T, broadcast=True)
     μ_Q_future = dask_client.scatter(μ_Q, broadcast=True)
     σ_Q_future = dask_client.scatter(σ_Q, broadcast=True)
 
-    step = 1 + k // nworkers
     QT_futures = []
     QT_first_futures = []
-    P_futures = []
-    I_futures = []
-    D_futures = []
-    D_prime_futures = []
 
     for i, start in enumerate(range(0, k, step)):
-        P[i, :, start], I[i, :, start] = _get_first_mstump_profile(
-            start, T, m, excl_zone, M_T, Σ_T
-        )
-
-        P_future = dask_client.scatter(P[i], workers=[hosts[i]])
-        I_future = dask_client.scatter(I[i], workers=[hosts[i]])
-        D_future = dask_client.scatter(D[i], workers=[hosts[i]])
-        D_prime_future = dask_client.scatter(D_prime[i], workers=[hosts[i]])
-
-        P_futures.append(P_future)
-        I_futures.append(I_future)
-        D_futures.append(D_future)
-        D_prime_futures.append(D_prime_future)
-
-        QT, QT_first = _get_multi_QT(start, T, m)
+        QT, QT_first = _get_multi_QT(start, T_A, m)
 
         QT_future = dask_client.scatter(QT, workers=[hosts[i]])
         QT_first_future = dask_client.scatter(QT_first, workers=[hosts[i]])
@@ -129,12 +121,8 @@ def mstumped(dask_client, T, m):
         futures.append(
             dask_client.submit(
                 _mstump,
-                T_future,
+                T_A_future,
                 m,
-                P_futures[i],
-                I_futures[i],
-                D_futures[i],
-                D_prime_futures[i],
                 stop,
                 excl_zone,
                 M_T_future,
@@ -150,9 +138,7 @@ def mstumped(dask_client, T, m):
 
     results = dask_client.gather(futures)
     for i, start in enumerate(range(0, k, step)):
-        P[i], I[i] = results[i]
-        col_mask = P[0] > P[i]
-        P[0, col_mask] = P[i, col_mask]
-        I[0, col_mask] = I[i, col_mask]
+        stop = min(k, start + step)
+        P[:, start + 1 : stop], I[:, start + 1 : stop] = results[i]
 
-    return P[0].T, I[0].T
+    return P.T, I.T
@@ -30,7 +30,7 @@ def _get_first_stump_profile(start, T_A, T_B, m, excl_zone, M_T, Σ_T, ignore_tr
         be returned
 
     T_B : ndarray
-        The time series or sequence that contain your query subsequences
+        The time series or sequence that contains your query subsequences
 
     m : int
         Window size
 
@@ -32,6 +32,8 @@ py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_st
 check_errs $?
 py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_two_subsequences_nan_A_B_join.py tests/test_stumped_two_subsequences_inf_A_B_join.py tests/test_stumped_two_subsequences_nan_inf_A_B_join.py tests/test_stumped_two_subsequences_nan_inf_A_B_join_swap.py
 check_errs $?
+py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mstumped_one_subsequence_nan_self_join.py tests/test_mstumped_one_subsequence_nan_self_join.py
+check_errs $?
 py.test -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_scrimp.py
 check_errs $?
 
 
@@ -26,6 +26,11 @@ def naive_rolling_window_dot_product(Q, T):
     (np.random.uniform(-1000, 1000, [3, 10]).astype(np.float64), 5),
 ]
 
+substitution_locations = [
+    (slice(1, 3), [0, 3])
+]  # [(slice(0, 0), 0, -1, slice(1, 3), [0, 3])]
+substitution_values = [np.nan, np.inf]
+
 
 @pytest.mark.parametrize("T, m", test_data)
 def test_multi_mass(T, m):
@@ -50,7 +55,7 @@ def test_get_first_mstump_profile(T, m):
     left_I = left_I[start, :]
 
     M_T, Σ_T = core.compute_mean_std(T, m)
-    right_P, right_I = _get_first_mstump_profile(start, T, m, excl_zone, M_T, Σ_T)
+    right_P, right_I = _get_first_mstump_profile(start, T, T, m, excl_zone, M_T, Σ_T)
 
     npt.assert_almost_equal(left_P, right_P)
     npt.assert_equal(left_I, right_I)
@@ -138,3 +143,43 @@ def test_constant_subsequence_self_join():
     right_P, right_I = mstump(T, m)
 
     npt.assert_almost_equal(left_P, right_P)  # ignore indices
+
+
+@pytest.mark.parametrize("T, m", test_data)
+@pytest.mark.parametrize("substitute", substitution_values)
+@pytest.mark.parametrize("substitution_locations", substitution_locations)
+def test_mstump_nan_inf_self_join_first_dimension(
+    T, m, substitute, substitution_locations
+):
+    excl_zone = int(np.ceil(m / 4))
+
+    T_sub = T.copy()
+
+    for substitution_location in substitution_locations:
+        T_sub[:] = T[:]
+        T_sub[0, substitution_location] = substitute
+
+        left_P, left_I = utils.naive_mstump(T_sub, m, excl_zone)
+        right_P, right_I = mstump(T_sub, m)
+
+        npt.assert_almost_equal(left_P, right_P)
+        npt.assert_almost_equal(left_I, right_I)
+
+
+@pytest.mark.parametrize("T, m", test_data)
+@pytest.mark.parametrize("substitute", substitution_values)
+@pytest.mark.parametrize("substitution_locations", substitution_locations)
+def test_mstump_nan_self_join_all_dimensions(T, m, substitute, substitution_locations):
+    excl_zone = int(np.ceil(m / 4))
+
+    T_sub = T.copy()
+
+    for substitution_location in substitution_locations:
+        T_sub[:] = T[:]
+        T_sub[:, substitution_location] = substitute
+
+        left_P, left_I = utils.naive_mstump(T_sub, m, excl_zone)
+        right_P, right_I = mstump(T_sub, m)
+
+        npt.assert_almost_equal(left_P, right_P)
+        npt.assert_almost_equal(left_I, right_I)