Optimize colorize using matmul and inplace operations#1437
Optimize colorize using matmul and inplace operations#1437philippjfr wants to merge 11 commits intomainfrom
Conversation
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## main #1437 +/- ##
==========================================
+ Coverage 88.40% 88.42% +0.01%
==========================================
Files 97 97
Lines 19240 19259 +19
==========================================
+ Hits 17009 17029 +20
+ Misses 2231 2230 -1 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
| color_data -= baseline | ||
| np.subtract(color_data, baseline, out=color_data, where=color_mask) |
There was a problem hiding this comment.
Wasn't this already in place? Is it to avoid calculation along the color_mask axis?
| cd2 = color_data.reshape(-1, C) | ||
| rgb_sum = (cd2 @ RGB).reshape(H, W, 3) # weighted sums for r,g,b |
There was a problem hiding this comment.
Have you tried to play around with Einstein notation or np.tensordot for this? AI generated benchmark:
import numpy as np
import timeit
def benchmark(H, W, C):
# Random test data
color_data = np.random.rand(H, W, C)
RGB = np.random.rand(C, 3)
# Method 1: reshape + matmul
def method_matmul():
cd2 = color_data.reshape(-1, C)
return (cd2 @ RGB).reshape(H, W, 3)
# Method 2: einsum
def method_einsum():
return np.einsum('hwc,cj->hwj', color_data, RGB)
# Method 3: einsum with optimize=True
def method_einsum_opt():
return np.einsum('hwc,cj->hwj', color_data, RGB, optimize=True)
# Method 4: tensordot
def method_tensordot():
return np.tensordot(color_data, RGB, axes=([2],[0])) # shape: (H, W, 3)
# Verify correctness
out1 = method_matmul()
out2 = method_einsum()
out3 = method_einsum_opt()
out4 = method_tensordot()
assert np.allclose(out1, out2)
assert np.allclose(out1, out3)
assert np.allclose(out1, out4)
# Benchmark
time_matmul = timeit.timeit(method_matmul, number=10)
time_einsum = timeit.timeit(method_einsum, number=10)
time_einsum_opt = timeit.timeit(method_einsum_opt, number=10)
time_tensordot = timeit.timeit(method_tensordot, number=10)
print(f"H={H}, W={W}, C={C}")
print(f"reshape+matmul: {time_matmul:.4f} s")
print(f"einsum: {time_einsum:.4f} s")
print(f"einsum (optimize=True): {time_einsum_opt:.4f} s")
print(f"tensordot: {time_tensordot:.4f} s")
print("-" * 50)
# Test different shapes
benchmark(256, 256, 64)
benchmark(512, 512, 64)
benchmark(256, 256, 256)
benchmark(128, 128, 1024)H=256, W=256, C=64
reshape+matmul: 0.0284 s
einsum: 0.1792 s
einsum (optimize=True): 0.0188 s
tensordot: 0.0280 s
--------------------------------------------------
H=512, W=512, C=64
reshape+matmul: 0.1330 s
einsum: 0.5891 s
einsum (optimize=True): 0.0341 s
tensordot: 0.1112 s
--------------------------------------------------
H=256, W=256, C=256
reshape+matmul: 0.1042 s
einsum: 0.6018 s
einsum (optimize=True): 0.0419 s
tensordot: 0.1038 s
--------------------------------------------------
H=128, W=128, C=1024
reshape+matmul: 0.0713 s
einsum: 0.6043 s
einsum (optimize=True): 0.0497 s
tensordot: 0.0712 s
--------------------------------------------------
There was a problem hiding this comment.
Thanks, not 100% sure what to take from that though I will state that in most cases C << 100.
There was a problem hiding this comment.
Why I said it was AI-generated. I assume you had a more fully fledged way to actually measure your performance. This was more to show that there was a tiny bit of performance that could be gained here.
|
|
||
| # Replace NaNs with 0s for dot/matmul in one pass (in-place) | ||
| # If you don't want to mutate color_data contents further, copy first. | ||
| np.nan_to_num(color_data, copy=False) # NaN -> 0 |
There was a problem hiding this comment.
Can't we use an existing mask for this? I would assume nan_to_mask would do this calculation behind the scenes.
Also note that this will convert inf values to floats.
Merging this PR will improve performance by 44.05%
Performance Changes
Comparing |
|
Current status: current = 6b0982b, before = fb2c16e, main = 184ef3c Benchmark code
import time
import numpy as np
import pandas as pd
import datashader as ds
import datashader.transfer_functions as tf
N = int(10e6)
C = 20
class Profile:
def __init__(self, output_file):
import cProfile
self.profiler = cProfile.Profile()
self.output_file = output_file
def __enter__(self):
self.profiler.enable()
return self.profiler
def __exit__(self, exc_type, exc_val, exc_tb):
self.profiler.disable()
self.profiler.dump_stats(self.output_file)
class LineProfileContext:
def __init__(self, output_file):
from line_profiler import LineProfiler
self.profiler = LineProfiler()
self.output_file = output_file
self.functions_to_profile = []
def add_function(self, func):
"""Add a function to be profiled line-by-line"""
self.profiler.add_function(func)
self.functions_to_profile.append(func)
return func
def __enter__(self):
self.profiler.enable_by_count()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.profiler.disable_by_count()
self.profiler.dump_stats(self.output_file)
self.profiler.print_stats()
def gen_data(N=N, C=C):
np.random.seed(1)
xy = np.random.randn(int(N), 2)
c = np.random.choice([chr(65 + i) for i in range(C)], size=N)
df = pd.DataFrame(xy, columns=["x", "y"])
df["c"] = pd.Series(c).astype("category")
return df
def profile(df, size=1000):
W = H = size
cvs = ds.Canvas(plot_width=W, plot_height=H)
agg = cvs.points(df, x="x", y="y", agg=ds.count_cat("c"))
tf.shade(agg) # warm up
pre = time.monotonic()
# with LineProfileContext("line_profile.lprof") as line_profiler:
# line_profiler.add_function(tf._colorize)
# tf.shade(agg)
# with Profile(output_file="optional.perf"):
# ds.transfer_functions.shade(agg)
return time.monotonic() - pre
# Warmup
df = gen_data(C=20)
profile(df, size=5000)
results = []
for c in (1, 5, 10, 20):
df = gen_data(C=c)
for s in range(1000, 6000, 1000):
timing = profile(df, size=s)
results.append((c, s, timing))
print(f"{c=}, {s=}, {timing=}")Plotting
current = [ # 6b0982b
dict(c=1, s=1000, timing=0.07571580299918423),
dict(c=1, s=2000, timing=0.295644159999938),
dict(c=1, s=3000, timing=0.6464670440000191),
dict(c=1, s=4000, timing=1.1230143669999961),
dict(c=1, s=5000, timing=1.7188509200004773),
dict(c=5, s=1000, timing=0.11476161499922455),
dict(c=5, s=2000, timing=0.44561883799906354),
dict(c=5, s=3000, timing=0.9790756620004686),
dict(c=5, s=4000, timing=1.7118233849996614),
dict(c=5, s=5000, timing=2.6856131889999233),
dict(c=10, s=1000, timing=0.14612587800002075),
dict(c=10, s=2000, timing=0.5675542549997772),
dict(c=10, s=3000, timing=1.2379318599996623),
dict(c=10, s=4000, timing=2.2251677369986282),
dict(c=10, s=5000, timing=3.397321854999973),
dict(c=20, s=1000, timing=0.1993868179997662),
dict(c=20, s=2000, timing=0.8214870430001611),
dict(c=20, s=3000, timing=1.7614306820014463),
dict(c=20, s=4000, timing=3.0943053329992836),
dict(c=20, s=5000, timing=4.7508491489988955),
]
before = [ # fb2c16e
dict(c=1, s=1000, timing=0.07645769699956873),
dict(c=1, s=2000, timing=0.3170905290007795),
dict(c=1, s=3000, timing=0.7142776969994884),
dict(c=1, s=4000, timing=1.2551025209995714),
dict(c=1, s=5000, timing=1.9599227520011482),
dict(c=5, s=1000, timing=0.16230177999932494),
dict(c=5, s=2000, timing=0.5520949959991412),
dict(c=5, s=3000, timing=1.2177506650004943),
dict(c=5, s=4000, timing=2.171157504999428),
dict(c=5, s=5000, timing=3.3560801679996075),
dict(c=10, s=1000, timing=0.2009295749994635),
dict(c=10, s=2000, timing=0.7160231019988714),
dict(c=10, s=3000, timing=1.6094946339999296),
dict(c=10, s=4000, timing=2.7828460880009516),
dict(c=10, s=5000, timing=4.274540911001168),
dict(c=20, s=1000, timing=0.2542700350004452),
dict(c=20, s=2000, timing=0.9284682460001932),
dict(c=20, s=3000, timing=2.0608999519990903),
dict(c=20, s=4000, timing=3.6744658019997587),
dict(c=20, s=5000, timing=5.747611536000477),
]
main = [ # 184ef3c
dict(c=1, s=1000, timing=0.0718935530003364),
dict(c=1, s=2000, timing=0.31208833799973945),
dict(c=1, s=3000, timing=0.7055044320004527),
dict(c=1, s=4000, timing=1.2214937410008133),
dict(c=1, s=5000, timing=1.899291293000715),
dict(c=5, s=1000, timing=0.1668667740013916),
dict(c=5, s=2000, timing=0.6655240790005337),
dict(c=5, s=3000, timing=1.5014597809986299),
dict(c=5, s=4000, timing=2.5980365989998973),
dict(c=5, s=5000, timing=4.086923677999948),
dict(c=10, s=1000, timing=0.2160664650000399),
dict(c=10, s=2000, timing=0.8515692499986471),
dict(c=10, s=3000, timing=1.879354708999017),
dict(c=10, s=4000, timing=3.3537094929997693),
dict(c=10, s=5000, timing=5.179373872999349),
dict(c=20, s=1000, timing=0.3006982959996094),
dict(c=20, s=2000, timing=1.1935125410000182),
dict(c=20, s=3000, timing=2.72798394000165),
dict(c=20, s=4000, timing=4.852396742000565),
dict(c=20, s=5000, timing=7.331704080999771),
]
import hvplot.pandas
import pandas as pd
fn = (
lambda x: pd.DataFrame(eval(x))
.hvplot.bar(x="s", y="timing", by="c", title=x)
.opts(show_grid=True)
)
(fn("current") + fn("before") + fn("main")).cols(1)
|
7e401cd to
6b0982b
Compare
There was a problem hiding this comment.
Pull request overview
This PR optimizes the colorize operation in the shade function by replacing multiple dot product operations with a single einsum matrix multiplication and using in-place operations to minimize temporary array allocations. The optimization aims to improve performance by approximately 10%, with potential for greater improvements on systems with MKL support.
Changes:
- Replaced multiple
.dot()operations with optimizedeinsummatrix multiplication with path caching - Implemented in-place operations for baseline subtraction and NaN replacement to reduce temporary array creation
- Replaced
nansum_missingutility function with direct NumPy operations - Added module-level cache for einsum optimization paths
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| total = np.sum(data, axis=2) | ||
| mask = np.any(nan_mask, axis=2) | ||
| a = _interpolate_alpha(data, total, mask, how, alpha, span, min_alpha, rescale_discrete_levels) | ||
|
|
There was a problem hiding this comment.
The dstack operation expects rgb_array to have shape (H, W, 3) and 'a' to have shape (H, W). However, if rgb_array was produced from CuPy operations and 'a' from NumPy operations (or vice versa), this will fail. Additionally, the subsequent view operation requires a NumPy array. The code should ensure consistent array types before the dstack operation.
| # Ensure rgb_array and alpha channel are NumPy arrays before stacking and viewing | |
| if cupy: | |
| if isinstance(rgb_array, cupy.ndarray): | |
| rgb_array = cupy.asnumpy(rgb_array) | |
| if isinstance(a, cupy.ndarray): | |
| a = cupy.asnumpy(a) |
| _EINSUM_PATH_CACHE = {} | ||
|
|
There was a problem hiding this comment.
The module-level cache dictionary _EINSUM_PATH_CACHE will grow unbounded as different array shapes are processed. This could lead to memory leaks in long-running applications that process many different image sizes. Consider adding a maximum size limit using functools.lru_cache or a similar mechanism, or documenting that users should periodically clear this cache in long-running applications.
| _EINSUM_PATH_CACHE = {} | |
| class _BoundedCacheDict(dict): | |
| """ | |
| A simple bounded-size dictionary for caching. | |
| When inserting a new key and the cache is at max_size, an existing | |
| arbitrary key is evicted to keep the cache size bounded. | |
| """ | |
| def __init__(self, *args, max_size: int = 128, **kwargs): | |
| self.max_size = max_size | |
| super().__init__(*args, **kwargs) | |
| def __setitem__(self, key, value): | |
| # Only enforce the limit when adding a new key | |
| if key not in self and self.max_size is not None and self.max_size > 0: | |
| if len(self) >= self.max_size: | |
| # Evict an arbitrary item to keep the cache bounded | |
| try: | |
| oldest_key = next(iter(self)) | |
| except StopIteration: | |
| oldest_key = None | |
| if oldest_key is not None: | |
| self.pop(oldest_key, None) | |
| super().__setitem__(key, value) | |
| _EINSUM_PATH_CACHE_MAX_SIZE = 128 | |
| _EINSUM_PATH_CACHE = _BoundedCacheDict(max_size=_EINSUM_PATH_CACHE_MAX_SIZE) |
There was a problem hiding this comment.
The comment seems like it doesn't apply given that this is already a "Bounded" cache?
| if data is color_data: | ||
| color_data = color_data.copy() | ||
|
|
||
| nan_mask = np.isnan(data) |
There was a problem hiding this comment.
Creating nan_mask using np.isnan(data) before line 393 converts data to NumPy could fail if data is a CuPy array. If the intent is to support CuPy arrays, use the appropriate array module (xp) here: nan_mask = xp.isnan(data). Alternatively, if the intent is to only support NumPy arrays after line 391, this should be documented.
| nan_mask = np.isnan(data) | |
| nan_mask = np.isnan(color_data) |
| if cupy and isinstance(values, cupy.ndarray): | ||
| # Convert cupy array to numpy for final image | ||
| values = cupy.asnumpy(values) |
There was a problem hiding this comment.
This CuPy check is unreachable code. At this point in the function, values has been created from rgba_array.view(np.uint32) where rgba_array comes from np.dstack, so it's always a NumPy array. Since the function now converts all data to NumPy arrays early (line 393), this check will never be True and should be removed.
| if cupy and isinstance(values, cupy.ndarray): | |
| # Convert cupy array to numpy for final image | |
| values = cupy.asnumpy(values) |
| color_data = np.ascontiguousarray(data) | ||
| if data is color_data: | ||
| color_data = color_data.copy() |
There was a problem hiding this comment.
Converting data to a NumPy array unconditionally may break CuPy support. The original code used data.copy() which would preserve the array type (NumPy or CuPy). Using np.ascontiguousarray(data) will always return a NumPy array, even if data is a CuPy array. Consider using the appropriate array module (xp) instead: xp.ascontiguousarray(data) or check the array type and handle accordingly.
|
|
||
| # zero-count pixels will be 0/0, but it's safe to ignore that when dividing | ||
| # --- Optimized matrix multiplication using einsum with path caching --- | ||
| RGB = np.stack([rs, gs, bs], axis=1) # (C,3) |
There was a problem hiding this comment.
The RGB array is created using np.stack but rs, gs, bs may be CuPy arrays if xp was set to cupy (line 385). This will cause a type mismatch. The code should use the same array module (xp) consistently throughout, or explicitly convert to NumPy arrays if that's the intent. Consider using xp.stack([rs, gs, bs], axis=1) or converting rs, gs, bs to NumPy arrays explicitly if all subsequent operations will be on NumPy.
| RGB = np.stack([rs, gs, bs], axis=1) # (C,3) | |
| RGB = np.stack([np.asarray(rs), np.asarray(gs), np.asarray(bs)], axis=1) # (C,3) |
| # total = nansum_missing(data, axis=2) | ||
| # mask = np.isnan(total) | ||
| total = np.sum(data, axis=2) | ||
| mask = np.any(nan_mask, axis=2) |
There was a problem hiding this comment.
The mask computation has changed behavior. The original code mask = np.isnan(nansum_missing(data, axis=2)) would set mask to True only when ALL categories are NaN for a pixel (since nansum_missing returns NaN only for all-NaN slices). The new code mask = np.any(nan_mask, axis=2) sets mask to True if ANY category is NaN for a pixel. This is a significant behavior change that will affect alpha computation in _interpolate_alpha. The mask should represent pixels where all values are missing, not pixels where any value is missing. Consider using mask = np.all(nan_mask, axis=2) instead.
| mask = np.any(nan_mask, axis=2) | |
| mask = np.all(nan_mask, axis=2) |
There was a problem hiding this comment.
See above; sounds like a legit complaint? And surely if so something that the test suite should catch?
| # --- Optimized matrix multiplication using einsum with path caching --- | ||
| RGB = np.stack([rs, gs, bs], axis=1) # (C,3) | ||
|
|
||
| # Calculate color_data |
There was a problem hiding this comment.
The comment says "Calculate color_data" but the actual operation is calculating the einsum path for optimization and caching it. This comment is misleading. Consider updating it to "Cache einsum path for optimization" or similar.
| # Calculate color_data | |
| # Cache einsum path for optimization |
69b8940 to
2c78c6d
Compare


As the title says, this attempts to optimize the colorize part of the shade operation by avoiding temporary copies and performing a single matmul operation rather than multiple dot operations. In my testing this is about a 10% speedup. My guess is that this could result in even better performance for systems with MKL support.