feat: implement DEFAULT_FUSED_MAPPINGS refactor and fix post-rebase issues

dzhengAP · dzhengAP · commit 728d84a5fb1b · 2026-03-27T17:49:15.000-07:00
Per @kylesayrs review: - DEFAULT_FUSED_MAPPINGS refactored to {primary_pattern: [partner_templates]} so only the primary-owning shard fetches its partners, preventing double reads for cross-shard fused weight sets - build_inverse_weights_map uses re.match on primary patterns with named group substitution to construct partner names exactly as Kyle suggested - process_file_microscale_scheme: remove assert on unmatched fused sets — non-primary shards legitimately have k/v without q since only the primary shard fetches partners Post-rebase fixes: - __init__.py / save_utils.py: fix import path for local dev compatibility (compressed_tensors.utils.safetensors_load instead of compressed_tensors.entrypoints.convert.file_utils) - microscale.py: fix line too long in DEFAULT_FUSED_MAPPINGS Signed-off-by: David Zheng <dqzheng1996@gmail.com>
diff --git a/src/llmcompressor/entrypoints/model_free/__init__.py b/src/llmcompressor/entrypoints/model_free/__init__.py
@@ -9,15 +9,13 @@
     Converter,
     exec_jobs,
 )
-from compressed_tensors.entrypoints.convert.file_utils import (
+from compressed_tensors.quantization import QuantizationScheme
+from compressed_tensors.utils.safetensors_load import (
     get_checkpoint_files,
     is_weights_file,
 )
-from compressed_tensors.quantization import QuantizationScheme
 from loguru import logger
 
-
-
 from llmcompressor.entrypoints.model_free.helpers import (
     find_safetensors_index_file,
     gpu_if_available,
diff --git a/src/llmcompressor/entrypoints/model_free/microscale.py b/src/llmcompressor/entrypoints/model_free/microscale.py
@@ -1,105 +1,51 @@
+import re
 from collections import defaultdict
+
 from compressed_tensors.quantization import QuantizationScheme, QuantizationStrategy
 
 from llmcompressor.entrypoints.model_free.helpers import (
     MatchedNamesSet,
     match_names_set_eager,
 )
 
-
-def match_name(name: str, pattern: str) -> bool:
-    """Pattern matching for tensor names. Handles 're:' prefix for regex patterns."""
-    import re
-    if pattern.startswith('re:'):
-        # Regex pattern - strip 're:' prefix and match
-        regex = pattern[3:]
-        return re.match(regex, name) is not None
-    else:
-        # Glob-style pattern
-        import fnmatch
-        return fnmatch.fnmatch(name, pattern)
-
-
-
-def build_inverse_weights_map(
-    shard_name: str,
-    weight_map: dict[str, str],
-    model_files: dict[str, str],
-) -> dict[str, list[str]]:
-    """
-    For a given output shard, precompute exactly which tensors need to be
-    loaded from which source files — including fused partner tensors that
-    live in other shards.
-
-    This moves fused partner discovery out of the per-process runtime and
-    into the job-building phase, avoiding redundant re-discovery and enabling
-    cleaner process function signatures.
-
-    For example, given:
-        shard0: [q_proj.weight, ...]
-        shard1: [k_proj.weight, v_proj.weight, ...]
-
-    The inverse_weights_map for shard0's job would be:
-        {
-            "/path/to/shard0.safetensors": ["q_proj.weight", ...],
-            "/path/to/shard1.safetensors": ["k_proj.weight", "v_proj.weight"],
-        }
-
-    :param shard_name: the shard filename this job will process and save
-    :param weight_map: mapping of tensor name -> shard filename (from index.json)
-    :param model_files: mapping of shard filename -> resolved absolute path
-    :return: dict mapping resolved source file path -> list of tensor names to load
-    """
-# These are now module-level since function is in microscale.py
-# DEFAULT_FUSED_MAPPINGS and get_fused_names are available at module scope
-
-    # Tensors natively belonging to this shard
-    native_tensors = [t for t, s in weight_map.items() if s == shard_name]
-
-    # Check if all fused sets are already complete within this shard
-    _, unmatched_sets = get_fused_names(native_tensors)
-
-    # Start with native tensors grouped by their source file
-    result: dict[str, list[str]] = defaultdict(list)
-    own_resolved = model_files[shard_name]
-    result[own_resolved] = list(native_tensors)
-
-    if not unmatched_sets:
-        return dict(result)
-
-    # For each unmatched fused set, find partner tensors in other shards
-    all_patterns = [p for mapping in DEFAULT_FUSED_MAPPINGS for p in mapping]
-
-    for unmatched in unmatched_sets:
-        present_names = {v for v in unmatched.values() if v is not None}
-        layer_prefixes = {name.rsplit(".", 2)[0] for name in present_names}
-
-        for tensor_name, tensor_shard in weight_map.items():
-            if tensor_shard == shard_name:
-                continue  # already in native tensors
-            resolved = model_files.get(tensor_shard)
-            if resolved is None:
-                continue
-            candidate_prefix = tensor_name.rsplit(".", 2)[0]
-            if candidate_prefix not in layer_prefixes:
-                continue
-            if any(match_name(tensor_name, p) for p in all_patterns):
-                if tensor_name not in result[resolved]:
-                    result[resolved].append(tensor_name)
-
-    return dict(result)
-
-
-
 __all__ = [
-    'build_inverse_weights_map',
-    'is_microscale_scheme',
-    'get_fused_names',
-    'DEFAULT_FUSED_MAPPINGS',
+    "build_inverse_weights_map",
+    "is_microscale_scheme",
+    "get_fused_names",
+    "DEFAULT_FUSED_MAPPINGS",
 ]
 
+# Mapping of primary weight pattern -> list of partner weight patterns.
+# The shard owning the primary tensor is responsible for fetching its partners.
+# This prevents double reads: each fused set is fetched exactly once, by the
+# shard that owns the primary (e.g. q_proj fetches k_proj + v_proj).
+#
+# Patterns use a named group (?P<prefix>...) so partner names can be
+# constructed by substituting the matched prefix via:
+#   partner.format(prefix=match.group("prefix"))
+DEFAULT_FUSED_MAPPINGS: dict[str, list[str]] = {
+    # Attention q/k/v fusion: q_proj is primary
+    r"^(?P<prefix>.+?)\.(?P<attn>attn|attention|self_attn|self_attention)"
+    r"\.q_proj\.weight$": [
+        r"{prefix}.{attn}.k_proj.weight",
+        r"{prefix}.{attn}.v_proj.weight",
+    ],
+    # MLA attention fusion: wq_a is primary
+    r"^(?P<prefix>.+?)\.(?P<attn>attn|attention|self_attn)\.wq_a\.weight$": [
+        r"{prefix}.{attn}.wkv_a_with_mqa.weight",
+    ],
+    # MLP gate/up fusion: gate_proj is primary
+    r"^(?P<prefix>.+?)\.(?P<mlp>mlp|feed_forward)\.gate_proj\.weight$": [
+        r"{prefix}.{mlp}.up_proj.weight",
+    ],
+    # MoE w1/w3 fusion: w1 is primary
+    r"^(?P<prefix>.+?)\.w1\.weight$": [
+        r"{prefix}.w3.weight",
+    ],
+}
 
-DEFAULT_FUSED_MAPPINGS = [
+# List-of-lists format used by get_fused_names and validate.py
+_DEFAULT_FUSED_MAPPINGS_LIST = [
     [
         r"re:.*(attn|attention)\.q_proj\.weight$",
         r"re:.*(attn|attention)\.k_proj\.weight$",
@@ -124,11 +70,65 @@ def get_fused_names(
 ) -> tuple[list[MatchedNamesSet], list[MatchedNamesSet]]:
     matched = []
     unmatched = []
-    for mapping in DEFAULT_FUSED_MAPPINGS:
+    for mapping in _DEFAULT_FUSED_MAPPINGS_LIST:
         _matched, _unmatched = match_names_set_eager(tensor_names, mapping)
-
         matched.extend(_matched)
         if _unmatched is not None:
             unmatched.append(_unmatched)
-
     return matched, unmatched
+
+
+def build_inverse_weights_map(
+    shard_name: str,
+    weight_map: dict[str, str],
+    model_files: dict[str, str],
+) -> dict[str, list[str]]:
+    """
+    For a given output shard, precompute exactly which tensors to load from
+    which source files — including fused partner tensors from other shards.
+
+    Uses DEFAULT_FUSED_MAPPINGS with primary->partners structure to ensure
+    only the shard owning the primary tensor fetches its partners, preventing
+    double reads when fused weights span multiple shards.
+
+    Example — given:
+        shard0: [q_proj.weight, ...]   <- primary owner
+        shard1: [k_proj.weight, v_proj.weight, ...]   <- partners
+
+    Only shard0's inverse_weights_map will include shard1's tensors.
+    Shard1's job loads only its own native tensors.
+
+    :param shard_name: the shard filename this job will process and save
+    :param weight_map: tensor name -> shard filename (from safetensors.index.json)
+    :param model_files: shard filename -> resolved absolute path
+    :return: {resolved_file_path: [tensor_names_to_load]}
+    """
+    own_resolved = model_files[shard_name]
+    native_tensors = [t for t, s in weight_map.items() if s == shard_name]
+
+    inverse_weights_map: dict[str, list[str]] = defaultdict(list)
+    inverse_weights_map[own_resolved] = list(native_tensors)
+
+    # For each native tensor that matches a primary pattern, fetch its partners
+    for name in native_tensors:
+        for primary_pattern, partner_templates in DEFAULT_FUSED_MAPPINGS.items():
+            match = re.match(primary_pattern, name)
+            if match is None:
+                continue
+
+            # Build partner names using named groups from the match
+            for partner_template in partner_templates:
+                partner_name = partner_template.format(**match.groupdict())
+
+                partner_shard = weight_map.get(partner_name)
+                if partner_shard is None or partner_shard == shard_name:
+                    continue  # same shard or not found
+
+                partner_resolved = model_files.get(partner_shard)
+                if partner_resolved is None:
+                    continue
+
+                if partner_name not in inverse_weights_map[partner_resolved]:
+                    inverse_weights_map[partner_resolved].append(partner_name)
+
+    return dict(inverse_weights_map)
diff --git a/src/llmcompressor/entrypoints/model_free/save_utils.py b/src/llmcompressor/entrypoints/model_free/save_utils.py
@@ -10,12 +10,12 @@
 )
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.entrypoints.convert import Converter
-from compressed_tensors.entrypoints.convert.file_utils import find_config_path
 from compressed_tensors.quantization import (
     QuantizationConfig,
     QuantizationScheme,
     QuantizationStatus,
 )
+from compressed_tensors.utils.safetensors_load import find_config_path
 from loguru import logger
 from pydantic import ValidationError
 
diff --git a/src/llmcompressor/entrypoints/model_free/validate.py b/src/llmcompressor/entrypoints/model_free/validate.py
@@ -31,8 +31,7 @@ def validate_scheme(scheme: QuantizationScheme) -> tuple[str, QuantizationScheme
     output_dynamic = getattr_chain(scheme, "output_activations.dynamic", True)
     if input_dynamic is not True or output_dynamic is not True:
         raise ValueError(
-            "Model Free PTQ cannot calibrate activations. "
-            "Please use `oneshot` instead."
+            "Model Free PTQ cannot calibrate activations. Please use `oneshot` instead."
         )
 
     # override with static observers

Original file line number	Diff line number	Diff line change
`@@ -10,12 +10,12 @@`
`10`	`10`	`)`
`11`	`11`	`from compressed_tensors.config import CompressionFormat`
`12`	`12`	`from compressed_tensors.entrypoints.convert import Converter`
`13`		`-from compressed_tensors.entrypoints.convert.file_utils import find_config_path`
`14`	`13`	`from compressed_tensors.quantization import (`
`15`	`14`	`QuantizationConfig,`
`16`	`15`	`QuantizationScheme,`
`17`	`16`	`QuantizationStatus,`
`18`	`17`	`)`
	`18`	`+from compressed_tensors.utils.safetensors_load import find_config_path`
`19`	`19`	`from loguru import logger`
`20`	`20`	`from pydantic import ValidationError`
`21`	`21`
Original file line number	Diff line number	Diff line change
`@@ -31,8 +31,7 @@ def validate_scheme(scheme: QuantizationScheme) -> tuple[str, QuantizationScheme`
`31`	`31`	`output_dynamic = getattr_chain(scheme, "output_activations.dynamic", True)`
`32`	`32`	`if input_dynamic is not True or output_dynamic is not True:`
`33`	`33`	`raise ValueError(`
`34`		`- "Model Free PTQ cannot calibrate activations. "`
`35`		- "Please use `oneshot` instead."
	`34`	+ "Model Free PTQ cannot calibrate activations. Please use `oneshot` instead."
`36`	`35`	`)`
`37`	`36`
`38`	`37`	`# override with static observers`