Support fake-quantized linear with fp32 bias

jinevening · jinevening · commit e87b4155c914 · 2025-06-17T16:18:01.000+09:00
This supports fake-quantized linear with fp32 bias.

TICO-DCO-1.0-Signed-off-by: Hyukjin Jeong &lt;hj1.jeong@samsung.com&gt;
diff --git a/test/modules/op/linear.py b/test/modules/op/linear.py
@@ -14,6 +14,9 @@
 
 import torch
 from torch.export import Dim
+from torch.nn import functional as F
+
+from test.utils.tag import test_without_inference
 
 
 class SimpleLinear(torch.nn.Module):
@@ -68,3 +71,29 @@ def forward(self, arg, attn_mask):
 
     def get_example_inputs(self):
         return (torch.randn(3, 3), None)
+
+
+@test_without_inference
+class FQLinearWithFp32Bias(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, 3))
+        self.bias = torch.nn.Parameter(torch.ones(3))
+
+    def forward(self, inp):
+        scale = torch.ones(3)
+        zero_point = torch.zeros(3)
+        axis = 0
+        qmin = -32768
+        qmax = 32767
+        quant_inp = torch.fake_quantize_per_tensor_affine(inp, 1.0, 0, qmin, qmax)
+        quant_weight = torch.fake_quantize_per_channel_affine(
+            self.weight, scale, zero_point, axis, qmin, qmax
+        )
+        output = F.linear(quant_inp, quant_weight, bias=self.bias)
+        output = torch.fake_quantize_per_tensor_affine(output, 1.0, 0, qmin, qmax)
+
+        return output
+
+    def get_example_inputs(self):
+        return (torch.randn(3, 3),)
diff --git a/test/pt2_to_circle_test/builder.py b/test/pt2_to_circle_test/builder.py
@@ -41,6 +41,9 @@ def __init__(self, test_name: str, nnmodule: torch.nn.Module):
 
         # Get tags
         self.test_without_pt2: bool = is_tagged(self.nnmodule, "test_without_pt2")
+        self.test_without_inference: bool = is_tagged(
+            self.nnmodule, "test_without_inference"
+        )
 
         # Set tolerance
         self.tolerance = {}
@@ -73,11 +76,15 @@ def wrapper(s):
         else:
 
             def wrapper(s):
-                self._run(without_pt2=self.test_without_pt2, dynamic=dynamic)
+                self._run(
+                    without_pt2=self.test_without_pt2,
+                    dynamic=dynamic,
+                    without_inference=self.test_without_inference,
+                )
 
             return wrapper
 
-    def _run(self, without_pt2=False, dynamic: bool = False):
+    def _run(self, without_pt2=False, dynamic: bool = False, without_inference=False):
         dynamic_shapes = None
         if dynamic:
             assert hasattr(self.nnmodule, "get_dynamic_shapes")
@@ -120,6 +127,9 @@ def _run(self, without_pt2=False, dynamic: bool = False):
 
         verify_circle(circle_model_path, opt_circle_model_path)
 
+        if without_inference:
+            return
+
         USE_ONERT = os.environ.get("CCEX_RUNTIME") == "onert" or dynamic
         if self.use_onert or USE_ONERT:
             circle_result = infer_circle(
diff --git a/test/utils/tag.py b/test/utils/tag.py
@@ -48,6 +48,11 @@ def __init__(self, *args_, **kwargs_):
         return lambda x: x
 
 
+def test_without_inference(orig_class):
+    setattr(orig_class, "__tag_test_without_inference", True)
+    return orig_class
+
+
 def test_without_pt2(orig_class):
     setattr(orig_class, "__tag_test_without_pt2", True)
     return orig_class
diff --git a/tico/experimental/quantization/passes/quantize_bias.py b/tico/experimental/quantization/passes/quantize_bias.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch.fx
+import copy
+
+import torch
+from torch.export import ExportedProgram
+
+from tico.serialize.quant_param import QPARAM_KEY, QuantParam, to_qparam_dtype
+from tico.utils import logging
+from tico.utils.graph import add_placeholder, get_torch_param_value, is_torch_param
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.validate_args_kwargs import LinearArgs
+
+
+@trace_graph_diff_on_pass
+class QuantizeBias(PassBase):
+    """
+    Quantize bias.
+
+    This pass identifies fp32 biases, quantizes them using scales of input and weights.
+
+    This pass assumes that if bias is fp32, input and weights must have been quantized.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+
+        graph_module = exported_program.graph_module
+        graph: torch.fx.Graph = graph_module.graph
+        for node in graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target == torch.ops.aten.linear.default:
+                lin_args = LinearArgs(*node.args, **node.kwargs)
+                inp = lin_args.input
+                weights = lin_args.weight
+                bias = lin_args.bias
+
+                if bias is None:
+                    continue
+
+                # Only support bias is Parameter
+                # TODO Is it possible that bias is not Parameter?
+                if not is_torch_param(bias, exported_program):
+                    continue
+
+                bias_val: torch.Tensor = get_torch_param_value(bias, exported_program)
+                if bias_val.dtype != torch.float32:
+                    continue
+
+                if QPARAM_KEY not in inp.meta:
+                    continue
+
+                if QPARAM_KEY not in weights.meta:
+                    continue
+
+                quant_dtype = None
+                if inp.meta[QPARAM_KEY].dtype == "int16":
+                    quant_dtype = torch.int64
+                elif inp.meta[QPARAM_KEY].dtype == "uint8":
+                    quant_dtype = torch.int32
+                else:
+                    continue
+
+                type_info = torch.iinfo(quant_dtype)
+
+                assert quant_dtype is not None
+
+                i_scale = inp.meta[QPARAM_KEY].scale
+                w_scale = weights.meta[QPARAM_KEY].scale
+
+                assert i_scale is not None
+                assert w_scale is not None
+                assert len(i_scale) == 1
+                assert len(w_scale) == bias_val.shape[0]
+
+                bias_scale = torch.tensor(i_scale) * torch.tensor(w_scale)
+                q_bias = torch.round(bias_val / bias_scale)
+                q_bias = torch.clamp(q_bias, min=type_info.min, max=type_info.max)
+                q_bias = q_bias.to(quant_dtype)
+
+                q_bias_node = add_placeholder(exported_program, q_bias, bias.name)
+
+                qparam = QuantParam()
+                qparam.scale = bias_scale.tolist()
+                qparam.zero_point = [0] * len(qparam.scale)
+                qparam.dtype = to_qparam_dtype(quant_dtype)
+                qparam.quantized_dimension = 0
+                q_bias_node.meta[QPARAM_KEY] = qparam
+
+                node.update_arg(2, q_bias_node)
+
+                logger.debug(f"Bias ({bias.name}) is quantized to {q_bias_node.name}.")
+
+            # TODO Support more ops.
+
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+
+        # Run only once.
+        return PassResult(False)
diff --git a/tico/experimental/quantization/passes/remove_weight_dequant_op.py b/tico/experimental/quantization/passes/remove_weight_dequant_op.py
@@ -145,6 +145,9 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
             if isinstance(dq_args, DequantizePerChannelArgs):
                 scales = get_constant(exported_program, dq_args.scales)
                 zero_ps = get_constant(exported_program, dq_args.zero_points)
+
+                # Sometimes users can give fp32 zero point. Let's update dtype here.
+                zero_ps = zero_ps.to(torch.int64)
                 quant_param.scale = scales.tolist()
                 quant_param.zero_point = zero_ps.tolist()
                 assert quant_param.zero_point is not None  # To avoid mypy error
diff --git a/tico/utils/convert.py b/tico/utils/convert.py
@@ -30,6 +30,7 @@
 from tico.experimental.quantization.passes.propagate_qparam_forward import (
     PropagateQParamForward,
 )
+from tico.experimental.quantization.passes.quantize_bias import QuantizeBias
 from tico.experimental.quantization.passes.remove_weight_dequant_op import (
     RemoveWeightDequantOp,
 )
@@ -250,6 +251,7 @@ def convert_exported_module_to_circle(
                 RemoveWeightDequantOp(),
                 PropagateQParamForward(),
                 PropagateQParamBackward(),
+                QuantizeBias(),
                 InsertQuantizeOnDtypeMismatch(),
             ]
         )

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@`
`30`	`30`	`from tico.experimental.quantization.passes.propagate_qparam_forward import (`
`31`	`31`	`PropagateQParamForward,`
`32`	`32`	`)`
	`33`	`+from tico.experimental.quantization.passes.quantize_bias import QuantizeBias`
`33`	`34`	`from tico.experimental.quantization.passes.remove_weight_dequant_op import (`
`34`	`35`	`RemoveWeightDequantOp,`
`35`	`36`	`)`
`@@ -250,6 +251,7 @@ def convert_exported_module_to_circle(`
`250`	`251`	`RemoveWeightDequantOp(),`
`251`	`252`	`PropagateQParamForward(),`
`252`	`253`	`PropagateQParamBackward(),`
	`254`	`+ QuantizeBias(),`
`253`	`255`	`InsertQuantizeOnDtypeMismatch(),`
`254`	`256`	`]`
`255`	`257`	`)`