openvinotoolkit · Passavee-Losripat · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -566,6 +566,19 @@ static MemoryPtr memoryViewToVector(const std::vector<T>& vec, const dnnl::engin
 bool Convolution::canFuse(const NodePtr& node) const {
 #if defined(OV_CPU_WITH_ACL)
     if (!fusedWith.empty()) {
+        // Allow FakeQuantize to fuse after a single Eltwise activation
+        // to enable Conv -> Activation -> FakeQuantize for the ACL INT8 path.
+        if (fusedWith.size() == 1 &&
+            fusedWith[0]->getType() == Type::Eltwise &&
+            node->getType() == Type::FakeQuantize) {
+            const auto fqOutPrc = node->getOriginalOutputPrecisionAtPort(0);
+            const auto convInPrc = getOriginalInputPrecisionAtPort(0);
+            if (any_of(convInPrc, ov::element::u8, ov::element::i8) && fqOutPrc != convInPrc) {
+                return false;
+            }
+            return canFuseSimpleOperation(node);
+        }
+
         return false;
     }
 

@@ -68,14 +68,14 @@ ACLConvolutionExecutor::ACLConvolutionExecutor(const ConvAttrs& attrs,
                                                paddingBottom,
                                                arm_compute::DimensionRoundingType::FLOOR);
     dilation = arm_compute::Size2D(attrs.dilation[1] + 1, attrs.dilation[0] + 1);
-
-    if (attrs.postOps.size() == 1) {
-        if (const auto* const activation = std::any_cast<ActivationPostOp>(attrs.postOps.data())) {
+    // Instead of checking ==1 ==2 branch, iterate throught he vector
+    for (const auto& postOp : attrs.postOps ){
+        if (const auto* const activation = std::any_cast<ActivationPostOp>(&postOp)) {
             activationLayerInfo = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
-                                                         activation->alpha(),
-                                                         activation->beta(),
-                                                         activation->gamma());
-        } else if (const auto* const fq = std::any_cast<FakeQuantizePostOp>(attrs.postOps.data())) {
+                                                        activation->alpha(),
+                                                        activation->beta(),
+                                                        activation->gamma());
+        } else if (const auto* const fq = std::any_cast<FakeQuantizePostOp>(&postOp)) {
             fqInputScale = fq->inputScale();
             fqInputShift = fq->inputShift();
             fqOutputScale = fq->outputScale();
@@ -112,13 +112,11 @@ ACLConvolutionExecutor::ACLConvolutionExecutor(const ConvAttrs& attrs,
         } else {
             OPENVINO_THROW("ACLConvolutionExecutor: the executor supports FakeQuantize and Activation post ops only");
         }
-    } else if (attrs.postOps.size() > 1) {
-        OPENVINO_THROW("ACLConvolutionExecutor: ACL does not support more than 1 post op");
     }
 }
 
 bool ACLConvolutionExecutor::supports(const ConvConfig& config) {
-    VERIFY(config.attrs.postOps.size() <= 1U, UNSUPPORTED_BY_EXECUTOR);
+    VERIFY(config.attrs.postOps.size() <= 2U, UNSUPPORTED_BY_EXECUTOR);
 
     const auto& srcDesc = config.descs.at(ARG_SRC);
     const auto& weiDesc = config.descs.at(ARG_WEI);
@@ -128,7 +126,13 @@ bool ACLConvolutionExecutor::supports(const ConvConfig& config) {
     VERIFY(srcDesc->getShape().getRank() == 4 && weiDesc->getShape().getRank() == 4, UNSUPPORTED_BY_EXECUTOR);
     // isQuantized verifies whether src is u8/i8, weights is i8 and FQ is fused if dst is u8/i8
     // the last requirement is due to ACL int32 accumulation that needs to be requantized by non-trivial scales
-    const bool hasQuantizationPostOp = std::any_cast<FakeQuantizePostOp>(config.attrs.postOps.data()) != nullptr;
+    const bool hasQuantizationPostOp = std::any_of(
+        config.attrs.postOps.begin(),
+        config.attrs.postOps.end(),
+        [](const std::any& op){
+            return std::any_cast<FakeQuantizePostOp>(&op) != nullptr;
+        }
+    );
     const bool isQuantizedU8 = srcDesc->getPrecision() == ov::element::u8 &&
                                any_of(weiDesc->getPrecision(), ov::element::u8, ov::element::i8) &&
                                dstDesc->getPrecision() == ov::element::u8 && hasQuantizationPostOp;

@@ -13,6 +13,8 @@
 #include "openvino/op/convolution.hpp"
 #include "openvino/op/fake_quantize.hpp"
 #include "openvino/op/multiply.hpp"
+#include "openvino/op/swish.hpp"
+#include "openvino/op/relu.hpp"
 #include "openvino/pass/pattern/op/block.hpp"
 #include "openvino/pass/pattern/op/label.hpp"
 #include "openvino/pass/pattern/op/or.hpp"
@@ -38,17 +40,20 @@ ov::intel_cpu::ConvMulAddFQBlock::ConvMulAddFQBlock(const bool require_int_fq_ou
         return !type_matches(ov::element::i32)(output);
     });
     auto add = wrap_type<ov::op::v1::Add>({multiply, bias_const});
+    auto activation = wrap_type<ov::op::v4::Swish, ov::op::v0::Relu>({add});
+    auto activation_or_add = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{activation, add});
 
     ov::pass::pattern::op::Predicate predicate =
         require_int_fq_output ? type_matches_any({element::i8, element::u8}) : ov::pass::pattern::op::Predicate();
     auto fake_quantize =
-        wrap_type<ov::op::v0::FakeQuantize>({add, any_input(), any_input(), any_input(), any_input()}, predicate);
+        wrap_type<ov::op::v0::FakeQuantize>({activation_or_add, any_input(), any_input(), any_input(), any_input()}, predicate);
 
     m_inputs = ov::OutputVector{conv};
     m_outputs = ov::OutputVector{fake_quantize};
 
     register_anchor("convolution", conv);
     register_anchor("multiply", multiply);
     register_anchor("add", add);
+    register_anchor("activation", activation);
     register_anchor("fake_quantize", fake_quantize);
 }
@@ -35,6 +35,7 @@ ov::intel_cpu::ConvertConvolutionBias::ConvertConvolutionBias() {
         const auto conv_out = conv_mul_add_fq->get_anchor("convolution", pattern_map);
         const auto mul_out = conv_mul_add_fq->get_anchor("multiply", pattern_map);
         const auto add_out = conv_mul_add_fq->get_anchor("add", pattern_map);
+        const auto activation_out = conv_mul_add_fq->get_anchor("activation", pattern_map);
         const auto fq_out = conv_mul_add_fq->get_anchor("fake_quantize", pattern_map);
         if (!conv_out || !mul_out || !add_out || !fq_out) {
             return false;

@@ -25,6 +25,7 @@
 #include "openvino/op/fake_quantize.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/reshape.hpp"
+#include "openvino/op/swish.hpp"
 #include "openvino/pass/matcher_pass.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
 #include "openvino/pass/pattern/op/pattern.hpp"
@@ -56,6 +57,7 @@ ov::intel_cpu::FallbackUnsupportedLPConvToFP16::FallbackUnsupportedLPConvToFP16(
         const auto conv_out = conv_mul_add_fq->get_anchor("convolution", pattern_map);
         const auto mul_out = conv_mul_add_fq->get_anchor("multiply", pattern_map);
         const auto add_out = conv_mul_add_fq->get_anchor("add", pattern_map);
+        const auto activation_out = conv_mul_add_fq->get_anchor("activation", pattern_map);
         const auto fq_out = conv_mul_add_fq->get_anchor("fake_quantize", pattern_map);
         if (!conv_out || !mul_out || !add_out || !fq_out) {
             return false;
@@ -105,6 +107,14 @@ ov::intel_cpu::FallbackUnsupportedLPConvToFP16::FallbackUnsupportedLPConvToFP16(
         ov::copy_runtime_info(rt_info_sources, {reshape_const, scales_reshape, scales_to_weights, conv_scaled});
 
         ov::replace_node(mul, conv_scaled);
+
+        if (activation_out) {
+            const auto activation_node = activation_out->get_node_shared_ptr();
+            if (auto type_relaxed = std::dynamic_pointer_cast<ov::op::TypeRelaxedBase>(activation_node)){
+                type_relaxed->set_overridden_output_type(ov::element::f16, 0);
+                activation_node->validate_and_infer_types();
+            }
+        }
 
         // Keep this matched Conv->Mul->Add->FQ pattern in FP16 end-to-end for CPU plugin selection.
         if (add->get_output_element_type(0) == ov::element::f16) {

@@ -990,8 +990,6 @@ void Transformations::runLptPasses(const std::vector<ov::element::Type>& default
         FuseMultiplyToFakeQuantizeTransformation);
     CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation);
 
-    // ConvolutionTransformation is disabled temporary until ACL issues are fixed: #1252, #1253
-    CPU_DISABLE_PASS_ARM(lptManager, ConvolutionTransformation);
     CPU_DISABLE_PASS_ARM(lptManager, ConvolutionBackpropDataTransformation);
     CPU_DISABLE_PASS_ARM(lptManager, InterpolateTransformation);
     CPU_DISABLE_PASS_ARM(lptManager, GroupConvolutionTransformation);