open-edge-platform · tjanczak · Apr 3, 2026 · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026
@@ -125,6 +125,8 @@ SUPPORTED_MODELS=(
   "clip-vit-base-patch16"
   "clip-vit-base-patch32"
   "ch_PP-OCRv4_rec_infer" # PaddlePaddle OCRv4 multilingual model
+  "PP-OCRv5_server_rec" # PaddlePaddle PP-OCRv5 recognition models
+  "PP-OCRv5_mobile_rec"
   "pallet_defect_detection" # Custom model for pallet defect detection
   "colorcls2" # Color classification model
   "mars-small128" # DeepSORT person re-identification model (uses convert_mars_deepsort.py)
@@ -293,6 +295,11 @@ validate_models() {
             fi
         done
 
+        # Allow any PP-OCRv5 variant (e.g. en_PP-OCRv5_mobile_rec, latin_PP-OCRv5_mobile_rec)
+        if [[ "$found" == false && "$model" == *"PP-OCRv5"* ]]; then
+            found=true
+        fi
+
         if [[ "$found" == false ]]; then
             echo_color "Error: Unsupported model '$model'" "red"
             echo ""
@@ -1131,6 +1138,85 @@ os.remove('${MODEL_NAME}.zip')
 fi
 
 
+# ================================= PP-OCRv5 PaddlePaddle models FP32 & FP16 - HuggingFace + paddle2onnx =================================
+# Generic function to download and convert any PaddlePaddle PP-OCRv5 model.
+# All PP-OCRv5 models on HuggingFace share the same structure:
+#   inference.json + inference.pdiparams (PaddlePaddle PIR format)
+#   config.json (contains character_dict for recognition models)
+# HuggingFace repo naming: PaddlePaddle/<model_name>  (e.g. PaddlePaddle/PP-OCRv5_server_rec)
+# For language-specific variants the prefix goes before PP-OCRv5: e.g. en_PP-OCRv5_mobile_rec
+
+export_ppocr_v5_model() {
+  local MODEL_NAME=$1
+  local MODEL_DIR="$MODELS_PATH/public/$MODEL_NAME"
+  local DST_FILE1="$MODEL_DIR/FP32/$MODEL_NAME.xml"
+  local DST_FILE2="$MODEL_DIR/FP16/$MODEL_NAME.xml"
+
+  if [[ ! -f "$DST_FILE1" || ! -f "$DST_FILE2" ]]; then
+    display_header "Downloading PaddlePaddle $MODEL_NAME model"
+    echo "Downloading and converting: ${MODEL_DIR}"
+    mkdir -p "$MODEL_DIR"
+    cd "$MODEL_DIR"
+
+    # Install dependencies (needed for PaddlePaddle PIR → ONNX conversion)
+    pip install --no-cache-dir paddlepaddle paddle2onnx huggingface_hub || handle_error $LINENO
+
+    # Step 1: Download model from HuggingFace
+    echo_color "[1/4] Downloading PaddlePaddle/$MODEL_NAME from HuggingFace..." "cyan"
+    python3 -c "
+from huggingface_hub import snapshot_download
+snapshot_download(repo_id='PaddlePaddle/${MODEL_NAME}', local_dir='paddle_model')
+" || handle_error $LINENO
+
+    # Step 2: Convert PaddlePaddle PIR → ONNX via paddle2onnx
+    echo_color "[2/4] Converting PaddlePaddle → ONNX..." "cyan"
+    paddle2onnx \
+      --model_dir paddle_model \
+      --model_filename inference.json \
+      --params_filename inference.pdiparams \
+      --save_file model.onnx \
+      --opset_version 14 || handle_error $LINENO
+
+    # Step 3: Convert ONNX → OpenVINO IR FP32 & FP16
+    echo_color "[3/4] Converting ONNX → OpenVINO IR (FP32 & FP16)..." "cyan"
+    mkdir -p FP32 FP16
+    ovc model.onnx --output_model "FP32/${MODEL_NAME}.xml" --compress_to_fp16=False || handle_error $LINENO
+    ovc model.onnx --output_model "FP16/${MODEL_NAME}.xml" --compress_to_fp16=True || handle_error $LINENO
+
+    # Step 4: Copy full config.json to output directories
+    echo_color "[4/4] Storing model config.json..." "cyan"
+    for d in FP32 FP16; do
+        if [ -d "$d" ]; then
+            cp paddle_model/config.json "$d/config.json" || handle_error $LINENO
+        fi
+    done
+
+    # Cleanup intermediate files
+    rm -f model.onnx
+    rm -rf paddle_model
+    cd -
+    echo_color "[+] $MODEL_NAME model ready: $MODEL_DIR/{FP32,FP16}/" "green"
+  else
+    echo_color "\nModel already exists: $MODEL_DIR.\n" "yellow"
+  fi
+}
+
+# Well-known PP-OCRv5 models listed in SUPPORTED_MODELS
+PP_OCRV5_MODELS=("PP-OCRv5_server_rec" "PP-OCRv5_mobile_rec" "PP-OCRv5_server_det" "PP-OCRv5_mobile_det")
+for MODEL_NAME in "${PP_OCRV5_MODELS[@]}"; do
+  if array_contains "$MODEL_NAME" "${MODELS_TO_PROCESS[@]}" || array_contains "all" "${MODELS_TO_PROCESS[@]}"; then
+    export_ppocr_v5_model "$MODEL_NAME"
+  fi
+done
+
+# Handle any other PP-OCRv5 variant passed directly (e.g. en_PP-OCRv5_mobile_rec, latin_PP-OCRv5_mobile_rec)
+for MODEL_NAME in "${MODELS_TO_PROCESS[@]}"; do
+  if [[ "$MODEL_NAME" == *"PP-OCRv5"* ]] && ! array_contains "$MODEL_NAME" "${PP_OCRV5_MODELS[@]}"; then
+    export_ppocr_v5_model "$MODEL_NAME"
+  fi
+done
+
+
 # ================================= Pallet Defect Detection INT8 - Edge AI Resources =================================
 if array_contains "pallet_defect_detection" "${MODELS_TO_PROCESS[@]}" || array_contains "all" "${MODELS_TO_PROCESS[@]}"; then
   display_header "Downloading Pallet Defect Detection model"

@@ -373,16 +373,17 @@ static GstFlowReturn gst_gvagenai_transform_ip(GstBaseTransform *base, GstBuffer
     }
 
     GST_OBJECT_LOCK(gvagenai);
+    gboolean _success = TRUE;
     if (gvagenai->prompt_changed) {
-        if (!load_effective_prompt(gvagenai)) {
-            GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
-                              ("Could not load or validate prompt configuration"));
-            GST_OBJECT_UNLOCK(gvagenai);
-            return GST_FLOW_ERROR;
-        }
+        _success = load_effective_prompt(gvagenai);
         gvagenai->prompt_changed = FALSE;
     }
     GST_OBJECT_UNLOCK(gvagenai);
+    if (!_success) {
+        GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
+                          ("Could not load or validate prompt configuration"));
+        return GST_FLOW_ERROR;
+    }
 
     // Get video info from pad
     GstVideoInfo info;

@@ -44,6 +44,8 @@ BlobToMetaConverter::Ptr BlobToTensorConverter::create(BlobToMetaConverter::Init
         return std::make_unique<CLIPTokenConverter>(std::move(initializer));
     else if (converter_name == PaddleOCRConverter::getName())
         return std::make_unique<PaddleOCRConverter>(std::move(initializer));
+    else if (converter_name == PaddleOCRCtcConverter::getName())
+        return std::make_unique<PaddleOCRCtcConverter>(std::move(initializer));
     else if (converter_name == DetectionAnomalyConverter::getName()) {
         return std::make_unique<DetectionAnomalyConverter>(std::move(initializer));
     }

@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2021-2025 Intel Corporation
+ * Copyright (C) 2021-2026 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  ******************************************************************************/
@@ -11,12 +11,10 @@
 #include <algorithm>
 #include <cmath>
 #include <gst/gst.h>
+#include <iostream>
 #include <sstream>
 #include <stdexcept>
 
-#include <fstream>
-#include <iostream>
-
 using namespace post_processing;
 using namespace InferenceBackend;
 
@@ -123,3 +121,135 @@ std::string PaddleOCRConverter::decode(const std::vector<int> &text_index) {
 
     return char_list; // Return the decoded text
 }
+
+// ==================== PaddleOCRCtcConverter ====================
+
+PaddleOCRCtcConverter::PaddleOCRCtcConverter(BlobToMetaConverter::Initializer initializer)
+    : BlobToTensorConverter(std::move(initializer)) {
+    loadVocabularyFromModelProc();
+}
+
+void PaddleOCRCtcConverter::loadVocabularyFromModelProc() {
+    GstStructure *s = getModelProcOutputInfo().get();
+    if (!s) {
+        GVA_WARNING("PaddleOCR CTC converter: model_proc_output_info is null — using empty vocabulary");
+        return;
+    }
+
+    const GValue *dict_value = gst_structure_get_value(s, "character_dict");
+    if (!dict_value || !GST_VALUE_HOLDS_ARRAY(dict_value)) {
+        GVA_WARNING("PaddleOCR CTC converter: character_dict not found in model_proc_output_info");
+        return;
+    }
+
+    guint n = gst_value_array_get_size(dict_value);
+    vocabulary.reserve(n);
+    for (guint i = 0; i < n; ++i) {
+        const GValue *item = gst_value_array_get_value(dict_value, i);
+        if (G_VALUE_HOLDS_STRING(item)) {
+            vocabulary.push_back(g_value_get_string(item));
+        }
+    }
+    GVA_INFO("Loaded PaddleOCR character dictionary: %zu characters from model metadata", vocabulary.size());
+}
+
+std::pair<std::string, double> PaddleOCRCtcConverter::ctcDecode(const float *data, size_t seq_len, size_t vocab_size) {
+    std::string result;
+    std::vector<float> confidences;
+    int prev_idx = 0;
+
+    for (size_t t = 0; t < seq_len; ++t) {
+        // find index of maximum confidence logit within the current sequence step
+        const float *row = data + t * vocab_size;
+        int max_idx = static_cast<int>(std::max_element(row, row + vocab_size) - row);
+
+        // Element 0 is CTC blank and indicates entire sequence should be skipped
+        // If current index matches previous index, we also skip it to avoid duplicates
+        if (max_idx == 0 || max_idx == prev_idx) {
+            prev_idx = max_idx;
+            continue;
+        }
+        prev_idx = max_idx;
+
+        // Convert element index to Vocabulary character index
+        // Vocabulary is 1-based indexed, so subtract 1
+        size_t char_idx = static_cast<size_t>(max_idx - 1);
+        if (char_idx >= vocabulary.size())
+            continue;
+
+        // Add new character to output label
+        result.append(vocabulary[char_idx]);
+        confidences.push_back(row[max_idx]);
+    }
+
+    // return mean of character confidences as overall confidence score
+    double confidence = 0.0;
+    if (!confidences.empty()) {
+        confidence = std::accumulate(confidences.begin(), confidences.end(), 0.0f) / confidences.size();
+    }
+
+    return {result, confidence};
+}
+
+TensorsTable PaddleOCRCtcConverter::convert(const OutputBlobs &output_blobs) {
+    ITT_TASK(__FUNCTION__);
+    TensorsTable tensors_table;
+
+    try {
+        const size_t batch_size = getModelInputImageInfo().batch_size;
+        tensors_table.resize(batch_size);
+
+        for (const auto &blob_iter : output_blobs) {
+            OutputBlob::Ptr blob = blob_iter.second;
+            if (!blob) {
+                throw std::invalid_argument("Output blob is empty");
+            }
+
+            const float *data = reinterpret_cast<const float *>(blob->GetData());
+            if (!data) {
+                throw std::invalid_argument("Output blob data is nullptr");
+            }
+
+            const std::string layer_name = blob_iter.first;
+
+            // Output shape: [batch_size, seq_len, vocab_size]
+            // Tensor vocab_size has two additional tokens: CTC Blank token and Padding token
+            // hence its size is bigger than character vocabulary
+            const auto &dims = blob->GetDims();
+            const size_t vocab_size = (dims.size() == 3) ? dims[2] : 0;
+            const size_t seq_len = (dims.size() >= 2) ? dims[1] : 0;
+            if (vocab_size == 0 || seq_len == 0 || vocab_size != vocabulary.size() + 2)
+                throw std::invalid_argument("Unexpected PaddleOCR output tensor dimensions");
+
+            for (size_t batch_elem_index = 0; batch_elem_index < batch_size; ++batch_elem_index) {
+                GVA::Tensor classification_result = createTensor();
+
+                if (!raw_tensor_copying->enabled(RawTensorCopyingToggle::id))
+                    CopyOutputBlobToGstStructure(blob, classification_result.gst_structure(),
+                                                 BlobToMetaConverter::getModelName().c_str(), layer_name.c_str(),
+                                                 batch_size, batch_elem_index);
+
+                const float *item_data = data + batch_elem_index * seq_len * vocab_size;
+                auto [decoded_text, confidence] = ctcDecode(item_data, seq_len, vocab_size);
+
+                if (decoded_text.size() > seq_minlen) {
+                    classification_result.set_string("label", decoded_text);
+                    classification_result.set_double("confidence", confidence);
+                } else {
+                    classification_result.set_string("label", "");
+                    classification_result.set_double("confidence", 0.0);
+                }
+
+                gst_structure_set(classification_result.gst_structure(), "tensor_id", G_TYPE_INT,
+                                  safe_convert<int>(batch_elem_index), "type", G_TYPE_STRING, "classification_result",
+                                  NULL);
+                std::vector<GstStructure *> tensors{classification_result.gst_structure()};
+                tensors_table[batch_elem_index].push_back(tensors);
+            }
+        }
+    } catch (const std::exception &e) {
+        GVA_ERROR("An error occurred in PaddleOCR CTC converter: %s", e.what());
+    }
+
+    return tensors_table;
+}
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2021-2025 Intel Corporation
+ * Copyright (C) 2021-2026 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  ******************************************************************************/
@@ -402,5 +402,30 @@ class PaddleOCRConverter : public BlobToTensorConverter {
     std::string decodeOutputTensor(const float *item_data);
     std::string decode(const std::vector<int> &text_index);
 
-}; // namespace post_processing
+}; // class PaddleOCRConverter
+
+/*
+PaddleOCRCtc tensor output = [B, L, N] where:
+    B - batch size
+    L - sequence length (maximum number of characters in the recognized text)
+    N - number of elements in the model's character set including two additional tokens:
+        CTC blank token and Padding token.
+*/
+class PaddleOCRCtcConverter : public BlobToTensorConverter {
+  public:
+    PaddleOCRCtcConverter(BlobToMetaConverter::Initializer initializer);
+    TensorsTable convert(const OutputBlobs &output_blobs) override;
+
+    static std::string getName() {
+        return "paddle_ocr_ctc";
+    }
+
+  private:
+    std::vector<std::string> vocabulary; // loaded from model_proc_output_info character_dict
+    size_t seq_minlen = 1;               // minimum decoded sequence length
+
+    void loadVocabularyFromModelProc();
+    std::pair<std::string, double> ctcDecode(const float *data, size_t seq_len, size_t vocab_size);
+};
+
 } // namespace post_processing