Updates after code review.

tjanczak · tjanczak · commit f5f7bf1fd4de · 2026-04-03T11:36:06.000+02:00
diff --git a/samples/download_public_models.sh b/samples/download_public_models.sh
@@ -1148,7 +1148,7 @@ fi
 
 export_ppocr_v5_model() {
   local MODEL_NAME=$1
-  MODEL_DIR="$MODELS_PATH/public/$MODEL_NAME"
+  local MODEL_DIR="$MODELS_PATH/public/$MODEL_NAME"
   local DST_FILE1="$MODEL_DIR/FP32/$MODEL_NAME.xml"
   local DST_FILE2="$MODEL_DIR/FP16/$MODEL_NAME.xml"
 
diff --git a/src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp b/src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp
@@ -373,16 +373,17 @@ static GstFlowReturn gst_gvagenai_transform_ip(GstBaseTransform *base, GstBuffer
     }
 
     GST_OBJECT_LOCK(gvagenai);
+    gboolean _success = TRUE;
     if (gvagenai->prompt_changed) {
-        if (!load_effective_prompt(gvagenai)) {
-            GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
-                              ("Could not load or validate prompt configuration"));
-            GST_OBJECT_UNLOCK(gvagenai);
-            return GST_FLOW_ERROR;
-        }
+        _success = load_effective_prompt(gvagenai);
         gvagenai->prompt_changed = FALSE;
     }
     GST_OBJECT_UNLOCK(gvagenai);
+    if (!_success) {
+        GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
+                          ("Could not load or validate prompt configuration"));
+        return GST_FLOW_ERROR;
+    }
 
     // Get video info from pad
     GstVideoInfo info;
diff --git a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.cpp b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.cpp
@@ -156,16 +156,15 @@ void PaddleOCRCtcConverter::loadVocabularyFromModelProc() {
 
 std::pair<std::string, double> PaddleOCRCtcConverter::ctcDecode(const float *data, size_t seq_len, size_t vocab_size) {
     std::string result;
-    double log_conf_sum = 0.0;
-    int num_chars = 0;
+    std::vector<float> confidences;
     int prev_idx = 0;
 
     for (size_t t = 0; t < seq_len; ++t) {
         // find index of maximum confidence logit within the current sequence step
         const float *row = data + t * vocab_size;
         int max_idx = static_cast<int>(std::max_element(row, row + vocab_size) - row);
 
-        // Element 0 is CTC blank and indicates entire sequence should be skiped
+        // Element 0 is CTC blank and indicates entire sequence should be skipped
         // If current index matches previous index, we also skip it to avoid duplicates
         if (max_idx == 0 || max_idx == prev_idx) {
             prev_idx = max_idx;
@@ -174,25 +173,22 @@ std::pair<std::string, double> PaddleOCRCtcConverter::ctcDecode(const float *dat
         prev_idx = max_idx;
 
         // Convert element index to Vocabulary character index
-        // Vocabulary is 1-based indexed (0 is reserved for CTC blank), so subtract 1
+        // Vocabulary is 1-based indexed, so subtract 1
         size_t char_idx = static_cast<size_t>(max_idx - 1);
         if (char_idx >= vocabulary.size())
             continue;
 
         // Add new character to output label
         result.append(vocabulary[char_idx]);
+        confidences.push_back(row[max_idx]);
+    }
 
-        // Calculate softmax probability for this character
-        float row_max = row[max_idx];
-        double exp_sum = 0.0;
-        for (size_t v = 0; v < vocab_size; ++v)
-            exp_sum += std::exp(static_cast<double>(row[v] - row_max));
-        log_conf_sum += std::log(1.0 / exp_sum + 1e-10);
-        ++num_chars;
+    // return mean of character confidences as overall confidence score
+    double confidence = 0.0;
+    if (!confidences.empty()) {
+        confidence = std::accumulate(confidences.begin(), confidences.end(), 0.0f) / confidences.size();
     }
 
-    // retunr geomean of character confidences as overall confidence score for the sequence
-    double confidence = (num_chars > 0) ? std::exp(log_conf_sum / num_chars) : 0.0;
     return {result, confidence};
 }
 
@@ -218,11 +214,13 @@ TensorsTable PaddleOCRCtcConverter::convert(const OutputBlobs &output_blobs) {
             const std::string layer_name = blob_iter.first;
 
             // Output shape: [batch_size, seq_len, vocab_size]
+            // Tensor vocab_size has two additional tokens: CTC Blank token and Padding token
+            // hence its size is bigger than character vocabulary
             const auto &dims = blob->GetDims();
             const size_t vocab_size = (dims.size() == 3) ? dims[2] : 0;
             const size_t seq_len = (dims.size() >= 2) ? dims[1] : 0;
-            if (vocab_size != vocabulary.size() + 2) // +1 for CTC blank token, +1 for 1-based indexing
-                throw std::invalid_argument("Unexpected vocabulary size");
+            if (vocab_size == 0 || seq_len == 0 || vocab_size != vocabulary.size() + 2)
+                throw std::invalid_argument("Unexpected PaddleOCR output tensor dimensions");
 
             for (size_t batch_elem_index = 0; batch_elem_index < batch_size; ++batch_elem_index) {
                 GVA::Tensor classification_result = createTensor();
diff --git a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.h b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.h
@@ -405,11 +405,11 @@ class PaddleOCRConverter : public BlobToTensorConverter {
 }; // class PaddleOCRConverter
 
 /*
-PaddleOCRCtcConverter: standard PaddleOCRv5 CTC decoder.
-  - Index 0 is the CTC blank token
-  - Real characters are mapped as vocabulary[index - 1]
-  - Character dictionary is loaded at runtime from config.json (as part of model import)
-  - Sequence length and vocabulary size are derived from the model output tensor shape
+PaddleOCRCtc tensor output = [B, L, N] where:
+    B - batch size
+    L - sequence length (maximum number of characters in the recognized text)
+    N - number of elements in the model's character set including two additional tokens:
+        CTC blank token and Padding token.
 */
 class PaddleOCRCtcConverter : public BlobToTensorConverter {
   public:
diff --git a/src/monolithic/inference_backend/image_inference/openvino/model_api_converters.cpp b/src/monolithic/inference_backend/image_inference/openvino/model_api_converters.cpp
@@ -438,15 +438,27 @@ bool isPaddleOCRModel(const std::string &model_file) {
     if (!loadJsonFromModelDir(model_file, "config.json", config_json))
         return false;
 
+    bool has_pp_ocr_model_name = false;
+    bool has_ctc_label_decode = false;
+
     // PaddleOCR config.json contains Global.model_name with "PP-OCR" substring
     if (config_json.contains("Global") && config_json["Global"].is_object() &&
         config_json["Global"].contains("model_name") && config_json["Global"]["model_name"].is_string()) {
         const std::string model_name = config_json["Global"]["model_name"].get<std::string>();
-        if (std::regex_search(model_name, std::regex("PP-OCR.*rec")))
-            return true;
+        if (std::regex_search(model_name, std::regex(".*PP-OCR.*rec")))
+            has_pp_ocr_model_name = true;
+    }
+
+    // Also check for PostProcess.name == "CTCLabelDecode" with character_dict
+    if (config_json.contains("PostProcess") && config_json["PostProcess"].is_object() &&
+        config_json["PostProcess"].contains("name") && config_json["PostProcess"]["name"].is_string()) {
+        const std::string pp_name = config_json["PostProcess"]["name"].get<std::string>();
+        if (pp_name == "CTCLabelDecode") {
+            has_ctc_label_decode = true;
+        }
     }
 
-    return false;
+    return has_pp_ocr_model_name && has_ctc_label_decode;
 }
 
 // Convert PaddleOCR config.json metadata into Model API format