diff --git a/samples/download_public_models.sh b/samples/download_public_models.sh index ea070b957..733389fbf 100755 --- a/samples/download_public_models.sh +++ b/samples/download_public_models.sh @@ -125,6 +125,8 @@ SUPPORTED_MODELS=( "clip-vit-base-patch16" "clip-vit-base-patch32" "ch_PP-OCRv4_rec_infer" # PaddlePaddle OCRv4 multilingual model + "PP-OCRv5_server_rec" # PaddlePaddle PP-OCRv5 recognition models + "PP-OCRv5_mobile_rec" "pallet_defect_detection" # Custom model for pallet defect detection "colorcls2" # Color classification model "mars-small128" # DeepSORT person re-identification model (uses convert_mars_deepsort.py) @@ -293,6 +295,11 @@ validate_models() { fi done + # Allow any PP-OCRv5 variant (e.g. en_PP-OCRv5_mobile_rec, latin_PP-OCRv5_mobile_rec) + if [[ "$found" == false && "$model" == *"PP-OCRv5"* ]]; then + found=true + fi + if [[ "$found" == false ]]; then echo_color "Error: Unsupported model '$model'" "red" echo "" @@ -1131,6 +1138,85 @@ os.remove('${MODEL_NAME}.zip') fi +# ================================= PP-OCRv5 PaddlePaddle models FP32 & FP16 - HuggingFace + paddle2onnx ================================= +# Generic function to download and convert any PaddlePaddle PP-OCRv5 model. +# All PP-OCRv5 models on HuggingFace share the same structure: +# inference.json + inference.pdiparams (PaddlePaddle PIR format) +# config.json (contains character_dict for recognition models) +# HuggingFace repo naming: PaddlePaddle/ (e.g. PaddlePaddle/PP-OCRv5_server_rec) +# For language-specific variants the prefix goes before PP-OCRv5: e.g. en_PP-OCRv5_mobile_rec + +export_ppocr_v5_model() { + local MODEL_NAME=$1 + local MODEL_DIR="$MODELS_PATH/public/$MODEL_NAME" + local DST_FILE1="$MODEL_DIR/FP32/$MODEL_NAME.xml" + local DST_FILE2="$MODEL_DIR/FP16/$MODEL_NAME.xml" + + if [[ ! -f "$DST_FILE1" || ! -f "$DST_FILE2" ]]; then + display_header "Downloading PaddlePaddle $MODEL_NAME model" + echo "Downloading and converting: ${MODEL_DIR}" + mkdir -p "$MODEL_DIR" + cd "$MODEL_DIR" + + # Install dependencies (needed for PaddlePaddle PIR → ONNX conversion) + pip install --no-cache-dir paddlepaddle paddle2onnx huggingface_hub || handle_error $LINENO + + # Step 1: Download model from HuggingFace + echo_color "[1/4] Downloading PaddlePaddle/$MODEL_NAME from HuggingFace..." "cyan" + python3 -c " +from huggingface_hub import snapshot_download +snapshot_download(repo_id='PaddlePaddle/${MODEL_NAME}', local_dir='paddle_model') +" || handle_error $LINENO + + # Step 2: Convert PaddlePaddle PIR → ONNX via paddle2onnx + echo_color "[2/4] Converting PaddlePaddle → ONNX..." "cyan" + paddle2onnx \ + --model_dir paddle_model \ + --model_filename inference.json \ + --params_filename inference.pdiparams \ + --save_file model.onnx \ + --opset_version 14 || handle_error $LINENO + + # Step 3: Convert ONNX → OpenVINO IR FP32 & FP16 + echo_color "[3/4] Converting ONNX → OpenVINO IR (FP32 & FP16)..." "cyan" + mkdir -p FP32 FP16 + ovc model.onnx --output_model "FP32/${MODEL_NAME}.xml" --compress_to_fp16=False || handle_error $LINENO + ovc model.onnx --output_model "FP16/${MODEL_NAME}.xml" --compress_to_fp16=True || handle_error $LINENO + + # Step 4: Copy full config.json to output directories + echo_color "[4/4] Storing model config.json..." "cyan" + for d in FP32 FP16; do + if [ -d "$d" ]; then + cp paddle_model/config.json "$d/config.json" || handle_error $LINENO + fi + done + + # Cleanup intermediate files + rm -f model.onnx + rm -rf paddle_model + cd - + echo_color "[+] $MODEL_NAME model ready: $MODEL_DIR/{FP32,FP16}/" "green" + else + echo_color "\nModel already exists: $MODEL_DIR.\n" "yellow" + fi +} + +# Well-known PP-OCRv5 models listed in SUPPORTED_MODELS +PP_OCRV5_MODELS=("PP-OCRv5_server_rec" "PP-OCRv5_mobile_rec" "PP-OCRv5_server_det" "PP-OCRv5_mobile_det") +for MODEL_NAME in "${PP_OCRV5_MODELS[@]}"; do + if array_contains "$MODEL_NAME" "${MODELS_TO_PROCESS[@]}" || array_contains "all" "${MODELS_TO_PROCESS[@]}"; then + export_ppocr_v5_model "$MODEL_NAME" + fi +done + +# Handle any other PP-OCRv5 variant passed directly (e.g. en_PP-OCRv5_mobile_rec, latin_PP-OCRv5_mobile_rec) +for MODEL_NAME in "${MODELS_TO_PROCESS[@]}"; do + if [[ "$MODEL_NAME" == *"PP-OCRv5"* ]] && ! array_contains "$MODEL_NAME" "${PP_OCRV5_MODELS[@]}"; then + export_ppocr_v5_model "$MODEL_NAME" + fi +done + + # ================================= Pallet Defect Detection INT8 - Edge AI Resources ================================= if array_contains "pallet_defect_detection" "${MODELS_TO_PROCESS[@]}" || array_contains "all" "${MODELS_TO_PROCESS[@]}"; then display_header "Downloading Pallet Defect Detection model" diff --git a/src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp b/src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp index 4092e7995..1b3a0bf15 100644 --- a/src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp +++ b/src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp @@ -373,16 +373,17 @@ static GstFlowReturn gst_gvagenai_transform_ip(GstBaseTransform *base, GstBuffer } GST_OBJECT_LOCK(gvagenai); + gboolean _success = TRUE; if (gvagenai->prompt_changed) { - if (!load_effective_prompt(gvagenai)) { - GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"), - ("Could not load or validate prompt configuration")); - GST_OBJECT_UNLOCK(gvagenai); - return GST_FLOW_ERROR; - } + _success = load_effective_prompt(gvagenai); gvagenai->prompt_changed = FALSE; } GST_OBJECT_UNLOCK(gvagenai); + if (!_success) { + GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"), + ("Could not load or validate prompt configuration")); + return GST_FLOW_ERROR; + } // Get video info from pad GstVideoInfo info; diff --git a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/blob_to_tensor_converter.cpp b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/blob_to_tensor_converter.cpp index cdfa88d35..b6cd7b76a 100644 --- a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/blob_to_tensor_converter.cpp +++ b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/blob_to_tensor_converter.cpp @@ -44,6 +44,8 @@ BlobToMetaConverter::Ptr BlobToTensorConverter::create(BlobToMetaConverter::Init return std::make_unique(std::move(initializer)); else if (converter_name == PaddleOCRConverter::getName()) return std::make_unique(std::move(initializer)); + else if (converter_name == PaddleOCRCtcConverter::getName()) + return std::make_unique(std::move(initializer)); else if (converter_name == DetectionAnomalyConverter::getName()) { return std::make_unique(std::move(initializer)); } diff --git a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.cpp b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.cpp index 49588992e..e099633ce 100644 --- a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.cpp +++ b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (C) 2021-2025 Intel Corporation + * Copyright (C) 2021-2026 Intel Corporation * * SPDX-License-Identifier: MIT ******************************************************************************/ @@ -11,12 +11,10 @@ #include #include #include +#include #include #include -#include -#include - using namespace post_processing; using namespace InferenceBackend; @@ -123,3 +121,135 @@ std::string PaddleOCRConverter::decode(const std::vector &text_index) { return char_list; // Return the decoded text } + +// ==================== PaddleOCRCtcConverter ==================== + +PaddleOCRCtcConverter::PaddleOCRCtcConverter(BlobToMetaConverter::Initializer initializer) + : BlobToTensorConverter(std::move(initializer)) { + loadVocabularyFromModelProc(); +} + +void PaddleOCRCtcConverter::loadVocabularyFromModelProc() { + GstStructure *s = getModelProcOutputInfo().get(); + if (!s) { + GVA_WARNING("PaddleOCR CTC converter: model_proc_output_info is null — using empty vocabulary"); + return; + } + + const GValue *dict_value = gst_structure_get_value(s, "character_dict"); + if (!dict_value || !GST_VALUE_HOLDS_ARRAY(dict_value)) { + GVA_WARNING("PaddleOCR CTC converter: character_dict not found in model_proc_output_info"); + return; + } + + guint n = gst_value_array_get_size(dict_value); + vocabulary.reserve(n); + for (guint i = 0; i < n; ++i) { + const GValue *item = gst_value_array_get_value(dict_value, i); + if (G_VALUE_HOLDS_STRING(item)) { + vocabulary.push_back(g_value_get_string(item)); + } + } + GVA_INFO("Loaded PaddleOCR character dictionary: %zu characters from model metadata", vocabulary.size()); +} + +std::pair PaddleOCRCtcConverter::ctcDecode(const float *data, size_t seq_len, size_t vocab_size) { + std::string result; + std::vector confidences; + int prev_idx = 0; + + for (size_t t = 0; t < seq_len; ++t) { + // find index of maximum confidence logit within the current sequence step + const float *row = data + t * vocab_size; + int max_idx = static_cast(std::max_element(row, row + vocab_size) - row); + + // Element 0 is CTC blank and indicates entire sequence should be skipped + // If current index matches previous index, we also skip it to avoid duplicates + if (max_idx == 0 || max_idx == prev_idx) { + prev_idx = max_idx; + continue; + } + prev_idx = max_idx; + + // Convert element index to Vocabulary character index + // Vocabulary is 1-based indexed, so subtract 1 + size_t char_idx = static_cast(max_idx - 1); + if (char_idx >= vocabulary.size()) + continue; + + // Add new character to output label + result.append(vocabulary[char_idx]); + confidences.push_back(row[max_idx]); + } + + // return mean of character confidences as overall confidence score + double confidence = 0.0; + if (!confidences.empty()) { + confidence = std::accumulate(confidences.begin(), confidences.end(), 0.0f) / confidences.size(); + } + + return {result, confidence}; +} + +TensorsTable PaddleOCRCtcConverter::convert(const OutputBlobs &output_blobs) { + ITT_TASK(__FUNCTION__); + TensorsTable tensors_table; + + try { + const size_t batch_size = getModelInputImageInfo().batch_size; + tensors_table.resize(batch_size); + + for (const auto &blob_iter : output_blobs) { + OutputBlob::Ptr blob = blob_iter.second; + if (!blob) { + throw std::invalid_argument("Output blob is empty"); + } + + const float *data = reinterpret_cast(blob->GetData()); + if (!data) { + throw std::invalid_argument("Output blob data is nullptr"); + } + + const std::string layer_name = blob_iter.first; + + // Output shape: [batch_size, seq_len, vocab_size] + // Tensor vocab_size has two additional tokens: CTC Blank token and Padding token + // hence its size is bigger than character vocabulary + const auto &dims = blob->GetDims(); + const size_t vocab_size = (dims.size() == 3) ? dims[2] : 0; + const size_t seq_len = (dims.size() >= 2) ? dims[1] : 0; + if (vocab_size == 0 || seq_len == 0 || vocab_size != vocabulary.size() + 2) + throw std::invalid_argument("Unexpected PaddleOCR output tensor dimensions"); + + for (size_t batch_elem_index = 0; batch_elem_index < batch_size; ++batch_elem_index) { + GVA::Tensor classification_result = createTensor(); + + if (!raw_tensor_copying->enabled(RawTensorCopyingToggle::id)) + CopyOutputBlobToGstStructure(blob, classification_result.gst_structure(), + BlobToMetaConverter::getModelName().c_str(), layer_name.c_str(), + batch_size, batch_elem_index); + + const float *item_data = data + batch_elem_index * seq_len * vocab_size; + auto [decoded_text, confidence] = ctcDecode(item_data, seq_len, vocab_size); + + if (decoded_text.size() > seq_minlen) { + classification_result.set_string("label", decoded_text); + classification_result.set_double("confidence", confidence); + } else { + classification_result.set_string("label", ""); + classification_result.set_double("confidence", 0.0); + } + + gst_structure_set(classification_result.gst_structure(), "tensor_id", G_TYPE_INT, + safe_convert(batch_elem_index), "type", G_TYPE_STRING, "classification_result", + NULL); + std::vector tensors{classification_result.gst_structure()}; + tensors_table[batch_elem_index].push_back(tensors); + } + } + } catch (const std::exception &e) { + GVA_ERROR("An error occurred in PaddleOCR CTC converter: %s", e.what()); + } + + return tensors_table; +} diff --git a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.h b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.h index ad5a5f1d9..7b317c731 100644 --- a/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.h +++ b/src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.h @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (C) 2021-2025 Intel Corporation + * Copyright (C) 2021-2026 Intel Corporation * * SPDX-License-Identifier: MIT ******************************************************************************/ @@ -402,5 +402,30 @@ class PaddleOCRConverter : public BlobToTensorConverter { std::string decodeOutputTensor(const float *item_data); std::string decode(const std::vector &text_index); -}; // namespace post_processing +}; // class PaddleOCRConverter + +/* +PaddleOCRCtc tensor output = [B, L, N] where: + B - batch size + L - sequence length (maximum number of characters in the recognized text) + N - number of elements in the model's character set including two additional tokens: + CTC blank token and Padding token. +*/ +class PaddleOCRCtcConverter : public BlobToTensorConverter { + public: + PaddleOCRCtcConverter(BlobToMetaConverter::Initializer initializer); + TensorsTable convert(const OutputBlobs &output_blobs) override; + + static std::string getName() { + return "paddle_ocr_ctc"; + } + + private: + std::vector vocabulary; // loaded from model_proc_output_info character_dict + size_t seq_minlen = 1; // minimum decoded sequence length + + void loadVocabularyFromModelProc(); + std::pair ctcDecode(const float *data, size_t seq_len, size_t vocab_size); +}; + } // namespace post_processing \ No newline at end of file diff --git a/src/monolithic/inference_backend/image_inference/openvino/model_api_converters.cpp b/src/monolithic/inference_backend/image_inference/openvino/model_api_converters.cpp index f9558fd9e..978ce79c9 100644 --- a/src/monolithic/inference_backend/image_inference/openvino/model_api_converters.cpp +++ b/src/monolithic/inference_backend/image_inference/openvino/model_api_converters.cpp @@ -432,6 +432,98 @@ bool isHuggingFaceModel(const std::string &model_file) { return false; } +// Detect PaddleOCR text recognition model by checking for PaddlePaddle model name in config.json +bool isPaddleOCRModel(const std::string &model_file) { + nlohmann::json config_json; + if (!loadJsonFromModelDir(model_file, "config.json", config_json)) + return false; + + bool has_pp_ocr_model_name = false; + bool has_ctc_label_decode = false; + + // PaddleOCR config.json contains Global.model_name with "PP-OCR" substring + if (config_json.contains("Global") && config_json["Global"].is_object() && + config_json["Global"].contains("model_name") && config_json["Global"]["model_name"].is_string()) { + const std::string model_name = config_json["Global"]["model_name"].get(); + if (std::regex_search(model_name, std::regex(".*PP-OCR.*rec"))) + has_pp_ocr_model_name = true; + } + + // Also check for PostProcess.name == "CTCLabelDecode" with character_dict + if (config_json.contains("PostProcess") && config_json["PostProcess"].is_object() && + config_json["PostProcess"].contains("name") && config_json["PostProcess"]["name"].is_string()) { + const std::string pp_name = config_json["PostProcess"]["name"].get(); + if (pp_name == "CTCLabelDecode") { + has_ctc_label_decode = true; + } + } + + return has_pp_ocr_model_name && has_ctc_label_decode; +} + +// Convert PaddleOCR config.json metadata into Model API format +bool convertPaddleOCRMeta2ModelApi(const std::string &model_file, ov::AnyMap &modelConfig) { + nlohmann::json config_json; + if (!loadJsonFromModelDir(model_file, "config.json", config_json)) + return false; + + GST_INFO("Parsing PaddleOCR config file for model: %s", model_file.c_str()); + + // Set model type to paddle_ocr_ctc (standard PaddleOCR CTC convention) + modelConfig["model_type"] = ov::Any(std::string("paddle_ocr_ctc")); + + // Set default PaddleOCR standard normalization + modelConfig["mean_values"] = ov::Any(std::string("127.5, 127.5, 127.5")); + modelConfig["scale_values"] = ov::Any(std::string("127.5, 127.5, 127.5")); + + // PaddleOCR preserves aspect ratio and pads to target width + modelConfig["resize_type"] = ov::Any(std::string("fit_to_window")); + + // Extract character dictionary from PostProcess.character_dict + if (config_json.contains("PostProcess") && config_json["PostProcess"].is_object() && + config_json["PostProcess"].contains("character_dict") && + config_json["PostProcess"]["character_dict"].is_array()) { + std::vector char_dict; + for (const auto &ch : config_json["PostProcess"]["character_dict"]) { + if (ch.is_string()) + char_dict.push_back(ch.get()); + } + modelConfig["character_dict"] = ov::Any(char_dict); + GST_INFO("Extracted PaddleOCR character dictionary: %zu characters", char_dict.size()); + } + + // Parse pre-processing metadata from config file + if (config_json.contains("PreProcess") && config_json["PreProcess"].is_object() && + config_json["PreProcess"].contains("transform_ops") && config_json["PreProcess"]["transform_ops"].is_array()) { + // Extract image color space + for (const auto &op : config_json["PreProcess"]["transform_ops"]) { + if (op.is_object() && op.contains("DecodeImage") && op["DecodeImage"].is_object() && + op["DecodeImage"].contains("img_mode") && op["DecodeImage"]["img_mode"].is_string()) { + const std::string img_mode = op["DecodeImage"]["img_mode"].get(); + if (img_mode == "RGB") { + modelConfig["reverse_input_channels"] = ov::Any(std::string("true")); + } + break; + } + } + // Extract reshape size from RecResizeImg.image_shape [C, H, W] + for (const auto &op : config_json["PreProcess"]["transform_ops"]) { + if (op.is_object() && op.contains("RecResizeImg") && op["RecResizeImg"].is_object() && + op["RecResizeImg"].contains("image_shape") && op["RecResizeImg"]["image_shape"].is_array()) { + const auto &shape = op["RecResizeImg"]["image_shape"]; + if (shape.size() == 3 && shape[1].is_number_integer() && shape[2].is_number_integer()) { + const int height = shape[1].get(); + const int width = shape[2].get(); + modelConfig["reshape"] = ov::Any(std::vector{height, width}); + } + break; + } + } + } + + return true; +} + // Convert third-party input metadata config files into Model API format bool convertThirdPartyModelConfig(const std::string model_file, ov::AnyMap &modelConfig) { bool updated = false; @@ -442,6 +534,9 @@ bool convertThirdPartyModelConfig(const std::string model_file, ov::AnyMap &mode } } + else if (isPaddleOCRModel(model_file)) + updated = convertPaddleOCRMeta2ModelApi(model_file, modelConfig); + else if (isHuggingFaceModel(model_file)) updated = convertHuggingFaceMeta2ModelApi(model_file, modelConfig); @@ -825,6 +920,21 @@ std::map get_model_info_postproc(const std::shared_ gst_structure_set_value(s, "labels", &gvalue); g_value_unset(&gvalue); } + if (element.first == "character_dict") { + std::vector char_dict = element.second.as>(); + GValue gvalue = G_VALUE_INIT; + g_value_init(&gvalue, GST_TYPE_ARRAY); + for (const auto &ch : char_dict) { + GValue item = G_VALUE_INIT; + g_value_init(&item, G_TYPE_STRING); + g_value_set_string(&item, ch.c_str()); + gst_value_array_append_value(&gvalue, &item); + g_value_unset(&item); + } + gst_structure_set_value(s, "character_dict", &gvalue); + GST_INFO("[get_model_info_postproc] character_dict: %zu characters", char_dict.size()); + g_value_unset(&gvalue); + } } // restore system locale