Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions samples/download_public_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ SUPPORTED_MODELS=(
"clip-vit-base-patch16"
"clip-vit-base-patch32"
"ch_PP-OCRv4_rec_infer" # PaddlePaddle OCRv4 multilingual model
"PP-OCRv5_server_rec" # PaddlePaddle PP-OCRv5 recognition models
"PP-OCRv5_mobile_rec"
"pallet_defect_detection" # Custom model for pallet defect detection
"colorcls2" # Color classification model
"mars-small128" # DeepSORT person re-identification model (uses convert_mars_deepsort.py)
Expand Down Expand Up @@ -293,6 +295,11 @@ validate_models() {
fi
done

# Allow any PP-OCRv5 variant (e.g. en_PP-OCRv5_mobile_rec, latin_PP-OCRv5_mobile_rec)
if [[ "$found" == false && "$model" == *"PP-OCRv5"* ]]; then
found=true
fi

if [[ "$found" == false ]]; then
echo_color "Error: Unsupported model '$model'" "red"
echo ""
Expand Down Expand Up @@ -1131,6 +1138,85 @@ os.remove('${MODEL_NAME}.zip')
fi


# ================================= PP-OCRv5 PaddlePaddle models FP32 & FP16 - HuggingFace + paddle2onnx =================================
# Generic function to download and convert any PaddlePaddle PP-OCRv5 model.
# All PP-OCRv5 models on HuggingFace share the same structure:
# inference.json + inference.pdiparams (PaddlePaddle PIR format)
# config.json (contains character_dict for recognition models)
# HuggingFace repo naming: PaddlePaddle/<model_name> (e.g. PaddlePaddle/PP-OCRv5_server_rec)
# For language-specific variants the prefix goes before PP-OCRv5: e.g. en_PP-OCRv5_mobile_rec

export_ppocr_v5_model() {
local MODEL_NAME=$1
local MODEL_DIR="$MODELS_PATH/public/$MODEL_NAME"
local DST_FILE1="$MODEL_DIR/FP32/$MODEL_NAME.xml"
local DST_FILE2="$MODEL_DIR/FP16/$MODEL_NAME.xml"

if [[ ! -f "$DST_FILE1" || ! -f "$DST_FILE2" ]]; then
display_header "Downloading PaddlePaddle $MODEL_NAME model"
echo "Downloading and converting: ${MODEL_DIR}"
mkdir -p "$MODEL_DIR"
cd "$MODEL_DIR"

# Install dependencies (needed for PaddlePaddle PIR → ONNX conversion)
pip install --no-cache-dir paddlepaddle paddle2onnx huggingface_hub || handle_error $LINENO

# Step 1: Download model from HuggingFace
echo_color "[1/4] Downloading PaddlePaddle/$MODEL_NAME from HuggingFace..." "cyan"
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(repo_id='PaddlePaddle/${MODEL_NAME}', local_dir='paddle_model')
" || handle_error $LINENO

# Step 2: Convert PaddlePaddle PIR → ONNX via paddle2onnx
echo_color "[2/4] Converting PaddlePaddle → ONNX..." "cyan"
paddle2onnx \
--model_dir paddle_model \
--model_filename inference.json \
--params_filename inference.pdiparams \
--save_file model.onnx \
--opset_version 14 || handle_error $LINENO

# Step 3: Convert ONNX → OpenVINO IR FP32 & FP16
echo_color "[3/4] Converting ONNX → OpenVINO IR (FP32 & FP16)..." "cyan"
mkdir -p FP32 FP16
ovc model.onnx --output_model "FP32/${MODEL_NAME}.xml" --compress_to_fp16=False || handle_error $LINENO
ovc model.onnx --output_model "FP16/${MODEL_NAME}.xml" --compress_to_fp16=True || handle_error $LINENO

# Step 4: Copy full config.json to output directories
echo_color "[4/4] Storing model config.json..." "cyan"
for d in FP32 FP16; do
if [ -d "$d" ]; then
cp paddle_model/config.json "$d/config.json" || handle_error $LINENO
fi
done

# Cleanup intermediate files
rm -f model.onnx
rm -rf paddle_model
cd -
echo_color "[+] $MODEL_NAME model ready: $MODEL_DIR/{FP32,FP16}/" "green"
else
echo_color "\nModel already exists: $MODEL_DIR.\n" "yellow"
fi
}

# Well-known PP-OCRv5 models listed in SUPPORTED_MODELS
PP_OCRV5_MODELS=("PP-OCRv5_server_rec" "PP-OCRv5_mobile_rec" "PP-OCRv5_server_det" "PP-OCRv5_mobile_det")
for MODEL_NAME in "${PP_OCRV5_MODELS[@]}"; do
if array_contains "$MODEL_NAME" "${MODELS_TO_PROCESS[@]}" || array_contains "all" "${MODELS_TO_PROCESS[@]}"; then
export_ppocr_v5_model "$MODEL_NAME"
fi
done

# Handle any other PP-OCRv5 variant passed directly (e.g. en_PP-OCRv5_mobile_rec, latin_PP-OCRv5_mobile_rec)
for MODEL_NAME in "${MODELS_TO_PROCESS[@]}"; do
if [[ "$MODEL_NAME" == *"PP-OCRv5"* ]] && ! array_contains "$MODEL_NAME" "${PP_OCRV5_MODELS[@]}"; then
export_ppocr_v5_model "$MODEL_NAME"
fi
done


# ================================= Pallet Defect Detection INT8 - Edge AI Resources =================================
if array_contains "pallet_defect_detection" "${MODELS_TO_PROCESS[@]}" || array_contains "all" "${MODELS_TO_PROCESS[@]}"; then
display_header "Downloading Pallet Defect Detection model"
Expand Down
13 changes: 7 additions & 6 deletions src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,16 +373,17 @@ static GstFlowReturn gst_gvagenai_transform_ip(GstBaseTransform *base, GstBuffer
}

GST_OBJECT_LOCK(gvagenai);
gboolean _success = TRUE;
if (gvagenai->prompt_changed) {
if (!load_effective_prompt(gvagenai)) {
GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
("Could not load or validate prompt configuration"));
GST_OBJECT_UNLOCK(gvagenai);
return GST_FLOW_ERROR;
}
_success = load_effective_prompt(gvagenai);
gvagenai->prompt_changed = FALSE;
}
GST_OBJECT_UNLOCK(gvagenai);
if (!_success) {
GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
("Could not load or validate prompt configuration"));
return GST_FLOW_ERROR;
}

// Get video info from pad
GstVideoInfo info;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ BlobToMetaConverter::Ptr BlobToTensorConverter::create(BlobToMetaConverter::Init
return std::make_unique<CLIPTokenConverter>(std::move(initializer));
else if (converter_name == PaddleOCRConverter::getName())
return std::make_unique<PaddleOCRConverter>(std::move(initializer));
else if (converter_name == PaddleOCRCtcConverter::getName())
return std::make_unique<PaddleOCRCtcConverter>(std::move(initializer));
else if (converter_name == DetectionAnomalyConverter::getName()) {
return std::make_unique<DetectionAnomalyConverter>(std::move(initializer));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright (C) 2021-2025 Intel Corporation
* Copyright (C) 2021-2026 Intel Corporation
*
* SPDX-License-Identifier: MIT
******************************************************************************/
Expand All @@ -11,12 +11,10 @@
#include <algorithm>
#include <cmath>
#include <gst/gst.h>
#include <iostream>
#include <sstream>
#include <stdexcept>

#include <fstream>
#include <iostream>

using namespace post_processing;
using namespace InferenceBackend;

Expand Down Expand Up @@ -123,3 +121,135 @@ std::string PaddleOCRConverter::decode(const std::vector<int> &text_index) {

return char_list; // Return the decoded text
}

// ==================== PaddleOCRCtcConverter ====================

PaddleOCRCtcConverter::PaddleOCRCtcConverter(BlobToMetaConverter::Initializer initializer)
: BlobToTensorConverter(std::move(initializer)) {
loadVocabularyFromModelProc();
}

void PaddleOCRCtcConverter::loadVocabularyFromModelProc() {
GstStructure *s = getModelProcOutputInfo().get();
if (!s) {
GVA_WARNING("PaddleOCR CTC converter: model_proc_output_info is null — using empty vocabulary");
return;
}

const GValue *dict_value = gst_structure_get_value(s, "character_dict");
if (!dict_value || !GST_VALUE_HOLDS_ARRAY(dict_value)) {
GVA_WARNING("PaddleOCR CTC converter: character_dict not found in model_proc_output_info");
return;
}

guint n = gst_value_array_get_size(dict_value);
vocabulary.reserve(n);
for (guint i = 0; i < n; ++i) {
const GValue *item = gst_value_array_get_value(dict_value, i);
if (G_VALUE_HOLDS_STRING(item)) {
vocabulary.push_back(g_value_get_string(item));
}
}
GVA_INFO("Loaded PaddleOCR character dictionary: %zu characters from model metadata", vocabulary.size());
}

std::pair<std::string, double> PaddleOCRCtcConverter::ctcDecode(const float *data, size_t seq_len, size_t vocab_size) {
std::string result;
std::vector<float> confidences;
int prev_idx = 0;

for (size_t t = 0; t < seq_len; ++t) {
// find index of maximum confidence logit within the current sequence step
const float *row = data + t * vocab_size;
int max_idx = static_cast<int>(std::max_element(row, row + vocab_size) - row);

// Element 0 is CTC blank and indicates entire sequence should be skipped
// If current index matches previous index, we also skip it to avoid duplicates
if (max_idx == 0 || max_idx == prev_idx) {
prev_idx = max_idx;
continue;
}
prev_idx = max_idx;

// Convert element index to Vocabulary character index
// Vocabulary is 1-based indexed, so subtract 1
size_t char_idx = static_cast<size_t>(max_idx - 1);
if (char_idx >= vocabulary.size())
continue;

// Add new character to output label
result.append(vocabulary[char_idx]);
confidences.push_back(row[max_idx]);
}

// return mean of character confidences as overall confidence score
double confidence = 0.0;
if (!confidences.empty()) {
confidence = std::accumulate(confidences.begin(), confidences.end(), 0.0f) / confidences.size();
}

return {result, confidence};
}

TensorsTable PaddleOCRCtcConverter::convert(const OutputBlobs &output_blobs) {
ITT_TASK(__FUNCTION__);
TensorsTable tensors_table;

try {
const size_t batch_size = getModelInputImageInfo().batch_size;
tensors_table.resize(batch_size);

for (const auto &blob_iter : output_blobs) {
OutputBlob::Ptr blob = blob_iter.second;
if (!blob) {
throw std::invalid_argument("Output blob is empty");
}

const float *data = reinterpret_cast<const float *>(blob->GetData());
if (!data) {
throw std::invalid_argument("Output blob data is nullptr");
}

const std::string layer_name = blob_iter.first;

// Output shape: [batch_size, seq_len, vocab_size]
// Tensor vocab_size has two additional tokens: CTC Blank token and Padding token
// hence its size is bigger than character vocabulary
const auto &dims = blob->GetDims();
const size_t vocab_size = (dims.size() == 3) ? dims[2] : 0;
const size_t seq_len = (dims.size() >= 2) ? dims[1] : 0;
if (vocab_size == 0 || seq_len == 0 || vocab_size != vocabulary.size() + 2)
throw std::invalid_argument("Unexpected PaddleOCR output tensor dimensions");

for (size_t batch_elem_index = 0; batch_elem_index < batch_size; ++batch_elem_index) {
GVA::Tensor classification_result = createTensor();

if (!raw_tensor_copying->enabled(RawTensorCopyingToggle::id))
CopyOutputBlobToGstStructure(blob, classification_result.gst_structure(),
BlobToMetaConverter::getModelName().c_str(), layer_name.c_str(),
batch_size, batch_elem_index);

const float *item_data = data + batch_elem_index * seq_len * vocab_size;
auto [decoded_text, confidence] = ctcDecode(item_data, seq_len, vocab_size);

if (decoded_text.size() > seq_minlen) {
classification_result.set_string("label", decoded_text);
classification_result.set_double("confidence", confidence);
} else {
classification_result.set_string("label", "");
classification_result.set_double("confidence", 0.0);
}

gst_structure_set(classification_result.gst_structure(), "tensor_id", G_TYPE_INT,
safe_convert<int>(batch_elem_index), "type", G_TYPE_STRING, "classification_result",
NULL);
std::vector<GstStructure *> tensors{classification_result.gst_structure()};
tensors_table[batch_elem_index].push_back(tensors);
}
}
} catch (const std::exception &e) {
GVA_ERROR("An error occurred in PaddleOCR CTC converter: %s", e.what());
}

return tensors_table;
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright (C) 2021-2025 Intel Corporation
* Copyright (C) 2021-2026 Intel Corporation
*
* SPDX-License-Identifier: MIT
******************************************************************************/
Expand Down Expand Up @@ -402,5 +402,30 @@ class PaddleOCRConverter : public BlobToTensorConverter {
std::string decodeOutputTensor(const float *item_data);
std::string decode(const std::vector<int> &text_index);

}; // namespace post_processing
}; // class PaddleOCRConverter

/*
PaddleOCRCtc tensor output = [B, L, N] where:
B - batch size
L - sequence length (maximum number of characters in the recognized text)
N - number of elements in the model's character set including two additional tokens:
CTC blank token and Padding token.
*/
class PaddleOCRCtcConverter : public BlobToTensorConverter {
public:
PaddleOCRCtcConverter(BlobToMetaConverter::Initializer initializer);
TensorsTable convert(const OutputBlobs &output_blobs) override;

static std::string getName() {
return "paddle_ocr_ctc";
}

private:
std::vector<std::string> vocabulary; // loaded from model_proc_output_info character_dict
size_t seq_minlen = 1; // minimum decoded sequence length

void loadVocabularyFromModelProc();
std::pair<std::string, double> ctcDecode(const float *data, size_t seq_len, size_t vocab_size);
};

} // namespace post_processing
Loading
Loading