Skip to content

Commit f5f7bf1

Browse files
committed
Updates after code review.
1 parent f0d4424 commit f5f7bf1

File tree

5 files changed

+41
-30
lines changed

5 files changed

+41
-30
lines changed

samples/download_public_models.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1148,7 +1148,7 @@ fi
11481148

11491149
export_ppocr_v5_model() {
11501150
local MODEL_NAME=$1
1151-
MODEL_DIR="$MODELS_PATH/public/$MODEL_NAME"
1151+
local MODEL_DIR="$MODELS_PATH/public/$MODEL_NAME"
11521152
local DST_FILE1="$MODEL_DIR/FP32/$MODEL_NAME.xml"
11531153
local DST_FILE2="$MODEL_DIR/FP16/$MODEL_NAME.xml"
11541154

src/monolithic/gst/elements/gvagenai/gstgvagenai.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -373,16 +373,17 @@ static GstFlowReturn gst_gvagenai_transform_ip(GstBaseTransform *base, GstBuffer
373373
}
374374

375375
GST_OBJECT_LOCK(gvagenai);
376+
gboolean _success = TRUE;
376377
if (gvagenai->prompt_changed) {
377-
if (!load_effective_prompt(gvagenai)) {
378-
GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
379-
("Could not load or validate prompt configuration"));
380-
GST_OBJECT_UNLOCK(gvagenai);
381-
return GST_FLOW_ERROR;
382-
}
378+
_success = load_effective_prompt(gvagenai);
383379
gvagenai->prompt_changed = FALSE;
384380
}
385381
GST_OBJECT_UNLOCK(gvagenai);
382+
if (!_success) {
383+
GST_ELEMENT_ERROR(gvagenai, RESOURCE, FAILED, ("Failed to load effective prompt"),
384+
("Could not load or validate prompt configuration"));
385+
return GST_FLOW_ERROR;
386+
}
386387

387388
// Get video info from pad
388389
GstVideoInfo info;

src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.cpp

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -156,16 +156,15 @@ void PaddleOCRCtcConverter::loadVocabularyFromModelProc() {
156156

157157
std::pair<std::string, double> PaddleOCRCtcConverter::ctcDecode(const float *data, size_t seq_len, size_t vocab_size) {
158158
std::string result;
159-
double log_conf_sum = 0.0;
160-
int num_chars = 0;
159+
std::vector<float> confidences;
161160
int prev_idx = 0;
162161

163162
for (size_t t = 0; t < seq_len; ++t) {
164163
// find index of maximum confidence logit within the current sequence step
165164
const float *row = data + t * vocab_size;
166165
int max_idx = static_cast<int>(std::max_element(row, row + vocab_size) - row);
167166

168-
// Element 0 is CTC blank and indicates entire sequence should be skiped
167+
// Element 0 is CTC blank and indicates entire sequence should be skipped
169168
// If current index matches previous index, we also skip it to avoid duplicates
170169
if (max_idx == 0 || max_idx == prev_idx) {
171170
prev_idx = max_idx;
@@ -174,25 +173,22 @@ std::pair<std::string, double> PaddleOCRCtcConverter::ctcDecode(const float *dat
174173
prev_idx = max_idx;
175174

176175
// Convert element index to Vocabulary character index
177-
// Vocabulary is 1-based indexed (0 is reserved for CTC blank), so subtract 1
176+
// Vocabulary is 1-based indexed, so subtract 1
178177
size_t char_idx = static_cast<size_t>(max_idx - 1);
179178
if (char_idx >= vocabulary.size())
180179
continue;
181180

182181
// Add new character to output label
183182
result.append(vocabulary[char_idx]);
183+
confidences.push_back(row[max_idx]);
184+
}
184185

185-
// Calculate softmax probability for this character
186-
float row_max = row[max_idx];
187-
double exp_sum = 0.0;
188-
for (size_t v = 0; v < vocab_size; ++v)
189-
exp_sum += std::exp(static_cast<double>(row[v] - row_max));
190-
log_conf_sum += std::log(1.0 / exp_sum + 1e-10);
191-
++num_chars;
186+
// return mean of character confidences as overall confidence score
187+
double confidence = 0.0;
188+
if (!confidences.empty()) {
189+
confidence = std::accumulate(confidences.begin(), confidences.end(), 0.0f) / confidences.size();
192190
}
193191

194-
// retunr geomean of character confidences as overall confidence score for the sequence
195-
double confidence = (num_chars > 0) ? std::exp(log_conf_sum / num_chars) : 0.0;
196192
return {result, confidence};
197193
}
198194

@@ -218,11 +214,13 @@ TensorsTable PaddleOCRCtcConverter::convert(const OutputBlobs &output_blobs) {
218214
const std::string layer_name = blob_iter.first;
219215

220216
// Output shape: [batch_size, seq_len, vocab_size]
217+
// Tensor vocab_size has two additional tokens: CTC Blank token and Padding token
218+
// hence its size is bigger than character vocabulary
221219
const auto &dims = blob->GetDims();
222220
const size_t vocab_size = (dims.size() == 3) ? dims[2] : 0;
223221
const size_t seq_len = (dims.size() >= 2) ? dims[1] : 0;
224-
if (vocab_size != vocabulary.size() + 2) // +1 for CTC blank token, +1 for 1-based indexing
225-
throw std::invalid_argument("Unexpected vocabulary size");
222+
if (vocab_size == 0 || seq_len == 0 || vocab_size != vocabulary.size() + 2)
223+
throw std::invalid_argument("Unexpected PaddleOCR output tensor dimensions");
226224

227225
for (size_t batch_elem_index = 0; batch_elem_index < batch_size; ++batch_elem_index) {
228226
GVA::Tensor classification_result = createTensor();

src/monolithic/gst/inference_elements/common/post_processor/converters/to_tensor/paddle_ocr.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -405,11 +405,11 @@ class PaddleOCRConverter : public BlobToTensorConverter {
405405
}; // class PaddleOCRConverter
406406

407407
/*
408-
PaddleOCRCtcConverter: standard PaddleOCRv5 CTC decoder.
409-
- Index 0 is the CTC blank token
410-
- Real characters are mapped as vocabulary[index - 1]
411-
- Character dictionary is loaded at runtime from config.json (as part of model import)
412-
- Sequence length and vocabulary size are derived from the model output tensor shape
408+
PaddleOCRCtc tensor output = [B, L, N] where:
409+
B - batch size
410+
L - sequence length (maximum number of characters in the recognized text)
411+
N - number of elements in the model's character set including two additional tokens:
412+
CTC blank token and Padding token.
413413
*/
414414
class PaddleOCRCtcConverter : public BlobToTensorConverter {
415415
public:

src/monolithic/inference_backend/image_inference/openvino/model_api_converters.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -438,15 +438,27 @@ bool isPaddleOCRModel(const std::string &model_file) {
438438
if (!loadJsonFromModelDir(model_file, "config.json", config_json))
439439
return false;
440440

441+
bool has_pp_ocr_model_name = false;
442+
bool has_ctc_label_decode = false;
443+
441444
// PaddleOCR config.json contains Global.model_name with "PP-OCR" substring
442445
if (config_json.contains("Global") && config_json["Global"].is_object() &&
443446
config_json["Global"].contains("model_name") && config_json["Global"]["model_name"].is_string()) {
444447
const std::string model_name = config_json["Global"]["model_name"].get<std::string>();
445-
if (std::regex_search(model_name, std::regex("PP-OCR.*rec")))
446-
return true;
448+
if (std::regex_search(model_name, std::regex(".*PP-OCR.*rec")))
449+
has_pp_ocr_model_name = true;
450+
}
451+
452+
// Also check for PostProcess.name == "CTCLabelDecode" with character_dict
453+
if (config_json.contains("PostProcess") && config_json["PostProcess"].is_object() &&
454+
config_json["PostProcess"].contains("name") && config_json["PostProcess"]["name"].is_string()) {
455+
const std::string pp_name = config_json["PostProcess"]["name"].get<std::string>();
456+
if (pp_name == "CTCLabelDecode") {
457+
has_ctc_label_decode = true;
458+
}
447459
}
448460

449-
return false;
461+
return has_pp_ocr_model_name && has_ctc_label_decode;
450462
}
451463

452464
// Convert PaddleOCR config.json metadata into Model API format

0 commit comments

Comments
 (0)