Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>de.digitalcollections</groupId>
<artifactId>solr-ocrhighlighting</artifactId>
<version>0.9.5</version>
<version>0.9.6-SNAPSHOT</version>

<name>Solr OCR Highlighting Plugin</name>
<description>
Expand Down
15 changes: 10 additions & 5 deletions src/main/java/com/github/dbmdz/solrocr/formats/OcrParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,8 @@ public OcrBox next() {

private OcrBox prepareNext() {
try {
while (xmlReader.hasNext()) {
OcrBox box = this.readNext(this.xmlReader, this.features);
if (box == null) {
continue;
}
OcrBox box;
while ((box = this.readNext(this.xmlReader, this.features)) != null) {
// Boxes without text or coordinates (if either is requested with a feature flag) are
// ignored since they break things downstream. Skip the current box and continue with next
// one.
Expand Down Expand Up @@ -216,6 +213,14 @@ public PeekingReader getInput() {
*
* <p>Implementers should take care to enable/disable various parsing steps depending on the set
* of features passed in.
*
* @param xmlReader the XML stream reader to read from, positioned at the current location in the
* input stream. Implementers should advance the reader to the next position after reading the
* box.
* @param features the set of features to enable/disable during parsing, as passed in the
* constructor
* @return the next OCR box, or null if there are no more boxes to read
* @throws XMLStreamException if an error occurs while reading the input stream
*/
protected abstract OcrBox readNext(XMLStreamReader2 xmlReader, Set<ParsingFeature> features)
throws XMLStreamException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,13 +213,74 @@ private OcrSnippet format(Passage passage, SourceReader reader) throws IOExcepti
if (trackPages) {
initialPage = determineStartPage(passage.getStartOffset(), reader);
}
OcrSnippet snip = parseFragment(xmlFragment, initialPage);

List<OcrBox> parsed = this.parseWords(xmlFragment, initialPage);
Comment thread
jbaiter marked this conversation as resolved.
if (parsed.isEmpty()) {
return null;
}

OcrBox finalBox = parsed.get(parsed.size() - 1);
if (finalBox.isInHighlight() && finalBox.isHyphenStart()) {
// Edge Case: Since our limit break locator does not truly parse the OCR,
// we can run into situations where the last word of a passage is a
// highlighted hyphenated word, but the following part of the word is not
// included in the passage since it is after a break. In this case, we want
// to expand the passage to include the line with the second part of the
// hyphenated word, since otherwise we would end up with a snippet that only
// contains half of a match.
Comment thread
schmika marked this conversation as resolved.
// We do the expansion here at formatting time instead of passage-building time
// since we can only know about hyphenation after parsing the passage, which
// we don't do at passage-building time for performance reasons (passage-building
// is one of the hottest paths in the codebase)
parsed = this.expandPassageForHyphenation(passage, reader, initialPage, finalBox);
}

OcrSnippet snip = buildFragment(parsed, initialPage);
if (snip != null) {
snip.setScore(passage.getScore());
}
return snip;
}

private List<OcrBox> expandPassageForHyphenation(
Passage passage, SourceReader reader, OcrPage initialPage, OcrBox finalBox)
throws IOException {
int passageEnd = passage.getStartOffset() + passage.getLength();
BreakLocator lineBreakLocator = this.format.getBreakLocator(reader, OcrBlock.LINE);
int lineEndOffset = lineBreakLocator.following(passageEnd);
Comment thread
jbaiter marked this conversation as resolved.
if (lineEndOffset == BreakLocator.DONE) {
// This means that the last part of the hyphenation does not exist in the
// input, which is weird (and likely due to an error during indexing), but we can't do
// anything about it, so we just return the original passage
lineEndOffset = passageEnd;
}
passage.setEndOffset(lineEndOffset);
String expandedFragment = getHighlightedFragment(passage, reader);
List<OcrBox> parsed = this.parseWords(expandedFragment, initialPage);
OcrBox hyphenStart =
parsed.stream()
.filter(
b ->
b.getUlx() == finalBox.getUlx()
&& b.getUly() == finalBox.getUly()
&& b.isHyphenStart())
.findFirst()
.orElseThrow(
() ->
new RuntimeException(
"Could not find the original final box in the expanded passage, this should not happen"));
int hyphenStartIdx = parsed.indexOf(hyphenStart);
if (hyphenStartIdx + 1 < parsed.size()) {
OcrBox hyphenEnd = parsed.get(hyphenStartIdx + 1);
if (hyphenEnd.isHyphenEndOf(hyphenStart)) {
parsed
.get(parsed.indexOf(hyphenStart) + 1)
.setHighlightSpan(hyphenStart.getHighlightSpan());
}
}
return parsed;
}

/** Determine the page an OCR fragment resides on. */
OcrPage determineStartPage(int startOffset, SourceReader reader) throws IOException {
BreakLocator pageBreakLocator = this.format.getBreakLocator(reader, OcrBlock.PAGE);
Expand All @@ -234,9 +295,17 @@ OcrPage determineStartPage(int startOffset, SourceReader reader) throws IOExcept
return this.format.parsePageFragment(pageFragment);
}

/** Parse an {@link OcrSnippet} from an OCR fragment. */
protected OcrSnippet parseFragment(String ocrFragment, OcrPage page) {
List<OcrBox> allBoxes = this.parseWords(ocrFragment, page);
/**
* Build an {@link OcrSnippet} from a parsed OCR fragment.
*
* @param allBoxes the parsed OCR boxes from the fragment, including highlighted and
* non-highlighted ones that are needed to determine the snippet regions and the highlighted
* spans and their coordinates
* @param page the page that the fragment starts on, can be null if page tracking is not enabled
* or the page could not be determined
* @return the built snippet, or null if the fragment did not contain any text
*/
protected OcrSnippet buildFragment(List<OcrBox> allBoxes, OcrPage page) {
if (allBoxes.isEmpty()) {
return null;
}
Expand Down
7 changes: 7 additions & 0 deletions src/main/java/com/github/dbmdz/solrocr/model/OcrBox.java
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,13 @@ public boolean contains(OcrBox other) {
&& other.lry <= this.lry;
}

public boolean isHyphenEndOf(OcrBox other) {
return this.isHyphenated()
&& !this.isHyphenStart()
&& Objects.equals(this.dehyphenatedForm, other.dehyphenatedForm)
&& (Objects.equals(this.highlightSpan, other.highlightSpan));
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand Down
26 changes: 26 additions & 0 deletions src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,32 @@ public void testHyphenationIsResolved() {
assertQ(req, "count(//arr[@name='regions']/lst)=1");
}

@Test
public void testMatchOnHyphenEndExpandsPassage() throws IOException {
Path ocrPath = Paths.get("src/test/resources/data/hocr_hyphen.html");
String docId = "578aa43b-5b3c-4595-bedc-f115300464a9";
assertU(
adoc(
"ocr_text",
new String(Files.readAllBytes(ocrPath), StandardCharsets.UTF_8),
"id",
docId));
assertU(commit());
try {
SolrQueryRequest req =
xmlQ("q", "schweizerdeutsch", "hl.ocr.limitBlock", "line", "hl.ocr.contextSize", "0");
assertQ(
req,
"//str[@name='text'][1]/text()='Subjekts im Wechsel der Sprachen. Englisch steht neben Deutsch und <em>Schweizerdeutsch</em>:'",
"(//arr[@name='regions']/lst)[1]/str[@name='text']/text()='Subjekts im Wechsel der Sprachen. Englisch steht neben Deutsch und <em>Schweizerdeutsch</em>:'",
"//arr[@name='highlights']/arr/lst[1]/str[@name='text']/text()='Schweizer'",
"//arr[@name='highlights']/arr/lst[2]/str[@name='text']/text()='deutsch:'");
} finally {
assertU(delI(docId));
assertU(commit());
}
}

@Test
public void testMaskedDocumentIsIndexed() {
SolrQueryRequest req = xmlQ("q", "Vögelchen");
Expand Down
Loading