dbmdz · jbaiter · Feb 11, 2026 · Feb 11, 2026
diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
   <groupId>de.digitalcollections</groupId>
   <artifactId>solr-ocrhighlighting</artifactId>
-  <version>0.9.5</version>
+  <version>0.9.6-SNAPSHOT</version>
 
   <name>Solr OCR Highlighting Plugin</name>
   <description>

diff --git a/src/main/java/com/github/dbmdz/solrocr/formats/OcrParser.java b/src/main/java/com/github/dbmdz/solrocr/formats/OcrParser.java
@@ -133,11 +133,8 @@ public OcrBox next() {
 
   private OcrBox prepareNext() {
     try {
-      while (xmlReader.hasNext()) {
-        OcrBox box = this.readNext(this.xmlReader, this.features);
-        if (box == null) {
-          continue;
-        }
+      OcrBox box;
+      while ((box = this.readNext(this.xmlReader, this.features)) != null) {
         // Boxes without text or coordinates (if either is requested with a feature flag) are
         // ignored since they break things downstream. Skip the current box and continue with next
         // one.
@@ -216,6 +213,14 @@ public PeekingReader getInput() {
    *
    * <p>Implementers should take care to enable/disable various parsing steps depending on the set
    * of features passed in.
+   *
+   * @param xmlReader the XML stream reader to read from, positioned at the current location in the
+   *     input stream. Implementers should advance the reader to the next position after reading the
+   *     box.
+   * @param features the set of features to enable/disable during parsing, as passed in the
+   *     constructor
+   * @return the next OCR box, or null if there are no more boxes to read
+   * @throws XMLStreamException if an error occurs while reading the input stream
    */
   protected abstract OcrBox readNext(XMLStreamReader2 xmlReader, Set<ParsingFeature> features)
       throws XMLStreamException;

diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/OcrPassageFormatter.java b/src/main/java/com/github/dbmdz/solrocr/lucene/OcrPassageFormatter.java
@@ -213,13 +213,74 @@ private OcrSnippet format(Passage passage, SourceReader reader) throws IOExcepti
     if (trackPages) {
       initialPage = determineStartPage(passage.getStartOffset(), reader);
     }
-    OcrSnippet snip = parseFragment(xmlFragment, initialPage);
+
+    List<OcrBox> parsed = this.parseWords(xmlFragment, initialPage);
+    if (parsed.isEmpty()) {
+      return null;
+    }
+
+    OcrBox finalBox = parsed.get(parsed.size() - 1);
+    if (finalBox.isInHighlight() && finalBox.isHyphenStart()) {
+      // Edge Case: Since our limit break locator does not truly parse the OCR,
+      // we can run into situations where the last word of a passage is a
+      // highlighted hyphenated word, but the following part of the word is not
+      // included in the passage since it is after a break. In this case, we want
+      // to expand the passage to include the line with the second part of the
+      // hyphenated word, since otherwise we would end up with a snippet that only
+      // contains half of a match.
+      // We do the expansion here at formatting time instead of passage-building time
+      // since we can only know about hyphenation after parsing the passage, which
+      // we don't do at passage-building time for performance reasons (passage-building
+      // is one of the hottest paths in the codebase)
+      parsed = this.expandPassageForHyphenation(passage, reader, initialPage, finalBox);
+    }
+
+    OcrSnippet snip = buildFragment(parsed, initialPage);
     if (snip != null) {
       snip.setScore(passage.getScore());
     }
     return snip;
   }
 
+  private List<OcrBox> expandPassageForHyphenation(
+      Passage passage, SourceReader reader, OcrPage initialPage, OcrBox finalBox)
+      throws IOException {
+    int passageEnd = passage.getStartOffset() + passage.getLength();
+    BreakLocator lineBreakLocator = this.format.getBreakLocator(reader, OcrBlock.LINE);
+    int lineEndOffset = lineBreakLocator.following(passageEnd);
+    if (lineEndOffset == BreakLocator.DONE) {
+      // This means that the last part of the hyphenation does not exist in the
+      // input, which is weird (and likely due to an error during indexing), but we can't do
+      // anything about it, so we just return the original passage
+      lineEndOffset = passageEnd;
+    }
+    passage.setEndOffset(lineEndOffset);
+    String expandedFragment = getHighlightedFragment(passage, reader);
+    List<OcrBox> parsed = this.parseWords(expandedFragment, initialPage);
+    OcrBox hyphenStart =
+        parsed.stream()
+            .filter(
+                b ->
+                    b.getUlx() == finalBox.getUlx()
+                        && b.getUly() == finalBox.getUly()
+                        && b.isHyphenStart())
+            .findFirst()
+            .orElseThrow(
+                () ->
+                    new RuntimeException(
+                        "Could not find the original final box in the expanded passage, this should not happen"));
+    int hyphenStartIdx = parsed.indexOf(hyphenStart);
+    if (hyphenStartIdx + 1 < parsed.size()) {
+      OcrBox hyphenEnd = parsed.get(hyphenStartIdx + 1);
+      if (hyphenEnd.isHyphenEndOf(hyphenStart)) {
+        parsed
+            .get(parsed.indexOf(hyphenStart) + 1)
+            .setHighlightSpan(hyphenStart.getHighlightSpan());
+      }
+    }
+    return parsed;
+  }
+
   /** Determine the page an OCR fragment resides on. */
   OcrPage determineStartPage(int startOffset, SourceReader reader) throws IOException {
     BreakLocator pageBreakLocator = this.format.getBreakLocator(reader, OcrBlock.PAGE);
@@ -234,9 +295,17 @@ OcrPage determineStartPage(int startOffset, SourceReader reader) throws IOExcept
     return this.format.parsePageFragment(pageFragment);
   }
 
-  /** Parse an {@link OcrSnippet} from an OCR fragment. */
-  protected OcrSnippet parseFragment(String ocrFragment, OcrPage page) {
-    List<OcrBox> allBoxes = this.parseWords(ocrFragment, page);
+  /**
+   * Build an {@link OcrSnippet} from a parsed OCR fragment.
+   *
+   * @param allBoxes the parsed OCR boxes from the fragment, including highlighted and
+   *     non-highlighted ones that are needed to determine the snippet regions and the highlighted
+   *     spans and their coordinates
+   * @param page the page that the fragment starts on, can be null if page tracking is not enabled
+   *     or the page could not be determined
+   * @return the built snippet, or null if the fragment did not contain any text
+   */
+  protected OcrSnippet buildFragment(List<OcrBox> allBoxes, OcrPage page) {
     if (allBoxes.isEmpty()) {
       return null;
     }

diff --git a/src/main/java/com/github/dbmdz/solrocr/model/OcrBox.java b/src/main/java/com/github/dbmdz/solrocr/model/OcrBox.java
@@ -296,6 +296,13 @@ public boolean contains(OcrBox other) {
         && other.lry <= this.lry;
   }
 
+  public boolean isHyphenEndOf(OcrBox other) {
+    return this.isHyphenated()
+           && !this.isHyphenStart()
+           && Objects.equals(this.dehyphenatedForm, other.dehyphenatedForm)
+           && (Objects.equals(this.highlightSpan, other.highlightSpan));
+  }
+
   @Override
   public boolean equals(Object o) {
     if (this == o) return true;

diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java
@@ -227,6 +227,32 @@ public void testHyphenationIsResolved() {
     assertQ(req, "count(//arr[@name='regions']/lst)=1");
   }
 
+  @Test
+  public void testMatchOnHyphenEndExpandsPassage() throws IOException {
+    Path ocrPath = Paths.get("src/test/resources/data/hocr_hyphen.html");
+    String docId = "578aa43b-5b3c-4595-bedc-f115300464a9";
+    assertU(
+        adoc(
+            "ocr_text",
+            new String(Files.readAllBytes(ocrPath), StandardCharsets.UTF_8),
+            "id",
+            docId));
+    assertU(commit());
+    try {
+      SolrQueryRequest req =
+          xmlQ("q", "schweizerdeutsch", "hl.ocr.limitBlock", "line", "hl.ocr.contextSize", "0");
+      assertQ(
+          req,
+          "//str[@name='text'][1]/text()='Subjekts im Wechsel der Sprachen. Englisch steht neben Deutsch und <em>Schweizerdeutsch</em>:'",
+          "(//arr[@name='regions']/lst)[1]/str[@name='text']/text()='Subjekts im Wechsel der Sprachen. Englisch steht neben Deutsch und <em>Schweizerdeutsch</em>:'",
+          "//arr[@name='highlights']/arr/lst[1]/str[@name='text']/text()='Schweizer'",
+          "//arr[@name='highlights']/arr/lst[2]/str[@name='text']/text()='deutsch:'");
+    } finally {
+      assertU(delI(docId));
+      assertU(commit());
+    }
+  }
+
   @Test
   public void testMaskedDocumentIsIndexed() {
     SolrQueryRequest req = xmlQ("q", "Vögelchen");