Improve handling if tokens/text don't match (see #2)

ines · ines · commit 768335ee17be · 2019-02-08T00:49:42.000+01:00
Check if the text and tokens are aligned first and if not, just take the tokens at face value and append whitespace, just to be safe
diff --git a/spacy_stanfordnlp/language.py b/spacy_stanfordnlp/language.py
@@ -4,6 +4,7 @@
 from spacy.tokens import Doc
 from spacy.util import get_lang_class
 import numpy
+import re
 
 
 class StanfordNLPLanguage(Language):
@@ -69,6 +70,7 @@ class Tokenizer(object):
     from_disk = lambda self, *args, **kwargs: None
     to_bytes = lambda self, *args, **kwargs: None
     from_bytes = lambda self, *args, **kwargs: None
+    _ws_pattern = re.compile(r"\s+")
 
     def __init__(self, snlp, vocab):
         """Initialize the tokenizer.
@@ -98,11 +100,12 @@ def __call__(self, text):
         deps = []
         lemmas = []
         offset = 0
+        is_aligned = self.check_aligned(text, tokens)
         for i, token in enumerate(tokens):
             span = text[offset:]
             if not len(span):
                 break
-            while not span.startswith(token.text):
+            while len(span) and span[0].isspace():
                 # If we encounter leading whitespace, skip one character ahead
                 offset += 1
                 span = text[offset:]
@@ -116,6 +119,8 @@ def __call__(self, text):
             span = text[offset:]
             if i == len(tokens) - 1:
                 spaces.append(False)
+            elif not is_aligned:
+                spaces.append(True)
             else:
                 next_token = tokens[i + 1]
                 spaces.append(not span.startswith(next_token.text))
@@ -155,3 +160,7 @@ def get_tokens_with_heads(self, snlp_doc):
                     tokens.append(word)
             offset += sum(len(token.words) for token in sentence.tokens)
         return tokens, heads
+
+    def check_aligned(self, text, tokens):
+        token_texts = "".join(t.text for t in tokens)
+        return re.sub(self._ws_pattern, "", text) == token_texts