Skip to content

Commit 768335e

Browse files
committed
Improve handling if tokens/text don't match (see #2)
Check if the text and tokens are aligned first and if not, just take the tokens at face value and append whitespace, just to be safe
1 parent 63c1945 commit 768335e

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

spacy_stanfordnlp/language.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from spacy.tokens import Doc
55
from spacy.util import get_lang_class
66
import numpy
7+
import re
78

89

910
class StanfordNLPLanguage(Language):
@@ -69,6 +70,7 @@ class Tokenizer(object):
6970
from_disk = lambda self, *args, **kwargs: None
7071
to_bytes = lambda self, *args, **kwargs: None
7172
from_bytes = lambda self, *args, **kwargs: None
73+
_ws_pattern = re.compile(r"\s+")
7274

7375
def __init__(self, snlp, vocab):
7476
"""Initialize the tokenizer.
@@ -98,11 +100,12 @@ def __call__(self, text):
98100
deps = []
99101
lemmas = []
100102
offset = 0
103+
is_aligned = self.check_aligned(text, tokens)
101104
for i, token in enumerate(tokens):
102105
span = text[offset:]
103106
if not len(span):
104107
break
105-
while not span.startswith(token.text):
108+
while len(span) and span[0].isspace():
106109
# If we encounter leading whitespace, skip one character ahead
107110
offset += 1
108111
span = text[offset:]
@@ -116,6 +119,8 @@ def __call__(self, text):
116119
span = text[offset:]
117120
if i == len(tokens) - 1:
118121
spaces.append(False)
122+
elif not is_aligned:
123+
spaces.append(True)
119124
else:
120125
next_token = tokens[i + 1]
121126
spaces.append(not span.startswith(next_token.text))
@@ -155,3 +160,7 @@ def get_tokens_with_heads(self, snlp_doc):
155160
tokens.append(word)
156161
offset += sum(len(token.words) for token in sentence.tokens)
157162
return tokens, heads
163+
164+
def check_aligned(self, text, tokens):
165+
token_texts = "".join(t.text for t in tokens)
166+
return re.sub(self._ws_pattern, "", text) == token_texts

0 commit comments

Comments
 (0)