44from spacy .tokens import Doc
55from spacy .util import get_lang_class
66import numpy
7+ import re
78
89
910class StanfordNLPLanguage (Language ):
@@ -69,6 +70,7 @@ class Tokenizer(object):
6970 from_disk = lambda self , * args , ** kwargs : None
7071 to_bytes = lambda self , * args , ** kwargs : None
7172 from_bytes = lambda self , * args , ** kwargs : None
73+ _ws_pattern = re .compile (r"\s+" )
7274
7375 def __init__ (self , snlp , vocab ):
7476 """Initialize the tokenizer.
@@ -98,11 +100,12 @@ def __call__(self, text):
98100 deps = []
99101 lemmas = []
100102 offset = 0
103+ is_aligned = self .check_aligned (text , tokens )
101104 for i , token in enumerate (tokens ):
102105 span = text [offset :]
103106 if not len (span ):
104107 break
105- while not span . startswith ( token . text ):
108+ while len ( span ) and span [ 0 ]. isspace ( ):
106109 # If we encounter leading whitespace, skip one character ahead
107110 offset += 1
108111 span = text [offset :]
@@ -116,6 +119,8 @@ def __call__(self, text):
116119 span = text [offset :]
117120 if i == len (tokens ) - 1 :
118121 spaces .append (False )
122+ elif not is_aligned :
123+ spaces .append (True )
119124 else :
120125 next_token = tokens [i + 1 ]
121126 spaces .append (not span .startswith (next_token .text ))
@@ -155,3 +160,7 @@ def get_tokens_with_heads(self, snlp_doc):
155160 tokens .append (word )
156161 offset += sum (len (token .words ) for token in sentence .tokens )
157162 return tokens , heads
163+
164+ def check_aligned (self , text , tokens ):
165+ token_texts = "" .join (t .text for t in tokens )
166+ return re .sub (self ._ws_pattern , "" , text ) == token_texts
0 commit comments