Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bindings/python/benches/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
print(f"huggingface \t{readable_size} / s")


def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
def test(model: str, dataset: str, dataset_config: str, threads: List[int]) -> None:
dataset_xnli = load_dataset(dataset, dataset_config)

input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]
Expand Down
5 changes: 3 additions & 2 deletions bindings/python/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import time

from tqdm import tqdm
from typing import List

from tokenizers import Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE, WordPiece
Expand Down Expand Up @@ -84,11 +85,11 @@
raise Exception(f"Unknown type {args.type}")


def tokenize_r():
def tokenize_r() -> List:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using the legacy List type from the typing module is inconsistent with the modern syntax list used elsewhere in the codebase. For Python 3.9+, prefer using the built-in list type directly (e.g., list instead of List). The import statement on line 6 can also be removed if no other typing constructs from the module are needed.

Copilot uses AI. Check for mistakes.
return tok_r.encode_batch(text)


def tokenize_p():
def tokenize_p() -> List[List[int]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using the legacy List type from the typing module is inconsistent with the modern syntax list used elsewhere in the codebase. For Python 3.9+, prefer using the built-in list type directly (e.g., list[list[int]] instead of List[List[int]]).

Copilot uses AI. Check for mistakes.
return [tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text)]


Expand Down
40 changes: 20 additions & 20 deletions bindings/python/scripts/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,13 @@ def converted(self):


class AlbertConverter(SpmConverter):
def vocab(self, proto):
def vocab(self, proto) -> list[tuple[str, float]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return [
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
for piece in proto.pieces
]

def normalizer(self, proto):
def normalizer(self, proto) -> Sequence:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
normalizers = [Replace("``", '"'), Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD())
Expand All @@ -135,7 +135,7 @@ def normalizer(self, proto):
normalizers.append(Replace(Regex(" {2,}"), " "))
return Sequence(normalizers)

def post_processor(self, tokenizer):
def post_processor(self, tokenizer) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Suggested change
def post_processor(self, tokenizer) -> TemplateProcessing:
def post_processor(self, tokenizer: Tokenizer) -> TemplateProcessing:

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single=["[CLS]", "$0", "[SEP]"],
pair=["$1", "[SEP]"],
Expand All @@ -147,7 +147,7 @@ def post_processor(self, tokenizer):


class CamembertConverter(SpmConverter):
def vocab(self, proto):
def vocab(self, proto) -> list[tuple[str, float]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
vocab = [
("<s>NOTUSED", 0.0),
("<pad>", 0.0),
Expand All @@ -157,11 +157,11 @@ def vocab(self, proto):
vocab += [(piece.piece, piece.score) for piece in proto.pieces]
return vocab

def unk_id(self, proto):
def unk_id(self, proto) -> int:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
# See vocab unk position
return 3

def post_processor(self, tokenizer):
def post_processor(self, tokenizer) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single=["<s>", "$0", "</s>"],
pair=["$1", "</s>"],
Expand All @@ -173,7 +173,7 @@ def post_processor(self, tokenizer):


class MBartConverter(SpmConverter):
def vocab(self, proto):
def vocab(self, proto) -> list[tuple[str, float]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
vocab = [
("<s>", 0.0),
("<pad>", 0.0),
Expand Down Expand Up @@ -210,10 +210,10 @@ def vocab(self, proto):
]
return vocab

def unk_id(self, proto):
def unk_id(self, proto) -> int:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return 3

def post_processor(self, tokenizer):
def post_processor(self, tokenizer) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single=["$0", "</s>", "en_XX"],
pair=["$1", "</s>"],
Expand All @@ -225,7 +225,7 @@ def post_processor(self, tokenizer):


class XLMRobertaConverter(SpmConverter):
def vocab(self, proto):
def vocab(self, proto) -> list[tuple[str, float]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
vocab = [
("<s>", 0.0),
("<pad>", 0.0),
Expand All @@ -235,11 +235,11 @@ def vocab(self, proto):
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
return vocab

def unk_id(self, proto):
def unk_id(self, proto) -> int:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
unk_id = 3
return unk_id

def post_processor(self, tokenizer):
def post_processor(self, tokenizer) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single=["<s>", "$0", "</s>"],
pair=["$1", "</s>"],
Expand All @@ -251,13 +251,13 @@ def post_processor(self, tokenizer):


class XLNetConverter(SpmConverter):
def vocab(self, proto):
def vocab(self, proto) -> list[tuple[str, float]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return [
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
for piece in proto.pieces
]

def normalizer(self, proto):
def normalizer(self, proto) -> Sequence:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
normalizers = [Replace("``", '"'), Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD())
Expand All @@ -270,7 +270,7 @@ def normalizer(self, proto):
normalizers.append(Replace(Regex(" {2,}"), " "))
return Sequence(normalizers)

def post_processor(self, tokenizer):
def post_processor(self, tokenizer) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single=["$0", "<sep>", "<cls>"],
pair=["$1", "<sep>"],
Expand All @@ -288,7 +288,7 @@ class ReformerConverter(SpmConverter):
class PegasusConverter(SpmConverter):
offset = 103

def vocab(self, proto):
def vocab(self, proto) -> list[tuple[str, float]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
vocab = [
(self.original_tokenizer.pad_token, 0),
(self.original_tokenizer.eos_token, 0),
Expand All @@ -297,10 +297,10 @@ def vocab(self, proto):
vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
return vocab

def unk_id(self, proto):
def unk_id(self, proto) -> int:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return proto.trainer_spec.unk_id + self.offset

def post_processor(self, tokenizer):
def post_processor(self, tokenizer) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
eos = self.original_tokenizer.eos_token
return TemplateProcessing(
single=["$0", eos],
Expand All @@ -310,7 +310,7 @@ def post_processor(self, tokenizer):


class T5Converter(SpmConverter):
def post_processor(self, tokenizer):
def post_processor(self, tokenizer) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single=["$0", "</s>"],
pair=["$1", "</s>"],
Expand All @@ -330,7 +330,7 @@ def post_processor(self, tokenizer):
}


def check(pretrained, filename):
def check(pretrained, filename) -> tuple[str, datetime.timedelta]:
transformer_tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
converter_class = CONVERTERS[transformer_tokenizer.__class__.__name__]
tokenizer = converter_class(transformer_tokenizer).converted()
Expand Down
6 changes: 3 additions & 3 deletions bindings/python/scripts/spm_parity_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def check_train(args):
print("Ok our trainer is at least more efficient than the SPM one")


def check_diff(spm_diff, tok_diff, sp, tok):
def check_diff(spm_diff, tok_diff, sp, tok) -> bool:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hints for spm_diff, tok_diff, sp, and tok are missing. Consider adding appropriate type hints for the parameters to make the function signature complete.

Copilot uses AI. Check for mistakes.
if spm_diff == list(reversed(tok_diff)):
# AAA -> AA+A vs A+AA case.
return True
Expand All @@ -147,7 +147,7 @@ def check_diff(spm_diff, tok_diff, sp, tok):
return False


def check_details(line, spm_ids, tok_ids, sp, tok):
def check_details(line, spm_ids, tok_ids, sp, tok) -> bool:
# Encoding can be the same with same result AAA -> A + AA vs AA + A
# We can check that we use at least exactly the same number of tokens.
for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
Expand Down Expand Up @@ -206,7 +206,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
return False


def check_encode(args):
def check_encode(args) -> None:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for args is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
sp = cast(Any, spm.SentencePieceProcessor())
sp.Load(args.model_file)

Expand Down
6 changes: 3 additions & 3 deletions bindings/python/tests/bindings/test_pre_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,12 +292,12 @@ def pre_tokenize(self, pretok, wrong):
pass

class GoodCustomPretok:
def split(self, n, normalized):
def split(self, n, normalized) -> list:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hints for n and normalized are missing. Consider adding appropriate type hints for the parameters to make the function signature complete.

Copilot uses AI. Check for mistakes.
# Here we just test that we can return a List[NormalizedString], it
# does not really make sense to return twice the same otherwise
return [normalized, normalized]

def pre_tokenize(self, pretok):
def pre_tokenize(self, pretok) -> None:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for pretok is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
pretok.split(self.split)

def test_instantiate(self):
Expand Down Expand Up @@ -325,7 +325,7 @@ def get_state(self, c):
else:
return "rest"

def split(self, n, normalized):
def split(self, n, normalized) -> list:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hints for n and normalized are missing. Consider adding appropriate type hints for the parameters to make the function signature complete.

Copilot uses AI. Check for mistakes.
i = 0
# states = {"any", "lower", "upper", "digit", "rest"}
state = "any"
Expand Down
6 changes: 3 additions & 3 deletions bindings/python/tests/bindings/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,21 @@ def test_manual_reload(self):


class TestTemplateProcessing:
def get_bert(self):
def get_bert(self) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for self methods should also include parameter types. Consider adding appropriate type hints for the parameters to make the function signature complete.

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single=["[CLS]", "$0", "[SEP]"],
pair=["[CLS]", "$A", "[SEP]", "$B:1", "[SEP]:1"],
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
)

def get_roberta(self):
def get_roberta(self) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for self methods should also include parameter types. Consider adding appropriate type hints for the parameters to make the function signature complete.

Copilot uses AI. Check for mistakes.
return TemplateProcessing(
single="<s> $0 </s>",
pair="<s> $A </s> </s> $B </s>",
special_tokens=[("<s>", 0), ("</s>", 1)],
)

def get_t5_squad(self):
def get_t5_squad(self) -> TemplateProcessing:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for self methods should also include parameter types. Consider adding appropriate type hints for the parameters to make the function signature complete.

Copilot uses AI. Check for mistakes.
# >>> from transformers import AutoTokenizer
# >>> tok = AutoTokenizer.from_pretrained("t5-small")
# >>> tok.tokenize("question: ")
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_encode_formats(self, bert_files):
["[CLS]", "my", "name", "is", "georges", "[SEP]", "pair", "[SEP]"],
]

def format(encodings):
def format(encodings) -> list[list[str]]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for encodings is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.

Suggested change
def format(encodings) -> list[list[str]]:
def format(encodings: list[Encoding]) -> list[list[str]]:

Copilot uses AI. Check for mistakes.
return [e.tokens for e in encodings]

def test_single(input, is_pretokenized=False):
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/tests/bindings/test_trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def test_train(self, train_files):

def test_train_parallelism_with_custom_pretokenizer(self, train_files):
class GoodCustomPretok:
def split(self, n, normalized):
def split(self, n, normalized) -> list:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hints for n and normalized are missing. Consider adding appropriate type hints for the parameters to make the function signature complete.

Copilot uses AI. Check for mistakes.
# Here we just test that we can return a List[NormalizedString], it
# does not really make sense to return twice the same otherwise
return [normalized, normalized]
Expand Down
16 changes: 8 additions & 8 deletions bindings/python/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
DATA_PATH = os.path.join("tests", "data")


def download(url, with_filename=None):
def download(url, with_filename=None) -> str:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hints are missing. Consider adding type hints for url (str) and with_filename (Optional[str] or str | None) to make the function signature complete.

Copilot uses AI. Check for mistakes.
filename = with_filename if with_filename is not None else url.rsplit("/")[-1]
filepath = os.path.join(DATA_PATH, filename)
if not os.path.exists(filepath):
Expand All @@ -30,30 +30,30 @@ def data_dir():


@pytest.fixture(scope="session")
def roberta_files(data_dir):
def roberta_files(data_dir) -> dict[str, str]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for data_dir is missing. Consider adding the appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return {
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"),
"merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"),
}


@pytest.fixture(scope="session")
def bert_files(data_dir):
def bert_files(data_dir) -> dict[str, str]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for data_dir is missing. Consider adding the appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return {
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"),
}


@pytest.fixture(scope="session")
def openai_files(data_dir):
def openai_files(data_dir) -> dict[str, str]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for data_dir is missing. Consider adding the appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return {
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"),
"merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"),
}


@pytest.fixture(scope="session")
def train_files(data_dir):
def train_files(data_dir) -> dict[str, str]:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for data_dir is missing. Consider adding the appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
big = download("https://norvig.com/big.txt")
small = os.path.join(DATA_PATH, "small.txt")
with open(small, "w") as f:
Expand All @@ -69,20 +69,20 @@ def train_files(data_dir):


@pytest.fixture(scope="session")
def albert_base(data_dir):
def albert_base(data_dir) -> str:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for data_dir is missing. Consider adding the appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return download("https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json")


@pytest.fixture(scope="session")
def doc_wiki_tokenizer(data_dir):
def doc_wiki_tokenizer(data_dir) -> str:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for data_dir is missing. Consider adding the appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return download(
"https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json",
"tokenizer-wiki.json",
)


@pytest.fixture(scope="session")
def doc_pipeline_bert_tokenizer(data_dir):
def doc_pipeline_bert_tokenizer(data_dir) -> str:
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type hint has been added, but the parameter type hint for data_dir is missing. Consider adding the appropriate type hint for the parameter to make the function signature complete.

Copilot uses AI. Check for mistakes.
return download(
"https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json",
"bert-wiki.json",
Expand Down
Loading