-
Notifications
You must be signed in to change notification settings - Fork 1k
fix: added type hints in .py files #1932
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| import time | ||
|
|
||
| from tqdm import tqdm | ||
| from typing import List | ||
|
|
||
| from tokenizers import Tokenizer, decoders, pre_tokenizers | ||
| from tokenizers.models import BPE, WordPiece | ||
|
|
@@ -84,11 +85,11 @@ | |
| raise Exception(f"Unknown type {args.type}") | ||
|
|
||
|
|
||
| def tokenize_r(): | ||
| def tokenize_r() -> List: | ||
| return tok_r.encode_batch(text) | ||
|
|
||
|
|
||
| def tokenize_p(): | ||
| def tokenize_p() -> List[List[int]]: | ||
|
||
| return [tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text)] | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -116,13 +116,13 @@ def converted(self): | |||||
|
|
||||||
|
|
||||||
| class AlbertConverter(SpmConverter): | ||||||
| def vocab(self, proto): | ||||||
| def vocab(self, proto) -> list[tuple[str, float]]: | ||||||
|
||||||
| return [ | ||||||
| (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100) | ||||||
| for piece in proto.pieces | ||||||
| ] | ||||||
|
|
||||||
| def normalizer(self, proto): | ||||||
| def normalizer(self, proto) -> Sequence: | ||||||
|
||||||
| normalizers = [Replace("``", '"'), Replace("''", '"')] | ||||||
| if not self.original_tokenizer.keep_accents: | ||||||
| normalizers.append(NFKD()) | ||||||
|
|
@@ -135,7 +135,7 @@ def normalizer(self, proto): | |||||
| normalizers.append(Replace(Regex(" {2,}"), " ")) | ||||||
| return Sequence(normalizers) | ||||||
|
|
||||||
| def post_processor(self, tokenizer): | ||||||
| def post_processor(self, tokenizer) -> TemplateProcessing: | ||||||
|
||||||
| def post_processor(self, tokenizer) -> TemplateProcessing: | |
| def post_processor(self, tokenizer: Tokenizer) -> TemplateProcessing: |
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for proto is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
Copilot
AI
Jan 20, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The return type hint has been added, but the parameter type hint for tokenizer is missing. Consider adding an appropriate type hint for the parameter to make the function signature complete.
ashmi8 marked this conversation as resolved.
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -126,7 +126,7 @@ def check_train(args): | |
| print("Ok our trainer is at least more efficient than the SPM one") | ||
|
|
||
|
|
||
| def check_diff(spm_diff, tok_diff, sp, tok): | ||
| def check_diff(spm_diff, tok_diff, sp, tok) -> bool: | ||
|
||
| if spm_diff == list(reversed(tok_diff)): | ||
| # AAA -> AA+A vs A+AA case. | ||
| return True | ||
|
|
@@ -147,7 +147,7 @@ def check_diff(spm_diff, tok_diff, sp, tok): | |
| return False | ||
|
|
||
|
|
||
| def check_details(line, spm_ids, tok_ids, sp, tok): | ||
| def check_details(line, spm_ids, tok_ids, sp, tok) -> bool: | ||
ashmi8 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Encoding can be the same with same result AAA -> A + AA vs AA + A | ||
| # We can check that we use at least exactly the same number of tokens. | ||
| for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)): | ||
|
|
@@ -206,7 +206,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok): | |
| return False | ||
|
|
||
|
|
||
| def check_encode(args): | ||
| def check_encode(args) -> None: | ||
|
||
| sp = cast(Any, spm.SentencePieceProcessor()) | ||
| sp.Load(args.model_file) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -292,12 +292,12 @@ def pre_tokenize(self, pretok, wrong): | |
| pass | ||
|
|
||
| class GoodCustomPretok: | ||
| def split(self, n, normalized): | ||
| def split(self, n, normalized) -> list: | ||
|
||
| # Here we just test that we can return a List[NormalizedString], it | ||
| # does not really make sense to return twice the same otherwise | ||
| return [normalized, normalized] | ||
|
|
||
| def pre_tokenize(self, pretok): | ||
| def pre_tokenize(self, pretok) -> None: | ||
|
||
| pretok.split(self.split) | ||
|
|
||
| def test_instantiate(self): | ||
|
|
@@ -325,7 +325,7 @@ def get_state(self, c): | |
| else: | ||
| return "rest" | ||
|
|
||
| def split(self, n, normalized): | ||
| def split(self, n, normalized) -> list: | ||
|
||
| i = 0 | ||
| # states = {"any", "lower", "upper", "digit", "rest"} | ||
| state = "any" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -106,21 +106,21 @@ def test_manual_reload(self): | |
|
|
||
|
|
||
| class TestTemplateProcessing: | ||
| def get_bert(self): | ||
| def get_bert(self) -> TemplateProcessing: | ||
|
||
| return TemplateProcessing( | ||
| single=["[CLS]", "$0", "[SEP]"], | ||
| pair=["[CLS]", "$A", "[SEP]", "$B:1", "[SEP]:1"], | ||
| special_tokens=[("[CLS]", 1), ("[SEP]", 0)], | ||
| ) | ||
|
|
||
| def get_roberta(self): | ||
| def get_roberta(self) -> TemplateProcessing: | ||
|
||
| return TemplateProcessing( | ||
| single="<s> $0 </s>", | ||
| pair="<s> $A </s> </s> $B </s>", | ||
| special_tokens=[("<s>", 0), ("</s>", 1)], | ||
| ) | ||
|
|
||
| def get_t5_squad(self): | ||
| def get_t5_squad(self) -> TemplateProcessing: | ||
|
||
| # >>> from transformers import AutoTokenizer | ||
| # >>> tok = AutoTokenizer.from_pretrained("t5-small") | ||
| # >>> tok.tokenize("question: ") | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -178,7 +178,7 @@ def test_encode_formats(self, bert_files): | |||||
| ["[CLS]", "my", "name", "is", "georges", "[SEP]", "pair", "[SEP]"], | ||||||
| ] | ||||||
|
|
||||||
| def format(encodings): | ||||||
| def format(encodings) -> list[list[str]]: | ||||||
|
||||||
| def format(encodings) -> list[list[str]]: | |
| def format(encodings: list[Encoding]) -> list[list[str]]: |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -160,7 +160,7 @@ def test_train(self, train_files): | |
|
|
||
| def test_train_parallelism_with_custom_pretokenizer(self, train_files): | ||
| class GoodCustomPretok: | ||
| def split(self, n, normalized): | ||
| def split(self, n, normalized) -> list: | ||
|
||
| # Here we just test that we can return a List[NormalizedString], it | ||
| # does not really make sense to return twice the same otherwise | ||
| return [normalized, normalized] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,7 @@ | |
| DATA_PATH = os.path.join("tests", "data") | ||
|
|
||
|
|
||
| def download(url, with_filename=None): | ||
| def download(url, with_filename=None) -> str: | ||
|
||
| filename = with_filename if with_filename is not None else url.rsplit("/")[-1] | ||
| filepath = os.path.join(DATA_PATH, filename) | ||
| if not os.path.exists(filepath): | ||
|
|
@@ -30,30 +30,30 @@ def data_dir(): | |
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def roberta_files(data_dir): | ||
| def roberta_files(data_dir) -> dict[str, str]: | ||
|
||
| return { | ||
| "vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"), | ||
| "merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"), | ||
| } | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def bert_files(data_dir): | ||
| def bert_files(data_dir) -> dict[str, str]: | ||
|
||
| return { | ||
| "vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"), | ||
| } | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def openai_files(data_dir): | ||
| def openai_files(data_dir) -> dict[str, str]: | ||
|
||
| return { | ||
| "vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"), | ||
| "merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"), | ||
| } | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def train_files(data_dir): | ||
| def train_files(data_dir) -> dict[str, str]: | ||
|
||
| big = download("https://norvig.com/big.txt") | ||
| small = os.path.join(DATA_PATH, "small.txt") | ||
| with open(small, "w") as f: | ||
|
|
@@ -69,20 +69,20 @@ def train_files(data_dir): | |
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def albert_base(data_dir): | ||
| def albert_base(data_dir) -> str: | ||
|
||
| return download("https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json") | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def doc_wiki_tokenizer(data_dir): | ||
| def doc_wiki_tokenizer(data_dir) -> str: | ||
|
||
| return download( | ||
| "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json", | ||
| "tokenizer-wiki.json", | ||
| ) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def doc_pipeline_bert_tokenizer(data_dir): | ||
| def doc_pipeline_bert_tokenizer(data_dir) -> str: | ||
|
||
| return download( | ||
| "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json", | ||
| "bert-wiki.json", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using the legacy
Listtype from the typing module is inconsistent with the modern syntaxlistused elsewhere in the codebase. For Python 3.9+, prefer using the built-inlisttype directly (e.g.,listinstead ofList). The import statement on line 6 can also be removed if no other typing constructs from the module are needed.