fix: added type hints in .py files #1932

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Sign up for GitHub

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

ashmi8 wants to merge 1 commit into huggingface:main from ashmi8:add-type-hint

+43 −42

Open

fix: added type hints in .py files #1932

bindings/python/benches/test_tiktoken.py

-Original file line number
+Diff line change
@@ Expand Up @@
         print(f"huggingface \t{readable_size} / s")
-    def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
+    def test(model: str, dataset: str, dataset_config: str, threads: List[int]) -> None:
         dataset_xnli = load_dataset(dataset, dataset_config)
         input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]
@@ Expand Down @@

bindings/python/examples/example.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ @@
     import time
     from tqdm import tqdm
+    from typing import List
     from tokenizers import Tokenizer, decoders, pre_tokenizers
     from tokenizers.models import BPE, WordPiece
@@ Expand Down Expand Up / @@ -84,11 +85,11 @@ @@
         raise Exception(f"Unknown type {args.type}")
-    def tokenize_r():
+    def tokenize_r() -> List:
         return tok_r.encode_batch(text)
-    def tokenize_p():
+    def tokenize_p() -> List[List[int]]:
         return [tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text)]
@@ Expand Down @@

bindings/python/scripts/convert.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -116,13 +116,13 @@ def converted(self): @@
     class AlbertConverter(SpmConverter):
-        def vocab(self, proto):
+        def vocab(self, proto) -> list[tuple[str, float]]:
             return [
                 (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
                 for piece in proto.pieces
             ]
-        def normalizer(self, proto):
+        def normalizer(self, proto) -> Sequence:
             normalizers = [Replace("``", '"'), Replace("''", '"')]
             if not self.original_tokenizer.keep_accents:
                 normalizers.append(NFKD())
@@ Expand All / @@ -135,7 +135,7 @@ def normalizer(self, proto): @@
             normalizers.append(Replace(Regex(" {2,}"), " "))
             return Sequence(normalizers)
-        def post_processor(self, tokenizer):
+        def post_processor(self, tokenizer) -> TemplateProcessing:
-    def post_processor(self, tokenizer) -> TemplateProcessing:
+    def post_processor(self, tokenizer: Tokenizer) -> TemplateProcessing:
             return TemplateProcessing(
                 single=["[CLS]", "$0", "[SEP]"],
                 pair=["$1", "[SEP]"],
@@ Expand All / @@ -147,7 +147,7 @@ def post_processor(self, tokenizer): @@
     class CamembertConverter(SpmConverter):
-        def vocab(self, proto):
+        def vocab(self, proto) -> list[tuple[str, float]]:
             vocab = [
                 ("<s>NOTUSED", 0.0),
                 ("<pad>", 0.0),
@@ Expand All / @@ -157,11 +157,11 @@ def vocab(self, proto): @@
             vocab += [(piece.piece, piece.score) for piece in proto.pieces]
             return vocab
-        def unk_id(self, proto):
+        def unk_id(self, proto) -> int:
             # See vocab unk position
             return 3
-        def post_processor(self, tokenizer):
+        def post_processor(self, tokenizer) -> TemplateProcessing:
             return TemplateProcessing(
                 single=["<s>", "$0", "</s>"],
                 pair=["$1", "</s>"],
@@ Expand All / @@ -173,7 +173,7 @@ def post_processor(self, tokenizer): @@
     class MBartConverter(SpmConverter):
-        def vocab(self, proto):
+        def vocab(self, proto) -> list[tuple[str, float]]:
             vocab = [
                 ("<s>", 0.0),
                 ("<pad>", 0.0),
@@ Expand Down Expand Up / @@ -210,10 +210,10 @@ def vocab(self, proto): @@
             ]
             return vocab
-        def unk_id(self, proto):
+        def unk_id(self, proto) -> int:
             return 3
-        def post_processor(self, tokenizer):
+        def post_processor(self, tokenizer) -> TemplateProcessing:
             return TemplateProcessing(
                 single=["$0", "</s>", "en_XX"],
                 pair=["$1", "</s>"],
@@ Expand All / @@ -225,7 +225,7 @@ def post_processor(self, tokenizer): @@
     class XLMRobertaConverter(SpmConverter):
-        def vocab(self, proto):
+        def vocab(self, proto) -> list[tuple[str, float]]:
             vocab = [
                 ("<s>", 0.0),
                 ("<pad>", 0.0),
@@ Expand All / @@ -235,11 +235,11 @@ def vocab(self, proto): @@
             vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
             return vocab
-        def unk_id(self, proto):
+        def unk_id(self, proto) -> int:
             unk_id = 3
             return unk_id
-        def post_processor(self, tokenizer):
+        def post_processor(self, tokenizer) -> TemplateProcessing:
             return TemplateProcessing(
                 single=["<s>", "$0", "</s>"],
                 pair=["$1", "</s>"],
@@ Expand All / @@ -251,13 +251,13 @@ def post_processor(self, tokenizer): @@
     class XLNetConverter(SpmConverter):
-        def vocab(self, proto):
+        def vocab(self, proto) -> list[tuple[str, float]]:
             return [
                 (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
                 for piece in proto.pieces
             ]
-        def normalizer(self, proto):
+        def normalizer(self, proto) -> Sequence:
             normalizers = [Replace("``", '"'), Replace("''", '"')]
             if not self.original_tokenizer.keep_accents:
                 normalizers.append(NFKD())
@@ Expand All / @@ -270,7 +270,7 @@ def normalizer(self, proto): @@
             normalizers.append(Replace(Regex(" {2,}"), " "))
             return Sequence(normalizers)
-        def post_processor(self, tokenizer):
+        def post_processor(self, tokenizer) -> TemplateProcessing:
             return TemplateProcessing(
                 single=["$0", "<sep>", "<cls>"],
                 pair=["$1", "<sep>"],
@@ Expand All / @@ -288,7 +288,7 @@ class ReformerConverter(SpmConverter): @@
     class PegasusConverter(SpmConverter):
         offset = 103
-        def vocab(self, proto):
+        def vocab(self, proto) -> list[tuple[str, float]]:
             vocab = [
                 (self.original_tokenizer.pad_token, 0),
                 (self.original_tokenizer.eos_token, 0),
@@ Expand All / @@ -297,10 +297,10 @@ def vocab(self, proto): @@
             vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
             return vocab
-        def unk_id(self, proto):
+        def unk_id(self, proto) -> int:
             return proto.trainer_spec.unk_id + self.offset
-        def post_processor(self, tokenizer):
+        def post_processor(self, tokenizer) -> TemplateProcessing:
             eos = self.original_tokenizer.eos_token
             return TemplateProcessing(
                 single=["$0", eos],
@@ Expand All / @@ -310,7 +310,7 @@ def post_processor(self, tokenizer): @@
     class T5Converter(SpmConverter):
-        def post_processor(self, tokenizer):
+        def post_processor(self, tokenizer) -> TemplateProcessing:
             return TemplateProcessing(
                 single=["$0", "</s>"],
                 pair=["$1", "</s>"],
@@ Expand All / @@ -330,7 +330,7 @@ def post_processor(self, tokenizer): @@
     }
-    def check(pretrained, filename):
+    def check(pretrained, filename) -> tuple[str, datetime.timedelta]:
         transformer_tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
         converter_class = CONVERTERS[transformer_tokenizer.__class__.__name__]
         tokenizer = converter_class(transformer_tokenizer).converted()
@@ Expand Down @@

bindings/python/scripts/spm_parity_check.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -126,7 +126,7 @@ def check_train(args): @@
         print("Ok our trainer is at least more efficient than the SPM one")
-    def check_diff(spm_diff, tok_diff, sp, tok):
+    def check_diff(spm_diff, tok_diff, sp, tok) -> bool:
         if spm_diff == list(reversed(tok_diff)):
             # AAA -> AA+A vs A+AA case.
             return True
@@ Expand All / @@ -147,7 +147,7 @@ def check_diff(spm_diff, tok_diff, sp, tok): @@
         return False
-    def check_details(line, spm_ids, tok_ids, sp, tok):
+    def check_details(line, spm_ids, tok_ids, sp, tok) -> bool:
         # Encoding can be the same with same result AAA -> A + AA vs AA + A
         # We can check that we use at least exactly the same number of tokens.
         for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
@@ Expand Down Expand Up / @@ -206,7 +206,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok): @@
         return False
-    def check_encode(args):
+    def check_encode(args) -> None:
         sp = cast(Any, spm.SentencePieceProcessor())
         sp.Load(args.model_file)
@@ Expand Down @@

bindings/python/tests/bindings/test_pre_tokenizers.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -292,12 +292,12 @@ def pre_tokenize(self, pretok, wrong): @@
                 pass
         class GoodCustomPretok:
-            def split(self, n, normalized):
+            def split(self, n, normalized) -> list:
                 #  Here we just test that we can return a List[NormalizedString], it
                 # does not really make sense to return twice the same otherwise
                 return [normalized, normalized]
-            def pre_tokenize(self, pretok):
+            def pre_tokenize(self, pretok) -> None:
                 pretok.split(self.split)
         def test_instantiate(self):
@@ Expand Down Expand Up / @@ -325,7 +325,7 @@ def get_state(self, c): @@
                     else:
                         return "rest"
-                def split(self, n, normalized):
+                def split(self, n, normalized) -> list:
                     i = 0
                     # states = {"any", "lower", "upper", "digit", "rest"}
                     state = "any"
@@ Expand Down @@

bindings/python/tests/bindings/test_processors.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -106,21 +106,21 @@ def test_manual_reload(self): @@
     class TestTemplateProcessing:
-        def get_bert(self):
+        def get_bert(self) -> TemplateProcessing:
             return TemplateProcessing(
                 single=["[CLS]", "$0", "[SEP]"],
                 pair=["[CLS]", "$A", "[SEP]", "$B:1", "[SEP]:1"],
                 special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
             )
-        def get_roberta(self):
+        def get_roberta(self) -> TemplateProcessing:
             return TemplateProcessing(
                 single="<s> $0 </s>",
                 pair="<s> $A </s> </s> $B </s>",
                 special_tokens=[("<s>", 0), ("</s>", 1)],
             )
-        def get_t5_squad(self):
+        def get_t5_squad(self) -> TemplateProcessing:
             # >>> from transformers import AutoTokenizer
             # >>> tok = AutoTokenizer.from_pretrained("t5-small")
             # >>> tok.tokenize("question: ")
@@ Expand Down @@

bindings/python/tests/bindings/test_tokenizer.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -178,7 +178,7 @@ def test_encode_formats(self, bert_files): @@
                 ["[CLS]", "my", "name", "is", "georges", "[SEP]", "pair", "[SEP]"],
             ]
-            def format(encodings):
+            def format(encodings) -> list[list[str]]:
-        def format(encodings) -> list[list[str]]:
+        def format(encodings: list[Encoding]) -> list[list[str]]:
                 return [e.tokens for e in encodings]
             def test_single(input, is_pretokenized=False):
@@ Expand Down @@

bindings/python/tests/bindings/test_trainers.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -160,7 +160,7 @@ def test_train(self, train_files): @@
         def test_train_parallelism_with_custom_pretokenizer(self, train_files):
             class GoodCustomPretok:
-                def split(self, n, normalized):
+                def split(self, n, normalized) -> list:
                     #  Here we just test that we can return a List[NormalizedString], it
                     # does not really make sense to return twice the same otherwise
                     return [normalized, normalized]
@@ Expand Down @@

bindings/python/tests/utils.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,7 @@ @@
     DATA_PATH = os.path.join("tests", "data")
-    def download(url, with_filename=None):
+    def download(url, with_filename=None) -> str:
         filename = with_filename if with_filename is not None else url.rsplit("/")[-1]
         filepath = os.path.join(DATA_PATH, filename)
         if not os.path.exists(filepath):
@@ Expand All / @@ -30,30 +30,30 @@ def data_dir(): @@
     @pytest.fixture(scope="session")
-    def roberta_files(data_dir):
+    def roberta_files(data_dir) -> dict[str, str]:
         return {
             "vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"),
             "merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"),
         }
     @pytest.fixture(scope="session")
-    def bert_files(data_dir):
+    def bert_files(data_dir) -> dict[str, str]:
         return {
             "vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"),
         }
     @pytest.fixture(scope="session")
-    def openai_files(data_dir):
+    def openai_files(data_dir) -> dict[str, str]:
         return {
             "vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"),
             "merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"),
         }
     @pytest.fixture(scope="session")
-    def train_files(data_dir):
+    def train_files(data_dir) -> dict[str, str]:
         big = download("https://norvig.com/big.txt")
         small = os.path.join(DATA_PATH, "small.txt")
         with open(small, "w") as f:
@@ Expand All / @@ -69,20 +69,20 @@ def train_files(data_dir): @@
     @pytest.fixture(scope="session")
-    def albert_base(data_dir):
+    def albert_base(data_dir) -> str:
         return download("https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json")
     @pytest.fixture(scope="session")
-    def doc_wiki_tokenizer(data_dir):
+    def doc_wiki_tokenizer(data_dir) -> str:
         return download(
             "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json",
             "tokenizer-wiki.json",
         )
     @pytest.fixture(scope="session")
-    def doc_pipeline_bert_tokenizer(data_dir):
+    def doc_pipeline_bert_tokenizer(data_dir) -> str:
         return download(
             "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json",
             "bert-wiki.json",
@@ Expand Down @@