-
Notifications
You must be signed in to change notification settings - Fork 22
Open
Description
I've encountered an issue when trying to use transformers-cfg for constrained generation with the recently released gemma-3 models.
Adding GrammarConstrainedLogitsProcessor from transformers-cfg to the logits_processor causes the model.generate() to fail with:
ValueError: impossible for tokenizer vocab to be less than model vocab
The error occurs regardless of whether the model is loaded using AutoModelForCausalLM or the specific Gemma3ForCausalLM class (same for the tokenizer class). The example below uses Gemma3ForCausalLM:
import torch
import transformers
from transformers import (
AutoTokenizer,
Gemma3ForCausalLM, # Same goes for the auto class
GenerationConfig
# BitsAndBytesConfig
)
import transformers_cfg
from transformers_cfg.grammar_utils import IncrementalGrammarConstraint
from transformers_cfg.generation.logits_process import GrammarConstrainedLogitsProcessor
import time
import gc
import platform
import traceback
MODEL_ID = "google/gemma-3-1b-it"
SIMPLE_GRAMMAR = 'root ::= ("A" | "B")' # Simple grammar
START_RULE = "root"
# Quantization
USE_4BIT = False
USE_8BIT = False
# Environment Info
print("-" * 60)
print("ENVIRONMENT:")
print(f"- Python: {platform.python_version()}")
print(f"- transformers: {transformers.__version__}")
print(f"- torch: {torch.__version__}")
try:
import accelerate
print(f"- accelerate: {accelerate.__version__}")
except ImportError: print("- accelerate: Not Installed")
try:
import bitsandbytes
print(f"- bitsandbytes: {bitsandbytes.__version__}")
except ImportError: print("- bitsandbytes: Not Installed (Needed for quantization)")
try:
print(f"- trans-cfg: {transformers_cfg.__version__}")
except AttributeError: print("- trans-cfg: Installed (version attribute missing)")
print("-" * 60)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
# Test
try:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
if not tokenizer.chat_template:
raise AttributeError("Tokenizer missing chat template")
print("Loading model (Gemma3ForCausalLM)...")
model = Gemma3ForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
device_map="auto",
quantization_config=None, # No quantization
trust_remote_code=True
).eval()
model_vocab = getattr(model.config, 'vocab_size', 'N/A')
tokenizer_vocab = getattr(tokenizer, 'vocab_size', 'N/A')
print(f"(Post-load check) Model vocab: {model_vocab}, Tokenizer vocab: {tokenizer_vocab}")
if model_vocab != tokenizer_vocab:
print("Post-load vocab size mismatch detected!")
# Grammar Processor
print("Creating grammar processor...")
grammar_constraint = IncrementalGrammarConstraint(SIMPLE_GRAMMAR, START_RULE, tokenizer)
grammar_processor = GrammarConstrainedLogitsProcessor(grammar_constraint)
logits_processor_list = [grammar_processor]
messages = [{"role": "user", "content": [{"type": "text", "text": "Output A or B."}]}]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt"
).to(model.device)
input_ids_len = input_ids.shape[1]
gen_config = GenerationConfig(
max_new_tokens=3,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
do_sample=False
)
# Expected to Fail
with torch.no_grad():
outputs = model.generate(
input_ids,
generation_config=gen_config,
logits_processor=logits_processor_list
)
output_token_ids = outputs[:, input_ids_len:]
output_text = tokenizer.decode(output_token_ids[0], skip_special_tokens=True)
except ValueError as e:
print(f"ERROR Type: {type(e).__name__}")
print(f"ERROR Message: {e}")
traceback.print_exc()
print("-" * 35 + "\n")
except Exception as e:
print(f"ERROR Type: {type(e).__name__}")
print(f"ERROR Message: {e}")
traceback.print_exc()
print("-" * 35 + "\n")
finally:
del model
del tokenizer
del grammar_processor
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()Which for me throws:
------------------------------------------------------------
ENVIRONMENT:
- Python: 3.12.3
- transformers: 4.51.1
- torch: 2.4.0
- accelerate: 1.0.1
- bitsandbytes: 0.45.5
- trans-cfg: 0.2.7
------------------------------------------------------------
Using device: cuda
Loading tokenizer...
Loading model (Gemma3ForCausalLM)...
(Post-load check) Model vocab: 262144, Tokenizer vocab: 262144
Creating grammar processor...
`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'cache_implementation': 'hybrid', 'top_k': 64, 'top_p': 0.95, 'bos_token_id': 2}. If this is not desired, please set these values explicitly.
ERROR Type: AssertionError
ERROR Message: impossible for tokenizer vocab to be less than model vocab
-----------------------------------
Traceback (most recent call last):
File "C:\Users\georg\AppData\Local\Temp\ipykernel_45580\3883546750.py", line 95, in <module>
outputs = model.generate(
^^^^^^^^^^^^^^^
File "c:\Users\georg\anaconda3\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\georg\anaconda3\Lib\site-packages\transformers\generation\utils.py", line 2463, in generate
result = self._sample(
^^^^^^^^^^^^^
File "c:\Users\georg\anaconda3\Lib\site-packages\transformers\generation\utils.py", line 3448, in _sample
next_token_scores = logits_processor(input_ids, next_token_logits)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\georg\anaconda3\Lib\site-packages\transformers\generation\logits_process.py", line 88, in __call__
scores = processor(input_ids, scores)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\georg\anaconda3\Lib\site-packages\transformers_cfg\generation\logits_process.py", line 164, in __call__
return self.process_logits(input_ids, scores)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\georg\anaconda3\Lib\site-packages\transformers_cfg\generation\logits_process.py", line 157, in process_logits
masked_scores = self.mask_logits(scores, device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\georg\anaconda3\Lib\site-packages\transformers_cfg\generation\logits_process.py", line 77, in mask_logits
acceptance_vocab_size < masked_logits_vocab_size
AssertionError: impossible for tokenizer vocab to be less than model vocab
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels