transformers-CFG incompatible with gemma-3: causes tokenizer and model vocab size mismatch

I've encountered an issue when trying to use transformers-cfg for constrained generation with the recently released gemma-3 models.

Adding GrammarConstrainedLogitsProcessor from transformers-cfg to the logits_processor causes the model.generate() to fail with:
```ValueError: impossible for tokenizer vocab to be less than model vocab```

The error occurs regardless of whether the model is loaded using AutoModelForCausalLM or the specific Gemma3ForCausalLM class (same for the tokenizer class). The example below uses Gemma3ForCausalLM:

```python

import torch
import transformers
from transformers import (
    AutoTokenizer,
    Gemma3ForCausalLM, # Same goes for the auto class
    GenerationConfig
    # BitsAndBytesConfig
)
import transformers_cfg
from transformers_cfg.grammar_utils import IncrementalGrammarConstraint
from transformers_cfg.generation.logits_process import GrammarConstrainedLogitsProcessor
import time
import gc
import platform
import traceback

MODEL_ID = "google/gemma-3-1b-it"
SIMPLE_GRAMMAR = 'root ::= ("A" | "B")' # Simple grammar
START_RULE = "root"

# Quantization
USE_4BIT = False
USE_8BIT = False

# Environment Info
print("-" * 60)
print("ENVIRONMENT:")
print(f"- Python:       {platform.python_version()}")
print(f"- transformers: {transformers.__version__}")
print(f"- torch:        {torch.__version__}")
try:
    import accelerate
    print(f"- accelerate:   {accelerate.__version__}")
except ImportError: print("- accelerate:   Not Installed")
try:
    import bitsandbytes
    print(f"- bitsandbytes: {bitsandbytes.__version__}")
except ImportError: print("- bitsandbytes: Not Installed (Needed for quantization)")
try:
    print(f"- trans-cfg:    {transformers_cfg.__version__}")
except AttributeError: print("- trans-cfg:    Installed (version attribute missing)")
print("-" * 60)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Test
try:
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    if tokenizer.pad_token is None: 
        tokenizer.pad_token = tokenizer.eos_token
    if not tokenizer.chat_template: 
        raise AttributeError("Tokenizer missing chat template")

    print("Loading model (Gemma3ForCausalLM)...")
    model = Gemma3ForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
        device_map="auto",
        quantization_config=None, # No quantization
        trust_remote_code=True
    ).eval()

    model_vocab = getattr(model.config, 'vocab_size', 'N/A')
    tokenizer_vocab = getattr(tokenizer, 'vocab_size', 'N/A')
    print(f"(Post-load check) Model vocab: {model_vocab}, Tokenizer vocab: {tokenizer_vocab}")
    if model_vocab != tokenizer_vocab: 
        print("Post-load vocab size mismatch detected!")

    # Grammar Processor
    print("Creating grammar processor...")
    grammar_constraint = IncrementalGrammarConstraint(SIMPLE_GRAMMAR, START_RULE, tokenizer)
    grammar_processor = GrammarConstrainedLogitsProcessor(grammar_constraint)
    logits_processor_list = [grammar_processor]

    messages = [{"role": "user", "content": [{"type": "text", "text": "Output A or B."}]}]
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt"
    ).to(model.device)
    input_ids_len = input_ids.shape[1]

    gen_config = GenerationConfig(
        max_new_tokens=3,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=False
    )

    # Expected to Fail
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            generation_config=gen_config,
            logits_processor=logits_processor_list
        )

    output_token_ids = outputs[:, input_ids_len:]
    output_text = tokenizer.decode(output_token_ids[0], skip_special_tokens=True)

except ValueError as e:
    print(f"ERROR Type: {type(e).__name__}")
    print(f"ERROR Message: {e}")
    traceback.print_exc()
    print("-" * 35 + "\n")
except Exception as e:
    print(f"ERROR Type: {type(e).__name__}")
    print(f"ERROR Message: {e}")
    traceback.print_exc()
    print("-" * 35 + "\n")
finally:
    del model
    del tokenizer
    del grammar_processor
    if torch.cuda.is_available(): 
        torch.cuda.empty_cache()
        gc.collect()
```

Which for me throws:
```
------------------------------------------------------------
ENVIRONMENT:
- Python:       3.12.3
- transformers: 4.51.1
- torch:        2.4.0
- accelerate:   1.0.1
- bitsandbytes: 0.45.5
- trans-cfg:    0.2.7
------------------------------------------------------------
Using device: cuda
Loading tokenizer...
Loading model (Gemma3ForCausalLM)...
(Post-load check) Model vocab: 262144, Tokenizer vocab: 262144
Creating grammar processor...
`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'cache_implementation': 'hybrid', 'top_k': 64, 'top_p': 0.95, 'bos_token_id': 2}. If this is not desired, please set these values explicitly.
ERROR Type: AssertionError
ERROR Message: impossible for tokenizer vocab to be less than model vocab
-----------------------------------

Traceback (most recent call last):
  File "C:\Users\georg\AppData\Local\Temp\ipykernel_45580\3883546750.py", line 95, in <module>
    outputs = model.generate(
              ^^^^^^^^^^^^^^^
  File "c:\Users\georg\anaconda3\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\georg\anaconda3\Lib\site-packages\transformers\generation\utils.py", line 2463, in generate
    result = self._sample(
             ^^^^^^^^^^^^^
  File "c:\Users\georg\anaconda3\Lib\site-packages\transformers\generation\utils.py", line 3448, in _sample
    next_token_scores = logits_processor(input_ids, next_token_logits)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\georg\anaconda3\Lib\site-packages\transformers\generation\logits_process.py", line 88, in __call__
    scores = processor(input_ids, scores)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\georg\anaconda3\Lib\site-packages\transformers_cfg\generation\logits_process.py", line 164, in __call__
    return self.process_logits(input_ids, scores)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\georg\anaconda3\Lib\site-packages\transformers_cfg\generation\logits_process.py", line 157, in process_logits
    masked_scores = self.mask_logits(scores, device)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\georg\anaconda3\Lib\site-packages\transformers_cfg\generation\logits_process.py", line 77, in mask_logits
    acceptance_vocab_size < masked_logits_vocab_size
AssertionError: impossible for tokenizer vocab to be less than model vocab
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

transformers-CFG incompatible with gemma-3: causes tokenizer and model vocab size mismatch #127

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

transformers-CFG incompatible with gemma-3: causes tokenizer and model vocab size mismatch #127

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions