Skip to content

Commit 7882262

Browse files
authored
feat(jailbreak): Validate Jailbreak Detection config at create-time (#1675)
1 parent e4c6a7a commit 7882262

File tree

2 files changed

+316
-3
lines changed

2 files changed

+316
-3
lines changed

nemoguardrails/rails/llm/config.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
with open(os.path.join(os.path.dirname(__file__), "default_config_v2.yml")) as _fc:
5454
_default_config_v2 = yaml.safe_load(_fc)
5555

56+
# Jailbreak-related strings
57+
JAILBREAK_FLOW_MODEL = "jailbreak detection model"
58+
JAILBREAK_FLOW_HEURISTICS = "jailbreak detection heuristics"
5659

5760
# Extract the COLANGPATH directories.
5861
colang_path_dirs = [
@@ -700,9 +703,9 @@ class JailbreakDetectionConfig(BaseModel):
700703
default=None,
701704
description="The endpoint for the jailbreak detection heuristics/model container.",
702705
)
703-
length_per_perplexity_threshold: float = Field(default=89.79, description="The length/perplexity threshold.")
706+
length_per_perplexity_threshold: float = Field(default=89.79, gt=0, description="The length/perplexity threshold.")
704707
prefix_suffix_perplexity_threshold: float = Field(
705-
default=1845.65, description="The prefix/suffix perplexity threshold."
708+
default=1845.65, gt=0, description="The prefix/suffix perplexity threshold."
706709
)
707710
nim_base_url: Optional[str] = Field(
708711
default=None,
@@ -744,6 +747,15 @@ def migrate_deprecated_fields(self) -> "JailbreakDetectionConfig":
744747
self.nim_base_url = f"http://{self.nim_url}:{port}/v1"
745748
return self
746749

750+
@model_validator(mode="after")
751+
def validate_urls(self) -> "JailbreakDetectionConfig":
752+
"""Validate URL formats for endpoints."""
753+
if self.nim_base_url and not self.nim_base_url.startswith(("http://", "https://")):
754+
raise ValueError(f"nim_base_url must start with 'http://' or 'https://', got '{self.nim_base_url}'")
755+
if self.server_endpoint and not self.server_endpoint.startswith(("http://", "https://")):
756+
raise ValueError(f"server_endpoint must start with 'http://' or 'https://', got '{self.server_endpoint}'")
757+
return self
758+
747759
def get_api_key(self) -> Optional[str]:
748760
"""Helper to return an API key (if it exists) from a Jailbreak configuration.
749761
This can come from (in descending order of priority):
@@ -1712,6 +1724,60 @@ def check_output_parser_exists(cls, values):
17121724
)
17131725
return values
17141726

1727+
@root_validator(pre=True, allow_reuse=True)
1728+
def check_jailbreak_detection_config(cls, values):
1729+
"""Validate jailbreak detection configuration against enabled flows."""
1730+
rails = values.get("rails") or {}
1731+
config_data = rails.get("config") or {}
1732+
input_flows = (rails.get("input") or {}).get("flows") or []
1733+
1734+
jailbreak_config = config_data.get("jailbreak_detection") or {}
1735+
has_model_flow = JAILBREAK_FLOW_MODEL in input_flows
1736+
has_heuristics_flow = JAILBREAK_FLOW_HEURISTICS in input_flows
1737+
has_any_jailbreak_flow = has_model_flow or has_heuristics_flow
1738+
1739+
# Case A: Config present but no flow references it
1740+
if jailbreak_config and not has_any_jailbreak_flow:
1741+
log.warning(
1742+
"Jailbreak detection configuration is present under "
1743+
"rails.config.jailbreak_detection but no jailbreak detection flow "
1744+
"is enabled. To use jailbreak detection, add 'jailbreak detection model' "
1745+
"or 'jailbreak detection heuristics' to rails.input.flows."
1746+
)
1747+
1748+
# Case B: "jailbreak detection model" flow is enabled
1749+
if has_model_flow:
1750+
nim_base_url = jailbreak_config.get("nim_base_url")
1751+
nim_url = jailbreak_config.get("nim_url") # deprecated, migrated later
1752+
server_endpoint = jailbreak_config.get("server_endpoint")
1753+
nim_server_endpoint = jailbreak_config.get("nim_server_endpoint", "classify")
1754+
1755+
if nim_base_url or nim_url:
1756+
if not nim_server_endpoint:
1757+
raise InvalidRailsConfigurationError(
1758+
"nim_base_url is set for jailbreak detection model but "
1759+
"nim_server_endpoint is empty. Both must be configured "
1760+
"when using NIM-based jailbreak detection."
1761+
)
1762+
elif not server_endpoint:
1763+
log.warning(
1764+
"No endpoint configured for jailbreak detection model. "
1765+
"Will fall back to local in-process detection, which is "
1766+
"not recommended for production."
1767+
)
1768+
1769+
# Case C: "jailbreak detection heuristics" flow is enabled
1770+
if has_heuristics_flow:
1771+
server_endpoint = jailbreak_config.get("server_endpoint")
1772+
if not server_endpoint:
1773+
log.warning(
1774+
"No server_endpoint configured for jailbreak detection heuristics. "
1775+
"Will fall back to local in-process detection, which is "
1776+
"not recommended for production."
1777+
)
1778+
1779+
return values
1780+
17151781
@root_validator(pre=True, allow_reuse=True)
17161782
def fill_in_default_values_for_v2_x(cls, values):
17171783
instructions = values.get("instructions", {})

tests/test_jailbreak_config.py

Lines changed: 248 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,13 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15+
import logging
1516
import os
1617
from unittest.mock import patch
1718

18-
from nemoguardrails.rails.llm.config import JailbreakDetectionConfig
19+
import pytest
20+
21+
from nemoguardrails.rails.llm.config import JailbreakDetectionConfig, RailsConfig
1922

2023

2124
class TestJailbreakDetectionConfig:
@@ -184,3 +187,247 @@ def test_get_api_key_api_key_env_var_not_set(self):
184187

185188
auth_token = config.get_api_key()
186189
assert auth_token is None
190+
191+
def test_negative_length_per_perplexity_threshold_raises(self):
192+
"""Threshold <= 0 should raise ValueError."""
193+
with pytest.raises(ValueError, match="greater than 0"):
194+
JailbreakDetectionConfig(length_per_perplexity_threshold=-1.0)
195+
196+
def test_negative_prefix_suffix_perplexity_threshold_raises(self):
197+
"""Threshold <= 0 should raise ValueError."""
198+
with pytest.raises(ValueError, match="greater than 0"):
199+
JailbreakDetectionConfig(prefix_suffix_perplexity_threshold=0)
200+
201+
def test_invalid_nim_base_url_raises(self):
202+
"""nim_base_url without http(s) scheme should raise ValueError."""
203+
with pytest.raises(ValueError, match="nim_base_url must start with"):
204+
JailbreakDetectionConfig(nim_base_url="ftp://localhost:8000/v1")
205+
206+
def test_invalid_server_endpoint_raises(self):
207+
"""server_endpoint without http(s) scheme should raise ValueError."""
208+
with pytest.raises(ValueError, match="server_endpoint must start with"):
209+
JailbreakDetectionConfig(server_endpoint="localhost:1337/model")
210+
211+
def test_valid_urls_accepted(self):
212+
"""Valid http and https URLs should be accepted."""
213+
config = JailbreakDetectionConfig(
214+
nim_base_url="https://nim.example.com/v1",
215+
server_endpoint="http://localhost:1337/model",
216+
)
217+
assert config.nim_base_url == "https://nim.example.com/v1"
218+
assert config.server_endpoint == "http://localhost:1337/model"
219+
220+
221+
def _make_rails_config(**kwargs):
222+
"""Helper to build a RailsConfig with minimal required fields."""
223+
defaults = {
224+
"models": [{"type": "main", "engine": "openai", "model": "gpt-3.5-turbo"}],
225+
}
226+
defaults.update(kwargs)
227+
return RailsConfig(**defaults)
228+
229+
230+
class TestJailbreakDetectionCrossValidation:
231+
def test_model_flow_with_nim_url_but_no_endpoint_raises(self):
232+
"""nim_base_url set but nim_server_endpoint empty should raise."""
233+
with pytest.raises(Exception, match="nim_server_endpoint is empty"):
234+
_make_rails_config(
235+
rails={
236+
"input": {"flows": ["jailbreak detection model"]},
237+
"config": {
238+
"jailbreak_detection": {
239+
"nim_base_url": "http://localhost:8000/v1",
240+
"nim_server_endpoint": "",
241+
}
242+
},
243+
},
244+
)
245+
246+
def test_model_flow_with_no_endpoints_warns(self, caplog):
247+
"""No nim_base_url or server_endpoint should warn about local fallback."""
248+
with caplog.at_level(logging.WARNING):
249+
_make_rails_config(
250+
rails={
251+
"input": {"flows": ["jailbreak detection model"]},
252+
"config": {"jailbreak_detection": {}},
253+
},
254+
)
255+
assert "No endpoint configured for jailbreak detection model" in caplog.text
256+
257+
def test_heuristics_flow_with_no_server_endpoint_warns(self, caplog):
258+
"""No server_endpoint for heuristics flow should warn."""
259+
with caplog.at_level(logging.WARNING):
260+
_make_rails_config(
261+
rails={
262+
"input": {"flows": ["jailbreak detection heuristics"]},
263+
"config": {"jailbreak_detection": {}},
264+
},
265+
)
266+
assert "No server_endpoint configured for jailbreak detection heuristics" in caplog.text
267+
268+
def test_jailbreak_config_present_but_no_flow_warns(self, caplog):
269+
"""Orphaned jailbreak_detection config should warn."""
270+
with caplog.at_level(logging.WARNING):
271+
_make_rails_config(
272+
rails={
273+
"config": {
274+
"jailbreak_detection": {
275+
"nim_base_url": "http://localhost:8000/v1",
276+
}
277+
},
278+
},
279+
)
280+
assert "no jailbreak detection flow is enabled" in caplog.text
281+
282+
def test_model_flow_with_nim_fully_configured_passes(self, caplog):
283+
"""Fully configured NIM-based model flow should produce no warnings."""
284+
with caplog.at_level(logging.WARNING):
285+
config = _make_rails_config(
286+
rails={
287+
"input": {"flows": ["jailbreak detection model"]},
288+
"config": {
289+
"jailbreak_detection": {
290+
"nim_base_url": "http://localhost:8000/v1",
291+
"nim_server_endpoint": "classify",
292+
}
293+
},
294+
},
295+
)
296+
assert "jailbreak" not in caplog.text.lower()
297+
assert config.rails.config.jailbreak_detection.nim_base_url == "http://localhost:8000/v1"
298+
299+
def test_model_flow_with_deprecated_nim_url_no_spurious_warning(self, caplog):
300+
"""Deprecated nim_url/nim_port should not trigger 'no endpoint' warning."""
301+
with caplog.at_level(logging.WARNING):
302+
config = _make_rails_config(
303+
rails={
304+
"input": {"flows": ["jailbreak detection model"]},
305+
"config": {
306+
"jailbreak_detection": {
307+
"nim_url": "localhost",
308+
"nim_port": 8000,
309+
}
310+
},
311+
},
312+
)
313+
assert "No endpoint configured" not in caplog.text
314+
# Verify migration happened
315+
assert config.rails.config.jailbreak_detection.nim_base_url == "http://localhost:8000/v1"
316+
317+
def test_model_flow_with_server_endpoint_passes(self, caplog):
318+
"""Model flow with server_endpoint (no NIM) should pass without warning."""
319+
with caplog.at_level(logging.WARNING):
320+
_make_rails_config(
321+
rails={
322+
"input": {"flows": ["jailbreak detection model"]},
323+
"config": {
324+
"jailbreak_detection": {
325+
"server_endpoint": "http://localhost:1337/model",
326+
}
327+
},
328+
},
329+
)
330+
assert "jailbreak" not in caplog.text.lower()
331+
332+
def test_heuristics_flow_with_server_endpoint_passes(self, caplog):
333+
"""Heuristics flow with server_endpoint should pass without warning."""
334+
with caplog.at_level(logging.WARNING):
335+
_make_rails_config(
336+
rails={
337+
"input": {"flows": ["jailbreak detection heuristics"]},
338+
"config": {
339+
"jailbreak_detection": {
340+
"server_endpoint": "http://localhost:1337/heuristics",
341+
}
342+
},
343+
},
344+
)
345+
assert "jailbreak" not in caplog.text.lower()
346+
347+
def test_model_flow_deprecated_nim_url_empty_server_endpoint_raises(self):
348+
"""Deprecated nim_url with empty nim_server_endpoint should raise."""
349+
with pytest.raises(Exception, match="nim_server_endpoint is empty"):
350+
_make_rails_config(
351+
rails={
352+
"input": {"flows": ["jailbreak detection model"]},
353+
"config": {
354+
"jailbreak_detection": {
355+
"nim_url": "localhost",
356+
"nim_server_endpoint": "",
357+
}
358+
},
359+
},
360+
)
361+
362+
def test_model_flow_nim_port_only_warns(self, caplog):
363+
"""nim_port alone (no nim_url or nim_base_url) should warn about local fallback."""
364+
with caplog.at_level(logging.WARNING):
365+
_make_rails_config(
366+
rails={
367+
"input": {"flows": ["jailbreak detection model"]},
368+
"config": {
369+
"jailbreak_detection": {
370+
"nim_port": 9000,
371+
}
372+
},
373+
},
374+
)
375+
assert "No endpoint configured for jailbreak detection model" in caplog.text
376+
377+
def test_both_flows_nim_only_warns_heuristics(self, caplog):
378+
"""Both flows with only NIM configured should warn for heuristics only."""
379+
with caplog.at_level(logging.WARNING):
380+
_make_rails_config(
381+
rails={
382+
"input": {
383+
"flows": [
384+
"jailbreak detection model",
385+
"jailbreak detection heuristics",
386+
]
387+
},
388+
"config": {
389+
"jailbreak_detection": {
390+
"nim_base_url": "http://localhost:8000/v1",
391+
}
392+
},
393+
},
394+
)
395+
assert "No endpoint configured for jailbreak detection model" not in caplog.text
396+
assert "No server_endpoint configured for jailbreak detection heuristics" in caplog.text
397+
398+
def test_both_flows_server_endpoint_only_passes(self, caplog):
399+
"""Both flows with server_endpoint should pass without warnings."""
400+
with caplog.at_level(logging.WARNING):
401+
_make_rails_config(
402+
rails={
403+
"input": {
404+
"flows": [
405+
"jailbreak detection model",
406+
"jailbreak detection heuristics",
407+
]
408+
},
409+
"config": {
410+
"jailbreak_detection": {
411+
"server_endpoint": "http://localhost:1337/model",
412+
}
413+
},
414+
},
415+
)
416+
assert "jailbreak" not in caplog.text.lower()
417+
418+
def test_explicit_null_jailbreak_detection_config(self, caplog):
419+
"""Explicit None for jailbreak_detection should not raise AttributeError."""
420+
with caplog.at_level(logging.WARNING):
421+
_make_rails_config(
422+
rails={
423+
"input": {"flows": ["jailbreak detection model"]},
424+
"config": {"jailbreak_detection": None},
425+
},
426+
)
427+
assert "No endpoint configured for jailbreak detection model" in caplog.text
428+
429+
def test_no_jailbreak_config_no_flow_no_warnings(self, caplog):
430+
"""Default config with no jailbreak config or flows should produce no warnings."""
431+
with caplog.at_level(logging.WARNING):
432+
_make_rails_config()
433+
assert "jailbreak" not in caplog.text.lower()

0 commit comments

Comments
 (0)