"""Toxicity detection using transformer sequence-classification models.
Default model: ``citizenlab/distilbert-base-multilingual-cased-toxicity``
(DistilBERT multilingual, F1-micro 0.94, 10 languages).
"""
from __future__ import annotations
import os
from typing import Any, Dict, List, Optional, Tuple
from transformers import pipeline
[docs]
class ToxicityDetector:
"""Classify text as toxic or non-toxic.
Parameters
----------
model_name : str
A key in :pyattr:`predefined_models` or any Hugging Face model ID /
local path suitable for ``text-classification``.
threshold : float
Score above which text is considered toxic.
"""
predefined_models: Dict[str, str] = {
"citizenlab": "citizenlab/distilbert-base-multilingual-cased-toxicity",
}
[docs]
def __init__(
self,
model_name: str = "citizenlab",
threshold: float = 0.5,
) -> None:
if not isinstance(threshold, (int, float)):
raise ValueError("threshold must be a number.")
self.default_threshold = float(threshold)
model_path = self.predefined_models.get(model_name, model_name)
if os.path.exists(model_name):
model_path = model_name
self._pipeline = pipeline(
"text-classification",
model=model_path,
)
[docs]
def detect(
self,
text: str,
threshold: Optional[float] = None,
) -> Tuple[bool, float]:
"""Detect whether *text* is toxic.
Returns ``(is_toxic, score)`` mirroring the
``PromptInjectionDetector.detect_injection`` return signature.
"""
if not isinstance(text, str):
raise TypeError("text must be a string.")
threshold = self.default_threshold if threshold is None else float(threshold)
results: List[Dict[str, Any]] = self._pipeline(text)
toxic_score = self._extract_toxic_score(results)
return toxic_score > threshold, toxic_score
[docs]
def report(self, text: str, threshold: Optional[float] = None) -> None:
"""Print a human-readable toxicity summary."""
is_toxic, score = self.detect(text, threshold=threshold)
if is_toxic:
print(f"Toxic content detected (score={score:.2f}).")
else:
print(f"No toxicity detected (score={score:.2f}).")