Source code for pytector.toxicity

"""Toxicity detection using transformer sequence-classification models.

Default model: ``citizenlab/distilbert-base-multilingual-cased-toxicity``
(DistilBERT multilingual, F1-micro 0.94, 10 languages).
"""

from __future__ import annotations

import os
from typing import Any, Dict, List, Optional, Tuple

from transformers import pipeline


[docs] class ToxicityDetector: """Classify text as toxic or non-toxic. Parameters ---------- model_name : str A key in :pyattr:`predefined_models` or any Hugging Face model ID / local path suitable for ``text-classification``. threshold : float Score above which text is considered toxic. """ predefined_models: Dict[str, str] = { "citizenlab": "citizenlab/distilbert-base-multilingual-cased-toxicity", }
[docs] def __init__( self, model_name: str = "citizenlab", threshold: float = 0.5, ) -> None: if not isinstance(threshold, (int, float)): raise ValueError("threshold must be a number.") self.default_threshold = float(threshold) model_path = self.predefined_models.get(model_name, model_name) if os.path.exists(model_name): model_path = model_name self._pipeline = pipeline( "text-classification", model=model_path, )
[docs] def detect( self, text: str, threshold: Optional[float] = None, ) -> Tuple[bool, float]: """Detect whether *text* is toxic. Returns ``(is_toxic, score)`` mirroring the ``PromptInjectionDetector.detect_injection`` return signature. """ if not isinstance(text, str): raise TypeError("text must be a string.") threshold = self.default_threshold if threshold is None else float(threshold) results: List[Dict[str, Any]] = self._pipeline(text) toxic_score = self._extract_toxic_score(results) return toxic_score > threshold, toxic_score
[docs] def report(self, text: str, threshold: Optional[float] = None) -> None: """Print a human-readable toxicity summary.""" is_toxic, score = self.detect(text, threshold=threshold) if is_toxic: print(f"Toxic content detected (score={score:.2f}).") else: print(f"No toxicity detected (score={score:.2f}).")
[docs] @staticmethod def _extract_toxic_score(results: List[Dict[str, Any]]) -> float: """Normalise pipeline output into a single toxicity probability. The citizenlab model returns ``[{"label": "toxic"|"non-toxic", "score": float}]``. Other models may use ``LABEL_1`` / ``LABEL_0`` conventions. """ if not results: return 0.0 result = results[0] label = result.get("label", "").lower() score = float(result.get("score", 0.0)) if label in ("toxic", "label_1"): return score if label in ("non-toxic", "non_toxic", "not_toxic", "label_0"): return 1.0 - score return score