"""Rule-based PII and credential detection using customisable regex patterns.
This module is **pure Python stdlib** — no model downloads, no heavy
dependencies. It ships with sensible defaults for common PII types and
lets users add, remove, or completely replace patterns at construction
time or at runtime.
"""
from __future__ import annotations
import re
from typing import Any, Dict, List, Optional, Tuple
DEFAULT_PATTERNS: Dict[str, str] = {
"EMAIL": r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}",
"PHONE": (
r"(?:\+?1[\s\-.]?)?"
r"(?:\(?\d{3}\)?[\s\-.]?)"
r"\d{3}[\s\-.]?\d{4}"
),
"SSN": r"\b\d{3}-\d{2}-\d{4}\b",
"CREDIT_CARD": (
r"\b(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2}|6(?:011|5\d{2}))"
r"[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{1,4}\b"
),
"IP_ADDRESS": (
r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
),
"API_KEY": (
r"(?:sk|pk)[-_](?:live|test|prod|dev)[-_][A-Za-z0-9]{16,}"
),
"JWT_TOKEN": (
r"eyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+"
),
}
[docs]
class RegexScanner:
"""Scan text for sensitive data using compiled regular expressions.
Parameters
----------
patterns : dict[str, str] | None
Mapping of ``{PATTERN_NAME: regex_string}``. Merged with the
built-in defaults when *use_defaults* is ``True``, or used alone
when ``False``.
use_defaults : bool
Whether to include the built-in patterns (EMAIL, PHONE, SSN,
CREDIT_CARD, IP_ADDRESS, API_KEY, JWT_TOKEN).
"""
[docs]
def __init__(
self,
patterns: Optional[Dict[str, str]] = None,
use_defaults: bool = True,
) -> None:
raw: Dict[str, str] = {}
if use_defaults:
raw.update(DEFAULT_PATTERNS)
if patterns:
raw.update(patterns)
self._patterns: Dict[str, str] = raw
self._compiled: Dict[str, re.Pattern[str]] = {
name: re.compile(pat) for name, pat in raw.items()
}
# ------------------------------------------------------------------
# Core API
# ------------------------------------------------------------------
[docs]
def scan(self, text: str) -> Tuple[bool, List[Dict[str, Any]]]:
"""Scan *text* against all active patterns.
Returns ``(has_matches, matches)`` where each match dict contains
``pattern_name``, ``match``, ``start``, and ``end``.
"""
if not isinstance(text, str):
raise TypeError("text must be a string.")
matches: List[Dict[str, Any]] = []
for name, compiled in self._compiled.items():
for m in compiled.finditer(text):
matches.append({
"pattern_name": name,
"match": m.group(),
"start": m.start(),
"end": m.end(),
})
matches.sort(key=lambda m: m["start"])
return len(matches) > 0, matches
[docs]
def redact(self, text: str, replacement: str = "[REDACTED]") -> str:
"""Return a copy of *text* with all matches replaced by *replacement*.
Non-overlapping matches are replaced right-to-left so offsets stay valid.
"""
_, matches = self.scan(text)
merged = self._merge_overlapping(matches)
result = text
for m in reversed(merged):
result = result[: m["start"]] + replacement + result[m["end"] :]
return result
[docs]
def report(self, text: str) -> None:
"""Print a human-readable scan summary."""
has_matches, matches = self.scan(text)
if not has_matches:
print("No matches found.")
return
print(f"Regex scan — {len(matches)} match{'es' if len(matches) != 1 else ''} found:")
for m in matches:
print(
f" [{m['pattern_name']}] \"{m['match']}\" "
f"(pos={m['start']}:{m['end']})"
)
# ------------------------------------------------------------------
# Pattern management
# ------------------------------------------------------------------
[docs]
def add_pattern(self, name: str, pattern: str) -> None:
"""Add or overwrite a pattern at runtime."""
self._patterns[name] = pattern
self._compiled[name] = re.compile(pattern)
[docs]
def remove_pattern(self, name: str) -> None:
"""Remove a pattern by name. No-op if not present."""
self._patterns.pop(name, None)
self._compiled.pop(name, None)
[docs]
def get_patterns(self) -> Dict[str, str]:
"""Return a copy of the active pattern dictionary."""
return self._patterns.copy()
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
[docs]
@staticmethod
def _merge_overlapping(
matches: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Merge overlapping spans so redaction doesn't double-replace."""
if not matches:
return []
sorted_matches = sorted(matches, key=lambda m: m["start"])
merged: List[Dict[str, Any]] = [sorted_matches[0].copy()]
for m in sorted_matches[1:]:
prev = merged[-1]
if m["start"] <= prev["end"]:
if m["end"] > prev["end"]:
prev["end"] = m["end"]
prev["match"] = prev["match"] + m["match"][prev["end"] - m["start"]:]
else:
merged.append(m.copy())
return merged