Метрики системы обнаружения мошенничества¶
~3 минуты чтения
Предварительно: Определение задачи, Метрики классификации
В fraud detection одна неправильная метрика может привести к катастрофе. Оптимизация accuracy (99.9% для dummy-классификатора) бессмысленна. Оптимизация только recall (блокируем всё подозрительное) -- и 0.5% легитимных клиентов получают отказ, что при 500M транзакций в день = 2.5M недовольных клиентов. Оптимизация только precision -- и пропускаем 20% фрода (\(6M потерь в месяц). Production fraud detection использует 4 уровня метрик: business (\)-потери), operational (review queue), model (precision/recall/AUC-PR) и system (latency/throughput). Каждый уровень влияет на следующий, и интервьюер ожидает, что вы покажете эту связь.
Metric Hierarchy¶
graph TD
BIZ["BUSINESS METRICS<br/>Fraud Loss Rate, Revenue Impact, Customer Churn"]
OPS["OPERATIONAL METRICS<br/>False Positive Rate, Review Volume, Investigation Time"]
MODEL["MODEL METRICS<br/>Precision, Recall, AUC-PR, F1, Detection Rate"]
SYS["SYSTEM METRICS<br/>Latency, Throughput, Availability, Error Rate"]
BIZ --> OPS --> MODEL --> SYS
style BIZ fill:#fce4ec,stroke:#c62828
style OPS fill:#fff3e0,stroke:#ef6c00
style MODEL fill:#e8f5e9,stroke:#4caf50
style SYS fill:#e8eaf6,stroke:#3f51b5
Business Metrics¶
Fraud Loss Metrics¶
class FraudLossMetrics:
"""
Key business metrics for fraud detection
"""
def compute_fraud_loss_rate(self, period: str = "monthly") -> dict:
"""
Fraud losses as percentage of transaction volume
"""
total_volume = self.get_transaction_volume(period)
fraud_losses = self.get_fraud_losses(period)
return {
"fraud_loss_rate": fraud_losses / total_volume,
"fraud_loss_amount": fraud_losses,
"total_volume": total_volume,
"target": 0.001, # 0.1% target
}
def compute_prevented_fraud(self, period: str = "monthly") -> dict:
"""
Fraud that was caught and prevented
"""
blocked_fraud = self.get_blocked_fraud_amount(period)
total_fraud_attempts = self.get_total_fraud_attempts(period)
return {
"prevented_fraud_amount": blocked_fraud,
"prevention_rate": blocked_fraud / total_fraud_attempts,
"blocked_transactions": self.get_blocked_count(period),
}
def compute_false_decline_cost(self, period: str = "monthly") -> dict:
"""
Revenue lost due to false positives
"""
false_declines = self.get_false_decline_transactions(period)
estimated_revenue_loss = sum(
txn.amount * self.customer_lifetime_value_factor
for txn in false_declines
)
return {
"false_decline_count": len(false_declines),
"false_decline_volume": sum(txn.amount for txn in false_declines),
"estimated_revenue_loss": estimated_revenue_loss,
"false_decline_rate": len(false_declines) / self.get_total_declines(period),
}
Cost-Benefit Analysis¶
def compute_fraud_roi(
fraud_prevented: float,
false_decline_cost: float,
operational_cost: float,
model_development_cost: float
) -> dict:
"""
ROI of fraud detection system
"""
total_cost = operational_cost + model_development_cost + false_decline_cost
net_benefit = fraud_prevented - total_cost
return {
"fraud_prevented": fraud_prevented,
"false_decline_cost": false_decline_cost,
"operational_cost": operational_cost,
"development_cost": model_development_cost,
"total_cost": total_cost,
"net_benefit": net_benefit,
"roi": net_benefit / total_cost if total_cost > 0 else 0,
}
# Example
roi = compute_fraud_roi(
fraud_prevented=10_000_000, # $10M prevented
false_decline_cost=500_000, # $500K lost revenue
operational_cost=1_000_000, # $1M operations
model_development_cost=200_000 # $200K R&D
)
# ROI = (10M - 1.7M) / 1.7M = 4.88x
Model Metrics¶
Classification Metrics¶
from sklearn.metrics import precision_recall_curve, auc
def compute_model_metrics(y_true, y_pred_proba, threshold=0.5):
"""
Comprehensive model evaluation metrics
"""
y_pred = (y_pred_proba >= threshold).astype(int)
# Confusion matrix elements
TP = ((y_pred == 1) & (y_true == 1)).sum()
FP = ((y_pred == 1) & (y_true == 0)).sum()
TN = ((y_pred == 0) & (y_true == 0)).sum()
FN = ((y_pred == 0) & (y_true == 1)).sum()
# Basic metrics
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
# False positive rate (critical for fraud)
fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
# AUC-PR (better for imbalanced data than AUC-ROC)
precision_curve, recall_curve, _ = precision_recall_curve(y_true, y_pred_proba)
auc_pr = auc(recall_curve, precision_curve)
return {
"precision": precision, # Target: > 90%
"recall": recall, # Target: > 95%
"f1_score": f1,
"fpr": fpr, # Target: < 0.1%
"auc_pr": auc_pr, # Target: > 0.7
"confusion_matrix": {
"TP": TP, "FP": FP, "TN": TN, "FN": FN
}
}
Cost-Sensitive Metrics¶
def compute_cost_weighted_metrics(y_true, y_pred, amounts):
"""
Metrics weighted by transaction amount
"""
fraud_mask = y_true == 1
predicted_fraud = y_pred == 1
# Value-weighted metrics
true_fraud_value = amounts[fraud_mask].sum()
caught_fraud_value = amounts[fraud_mask & predicted_fraud].sum()
missed_fraud_value = amounts[fraud_mask & ~predicted_fraud].sum()
false_decline_value = amounts[~fraud_mask & predicted_fraud].sum()
return {
# Value-based recall: What % of fraud $ did we catch?
"value_recall": caught_fraud_value / true_fraud_value,
# Missed fraud: $ that slipped through
"missed_fraud_value": missed_fraud_value,
# False decline value: $ we incorrectly blocked
"false_decline_value": false_decline_value,
# Net value: Caught fraud - False declines
"net_value_saved": caught_fraud_value - false_decline_value,
}
Detection Metrics by Fraud Type¶
def compute_detection_by_fraud_type(y_true, y_pred, fraud_types):
"""
Breakdown of detection rate by fraud type
"""
results = {}
for fraud_type in fraud_types.unique():
mask = fraud_types == fraud_type
type_true = y_true[mask]
type_pred = y_pred[mask]
recall = (type_pred & type_true).sum() / type_true.sum()
precision = (type_pred & type_true).sum() / type_pred.sum() if type_pred.sum() > 0 else 0
results[fraud_type] = {
"recall": recall,
"precision": precision,
"volume": mask.sum(),
"fraud_count": type_true.sum(),
}
return results
# Example output:
# {
# "card_not_present": {"recall": 0.95, "precision": 0.88, ...},
# "account_takeover": {"recall": 0.92, "precision": 0.75, ...},
# "promo_abuse": {"recall": 0.85, "precision": 0.90, ...},
# }
Operational Metrics¶
Review Queue Metrics¶
class ReviewQueueMetrics:
"""
Metrics for manual review operations
"""
def compute_queue_metrics(self, period: str = "daily") -> dict:
cases = self.get_cases(period)
return {
# Volume
"total_cases": len(cases),
"cases_per_analyst": len(cases) / self.analyst_count,
# Efficiency
"avg_investigation_time_min": np.mean([c.investigation_time for c in cases]),
"median_investigation_time_min": np.median([c.investigation_time for c in cases]),
# Quality
"escalation_rate": sum(c.escalated for c in cases) / len(cases),
"reversal_rate": sum(c.reversed for c in cases) / len(cases),
# SLA
"sla_breach_rate": sum(c.sla_breached for c in cases) / len(cases),
"avg_queue_wait_time_min": np.mean([c.queue_wait_time for c in cases]),
}
def compute_analyst_metrics(self, analyst_id: str, period: str = "monthly") -> dict:
cases = self.get_cases_by_analyst(analyst_id, period)
return {
"cases_reviewed": len(cases),
"avg_time_per_case_min": np.mean([c.investigation_time for c in cases]),
"accuracy": sum(c.decision_correct for c in cases) / len(cases),
"false_positive_rate": sum(c.was_false_positive for c in cases) / len(cases),
}
Alert Metrics¶
def compute_alert_metrics(alerts: List[Alert], period: str = "daily") -> dict:
"""
Metrics for fraud alerts
"""
return {
# Volume
"total_alerts": len(alerts),
"alerts_per_hour": len(alerts) / 24,
# Quality
"true_positive_rate": sum(a.is_true_positive for a in alerts) / len(alerts),
"false_positive_rate": sum(a.is_false_positive for a in alerts) / len(alerts),
# Triage
"auto_resolved_rate": sum(a.auto_resolved for a in alerts) / len(alerts),
"escalated_rate": sum(a.escalated for a in alerts) / len(alerts),
# By severity
"high_severity_count": sum(a.severity == "high" for a in alerts),
"medium_severity_count": sum(a.severity == "medium" for a in alerts),
"low_severity_count": sum(a.severity == "low" for a in alerts),
}
System Metrics¶
Real-time Monitoring¶
from prometheus_client import Counter, Histogram, Gauge
# Request metrics
SCORING_REQUESTS = Counter(
'fraud_scoring_requests_total',
'Total scoring requests',
['decision'] # approve, review, decline
)
SCORING_LATENCY = Histogram(
'fraud_scoring_latency_ms',
'Scoring latency in milliseconds',
buckets=[10, 25, 50, 100, 250, 500, 1000]
)
# Component latency
COMPONENT_LATENCY = Histogram(
'fraud_component_latency_ms',
'Component latency',
['component'], # rules, ml_model, graph, feature_store
buckets=[1, 5, 10, 25, 50, 100]
)
# Model metrics
MODEL_PREDICTION_SCORE = Histogram(
'fraud_model_prediction_score',
'Distribution of model predictions',
buckets=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
)
# Feature store
FEATURE_CACHE_HIT = Counter(
'fraud_feature_cache_hit_total',
'Feature cache hits',
['feature_type']
)
# Errors
SCORING_ERRORS = Counter(
'fraud_scoring_errors_total',
'Scoring errors',
['error_type']
)
SLA Monitoring¶
class SLAMonitor:
"""
Monitor SLA compliance
"""
def __init__(self):
self.sla_targets = {
"latency_p50": 35, # ms
"latency_p99": 100, # ms
"availability": 99.99, # %
"error_rate": 0.1, # %
"false_positive_rate": 1.0, # %
}
def check_sla_compliance(self, metrics: dict) -> dict:
compliance = {}
for metric, target in self.sla_targets.items():
actual = metrics.get(metric)
if actual is None:
compliance[metric] = {"status": "unknown"}
elif metric in ["latency_p50", "latency_p99", "error_rate", "false_positive_rate"]:
compliance[metric] = {
"target": target,
"actual": actual,
"status": "ok" if actual <= target else "breach",
"margin": (target - actual) / target * 100
}
else: # availability
compliance[metric] = {
"target": target,
"actual": actual,
"status": "ok" if actual >= target else "breach",
"margin": (actual - target) / (100 - target) * 100
}
return compliance
Dashboards¶
Executive Dashboard¶
+-------------------------------------------------------------------------+
| FRAUD DETECTION - EXECUTIVE DASHBOARD |
+-------------------------------------------------------------------------+
| |
| +----------------+ +----------------+ +----------------+ |
| | FRAUD LOSS | | PREVENTED | | FALSE DECLINE | |
| | RATE | | FRAUD | | RATE | |
| | 0.08% | | $2.5M | | 0.5% | |
| | v -15% | | ^ +12% | | v -20% | |
| +----------------+ +----------------+ +----------------+ |
| |
| +-------------------------------------------------------------+ |
| | FRAUD TREND (Last 30 Days) | |
| | 0.2% | | |
| | | *** | |
| | 0.1% | ****** | |
| | | **** | |
| | 0% | ******** Target | |
| | +-------------------------------------------------- | |
| +-------------------------------------------------------------+ |
| |
| +--------------------------------+ +------------------------------+ |
| | TOP FRAUD TYPES | | DETECTION BY CHANNEL | |
| | 1. Card Not Present: 45% | | Online: 95% detected | |
| | 2. Account Takeover: 30% | | Mobile: 92% detected | |
| | 3. Promo Abuse: 15% | | POS: 98% detected | |
| | 4. Other: 10% | | ATM: 99% detected | |
| +--------------------------------+ +------------------------------+ |
| |
+-------------------------------------------------------------------------+
ML Team Dashboard¶
+-------------------------------------------------------------------------+
| MODEL PERFORMANCE DASHBOARD |
+-------------------------------------------------------------------------+
| |
| Model: fraud_xgb_v3.2 | Deployed: 2024-01-15 | Training: 01-10 |
| |
| +-------------------------------------------------------------+ |
| | CLASSIFICATION METRICS | |
| | +---------+---------+---------+---------+---------+ | |
| | |Precision| Recall | F1 | FPR | AUC-PR | | |
| | | 92.3% | 96.1% | 94.2% | 0.08% | 0.847 | | |
| | | ^+1.2% | ^+0.5% | ^+0.8% | v-0.02%| ^+0.02 | | |
| | +---------+---------+---------+---------+---------+ | |
| +-------------------------------------------------------------+ |
| |
| +-------------------------------------------------------------+ |
| | DETECTION BY FRAUD TYPE | |
| | Card Not Present ====================.... 96% | |
| | Account Takeover =================....... 92% | |
| | Promo Abuse ==============.......... 85% | |
| | Return Fraud ===========............. 78% | |
| +-------------------------------------------------------------+ |
| |
| +--------------------------------+ +------------------------------+ |
| | SCORE DISTRIBUTION | | FEATURE IMPORTANCE | |
| | ...###====.. | | 1. velocity_1h: 0.15 | |
| | 0.0 0.5 1.0 | | 2. amount_zscore: 0.12 | |
| | Normal | Fraud | | 3. device_age: 0.10 | |
| | Peak: 0.1 | 0.85 | | 4. is_new_merchant: 0.08 | |
| +--------------------------------+ +------------------------------+ |
| |
+-------------------------------------------------------------------------+
Operations Dashboard¶
+-------------------------------------------------------------------------+
| OPERATIONS DASHBOARD |
+-------------------------------------------------------------------------+
| |
| +----------------+ +----------------+ +----------------+ |
| | REVIEW QUEUE | | AVG HANDLE | | SLA BREACH | |
| | SIZE | | TIME | | RATE | |
| | 124 | | 8.5 min | | 2.3% | |
| | ^ +15 | | v -1.2 min | | v -0.5% | |
| +----------------+ +----------------+ +----------------+ |
| |
| +-------------------------------------------------------------+ |
| | ANALYST PERFORMANCE | |
| | +----------------+--------+----------+----------+---------+ | |
| | | Analyst | Cases | Avg Time | Accuracy | FP Rate | | |
| | +----------------+--------+----------+----------+---------+ | |
| | | Alice | 45 | 7.2 min | 98.2% | 1.8% | | |
| | | Bob | 38 | 9.1 min | 95.5% | 4.5% | | |
| | | Charlie | 42 | 8.0 min | 97.1% | 2.9% | | |
| | +----------------+--------+----------+----------+---------+ | |
| +-------------------------------------------------------------+ |
| |
+-------------------------------------------------------------------------+
Заблуждение: AUC-ROC -- лучшая метрика для fraud detection
При дисбалансе 1:1000 AUC-ROC завышает качество модели, потому что true negative rate (TN/(TN+FP)) огромен при любом пороге. Модель с AUC-ROC 0.98 может иметь precision всего 5% при recall 90%. AUC-PR (Precision-Recall curve) -- корректная метрика: она показывает trade-off именно между precision и recall для minority class. Целевое значение AUC-PR > 0.7 для fraud detection.
Заблуждение: метрики модели = бизнес-результат
Улучшение recall с 94% до 96% может означать +$2M спасённых средств, но если при этом FPR вырос с 0.05% до 0.15%, то 50K лишних клиентов в день получат отказ, что стоит $500K/мес в потерянном LTV. Метрики модели нужно всегда переводить в бизнес-стоимость: cost(FN) = сумма транзакции, cost(FP) = доля клиентов, которые уйдут * LTV. Только тогда можно принять решение о пороге.
Заблуждение: достаточно мониторить только accuracy/precision/recall
В production нужно мониторить 4 уровня: (1) system -- латентность p50/p99, throughput, error rate; (2) model -- score distribution drift, feature drift (PSI), prediction drift; (3) operational -- review queue size, analyst handle time, SLA breach rate; (4) business -- fraud loss rate, false decline cost, customer churn from declines. Дрейф score distribution (вдруг 30% транзакций попадают в review вместо обычных 3%) -- ранний сигнал деградации модели.
Секция для интервью¶
Вопрос: "Какие метрики вы будете отслеживать?"
Слабый ответ: "Precision и recall."
Сильный ответ: "Четыре уровня метрик. Business: fraud loss rate (цель < 0.1%), false decline rate (< 0.5%), ROI системы. Model: precision (> 90%), recall (> 95%), AUC-PR (> 0.7), value-weighted recall (какой % фродовых долларов мы ловим). Operational: review queue size (< 200 кейсов), avg handle time (< 10 мин), SLA breach rate (< 5%). System: p99 latency (< 100 мс), availability (99.99%), error rate (< 0.1%). Критически важно: метрики модели без бизнес-контекста бессмысленны. Recall 96% при precision 30% = 70% алертов ложные, аналитики тонут в фальшивых кейсах."
Вопрос: "Как выбрать между precision и recall?"
Слабый ответ: "Зависит от бизнеса."
Сильный ответ: "Перевожу в доллары. Если средняя фродовая транзакция = $500, а потеря клиента от false decline = $2000 LTV, то cost(FN) = $500, cost(FP) = $2000 * 5% churn probability = $100. FN в 5 раз дороже FP, значит оптимизируем recall с ограничением на FPR < 0.1%. Конкретный порог: фиксирую recall = 95%, смотрю precision. Если precision < 80% -- добавляю третью зону (review) для пограничных скоров, чтобы не блокировать клиентов автоматически. Три зоны: auto-approve (75% транзакций), review (2-3%), auto-decline (0.1%)."