Метрики рекомендательной системы¶
~3 минуты чтения
Предварительно: Определение задачи | Метрики ранжирования | A/B тестирование
Метрики рекомендательной системы образуют 4-уровневую иерархию: бизнес-метрики (revenue, retention) декомпозируются в product-метрики (CTR, conversion), которые оцениваются через ML-метрики (NDCG@10, Recall@K) и защищаются guardrail-метриками (latency p99, coverage). Amazon атрибутирует 35% выручки рекомендациям -- это ~$150B/год. Ошибка в выборе primary metric стоит дорого: YouTube оптимизировал CTR и получил рост кликбейта на 40%, пока не переключился на multi-objective (watch time + satisfaction). На интервью спрашивают не только формулы NDCG, но и связь offline метрик с бизнес-результатами.
Metrics Framework¶
Metric Categories¶
graph TD
BM["Business Metrics<br/>Revenue, GMV, Retention<br/>(North Star)"] --> PM["Product Metrics<br/>CTR, Conversion, Engagement<br/>(User Behavior)"]
PM --> ML["ML Model Metrics<br/>Precision, Recall, NDCG<br/>(Model Quality)"]
ML --> SM["System Metrics<br/>Latency, Throughput, Availability<br/>(Infrastructure)"]
style BM fill:#fce4ec,stroke:#c62828
style PM fill:#fff3e0,stroke:#ef6c00
style ML fill:#e8eaf6,stroke:#3f51b5
style SM fill:#e8f5e9,stroke:#4caf50
Offline Metrics (Model Evaluation)¶
Ranking Metrics¶
def compute_ranking_metrics(y_true: List[int], y_pred: List[float], k: int = 10):
"""
Compute standard ranking metrics
Args:
y_true: Binary relevance labels (0/1)
y_pred: Predicted scores (higher = better)
k: Cutoff for @K metrics
"""
# Sort by predicted score
sorted_indices = np.argsort(y_pred)[::-1]
sorted_labels = y_true[sorted_indices]
metrics = {}
# Precision@K: What fraction of top-K are relevant?
metrics['precision_at_k'] = np.sum(sorted_labels[:k]) / k
# Recall@K: What fraction of all relevant items are in top-K?
total_relevant = np.sum(y_true)
metrics['recall_at_k'] = np.sum(sorted_labels[:k]) / total_relevant if total_relevant > 0 else 0
# Hit Rate@K: Is there at least 1 relevant item in top-K?
metrics['hit_rate_at_k'] = 1 if np.sum(sorted_labels[:k]) > 0 else 0
# NDCG@K: Normalized Discounted Cumulative Gain
dcg = np.sum(sorted_labels[:k] / np.log2(np.arange(2, k + 2)))
ideal_labels = np.sort(y_true)[::-1][:k]
idcg = np.sum(ideal_labels / np.log2(np.arange(2, k + 2)))
metrics['ndcg_at_k'] = dcg / idcg if idcg > 0 else 0
# MRR: Mean Reciprocal Rank
relevant_ranks = np.where(sorted_labels == 1)[0] + 1
metrics['mrr'] = 1 / relevant_ranks[0] if len(relevant_ranks) > 0 else 0
# MAP: Mean Average Precision
precisions = []
relevant_count = 0
for i, label in enumerate(sorted_labels):
if label == 1:
relevant_count += 1
precisions.append(relevant_count / (i + 1))
metrics['map'] = np.mean(precisions) if precisions else 0
return metrics
Coverage & Diversity Metrics¶
def compute_coverage_metrics(recommendations: List[List[int]], catalog_size: int):
"""
How much of the catalog is being recommended?
"""
all_recommended = set()
for recs in recommendations:
all_recommended.update(recs)
metrics = {
# Catalog coverage: % of items ever recommended
'catalog_coverage': len(all_recommended) / catalog_size,
# Gini coefficient: Inequality in recommendation frequency
'gini_coefficient': compute_gini(recommendations),
# Long-tail coverage: % of long-tail items recommended
'long_tail_coverage': compute_long_tail_coverage(recommendations),
}
return metrics
def compute_diversity_metrics(recommendations: List[List[int]], item_embeddings: Dict):
"""
How diverse are recommendations within a list?
"""
intra_list_diversities = []
for recs in recommendations:
if len(recs) < 2:
continue
# Average pairwise distance between recommended items
embeddings = [item_embeddings[item_id] for item_id in recs]
distances = []
for i in range(len(embeddings)):
for j in range(i + 1, len(embeddings)):
dist = cosine_distance(embeddings[i], embeddings[j])
distances.append(dist)
intra_list_diversities.append(np.mean(distances))
return {
'intra_list_diversity': np.mean(intra_list_diversities),
'category_diversity': compute_category_diversity(recommendations),
}
Novelty Metrics¶
def compute_novelty_metrics(
recommendations: List[List[int]],
user_history: Dict[int, List[int]],
item_popularity: Dict[int, float]
):
"""
How novel/unexpected are the recommendations?
"""
novelties = []
for user_id, recs in recommendations:
user_seen = set(user_history.get(user_id, []))
for item_id in recs:
if item_id not in user_seen:
# Novelty: -log(popularity)
# Unpopular items = higher novelty
pop = item_popularity.get(item_id, 0.01)
novelties.append(-np.log(pop))
return {
'mean_novelty': np.mean(novelties),
'median_novelty': np.median(novelties),
}
Online Metrics (Production)¶
User Engagement Metrics¶
class OnlineMetricsCollector:
"""
Collect and compute online metrics in real-time
"""
def __init__(self):
self.metrics = defaultdict(lambda: defaultdict(list))
def log_impression(self, user_id: str, item_ids: List[str], positions: List[int]):
"""Log when recommendations are shown"""
self.metrics['impressions'][user_id].extend(item_ids)
def log_click(self, user_id: str, item_id: str, position: int):
"""Log when user clicks a recommendation"""
self.metrics['clicks'][user_id].append({
'item_id': item_id,
'position': position,
'timestamp': time.time()
})
def log_conversion(self, user_id: str, item_id: str, value: float):
"""Log when user converts (purchase, watch, etc.)"""
self.metrics['conversions'][user_id].append({
'item_id': item_id,
'value': value,
'timestamp': time.time()
})
def compute_ctr(self, time_window_hours: int = 24) -> float:
"""Click-through rate"""
total_impressions = sum(len(v) for v in self.metrics['impressions'].values())
total_clicks = sum(len(v) for v in self.metrics['clicks'].values())
return total_clicks / total_impressions if total_impressions > 0 else 0
def compute_conversion_rate(self) -> float:
"""Conversion rate"""
total_clicks = sum(len(v) for v in self.metrics['clicks'].values())
total_conversions = sum(len(v) for v in self.metrics['conversions'].values())
return total_conversions / total_clicks if total_clicks > 0 else 0
def compute_position_bias(self) -> Dict[int, float]:
"""CTR by position"""
position_impressions = defaultdict(int)
position_clicks = defaultdict(int)
for user_clicks in self.metrics['clicks'].values():
for click in user_clicks:
position_clicks[click['position']] += 1
# Assume same number of impressions per position
# In practice, track this separately
return {
pos: clicks / position_impressions.get(pos, 1)
for pos, clicks in position_clicks.items()
}
Revenue Metrics¶
def compute_revenue_metrics(conversions: List[Conversion], recommendations: List[Recommendation]):
"""
Revenue attribution to recommendations
"""
rec_revenue = 0
total_revenue = 0
for conv in conversions:
total_revenue += conv.value
# Check if conversion was from recommendation
if conv.source == 'recommendation':
rec_revenue += conv.value
return {
'total_revenue': total_revenue,
'recommendation_revenue': rec_revenue,
'recommendation_revenue_share': rec_revenue / total_revenue if total_revenue > 0 else 0,
'revenue_per_recommendation': rec_revenue / len(recommendations) if recommendations else 0,
'average_order_value_rec': rec_revenue / len([c for c in conversions if c.source == 'recommendation']),
}
A/B Testing Metrics¶
Experiment Setup¶
class ABTestConfig:
"""
A/B test configuration
"""
def __init__(
self,
experiment_name: str,
variants: Dict[str, float], # variant_name -> traffic %
primary_metric: str,
secondary_metrics: List[str],
guardrail_metrics: List[str],
min_sample_size: int,
min_detectable_effect: float,
):
self.experiment_name = experiment_name
self.variants = variants
self.primary_metric = primary_metric
self.secondary_metrics = secondary_metrics
self.guardrail_metrics = guardrail_metrics
self.min_sample_size = min_sample_size
self.min_detectable_effect = min_detectable_effect
# Example experiment
experiment = ABTestConfig(
experiment_name="deep_ranking_v2",
variants={
"control": 0.5, # Existing model
"treatment": 0.5, # New deep ranking model
},
primary_metric="revenue_per_user",
secondary_metrics=["ctr", "conversion_rate", "session_duration"],
guardrail_metrics=["latency_p99", "error_rate", "coverage"],
min_sample_size=100000,
min_detectable_effect=0.02, # 2% lift
)
Statistical Analysis¶
from scipy import stats
import numpy as np
def analyze_ab_test(
control_values: np.ndarray,
treatment_values: np.ndarray,
metric_type: str = "mean", # "mean" or "proportion"
alpha: float = 0.05
) -> Dict:
"""
Analyze A/B test results
Returns:
- effect_size: Relative lift
- p_value: Statistical significance
- confidence_interval: 95% CI for effect
- is_significant: Whether to ship
"""
if metric_type == "mean":
# T-test for continuous metrics
control_mean = np.mean(control_values)
treatment_mean = np.mean(treatment_values)
t_stat, p_value = stats.ttest_ind(treatment_values, control_values)
effect_size = (treatment_mean - control_mean) / control_mean
# Bootstrap confidence interval
n_bootstrap = 10000
bootstrap_effects = []
for _ in range(n_bootstrap):
c_sample = np.random.choice(control_values, size=len(control_values), replace=True)
t_sample = np.random.choice(treatment_values, size=len(treatment_values), replace=True)
bootstrap_effects.append((np.mean(t_sample) - np.mean(c_sample)) / np.mean(c_sample))
ci_lower = np.percentile(bootstrap_effects, 2.5)
ci_upper = np.percentile(bootstrap_effects, 97.5)
elif metric_type == "proportion":
# Chi-square test for proportions (CTR, conversion rate)
control_successes = np.sum(control_values)
treatment_successes = np.sum(treatment_values)
control_n = len(control_values)
treatment_n = len(treatment_values)
control_rate = control_successes / control_n
treatment_rate = treatment_successes / treatment_n
effect_size = (treatment_rate - control_rate) / control_rate
# Z-test for proportions
pooled_rate = (control_successes + treatment_successes) / (control_n + treatment_n)
se = np.sqrt(pooled_rate * (1 - pooled_rate) * (1/control_n + 1/treatment_n))
z_stat = (treatment_rate - control_rate) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
ci_lower = effect_size - 1.96 * se / control_rate
ci_upper = effect_size + 1.96 * se / control_rate
return {
'effect_size': effect_size,
'p_value': p_value,
'confidence_interval': (ci_lower, ci_upper),
'is_significant': p_value < alpha,
'sample_size_control': len(control_values),
'sample_size_treatment': len(treatment_values),
}
System Metrics¶
Latency Monitoring¶
from prometheus_client import Histogram, Counter, Gauge
# Latency by stage
STAGE_LATENCY = Histogram(
'recommendation_stage_latency_ms',
'Latency by pipeline stage',
['stage'],
buckets=[1, 5, 10, 25, 50, 100, 250, 500, 1000]
)
# End-to-end latency
E2E_LATENCY = Histogram(
'recommendation_e2e_latency_ms',
'End-to-end latency',
buckets=[10, 25, 50, 100, 200, 500, 1000]
)
# Error rates
ERROR_COUNTER = Counter(
'recommendation_errors_total',
'Total errors',
['error_type']
)
# Cache metrics
CACHE_HIT_RATE = Gauge(
'recommendation_cache_hit_rate',
'Cache hit rate',
['cache_type']
)
# Model inference
MODEL_INFERENCE_LATENCY = Histogram(
'model_inference_latency_ms',
'Model inference latency',
['model_name', 'batch_size_bucket']
)
SLA Definitions¶
| Metric | Target | Alert Threshold | Critical Threshold |
|---|---|---|---|
| Latency (p50) | < 35ms | > 50ms | > 75ms |
| Latency (p99) | < 100ms | > 150ms | > 250ms |
| Error Rate | < 0.1% | > 0.5% | > 1% |
| Availability | > 99.9% | < 99.5% | < 99% |
| Cache Hit Rate | > 90% | < 80% | < 70% |
| Throughput | > 100K RPS | N/A | < 50K RPS |
Metric Dashboards¶
Executive Dashboard¶
| KPI | Значение | Тренд |
|---|---|---|
| Revenue from Recs | $2.5M | +8% |
| CTR | 4.2% | +0.3% |
| Conversion Rate | 2.1% | +0.1% |
ML Team Dashboard¶
| Offline Metric (Test Set) | Значение | Изменение |
|---|---|---|
| NDCG@10 | 0.342 | +2.1% |
| Precision@10 | 0.125 | +1.5% |
| Recall@10 | 0.089 | +0.8% |
| MRR | 0.456 | +1.2% |
| MAP | 0.234 | +0.9% |
| Online Metric (7 days) | Значение | Изменение |
|---|---|---|
| CTR | 4.2% | +0.3% |
| Conversion | 2.1% | +0.1% |
| Revenue per User | $12.50 | +$0.80 |
| Diversity | 0.65 | +0.02 |
| Coverage | 42% | -2% |
Заблуждение: Высокий NDCG offline = успех в production
NDCG@10 может расти на test set, но CTR в production падать. Причины: (1) offline test set не отражает реальное распределение (selection bias), (2) offline метрики не учитывают position bias, (3) пользователи адаптируют поведение к новым рекомендациям. Netflix показал, что корреляция между offline NDCG и online CTR -- всего 0.4. Единственный достоверный способ оценки -- A/B тест.
Заблуждение: A/B тест достаточно запустить на 1-2 дня
Для detection 2% lift в revenue при 100K DAU нужно минимум 7-14 дней и 100K+ пользователей в каждой группе. Novelty effect (пользователи кликают на новое) завышает метрики в первые 3-5 дней. День недели сильно влияет на поведение (покупки в пятницу vs понедельник). Минимум -- 2 полных недельных цикла. Sequential testing позволяет остановить раньше при сильном сигнале.
Заблуждение: Coverage 100% -- это хорошо
Coverage 100% означает, что система рекомендует ВСЁ, включая низкокачественные items. Оптимальный coverage -- 30-60% каталога. Amazon рекомендует ~40% каталога, остальное -- long tail с < 10 продажами/год. Важнее Gini coefficient: насколько равномерно распределены рекомендации. Gini > 0.8 = top-1% items получают 80% показов (popularity bias).
Собеседование¶
Как оценивать рекомендательную модель?¶
"Считаем accuracy на test set."
"Двухэтапная оценка: Offline -- NDCG@10 (качество ранжирования), Recall@K (покрытие), MRR (первый релевантный item), Coverage (каталог), Diversity (intra-list distance). Online -- A/B тест: primary metric (revenue per user), secondary (CTR, conversion, session duration), guardrails (latency p99 < 100ms, error rate < 0.1%, coverage > 30%). Offline-online корреляция ~0.4, поэтому A/B тест -- единственный достоверный способ."
Как настроить A/B тест для рекомендаций?¶
"Делим пользователей 50/50 на 3 дня."
"Power analysis: для 2% MDE при alpha=0.05 и power=0.8 нужно 100K+ users per group. User-level randomization (не request-level, чтобы один пользователь видел одну версию). Минимум 2 недели -- убрать novelty effect и покрыть weekday/weekend cycles. Primary metric: revenue_per_user. Guardrails: latency_p99, error_rate, coverage. Sequential testing (CUPED) для ранней остановки. Multiple comparison correction если несколько variants."
Что делать если offline метрики растут, а online падают?¶
"Значит модель работает, просто нужно больше данных."
"Три основные причины offline-online расхождения: (1) Selection bias -- offline test set не отражает реальное распределение, нужен unbiased evaluation через Inverse Propensity Scoring; (2) Position bias -- offline NDCG не учитывает что позиция 1 получает 10x кликов; (3) Feedback loops -- модель оптимизирует прошлые паттерны, пользователи адаптируются. Решения: counterfactual evaluation, interleaving tests (показать обе модели одному пользователю), bandits для онлайн-обучения."