Files
bi-detective/generate_data.py
T

447 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Generate synthetic operations data for the BI Detective L5 game.
Single fact table (one row per operation) → aggregated views matching the shape
the BUS220 game.js expects (rides → ops count, avg_duration → success rate %).
Story:
- Headline efficiency 62% (Oct 2023) → 71% (Nov 2023). Volume 800 → 500 ops.
- Volume drop is seasonal (Nov 2022 had similar volume → YoY check defuses it).
- Efficiency rise is a MIX SHIFT: brigade pulled defensive units for refit, share
of "preparation" missions (training + logistics) grew from 15% → 50%; "combat"
missions (recon + fire support + defensive) shrank 85% → 50%.
- Per-category success rates are STABLE across all months; aggregate rises only
because composition shifted toward easier categories.
Output: game_data.js (JS file with `const GAME_DATA = {...}`) + raw_facts.csv
(per-op fact table, for inspection/audit).
Run: python3 generate_data.py
"""
from __future__ import annotations
import csv
import json
import random
from collections import defaultdict
from dataclasses import dataclass
from datetime import date, timedelta
from pathlib import Path
OUT_DIR = Path(__file__).parent
# --- Domain ---
CATEGORIES = ["training", "logistics", "recon", "fire_support", "defensive"]
CATEGORY_LABEL_UK = {
"training": "Тренування",
"logistics": "Логістика",
"recon": "Розвідка",
"fire_support": "Вогневе ураження",
"defensive": "Оборонні дії",
}
# Stable across all months (the whole point — only composition shifts)
CATEGORY_BASE_SUCCESS = {
"training": 0.91,
"logistics": 0.79,
"recon": 0.67,
"fire_support": 0.60,
"defensive": 0.50,
}
# 2-segment partition for the "Rider Types" tab analog
SEGMENT_OF = {
"training": "preparation",
"logistics": "preparation",
"recon": "combat",
"fire_support": "combat",
"defensive": "combat",
}
UNITS = ["1 БТГр", "2 БТГр", "3 БТГр", "Розвідрота", "Інж-сап. рота"]
SCALES = ["small", "medium", "large"]
SCALE_ORDER = {"small": 1, "medium": 2, "large": 3}
SCALE_LABEL_UK = {"small": "Мала", "medium": "Середня", "large": "Велика"}
# Per-month totals (ops count) — seasonal pattern, calibrated so:
# - Oct 2023 = 800 (prior month, "normal")
# - Nov 2023 = 500 (focus month, sharp drop)
# - Nov 2022 = 480 (similar seasonal pattern → YoY check defuses volume alarm)
MONTHLY_OPS = {
"2022-11": 480,
"2022-12": 460,
"2023-01": 470,
"2023-02": 500,
"2023-03": 600,
"2023-04": 700,
"2023-05": 750,
"2023-06": 820,
"2023-07": 850,
"2023-08": 830,
"2023-09": 780,
"2023-10": 800,
"2023-11": 500,
}
# Category share of total ops per month.
# All months use the "old" mix EXCEPT Nov 2023 — that's the one with the new pattern.
OLD_MIX = {
"training": 0.05, "logistics": 0.10,
"recon": 0.28, "fire_support": 0.27, "defensive": 0.30,
}
NEW_MIX = { # Nov 2023 only — defensive units pulled for refit, prep grew
"training": 0.16, "logistics": 0.36,
"recon": 0.15, "fire_support": 0.16, "defensive": 0.17,
}
def category_share(month: str) -> dict[str, float]:
return NEW_MIX if month == "2023-11" else OLD_MIX
# Mission scale distribution (stable, doesn't drive the story — red herring)
SCALE_SHARE = {"small": 0.55, "medium": 0.30, "large": 0.15}
# Hourly distribution: bell-shaped around midday/afternoon (typical activity window)
# Used as weights for sampling.
HOURLY_WEIGHT = [
0.5, 0.3, 0.2, 0.2, 0.3, 0.5, # 0-5
0.8, 1.5, 2.5, 3.5, 4.0, 4.0, # 6-11
3.5, 3.5, 4.0, 4.5, 4.5, 4.0, # 12-17
3.5, 2.5, 1.8, 1.2, 0.8, 0.5, # 18-23
]
# --- Generation ---
@dataclass
class Op:
date: str
month: str
weekday: str
weekday_num: int # 0 = Sunday (matches Divvy convention)
hour: int
unit: str
category: str
segment: str
scale: str
succeeded: bool
WEEKDAY_NAMES = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
def days_in_month(year: int, month: int) -> list[date]:
d = date(year, month, 1)
days = []
while d.month == month:
days.append(d)
d += timedelta(days=1)
return days
def weighted_choice(weights: dict[str, float]) -> str:
keys = list(weights.keys())
vals = [weights[k] for k in keys]
return random.choices(keys, weights=vals, k=1)[0]
def weighted_int(weights: list[float]) -> int:
return random.choices(range(len(weights)), weights=weights, k=1)[0]
def exact_counts(total: int, shares: dict[str, float]) -> dict[str, int]:
"""Return exact integer counts per key that sum to total, distributed by shares."""
raw = {k: total * v for k, v in shares.items()}
counts = {k: int(v) for k, v in raw.items()}
deficit = total - sum(counts.values())
# Hand the leftover units to the keys with the largest fractional remainders.
fracs = sorted(((raw[k] - counts[k], k) for k in shares), reverse=True)
for i in range(deficit):
counts[fracs[i % len(fracs)][1]] += 1
return counts
def generate_ops(seed: int = 42) -> list[Op]:
"""
Deterministic on the puzzle-driving dimensions (category share, per-category
success rate). Random on incidental dimensions (unit, scale, hour, day-within-month).
"""
random.seed(seed)
ops: list[Op] = []
for month, total in MONTHLY_OPS.items():
year = int(month[:4])
mo = int(month[5:7])
days = days_in_month(year, mo)
shares = category_share(month)
# Exact category counts per month (no sampling noise on the shares)
cat_counts = exact_counts(total, shares)
for cat, n_cat in cat_counts.items():
seg = SEGMENT_OF[cat]
base = CATEGORY_BASE_SUCCESS[cat]
# Exact success count for this (month, category) — no Bernoulli noise
n_success = round(n_cat * base)
# Build the ops list for this bucket, mark first n_success as succeeded
bucket = []
for i in range(n_cat):
d = random.choice(days)
wd_idx = (d.weekday() + 1) % 7
bucket.append(Op(
date=d.isoformat(), month=month,
weekday=WEEKDAY_NAMES[wd_idx], weekday_num=wd_idx,
hour=weighted_int(HOURLY_WEIGHT),
unit=random.choice(UNITS),
category=cat, segment=seg,
scale=weighted_choice(SCALE_SHARE),
succeeded=(i < n_success),
))
random.shuffle(bucket) # don't keep all successes at front of list
ops.extend(bucket)
return ops
# --- Aggregation: build the 14 GAME_DATA tables game.js expects ---
# Field names follow Divvy convention (rides = count, avg_duration = success rate %)
# even though the meaning is repurposed. See top-of-file note.
def round_pct(x: float) -> float:
return round(x * 100, 1)
def safe_pct(num: float, den: float) -> float:
return round_pct(num / den) if den else 0.0
def aggregate(ops: list[Op]) -> dict:
out: dict = {}
# monthly_totals: [{month, rides, avg_duration, median_duration}]
by_month: dict[str, list[Op]] = defaultdict(list)
for o in ops:
by_month[o.month].append(o)
out["monthly_totals"] = [
{
"month": m,
"rides": len(rows),
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
"median_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), # same as avg for binomial
}
for m, rows in sorted(by_month.items())
]
# daily_totals: [{date, rides, avg_duration}]
by_date: dict[str, list[Op]] = defaultdict(list)
for o in ops:
by_date[o.date].append(o)
out["daily_totals"] = [
{
"date": d,
"rides": len(rows),
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
}
for d, rows in sorted(by_date.items())
]
# day_of_week: [{month, weekday, weekday_num, rides, avg_duration}] for Oct + Nov 2023
out["day_of_week"] = []
for month in ["2023-10", "2023-11"]:
by_wd: dict[tuple[str, int], list[Op]] = defaultdict(list)
for o in by_month[month]:
by_wd[(o.weekday, o.weekday_num)].append(o)
for (wd, wd_num), rows in sorted(by_wd.items(), key=lambda kv: kv[0][1]):
out["day_of_week"].append({
"month": month, "weekday": wd, "weekday_num": wd_num,
"rides": len(rows),
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
})
# day_counts: [{month, weekday, weekday_num, day_count}] — for calendar artifact
out["day_counts"] = []
for month in ["2023-10", "2023-11"]:
year, mo = int(month[:4]), int(month[5:7])
days = days_in_month(year, mo)
wd_count: dict[tuple[str, int], int] = defaultdict(int)
for d in days:
wd_idx = (d.weekday() + 1) % 7
wd_count[(WEEKDAY_NAMES[wd_idx], wd_idx)] += 1
for (wd, wd_num), cnt in sorted(wd_count.items(), key=lambda kv: kv[0][1]):
out["day_counts"].append({
"month": month, "weekday": wd, "weekday_num": wd_num, "day_count": cnt,
})
# hourly_totals: [{month, hour, rides}] for Oct + Nov 2023
out["hourly_totals"] = []
for month in ["2023-10", "2023-11"]:
for h in range(24):
n = sum(1 for o in by_month[month] if o.hour == h)
out["hourly_totals"].append({"month": month, "hour": h, "rides": n})
# hourly_patterns: per-segment hourly (used by game.js but we keep simpler)
out["hourly_patterns"] = []
for month in ["2023-10", "2023-11"]:
for seg in ["preparation", "combat"]:
for h in range(24):
n = sum(1 for o in by_month[month] if o.hour == h and o.segment == seg)
out["hourly_patterns"].append({"month": month, "segment": seg, "hour": h, "rides": n})
# bike_type → mission_category: per-category aggregate Oct + Nov 2023
out["bike_type"] = []
for month in ["2023-10", "2023-11"]:
for cat in CATEGORIES:
rows = [o for o in by_month[month] if o.category == cat]
out["bike_type"].append({
"month": month, "bike_type": cat, # field name kept for game.js compat
"rides": len(rows),
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
})
# duration_buckets → mission_scale: per-scale aggregate Oct + Nov 2023
out["duration_buckets"] = []
for month in ["2023-10", "2023-11"]:
for scale in SCALES:
rows = [o for o in by_month[month] if o.scale == scale]
out["duration_buckets"].append({
"month": month, "bucket": scale, "bucket_order": SCALE_ORDER[scale],
"rides": len(rows),
})
# station_comparison → unit_comparison: top units, Oct→Nov change
out["station_comparison"] = []
for unit in UNITS:
oct_rows = [o for o in by_month["2023-10"] if o.unit == unit]
nov_rows = [o for o in by_month["2023-11"] if o.unit == unit]
oct_n = len(oct_rows)
nov_n = len(nov_rows)
change = round((nov_n - oct_n) / oct_n * 100, 1) if oct_n else 0.0
out["station_comparison"].append({
"station": unit, # field name kept
"oct_rides": oct_n, "nov_rides": nov_n, "change_pct": change,
})
out["station_comparison"].sort(key=lambda r: -r["oct_rides"])
# monthly_by_segment: [{month, segment, rides, avg_duration}] — all months
out["monthly_by_segment"] = []
for m, rows in sorted(by_month.items()):
for seg in ["preparation", "combat"]:
sub = [o for o in rows if o.segment == seg]
out["monthly_by_segment"].append({
"month": m, "segment": seg,
"rides": len(sub),
"avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
})
# daily_by_segment: [{date, segment, rides, avg_duration}]
out["daily_by_segment"] = []
for d, rows in sorted(by_date.items()):
for seg in ["preparation", "combat"]:
sub = [o for o in rows if o.segment == seg]
out["daily_by_segment"].append({
"date": d, "segment": seg,
"rides": len(sub),
"avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
})
# null_station_rate: data-quality table (game.js doesn't actively use, but referenced)
out["null_station_rate"] = [] # leave empty; game.js handles missing gracefully
# yoy_data + yoy_totals: Nov 2022 vs Nov 2023
out["yoy_data"] = []
out["yoy_totals"] = []
for month in ["2022-11", "2023-11"]:
rows = by_month[month]
out["yoy_totals"].append({
"month": month,
"rides": len(rows),
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
})
for seg in ["preparation", "combat"]:
sub = [o for o in rows if o.segment == seg]
out["yoy_data"].append({
"month": month, "segment": seg,
"rides": len(sub),
"avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
})
# scenario: VP-message metadata
oct_rows = by_month["2023-10"]
nov_rows = by_month["2023-11"]
oct_ops = len(oct_rows)
nov_ops = len(nov_rows)
oct_eff = safe_pct(sum(1 for o in oct_rows if o.succeeded), oct_ops)
nov_eff = safe_pct(sum(1 for o in nov_rows if o.succeeded), nov_ops)
out["scenario"] = {
"prior_month": "2023-10",
"prior_month_label": "Жовтень 2023",
"focus_month": "2023-11",
"focus_month_label": "Листопад 2023",
"prior_rides": oct_ops, # = ops count for prior month
"focus_rides": nov_ops, # = ops count for focus month
"rides_change_pct": round((nov_ops - oct_ops) / oct_ops * 100, 1),
"prior_avg_duration": oct_eff,
"focus_avg_duration": nov_eff,
"duration_change_pct": round(nov_eff - oct_eff, 1), # difference of percentages, not relative
}
return out
# --- Output ---
def write_facts_csv(ops: list[Op], path: Path):
with path.open("w", newline="") as f:
w = csv.writer(f)
w.writerow(["date", "month", "weekday", "hour", "unit", "category", "segment", "scale", "succeeded"])
for o in ops:
w.writerow([o.date, o.month, o.weekday, o.hour, o.unit, o.category, o.segment, o.scale, int(o.succeeded)])
def write_game_data_js(data: dict, path: Path):
header = (
"// Auto-generated — do not edit manually. Run: python3 generate_data.py\n"
"// Synthetic dataset for the L5 BI Detective game (KSE × ХАРТІЯ data-literacy course).\n"
"// Field-name mapping (kept compatible with BUS220 game.js):\n"
"// rides = number of operations (ops count)\n"
"// avg_duration = success rate, percent (0100)\n"
"// bike_type = mission_category (training / logistics / recon / fire_support / defensive)\n"
"// segment = preparation (training+logistics) | combat (recon+fire_support+defensive)\n"
"// bucket = mission_scale (small/medium/large)\n"
"// station = unit name\n"
)
payload = "const GAME_DATA = " + json.dumps(data, ensure_ascii=False, separators=(",", ":")) + ";\n"
path.write_text(header + payload)
def main():
print("Generating synthetic operations…")
ops = generate_ops(seed=42)
print(f" Total ops generated: {len(ops):,}")
write_facts_csv(ops, OUT_DIR / "raw_facts.csv")
print(f" Wrote {OUT_DIR / 'raw_facts.csv'}")
print("Aggregating into game tables…")
data = aggregate(ops)
write_game_data_js(data, OUT_DIR / "game_data.js")
print(f" Wrote {OUT_DIR / 'game_data.js'}")
# Sanity check — print headline numbers
s = data["scenario"]
print()
print("Headline check:")
print(f" {s['prior_month_label']}: {s['prior_rides']} ops, {s['prior_avg_duration']}% efficiency")
print(f" {s['focus_month_label']}: {s['focus_rides']} ops, {s['focus_avg_duration']}% efficiency")
print(f" Volume change: {s['rides_change_pct']:+.1f}%")
print(f" Efficiency change: {s['duration_change_pct']:+.1f} pp")
print()
print("Per-category success rate (should be stable across Oct → Nov):")
bt = data["bike_type"]
for cat in CATEGORIES:
oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None)
nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None)
if oct_v and nov_v:
print(f" {cat:14s}: Oct {oct_v['avg_duration']:5.1f}% (n={oct_v['rides']:3d}) → Nov {nov_v['avg_duration']:5.1f}% (n={nov_v['rides']:3d})")
print()
print("Composition (share of total ops):")
oct_total = sum(r["rides"] for r in bt if r["month"] == "2023-10")
nov_total = sum(r["rides"] for r in bt if r["month"] == "2023-11")
for cat in CATEGORIES:
oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None)
nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None)
if oct_v and nov_v:
print(f" {cat:14s}: Oct {oct_v['rides']/oct_total*100:5.1f}% → Nov {nov_v['rides']/nov_total*100:5.1f}%")
if __name__ == "__main__":
main()