bi-detective/generate_data.py

"""
Generate synthetic operations data for the BI Detective L5 game.

Single fact table (one row per operation) → aggregated views matching the shape
the BUS220 game.js expects (rides → ops count, avg_duration → success rate %).

Story:
- Headline efficiency 62% (Oct 2023) → 71% (Nov 2023). Volume 800 → 500 ops.
- Volume drop is seasonal (Nov 2022 had similar volume → YoY check defuses it).
- Efficiency rise is a MIX SHIFT: brigade pulled defensive units for refit, share
  of "preparation" missions (training + logistics) grew from 15% → 50%; "combat"
  missions (recon + fire support + defensive) shrank 85% → 50%.
- Per-category success rates are STABLE across all months; aggregate rises only
  because composition shifted toward easier categories.

Output: game_data.js (JS file with `const GAME_DATA = {...}`) + raw_facts.csv
(per-op fact table, for inspection/audit).

Run: python3 generate_data.py
"""

from __future__ import annotations

import csv
import json
import random
from collections import defaultdict
from dataclasses import dataclass
from datetime import date, timedelta
from pathlib import Path

OUT_DIR = Path(__file__).parent

# --- Domain ---

CATEGORIES = ["training", "logistics", "recon", "fire_support", "defensive"]
CATEGORY_LABEL_UK = {
    "training": "Тренування",
    "logistics": "Логістика",
    "recon": "Розвідка",
    "fire_support": "Вогневе ураження",
    "defensive": "Оборонні дії",
}
# Stable across all months (the whole point — only composition shifts)
CATEGORY_BASE_SUCCESS = {
    "training": 0.91,
    "logistics": 0.79,
    "recon": 0.67,
    "fire_support": 0.60,
    "defensive": 0.50,
}

# 2-segment partition for the "Rider Types" tab analog
SEGMENT_OF = {
    "training": "preparation",
    "logistics": "preparation",
    "recon": "combat",
    "fire_support": "combat",
    "defensive": "combat",
}

UNITS = ["1 БТГр", "2 БТГр", "3 БТГр", "Розвідрота", "Інж-сап. рота"]
SCALES = ["small", "medium", "large"]
SCALE_ORDER = {"small": 1, "medium": 2, "large": 3}
SCALE_LABEL_UK = {"small": "Мала", "medium": "Середня", "large": "Велика"}

# Per-month totals (ops count) — seasonal pattern, calibrated so:
#   - Oct 2023 = 800 (prior month, "normal")
#   - Nov 2023 = 500 (focus month, sharp drop)
#   - Nov 2022 = 480 (similar seasonal pattern → YoY check defuses volume alarm)
MONTHLY_OPS = {
    "2022-11": 480,
    "2022-12": 460,
    "2023-01": 470,
    "2023-02": 500,
    "2023-03": 600,
    "2023-04": 700,
    "2023-05": 750,
    "2023-06": 820,
    "2023-07": 850,
    "2023-08": 830,
    "2023-09": 780,
    "2023-10": 800,
    "2023-11": 500,
}

# Category share of total ops per month.
# All months use the "old" mix EXCEPT Nov 2023 — that's the one with the new pattern.
OLD_MIX = {
    "training": 0.05, "logistics": 0.10,
    "recon": 0.28, "fire_support": 0.27, "defensive": 0.30,
}
NEW_MIX = {  # Nov 2023 only — defensive units pulled for refit, prep grew
    "training": 0.16, "logistics": 0.36,
    "recon": 0.15, "fire_support": 0.16, "defensive": 0.17,
}

def category_share(month: str) -> dict[str, float]:
    return NEW_MIX if month == "2023-11" else OLD_MIX

# Mission scale distribution (stable, doesn't drive the story — red herring)
SCALE_SHARE = {"small": 0.55, "medium": 0.30, "large": 0.15}

# Hourly distribution: bell-shaped around midday/afternoon (typical activity window)
# Used as weights for sampling.
HOURLY_WEIGHT = [
    0.5, 0.3, 0.2, 0.2, 0.3, 0.5,  # 0-5
    0.8, 1.5, 2.5, 3.5, 4.0, 4.0,  # 6-11
    3.5, 3.5, 4.0, 4.5, 4.5, 4.0,  # 12-17
    3.5, 2.5, 1.8, 1.2, 0.8, 0.5,  # 18-23
]

# --- Generation ---

@dataclass
class Op:
    date: str
    month: str
    weekday: str
    weekday_num: int  # 0 = Sunday (matches Divvy convention)
    hour: int
    unit: str
    category: str
    segment: str
    scale: str
    succeeded: bool

WEEKDAY_NAMES = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]

def days_in_month(year: int, month: int) -> list[date]:
    d = date(year, month, 1)
    days = []
    while d.month == month:
        days.append(d)
        d += timedelta(days=1)
    return days

def weighted_choice(weights: dict[str, float]) -> str:
    keys = list(weights.keys())
    vals = [weights[k] for k in keys]
    return random.choices(keys, weights=vals, k=1)[0]

def weighted_int(weights: list[float]) -> int:
    return random.choices(range(len(weights)), weights=weights, k=1)[0]

def exact_counts(total: int, shares: dict[str, float]) -> dict[str, int]:
    """Return exact integer counts per key that sum to total, distributed by shares."""
    raw = {k: total * v for k, v in shares.items()}
    counts = {k: int(v) for k, v in raw.items()}
    deficit = total - sum(counts.values())
    # Hand the leftover units to the keys with the largest fractional remainders.
    fracs = sorted(((raw[k] - counts[k], k) for k in shares), reverse=True)
    for i in range(deficit):
        counts[fracs[i % len(fracs)][1]] += 1
    return counts

def generate_ops(seed: int = 42) -> list[Op]:
    """
    Deterministic on the puzzle-driving dimensions (category share, per-category
    success rate). Random on incidental dimensions (unit, scale, hour, day-within-month).
    """
    random.seed(seed)
    ops: list[Op] = []

    for month, total in MONTHLY_OPS.items():
        year = int(month[:4])
        mo = int(month[5:7])
        days = days_in_month(year, mo)
        shares = category_share(month)

        # Exact category counts per month (no sampling noise on the shares)
        cat_counts = exact_counts(total, shares)

        for cat, n_cat in cat_counts.items():
            seg = SEGMENT_OF[cat]
            base = CATEGORY_BASE_SUCCESS[cat]
            # Exact success count for this (month, category) — no Bernoulli noise
            n_success = round(n_cat * base)

            # Build the ops list for this bucket, mark first n_success as succeeded
            bucket = []
            for i in range(n_cat):
                d = random.choice(days)
                wd_idx = (d.weekday() + 1) % 7
                bucket.append(Op(
                    date=d.isoformat(), month=month,
                    weekday=WEEKDAY_NAMES[wd_idx], weekday_num=wd_idx,
                    hour=weighted_int(HOURLY_WEIGHT),
                    unit=random.choice(UNITS),
                    category=cat, segment=seg,
                    scale=weighted_choice(SCALE_SHARE),
                    succeeded=(i < n_success),
                ))
            random.shuffle(bucket)  # don't keep all successes at front of list
            ops.extend(bucket)

    return ops

# --- Aggregation: build the 14 GAME_DATA tables game.js expects ---
# Field names follow Divvy convention (rides = count, avg_duration = success rate %)
# even though the meaning is repurposed. See top-of-file note.

def round_pct(x: float) -> float:
    return round(x * 100, 1)

def safe_pct(num: float, den: float) -> float:
    return round_pct(num / den) if den else 0.0

def aggregate(ops: list[Op]) -> dict:
    out: dict = {}

    # monthly_totals: [{month, rides, avg_duration, median_duration}]
    by_month: dict[str, list[Op]] = defaultdict(list)
    for o in ops:
        by_month[o.month].append(o)
    out["monthly_totals"] = [
        {
            "month": m,
            "rides": len(rows),
            "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
            "median_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),  # same as avg for binomial
        }
        for m, rows in sorted(by_month.items())
    ]

    # daily_totals: [{date, rides, avg_duration}]
    by_date: dict[str, list[Op]] = defaultdict(list)
    for o in ops:
        by_date[o.date].append(o)
    out["daily_totals"] = [
        {
            "date": d,
            "rides": len(rows),
            "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
        }
        for d, rows in sorted(by_date.items())
    ]

    # day_of_week: [{month, weekday, weekday_num, rides, avg_duration}] for Oct + Nov 2023
    out["day_of_week"] = []
    for month in ["2023-10", "2023-11"]:
        by_wd: dict[tuple[str, int], list[Op]] = defaultdict(list)
        for o in by_month[month]:
            by_wd[(o.weekday, o.weekday_num)].append(o)
        for (wd, wd_num), rows in sorted(by_wd.items(), key=lambda kv: kv[0][1]):
            out["day_of_week"].append({
                "month": month, "weekday": wd, "weekday_num": wd_num,
                "rides": len(rows),
                "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
            })

    # day_counts: [{month, weekday, weekday_num, day_count}] — for calendar artifact
    out["day_counts"] = []
    for month in ["2023-10", "2023-11"]:
        year, mo = int(month[:4]), int(month[5:7])
        days = days_in_month(year, mo)
        wd_count: dict[tuple[str, int], int] = defaultdict(int)
        for d in days:
            wd_idx = (d.weekday() + 1) % 7
            wd_count[(WEEKDAY_NAMES[wd_idx], wd_idx)] += 1
        for (wd, wd_num), cnt in sorted(wd_count.items(), key=lambda kv: kv[0][1]):
            out["day_counts"].append({
                "month": month, "weekday": wd, "weekday_num": wd_num, "day_count": cnt,
            })

    # hourly_totals: [{month, hour, rides}] for Oct + Nov 2023
    out["hourly_totals"] = []
    for month in ["2023-10", "2023-11"]:
        for h in range(24):
            n = sum(1 for o in by_month[month] if o.hour == h)
            out["hourly_totals"].append({"month": month, "hour": h, "rides": n})

    # hourly_patterns: per-segment hourly (used by game.js but we keep simpler)
    out["hourly_patterns"] = []
    for month in ["2023-10", "2023-11"]:
        for seg in ["preparation", "combat"]:
            for h in range(24):
                n = sum(1 for o in by_month[month] if o.hour == h and o.segment == seg)
                out["hourly_patterns"].append({"month": month, "segment": seg, "hour": h, "rides": n})

    # bike_type → mission_category: per-category aggregate Oct + Nov 2023
    out["bike_type"] = []
    for month in ["2023-10", "2023-11"]:
        for cat in CATEGORIES:
            rows = [o for o in by_month[month] if o.category == cat]
            out["bike_type"].append({
                "month": month, "bike_type": cat,  # field name kept for game.js compat
                "rides": len(rows),
                "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
            })

    # duration_buckets → mission_scale: per-scale aggregate Oct + Nov 2023
    out["duration_buckets"] = []
    for month in ["2023-10", "2023-11"]:
        for scale in SCALES:
            rows = [o for o in by_month[month] if o.scale == scale]
            out["duration_buckets"].append({
                "month": month, "bucket": scale, "bucket_order": SCALE_ORDER[scale],
                "rides": len(rows),
            })

    # station_comparison → unit_comparison: top units, Oct→Nov change
    out["station_comparison"] = []
    for unit in UNITS:
        oct_rows = [o for o in by_month["2023-10"] if o.unit == unit]
        nov_rows = [o for o in by_month["2023-11"] if o.unit == unit]
        oct_n = len(oct_rows)
        nov_n = len(nov_rows)
        change = round((nov_n - oct_n) / oct_n * 100, 1) if oct_n else 0.0
        out["station_comparison"].append({
            "station": unit,  # field name kept
            "oct_rides": oct_n, "nov_rides": nov_n, "change_pct": change,
        })
    out["station_comparison"].sort(key=lambda r: -r["oct_rides"])

    # monthly_by_segment: [{month, segment, rides, avg_duration}] — all months
    out["monthly_by_segment"] = []
    for m, rows in sorted(by_month.items()):
        for seg in ["preparation", "combat"]:
            sub = [o for o in rows if o.segment == seg]
            out["monthly_by_segment"].append({
                "month": m, "segment": seg,
                "rides": len(sub),
                "avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
            })

    # daily_by_segment: [{date, segment, rides, avg_duration}]
    out["daily_by_segment"] = []
    for d, rows in sorted(by_date.items()):
        for seg in ["preparation", "combat"]:
            sub = [o for o in rows if o.segment == seg]
            out["daily_by_segment"].append({
                "date": d, "segment": seg,
                "rides": len(sub),
                "avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
            })

    # null_station_rate: data-quality table (game.js doesn't actively use, but referenced)
    out["null_station_rate"] = []  # leave empty; game.js handles missing gracefully

    # yoy_data + yoy_totals: Nov 2022 vs Nov 2023
    out["yoy_data"] = []
    out["yoy_totals"] = []
    for month in ["2022-11", "2023-11"]:
        rows = by_month[month]
        out["yoy_totals"].append({
            "month": month,
            "rides": len(rows),
            "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
        })
        for seg in ["preparation", "combat"]:
            sub = [o for o in rows if o.segment == seg]
            out["yoy_data"].append({
                "month": month, "segment": seg,
                "rides": len(sub),
                "avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
            })

    # scenario: VP-message metadata
    oct_rows = by_month["2023-10"]
    nov_rows = by_month["2023-11"]
    oct_ops = len(oct_rows)
    nov_ops = len(nov_rows)
    oct_eff = safe_pct(sum(1 for o in oct_rows if o.succeeded), oct_ops)
    nov_eff = safe_pct(sum(1 for o in nov_rows if o.succeeded), nov_ops)
    out["scenario"] = {
        "prior_month": "2023-10",
        "prior_month_label": "Жовтень 2023",
        "focus_month": "2023-11",
        "focus_month_label": "Листопад 2023",
        "prior_rides": oct_ops,        # = ops count for prior month
        "focus_rides": nov_ops,        # = ops count for focus month
        "rides_change_pct": round((nov_ops - oct_ops) / oct_ops * 100, 1),
        "prior_avg_duration": oct_eff,
        "focus_avg_duration": nov_eff,
        "duration_change_pct": round(nov_eff - oct_eff, 1),  # difference of percentages, not relative
    }

    return out

# --- Output ---

def write_facts_csv(ops: list[Op], path: Path):
    with path.open("w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["date", "month", "weekday", "hour", "unit", "category", "segment", "scale", "succeeded"])
        for o in ops:
            w.writerow([o.date, o.month, o.weekday, o.hour, o.unit, o.category, o.segment, o.scale, int(o.succeeded)])

def write_game_data_js(data: dict, path: Path):
    header = (
        "// Auto-generated — do not edit manually. Run: python3 generate_data.py\n"
        "// Synthetic dataset for the L5 BI Detective game (KSE × ХАРТІЯ data-literacy course).\n"
        "// Field-name mapping (kept compatible with BUS220 game.js):\n"
        "//   rides         = number of operations (ops count)\n"
        "//   avg_duration  = success rate, percent (0–100)\n"
        "//   bike_type     = mission_category (training / logistics / recon / fire_support / defensive)\n"
        "//   segment       = preparation (training+logistics) | combat (recon+fire_support+defensive)\n"
        "//   bucket        = mission_scale (small/medium/large)\n"
        "//   station       = unit name\n"
    )
    payload = "const GAME_DATA = " + json.dumps(data, ensure_ascii=False, separators=(",", ":")) + ";\n"
    path.write_text(header + payload)

def main():
    print("Generating synthetic operations…")
    ops = generate_ops(seed=42)
    print(f"  Total ops generated: {len(ops):,}")

    write_facts_csv(ops, OUT_DIR / "raw_facts.csv")
    print(f"  Wrote {OUT_DIR / 'raw_facts.csv'}")

    print("Aggregating into game tables…")
    data = aggregate(ops)

    write_game_data_js(data, OUT_DIR / "game_data.js")
    print(f"  Wrote {OUT_DIR / 'game_data.js'}")

    # Sanity check — print headline numbers
    s = data["scenario"]
    print()
    print("Headline check:")
    print(f"  {s['prior_month_label']}: {s['prior_rides']} ops, {s['prior_avg_duration']}% efficiency")
    print(f"  {s['focus_month_label']}: {s['focus_rides']} ops, {s['focus_avg_duration']}% efficiency")
    print(f"  Volume change: {s['rides_change_pct']:+.1f}%")
    print(f"  Efficiency change: {s['duration_change_pct']:+.1f} pp")
    print()
    print("Per-category success rate (should be stable across Oct → Nov):")
    bt = data["bike_type"]
    for cat in CATEGORIES:
        oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None)
        nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None)
        if oct_v and nov_v:
            print(f"  {cat:14s}: Oct {oct_v['avg_duration']:5.1f}% (n={oct_v['rides']:3d})  →  Nov {nov_v['avg_duration']:5.1f}% (n={nov_v['rides']:3d})")
    print()
    print("Composition (share of total ops):")
    oct_total = sum(r["rides"] for r in bt if r["month"] == "2023-10")
    nov_total = sum(r["rides"] for r in bt if r["month"] == "2023-11")
    for cat in CATEGORIES:
        oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None)
        nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None)
        if oct_v and nov_v:
            print(f"  {cat:14s}: Oct {oct_v['rides']/oct_total*100:5.1f}%  →  Nov {nov_v['rides']/nov_total*100:5.1f}%")

if __name__ == "__main__":
    main()