""" Generate synthetic operations data for the BI Detective L5 game. Single fact table (one row per operation) → aggregated views matching the shape the BUS220 game.js expects (rides → ops count, avg_duration → success rate %). Story: - Headline efficiency 62% (Oct 2023) → 71% (Nov 2023). Volume 800 → 500 ops. - Volume drop is seasonal (Nov 2022 had similar volume → YoY check defuses it). - Efficiency rise is a MIX SHIFT: brigade pulled defensive units for refit, share of "preparation" missions (training + logistics) grew from 15% → 50%; "combat" missions (recon + fire support + defensive) shrank 85% → 50%. - Per-category success rates are STABLE across all months; aggregate rises only because composition shifted toward easier categories. Output: game_data.js (JS file with `const GAME_DATA = {...}`) + raw_facts.csv (per-op fact table, for inspection/audit). Run: python3 generate_data.py """ from __future__ import annotations import csv import json import random from collections import defaultdict from dataclasses import dataclass from datetime import date, timedelta from pathlib import Path OUT_DIR = Path(__file__).parent # --- Domain --- CATEGORIES = ["training", "logistics", "recon", "fire_support", "defensive"] CATEGORY_LABEL_UK = { "training": "Тренування", "logistics": "Логістика", "recon": "Розвідка", "fire_support": "Вогневе ураження", "defensive": "Оборонні дії", } # Stable across all months (the whole point — only composition shifts) CATEGORY_BASE_SUCCESS = { "training": 0.91, "logistics": 0.79, "recon": 0.67, "fire_support": 0.60, "defensive": 0.50, } # 2-segment partition for the "Rider Types" tab analog SEGMENT_OF = { "training": "preparation", "logistics": "preparation", "recon": "combat", "fire_support": "combat", "defensive": "combat", } UNITS = ["1 БТГр", "2 БТГр", "3 БТГр", "Розвідрота", "Інж-сап. рота"] SCALES = ["small", "medium", "large"] SCALE_ORDER = {"small": 1, "medium": 2, "large": 3} SCALE_LABEL_UK = {"small": "Мала", "medium": "Середня", "large": "Велика"} # Per-month totals (ops count) — seasonal pattern, calibrated so: # - Oct 2023 = 800 (prior month, "normal") # - Nov 2023 = 500 (focus month, sharp drop) # - Nov 2022 = 480 (similar seasonal pattern → YoY check defuses volume alarm) MONTHLY_OPS = { "2022-11": 480, "2022-12": 460, "2023-01": 470, "2023-02": 500, "2023-03": 600, "2023-04": 700, "2023-05": 750, "2023-06": 820, "2023-07": 850, "2023-08": 830, "2023-09": 780, "2023-10": 800, "2023-11": 500, } # Category share of total ops per month. # All months use the "old" mix EXCEPT Nov 2023 — that's the one with the new pattern. OLD_MIX = { "training": 0.05, "logistics": 0.10, "recon": 0.28, "fire_support": 0.27, "defensive": 0.30, } NEW_MIX = { # Nov 2023 only — defensive units pulled for refit, prep grew "training": 0.16, "logistics": 0.36, "recon": 0.15, "fire_support": 0.16, "defensive": 0.17, } def category_share(month: str) -> dict[str, float]: return NEW_MIX if month == "2023-11" else OLD_MIX # Mission scale distribution (stable, doesn't drive the story — red herring) SCALE_SHARE = {"small": 0.55, "medium": 0.30, "large": 0.15} # Hourly distribution: bell-shaped around midday/afternoon (typical activity window) # Used as weights for sampling. HOURLY_WEIGHT = [ 0.5, 0.3, 0.2, 0.2, 0.3, 0.5, # 0-5 0.8, 1.5, 2.5, 3.5, 4.0, 4.0, # 6-11 3.5, 3.5, 4.0, 4.5, 4.5, 4.0, # 12-17 3.5, 2.5, 1.8, 1.2, 0.8, 0.5, # 18-23 ] # --- Generation --- @dataclass class Op: date: str month: str weekday: str weekday_num: int # 0 = Sunday (matches Divvy convention) hour: int unit: str category: str segment: str scale: str succeeded: bool WEEKDAY_NAMES = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"] def days_in_month(year: int, month: int) -> list[date]: d = date(year, month, 1) days = [] while d.month == month: days.append(d) d += timedelta(days=1) return days def weighted_choice(weights: dict[str, float]) -> str: keys = list(weights.keys()) vals = [weights[k] for k in keys] return random.choices(keys, weights=vals, k=1)[0] def weighted_int(weights: list[float]) -> int: return random.choices(range(len(weights)), weights=weights, k=1)[0] def exact_counts(total: int, shares: dict[str, float]) -> dict[str, int]: """Return exact integer counts per key that sum to total, distributed by shares.""" raw = {k: total * v for k, v in shares.items()} counts = {k: int(v) for k, v in raw.items()} deficit = total - sum(counts.values()) # Hand the leftover units to the keys with the largest fractional remainders. fracs = sorted(((raw[k] - counts[k], k) for k in shares), reverse=True) for i in range(deficit): counts[fracs[i % len(fracs)][1]] += 1 return counts def generate_ops(seed: int = 42) -> list[Op]: """ Deterministic on the puzzle-driving dimensions (category share, per-category success rate). Random on incidental dimensions (unit, scale, hour, day-within-month). """ random.seed(seed) ops: list[Op] = [] for month, total in MONTHLY_OPS.items(): year = int(month[:4]) mo = int(month[5:7]) days = days_in_month(year, mo) shares = category_share(month) # Exact category counts per month (no sampling noise on the shares) cat_counts = exact_counts(total, shares) for cat, n_cat in cat_counts.items(): seg = SEGMENT_OF[cat] base = CATEGORY_BASE_SUCCESS[cat] # Exact success count for this (month, category) — no Bernoulli noise n_success = round(n_cat * base) # Build the ops list for this bucket, mark first n_success as succeeded bucket = [] for i in range(n_cat): d = random.choice(days) wd_idx = (d.weekday() + 1) % 7 bucket.append(Op( date=d.isoformat(), month=month, weekday=WEEKDAY_NAMES[wd_idx], weekday_num=wd_idx, hour=weighted_int(HOURLY_WEIGHT), unit=random.choice(UNITS), category=cat, segment=seg, scale=weighted_choice(SCALE_SHARE), succeeded=(i < n_success), )) random.shuffle(bucket) # don't keep all successes at front of list ops.extend(bucket) return ops # --- Aggregation: build the 14 GAME_DATA tables game.js expects --- # Field names follow Divvy convention (rides = count, avg_duration = success rate %) # even though the meaning is repurposed. See top-of-file note. def round_pct(x: float) -> float: return round(x * 100, 1) def safe_pct(num: float, den: float) -> float: return round_pct(num / den) if den else 0.0 def aggregate(ops: list[Op]) -> dict: out: dict = {} # monthly_totals: [{month, rides, avg_duration, median_duration}] by_month: dict[str, list[Op]] = defaultdict(list) for o in ops: by_month[o.month].append(o) out["monthly_totals"] = [ { "month": m, "rides": len(rows), "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), "median_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), # same as avg for binomial } for m, rows in sorted(by_month.items()) ] # daily_totals: [{date, rides, avg_duration}] by_date: dict[str, list[Op]] = defaultdict(list) for o in ops: by_date[o.date].append(o) out["daily_totals"] = [ { "date": d, "rides": len(rows), "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), } for d, rows in sorted(by_date.items()) ] # day_of_week: [{month, weekday, weekday_num, rides, avg_duration}] for Oct + Nov 2023 out["day_of_week"] = [] for month in ["2023-10", "2023-11"]: by_wd: dict[tuple[str, int], list[Op]] = defaultdict(list) for o in by_month[month]: by_wd[(o.weekday, o.weekday_num)].append(o) for (wd, wd_num), rows in sorted(by_wd.items(), key=lambda kv: kv[0][1]): out["day_of_week"].append({ "month": month, "weekday": wd, "weekday_num": wd_num, "rides": len(rows), "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), }) # day_counts: [{month, weekday, weekday_num, day_count}] — for calendar artifact out["day_counts"] = [] for month in ["2023-10", "2023-11"]: year, mo = int(month[:4]), int(month[5:7]) days = days_in_month(year, mo) wd_count: dict[tuple[str, int], int] = defaultdict(int) for d in days: wd_idx = (d.weekday() + 1) % 7 wd_count[(WEEKDAY_NAMES[wd_idx], wd_idx)] += 1 for (wd, wd_num), cnt in sorted(wd_count.items(), key=lambda kv: kv[0][1]): out["day_counts"].append({ "month": month, "weekday": wd, "weekday_num": wd_num, "day_count": cnt, }) # hourly_totals: [{month, hour, rides}] for Oct + Nov 2023 out["hourly_totals"] = [] for month in ["2023-10", "2023-11"]: for h in range(24): n = sum(1 for o in by_month[month] if o.hour == h) out["hourly_totals"].append({"month": month, "hour": h, "rides": n}) # hourly_patterns: per-segment hourly (used by game.js but we keep simpler) out["hourly_patterns"] = [] for month in ["2023-10", "2023-11"]: for seg in ["preparation", "combat"]: for h in range(24): n = sum(1 for o in by_month[month] if o.hour == h and o.segment == seg) out["hourly_patterns"].append({"month": month, "segment": seg, "hour": h, "rides": n}) # bike_type → mission_category: per-category aggregate Oct + Nov 2023 out["bike_type"] = [] for month in ["2023-10", "2023-11"]: for cat in CATEGORIES: rows = [o for o in by_month[month] if o.category == cat] out["bike_type"].append({ "month": month, "bike_type": cat, # field name kept for game.js compat "rides": len(rows), "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), }) # duration_buckets → mission_scale: per-scale aggregate Oct + Nov 2023 out["duration_buckets"] = [] for month in ["2023-10", "2023-11"]: for scale in SCALES: rows = [o for o in by_month[month] if o.scale == scale] out["duration_buckets"].append({ "month": month, "bucket": scale, "bucket_order": SCALE_ORDER[scale], "rides": len(rows), }) # station_comparison → unit_comparison: top units, Oct→Nov change out["station_comparison"] = [] for unit in UNITS: oct_rows = [o for o in by_month["2023-10"] if o.unit == unit] nov_rows = [o for o in by_month["2023-11"] if o.unit == unit] oct_n = len(oct_rows) nov_n = len(nov_rows) change = round((nov_n - oct_n) / oct_n * 100, 1) if oct_n else 0.0 out["station_comparison"].append({ "station": unit, # field name kept "oct_rides": oct_n, "nov_rides": nov_n, "change_pct": change, }) out["station_comparison"].sort(key=lambda r: -r["oct_rides"]) # monthly_by_segment: [{month, segment, rides, avg_duration}] — all months out["monthly_by_segment"] = [] for m, rows in sorted(by_month.items()): for seg in ["preparation", "combat"]: sub = [o for o in rows if o.segment == seg] out["monthly_by_segment"].append({ "month": m, "segment": seg, "rides": len(sub), "avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)), }) # daily_by_segment: [{date, segment, rides, avg_duration}] out["daily_by_segment"] = [] for d, rows in sorted(by_date.items()): for seg in ["preparation", "combat"]: sub = [o for o in rows if o.segment == seg] out["daily_by_segment"].append({ "date": d, "segment": seg, "rides": len(sub), "avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)), }) # null_station_rate: data-quality table (game.js doesn't actively use, but referenced) out["null_station_rate"] = [] # leave empty; game.js handles missing gracefully # yoy_data + yoy_totals: Nov 2022 vs Nov 2023 out["yoy_data"] = [] out["yoy_totals"] = [] for month in ["2022-11", "2023-11"]: rows = by_month[month] out["yoy_totals"].append({ "month": month, "rides": len(rows), "avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), }) for seg in ["preparation", "combat"]: sub = [o for o in rows if o.segment == seg] out["yoy_data"].append({ "month": month, "segment": seg, "rides": len(sub), "avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)), }) # scenario: VP-message metadata oct_rows = by_month["2023-10"] nov_rows = by_month["2023-11"] oct_ops = len(oct_rows) nov_ops = len(nov_rows) oct_eff = safe_pct(sum(1 for o in oct_rows if o.succeeded), oct_ops) nov_eff = safe_pct(sum(1 for o in nov_rows if o.succeeded), nov_ops) out["scenario"] = { "prior_month": "2023-10", "prior_month_label": "Жовтень 2023", "focus_month": "2023-11", "focus_month_label": "Листопад 2023", "prior_rides": oct_ops, # = ops count for prior month "focus_rides": nov_ops, # = ops count for focus month "rides_change_pct": round((nov_ops - oct_ops) / oct_ops * 100, 1), "prior_avg_duration": oct_eff, "focus_avg_duration": nov_eff, "duration_change_pct": round(nov_eff - oct_eff, 1), # difference of percentages, not relative } return out # --- Output --- def write_facts_csv(ops: list[Op], path: Path): with path.open("w", newline="") as f: w = csv.writer(f) w.writerow(["date", "month", "weekday", "hour", "unit", "category", "segment", "scale", "succeeded"]) for o in ops: w.writerow([o.date, o.month, o.weekday, o.hour, o.unit, o.category, o.segment, o.scale, int(o.succeeded)]) def write_game_data_js(data: dict, path: Path): header = ( "// Auto-generated — do not edit manually. Run: python3 generate_data.py\n" "// Synthetic dataset for the L5 BI Detective game (KSE × ХАРТІЯ data-literacy course).\n" "// Field-name mapping (kept compatible with BUS220 game.js):\n" "// rides = number of operations (ops count)\n" "// avg_duration = success rate, percent (0–100)\n" "// bike_type = mission_category (training / logistics / recon / fire_support / defensive)\n" "// segment = preparation (training+logistics) | combat (recon+fire_support+defensive)\n" "// bucket = mission_scale (small/medium/large)\n" "// station = unit name\n" ) payload = "const GAME_DATA = " + json.dumps(data, ensure_ascii=False, separators=(",", ":")) + ";\n" path.write_text(header + payload) def main(): print("Generating synthetic operations…") ops = generate_ops(seed=42) print(f" Total ops generated: {len(ops):,}") write_facts_csv(ops, OUT_DIR / "raw_facts.csv") print(f" Wrote {OUT_DIR / 'raw_facts.csv'}") print("Aggregating into game tables…") data = aggregate(ops) write_game_data_js(data, OUT_DIR / "game_data.js") print(f" Wrote {OUT_DIR / 'game_data.js'}") # Sanity check — print headline numbers s = data["scenario"] print() print("Headline check:") print(f" {s['prior_month_label']}: {s['prior_rides']} ops, {s['prior_avg_duration']}% efficiency") print(f" {s['focus_month_label']}: {s['focus_rides']} ops, {s['focus_avg_duration']}% efficiency") print(f" Volume change: {s['rides_change_pct']:+.1f}%") print(f" Efficiency change: {s['duration_change_pct']:+.1f} pp") print() print("Per-category success rate (should be stable across Oct → Nov):") bt = data["bike_type"] for cat in CATEGORIES: oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None) nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None) if oct_v and nov_v: print(f" {cat:14s}: Oct {oct_v['avg_duration']:5.1f}% (n={oct_v['rides']:3d}) → Nov {nov_v['avg_duration']:5.1f}% (n={nov_v['rides']:3d})") print() print("Composition (share of total ops):") oct_total = sum(r["rides"] for r in bt if r["month"] == "2023-10") nov_total = sum(r["rides"] for r in bt if r["month"] == "2023-11") for cat in CATEGORIES: oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None) nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None) if oct_v and nov_v: print(f" {cat:14s}: Oct {oct_v['rides']/oct_total*100:5.1f}% → Nov {nov_v['rides']/nov_total*100:5.1f}%") if __name__ == "__main__": main()