mirror of
https://github.com/olehomelchenko/bi-detective.git
synced 2026-06-23 22:47:46 +00:00
447 lines
17 KiB
Python
447 lines
17 KiB
Python
"""
|
||
Generate synthetic operations data for the BI Detective L5 game.
|
||
|
||
Single fact table (one row per operation) → aggregated views matching the shape
|
||
the BUS220 game.js expects (rides → ops count, avg_duration → success rate %).
|
||
|
||
Story:
|
||
- Headline efficiency 62% (Oct 2023) → 71% (Nov 2023). Volume 800 → 500 ops.
|
||
- Volume drop is seasonal (Nov 2022 had similar volume → YoY check defuses it).
|
||
- Efficiency rise is a MIX SHIFT: brigade pulled defensive units for refit, share
|
||
of "preparation" missions (training + logistics) grew from 15% → 50%; "combat"
|
||
missions (recon + fire support + defensive) shrank 85% → 50%.
|
||
- Per-category success rates are STABLE across all months; aggregate rises only
|
||
because composition shifted toward easier categories.
|
||
|
||
Output: game_data.js (JS file with `const GAME_DATA = {...}`) + raw_facts.csv
|
||
(per-op fact table, for inspection/audit).
|
||
|
||
Run: python3 generate_data.py
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import csv
|
||
import json
|
||
import random
|
||
from collections import defaultdict
|
||
from dataclasses import dataclass
|
||
from datetime import date, timedelta
|
||
from pathlib import Path
|
||
|
||
OUT_DIR = Path(__file__).parent
|
||
|
||
# --- Domain ---
|
||
|
||
CATEGORIES = ["training", "logistics", "recon", "fire_support", "defensive"]
|
||
CATEGORY_LABEL_UK = {
|
||
"training": "Тренування",
|
||
"logistics": "Логістика",
|
||
"recon": "Розвідка",
|
||
"fire_support": "Вогневе ураження",
|
||
"defensive": "Оборонні дії",
|
||
}
|
||
# Stable across all months (the whole point — only composition shifts)
|
||
CATEGORY_BASE_SUCCESS = {
|
||
"training": 0.91,
|
||
"logistics": 0.79,
|
||
"recon": 0.67,
|
||
"fire_support": 0.60,
|
||
"defensive": 0.50,
|
||
}
|
||
|
||
# 2-segment partition for the "Rider Types" tab analog
|
||
SEGMENT_OF = {
|
||
"training": "preparation",
|
||
"logistics": "preparation",
|
||
"recon": "combat",
|
||
"fire_support": "combat",
|
||
"defensive": "combat",
|
||
}
|
||
|
||
UNITS = ["1 БТГр", "2 БТГр", "3 БТГр", "Розвідрота", "Інж-сап. рота"]
|
||
SCALES = ["small", "medium", "large"]
|
||
SCALE_ORDER = {"small": 1, "medium": 2, "large": 3}
|
||
SCALE_LABEL_UK = {"small": "Мала", "medium": "Середня", "large": "Велика"}
|
||
|
||
# Per-month totals (ops count) — seasonal pattern, calibrated so:
|
||
# - Oct 2023 = 800 (prior month, "normal")
|
||
# - Nov 2023 = 500 (focus month, sharp drop)
|
||
# - Nov 2022 = 480 (similar seasonal pattern → YoY check defuses volume alarm)
|
||
MONTHLY_OPS = {
|
||
"2022-11": 480,
|
||
"2022-12": 460,
|
||
"2023-01": 470,
|
||
"2023-02": 500,
|
||
"2023-03": 600,
|
||
"2023-04": 700,
|
||
"2023-05": 750,
|
||
"2023-06": 820,
|
||
"2023-07": 850,
|
||
"2023-08": 830,
|
||
"2023-09": 780,
|
||
"2023-10": 800,
|
||
"2023-11": 500,
|
||
}
|
||
|
||
# Category share of total ops per month.
|
||
# All months use the "old" mix EXCEPT Nov 2023 — that's the one with the new pattern.
|
||
OLD_MIX = {
|
||
"training": 0.05, "logistics": 0.10,
|
||
"recon": 0.28, "fire_support": 0.27, "defensive": 0.30,
|
||
}
|
||
NEW_MIX = { # Nov 2023 only — defensive units pulled for refit, prep grew
|
||
"training": 0.16, "logistics": 0.36,
|
||
"recon": 0.15, "fire_support": 0.16, "defensive": 0.17,
|
||
}
|
||
|
||
def category_share(month: str) -> dict[str, float]:
|
||
return NEW_MIX if month == "2023-11" else OLD_MIX
|
||
|
||
# Mission scale distribution (stable, doesn't drive the story — red herring)
|
||
SCALE_SHARE = {"small": 0.55, "medium": 0.30, "large": 0.15}
|
||
|
||
# Hourly distribution: bell-shaped around midday/afternoon (typical activity window)
|
||
# Used as weights for sampling.
|
||
HOURLY_WEIGHT = [
|
||
0.5, 0.3, 0.2, 0.2, 0.3, 0.5, # 0-5
|
||
0.8, 1.5, 2.5, 3.5, 4.0, 4.0, # 6-11
|
||
3.5, 3.5, 4.0, 4.5, 4.5, 4.0, # 12-17
|
||
3.5, 2.5, 1.8, 1.2, 0.8, 0.5, # 18-23
|
||
]
|
||
|
||
# --- Generation ---
|
||
|
||
@dataclass
|
||
class Op:
|
||
date: str
|
||
month: str
|
||
weekday: str
|
||
weekday_num: int # 0 = Sunday (matches Divvy convention)
|
||
hour: int
|
||
unit: str
|
||
category: str
|
||
segment: str
|
||
scale: str
|
||
succeeded: bool
|
||
|
||
WEEKDAY_NAMES = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
|
||
|
||
def days_in_month(year: int, month: int) -> list[date]:
|
||
d = date(year, month, 1)
|
||
days = []
|
||
while d.month == month:
|
||
days.append(d)
|
||
d += timedelta(days=1)
|
||
return days
|
||
|
||
def weighted_choice(weights: dict[str, float]) -> str:
|
||
keys = list(weights.keys())
|
||
vals = [weights[k] for k in keys]
|
||
return random.choices(keys, weights=vals, k=1)[0]
|
||
|
||
def weighted_int(weights: list[float]) -> int:
|
||
return random.choices(range(len(weights)), weights=weights, k=1)[0]
|
||
|
||
def exact_counts(total: int, shares: dict[str, float]) -> dict[str, int]:
|
||
"""Return exact integer counts per key that sum to total, distributed by shares."""
|
||
raw = {k: total * v for k, v in shares.items()}
|
||
counts = {k: int(v) for k, v in raw.items()}
|
||
deficit = total - sum(counts.values())
|
||
# Hand the leftover units to the keys with the largest fractional remainders.
|
||
fracs = sorted(((raw[k] - counts[k], k) for k in shares), reverse=True)
|
||
for i in range(deficit):
|
||
counts[fracs[i % len(fracs)][1]] += 1
|
||
return counts
|
||
|
||
def generate_ops(seed: int = 42) -> list[Op]:
|
||
"""
|
||
Deterministic on the puzzle-driving dimensions (category share, per-category
|
||
success rate). Random on incidental dimensions (unit, scale, hour, day-within-month).
|
||
"""
|
||
random.seed(seed)
|
||
ops: list[Op] = []
|
||
|
||
for month, total in MONTHLY_OPS.items():
|
||
year = int(month[:4])
|
||
mo = int(month[5:7])
|
||
days = days_in_month(year, mo)
|
||
shares = category_share(month)
|
||
|
||
# Exact category counts per month (no sampling noise on the shares)
|
||
cat_counts = exact_counts(total, shares)
|
||
|
||
for cat, n_cat in cat_counts.items():
|
||
seg = SEGMENT_OF[cat]
|
||
base = CATEGORY_BASE_SUCCESS[cat]
|
||
# Exact success count for this (month, category) — no Bernoulli noise
|
||
n_success = round(n_cat * base)
|
||
|
||
# Build the ops list for this bucket, mark first n_success as succeeded
|
||
bucket = []
|
||
for i in range(n_cat):
|
||
d = random.choice(days)
|
||
wd_idx = (d.weekday() + 1) % 7
|
||
bucket.append(Op(
|
||
date=d.isoformat(), month=month,
|
||
weekday=WEEKDAY_NAMES[wd_idx], weekday_num=wd_idx,
|
||
hour=weighted_int(HOURLY_WEIGHT),
|
||
unit=random.choice(UNITS),
|
||
category=cat, segment=seg,
|
||
scale=weighted_choice(SCALE_SHARE),
|
||
succeeded=(i < n_success),
|
||
))
|
||
random.shuffle(bucket) # don't keep all successes at front of list
|
||
ops.extend(bucket)
|
||
|
||
return ops
|
||
|
||
# --- Aggregation: build the 14 GAME_DATA tables game.js expects ---
|
||
# Field names follow Divvy convention (rides = count, avg_duration = success rate %)
|
||
# even though the meaning is repurposed. See top-of-file note.
|
||
|
||
def round_pct(x: float) -> float:
|
||
return round(x * 100, 1)
|
||
|
||
def safe_pct(num: float, den: float) -> float:
|
||
return round_pct(num / den) if den else 0.0
|
||
|
||
def aggregate(ops: list[Op]) -> dict:
|
||
out: dict = {}
|
||
|
||
# monthly_totals: [{month, rides, avg_duration, median_duration}]
|
||
by_month: dict[str, list[Op]] = defaultdict(list)
|
||
for o in ops:
|
||
by_month[o.month].append(o)
|
||
out["monthly_totals"] = [
|
||
{
|
||
"month": m,
|
||
"rides": len(rows),
|
||
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
|
||
"median_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)), # same as avg for binomial
|
||
}
|
||
for m, rows in sorted(by_month.items())
|
||
]
|
||
|
||
# daily_totals: [{date, rides, avg_duration}]
|
||
by_date: dict[str, list[Op]] = defaultdict(list)
|
||
for o in ops:
|
||
by_date[o.date].append(o)
|
||
out["daily_totals"] = [
|
||
{
|
||
"date": d,
|
||
"rides": len(rows),
|
||
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
|
||
}
|
||
for d, rows in sorted(by_date.items())
|
||
]
|
||
|
||
# day_of_week: [{month, weekday, weekday_num, rides, avg_duration}] for Oct + Nov 2023
|
||
out["day_of_week"] = []
|
||
for month in ["2023-10", "2023-11"]:
|
||
by_wd: dict[tuple[str, int], list[Op]] = defaultdict(list)
|
||
for o in by_month[month]:
|
||
by_wd[(o.weekday, o.weekday_num)].append(o)
|
||
for (wd, wd_num), rows in sorted(by_wd.items(), key=lambda kv: kv[0][1]):
|
||
out["day_of_week"].append({
|
||
"month": month, "weekday": wd, "weekday_num": wd_num,
|
||
"rides": len(rows),
|
||
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
|
||
})
|
||
|
||
# day_counts: [{month, weekday, weekday_num, day_count}] — for calendar artifact
|
||
out["day_counts"] = []
|
||
for month in ["2023-10", "2023-11"]:
|
||
year, mo = int(month[:4]), int(month[5:7])
|
||
days = days_in_month(year, mo)
|
||
wd_count: dict[tuple[str, int], int] = defaultdict(int)
|
||
for d in days:
|
||
wd_idx = (d.weekday() + 1) % 7
|
||
wd_count[(WEEKDAY_NAMES[wd_idx], wd_idx)] += 1
|
||
for (wd, wd_num), cnt in sorted(wd_count.items(), key=lambda kv: kv[0][1]):
|
||
out["day_counts"].append({
|
||
"month": month, "weekday": wd, "weekday_num": wd_num, "day_count": cnt,
|
||
})
|
||
|
||
# hourly_totals: [{month, hour, rides}] for Oct + Nov 2023
|
||
out["hourly_totals"] = []
|
||
for month in ["2023-10", "2023-11"]:
|
||
for h in range(24):
|
||
n = sum(1 for o in by_month[month] if o.hour == h)
|
||
out["hourly_totals"].append({"month": month, "hour": h, "rides": n})
|
||
|
||
# hourly_patterns: per-segment hourly (used by game.js but we keep simpler)
|
||
out["hourly_patterns"] = []
|
||
for month in ["2023-10", "2023-11"]:
|
||
for seg in ["preparation", "combat"]:
|
||
for h in range(24):
|
||
n = sum(1 for o in by_month[month] if o.hour == h and o.segment == seg)
|
||
out["hourly_patterns"].append({"month": month, "segment": seg, "hour": h, "rides": n})
|
||
|
||
# bike_type → mission_category: per-category aggregate Oct + Nov 2023
|
||
out["bike_type"] = []
|
||
for month in ["2023-10", "2023-11"]:
|
||
for cat in CATEGORIES:
|
||
rows = [o for o in by_month[month] if o.category == cat]
|
||
out["bike_type"].append({
|
||
"month": month, "bike_type": cat, # field name kept for game.js compat
|
||
"rides": len(rows),
|
||
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
|
||
})
|
||
|
||
# duration_buckets → mission_scale: per-scale aggregate Oct + Nov 2023
|
||
out["duration_buckets"] = []
|
||
for month in ["2023-10", "2023-11"]:
|
||
for scale in SCALES:
|
||
rows = [o for o in by_month[month] if o.scale == scale]
|
||
out["duration_buckets"].append({
|
||
"month": month, "bucket": scale, "bucket_order": SCALE_ORDER[scale],
|
||
"rides": len(rows),
|
||
})
|
||
|
||
# station_comparison → unit_comparison: top units, Oct→Nov change
|
||
out["station_comparison"] = []
|
||
for unit in UNITS:
|
||
oct_rows = [o for o in by_month["2023-10"] if o.unit == unit]
|
||
nov_rows = [o for o in by_month["2023-11"] if o.unit == unit]
|
||
oct_n = len(oct_rows)
|
||
nov_n = len(nov_rows)
|
||
change = round((nov_n - oct_n) / oct_n * 100, 1) if oct_n else 0.0
|
||
out["station_comparison"].append({
|
||
"station": unit, # field name kept
|
||
"oct_rides": oct_n, "nov_rides": nov_n, "change_pct": change,
|
||
})
|
||
out["station_comparison"].sort(key=lambda r: -r["oct_rides"])
|
||
|
||
# monthly_by_segment: [{month, segment, rides, avg_duration}] — all months
|
||
out["monthly_by_segment"] = []
|
||
for m, rows in sorted(by_month.items()):
|
||
for seg in ["preparation", "combat"]:
|
||
sub = [o for o in rows if o.segment == seg]
|
||
out["monthly_by_segment"].append({
|
||
"month": m, "segment": seg,
|
||
"rides": len(sub),
|
||
"avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
|
||
})
|
||
|
||
# daily_by_segment: [{date, segment, rides, avg_duration}]
|
||
out["daily_by_segment"] = []
|
||
for d, rows in sorted(by_date.items()):
|
||
for seg in ["preparation", "combat"]:
|
||
sub = [o for o in rows if o.segment == seg]
|
||
out["daily_by_segment"].append({
|
||
"date": d, "segment": seg,
|
||
"rides": len(sub),
|
||
"avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
|
||
})
|
||
|
||
# null_station_rate: data-quality table (game.js doesn't actively use, but referenced)
|
||
out["null_station_rate"] = [] # leave empty; game.js handles missing gracefully
|
||
|
||
# yoy_data + yoy_totals: Nov 2022 vs Nov 2023
|
||
out["yoy_data"] = []
|
||
out["yoy_totals"] = []
|
||
for month in ["2022-11", "2023-11"]:
|
||
rows = by_month[month]
|
||
out["yoy_totals"].append({
|
||
"month": month,
|
||
"rides": len(rows),
|
||
"avg_duration": safe_pct(sum(1 for o in rows if o.succeeded), len(rows)),
|
||
})
|
||
for seg in ["preparation", "combat"]:
|
||
sub = [o for o in rows if o.segment == seg]
|
||
out["yoy_data"].append({
|
||
"month": month, "segment": seg,
|
||
"rides": len(sub),
|
||
"avg_duration": safe_pct(sum(1 for o in sub if o.succeeded), len(sub)),
|
||
})
|
||
|
||
# scenario: VP-message metadata
|
||
oct_rows = by_month["2023-10"]
|
||
nov_rows = by_month["2023-11"]
|
||
oct_ops = len(oct_rows)
|
||
nov_ops = len(nov_rows)
|
||
oct_eff = safe_pct(sum(1 for o in oct_rows if o.succeeded), oct_ops)
|
||
nov_eff = safe_pct(sum(1 for o in nov_rows if o.succeeded), nov_ops)
|
||
out["scenario"] = {
|
||
"prior_month": "2023-10",
|
||
"prior_month_label": "Жовтень 2023",
|
||
"focus_month": "2023-11",
|
||
"focus_month_label": "Листопад 2023",
|
||
"prior_rides": oct_ops, # = ops count for prior month
|
||
"focus_rides": nov_ops, # = ops count for focus month
|
||
"rides_change_pct": round((nov_ops - oct_ops) / oct_ops * 100, 1),
|
||
"prior_avg_duration": oct_eff,
|
||
"focus_avg_duration": nov_eff,
|
||
"duration_change_pct": round(nov_eff - oct_eff, 1), # difference of percentages, not relative
|
||
}
|
||
|
||
return out
|
||
|
||
# --- Output ---
|
||
|
||
def write_facts_csv(ops: list[Op], path: Path):
|
||
with path.open("w", newline="") as f:
|
||
w = csv.writer(f)
|
||
w.writerow(["date", "month", "weekday", "hour", "unit", "category", "segment", "scale", "succeeded"])
|
||
for o in ops:
|
||
w.writerow([o.date, o.month, o.weekday, o.hour, o.unit, o.category, o.segment, o.scale, int(o.succeeded)])
|
||
|
||
def write_game_data_js(data: dict, path: Path):
|
||
header = (
|
||
"// Auto-generated — do not edit manually. Run: python3 generate_data.py\n"
|
||
"// Synthetic dataset for the L5 BI Detective game (KSE × ХАРТІЯ data-literacy course).\n"
|
||
"// Field-name mapping (kept compatible with BUS220 game.js):\n"
|
||
"// rides = number of operations (ops count)\n"
|
||
"// avg_duration = success rate, percent (0–100)\n"
|
||
"// bike_type = mission_category (training / logistics / recon / fire_support / defensive)\n"
|
||
"// segment = preparation (training+logistics) | combat (recon+fire_support+defensive)\n"
|
||
"// bucket = mission_scale (small/medium/large)\n"
|
||
"// station = unit name\n"
|
||
)
|
||
payload = "const GAME_DATA = " + json.dumps(data, ensure_ascii=False, separators=(",", ":")) + ";\n"
|
||
path.write_text(header + payload)
|
||
|
||
def main():
|
||
print("Generating synthetic operations…")
|
||
ops = generate_ops(seed=42)
|
||
print(f" Total ops generated: {len(ops):,}")
|
||
|
||
write_facts_csv(ops, OUT_DIR / "raw_facts.csv")
|
||
print(f" Wrote {OUT_DIR / 'raw_facts.csv'}")
|
||
|
||
print("Aggregating into game tables…")
|
||
data = aggregate(ops)
|
||
|
||
write_game_data_js(data, OUT_DIR / "game_data.js")
|
||
print(f" Wrote {OUT_DIR / 'game_data.js'}")
|
||
|
||
# Sanity check — print headline numbers
|
||
s = data["scenario"]
|
||
print()
|
||
print("Headline check:")
|
||
print(f" {s['prior_month_label']}: {s['prior_rides']} ops, {s['prior_avg_duration']}% efficiency")
|
||
print(f" {s['focus_month_label']}: {s['focus_rides']} ops, {s['focus_avg_duration']}% efficiency")
|
||
print(f" Volume change: {s['rides_change_pct']:+.1f}%")
|
||
print(f" Efficiency change: {s['duration_change_pct']:+.1f} pp")
|
||
print()
|
||
print("Per-category success rate (should be stable across Oct → Nov):")
|
||
bt = data["bike_type"]
|
||
for cat in CATEGORIES:
|
||
oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None)
|
||
nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None)
|
||
if oct_v and nov_v:
|
||
print(f" {cat:14s}: Oct {oct_v['avg_duration']:5.1f}% (n={oct_v['rides']:3d}) → Nov {nov_v['avg_duration']:5.1f}% (n={nov_v['rides']:3d})")
|
||
print()
|
||
print("Composition (share of total ops):")
|
||
oct_total = sum(r["rides"] for r in bt if r["month"] == "2023-10")
|
||
nov_total = sum(r["rides"] for r in bt if r["month"] == "2023-11")
|
||
for cat in CATEGORIES:
|
||
oct_v = next((r for r in bt if r["month"] == "2023-10" and r["bike_type"] == cat), None)
|
||
nov_v = next((r for r in bt if r["month"] == "2023-11" and r["bike_type"] == cat), None)
|
||
if oct_v and nov_v:
|
||
print(f" {cat:14s}: Oct {oct_v['rides']/oct_total*100:5.1f}% → Nov {nov_v['rides']/nov_total*100:5.1f}%")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|