From 37b2a71c11be8ce213cf24ecf16863123cbed4e2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 09:29:25 +0000
Subject: [PATCH 1/3] Add csv_utilise.sh, sample CSVs, and arbitrary-shape
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing csv_loader.sh already accepts any CSV file generically, but
nothing in the repo helps a user query, list, or export what they loaded —
they had to know psql and the auto-created schema. This change closes that
gap and proves the path end-to-end:

- build/csv_utilise.sh: list / describe / peek / export / drop subcommands
  scoped to tables carrying the loader's _csv_row_id + _loaded_at markers,
  so the te_core_schema tables can never be touched by accident
- build/csv/samples/: three off-domain CSVs (customers, orders, inventory)
  so the loader has something to demo against
- Makefile: csv-load, csv-list, csv-demo targets
- tests/test_csv_utilise.py: 9 unit tests for arg parsing and engine guard
- tests/test_csv_loader_arbitrary_shapes.py: parameterised regression
  (2x3, 10x50, 1x100) covering CSV -> validator -> Postgres -> query,
  skipped cleanly when PostgreSQL is unreachable
- README.md / ARCHITECTURE.md: 'Load any CSV' subsection and new build/ rows

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VBrxqChRJtxdvSpFhiUWUy
---
 ARCHITECTURE.md                           |   4 +-
 Makefile                                  |  27 ++-
 README.md                                 |  24 ++
 build/csv/samples/customers.csv           |   6 +
 build/csv/samples/inventory.csv           |   7 +
 build/csv/samples/orders.csv              |   9 +
 build/csv_utilise.sh                      | 259 ++++++++++++++++++++++
 tests/test_csv_loader_arbitrary_shapes.py | 112 ++++++++++
 tests/test_csv_utilise.py                 |  88 ++++++++
 9 files changed, 534 insertions(+), 2 deletions(-)
 create mode 100644 build/csv/samples/customers.csv
 create mode 100644 build/csv/samples/inventory.csv
 create mode 100644 build/csv/samples/orders.csv
 create mode 100755 build/csv_utilise.sh
 create mode 100644 tests/test_csv_loader_arbitrary_shapes.py
 create mode 100644 tests/test_csv_utilise.py

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 98bc88c..c063be5 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -30,13 +30,15 @@ The three layers can break independently, so we keep them physically separate. T
 |------|-----------|
 | `build/te_core_schema.sql` | PostgreSQL master schema (legacy entry point) |
 | `build/te_seed_data.sql` | Seed data |
-| `build/csv/` | Python CSV validator (`validator.py`) + per-engine shell loaders (`loader_*.sh`) |
+| `build/csv/` | Python CSV validator (`validator.py`), per-engine shell loaders (`loader_*.sh`), and `samples/` |
 | `build/adapters/` | Per-engine deployment adapters (`adapter_postgresql.sh`, `adapter_mariadb.sh`, etc.) |
 | `build/schema/` | Engine-specific DDL and seed data |
 | `build/environments/` | PostgreSQL per-environment launchers (`env_dev.sql`, `env_test.sql`, etc.) |
 | `build/terraform-github-repos/` | GitHub repository management as Infrastructure-as-Code |
 | `build/setup.sh` | Interactive multi-database configuration wizard |
 | `build/deploy_all.sh` | Multi-engine deployment router |
+| `build/csv_loader.sh` | Schema-agnostic CSV ingestion: any CSV → auto-created table |
+| `build/csv_utilise.sh` | Companion to the loader: list / describe / peek / export / drop CSV-loaded tables (PostgreSQL) |
 
 ### `tests/` — correctness coverage
 
diff --git a/Makefile b/Makefile
index c39f78d..2efe6c9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
 .PHONY: test-free test-gate test-evals test-e2e test-all \
         lint lint-diff health \
-        eval-list eval-compare eval-summary select-tests
+        eval-list eval-compare eval-summary select-tests \
+        csv-load csv-list csv-demo
 
 # ── Test tiers ────────────────────────────────────────────────────────────────
 
@@ -69,3 +70,27 @@ eval-summary:
 # Mirrors: bun run eval:select
 select-tests:
 	python3 scripts/select_tests.py
+
+# ── CSV loader / utiliser ─────────────────────────────────────────────────────
+
+# Load any CSV file into the target environment's database.
+# Usage: make csv-load FILE=path/to.csv [ENV=dev] [ENGINE=postgresql]
+csv-load:
+	@if [ -z "$(FILE)" ]; then \
+	    echo "Usage: make csv-load FILE=path/to.csv [ENV=dev] [ENGINE=postgresql]"; \
+	    exit 1; \
+	fi
+	bash build/csv_loader.sh "$(FILE)" --env $(or $(ENV),dev) $(if $(ENGINE),--engine $(ENGINE),)
+
+# List CSV-loaded tables in the target environment.
+# Usage: make csv-list [ENV=dev]
+csv-list:
+	bash build/csv_utilise.sh list --env $(or $(ENV),dev)
+
+# One-shot proof: load the three sample CSVs into dev, then list them.
+# Usage: make csv-demo [ENV=dev]
+csv-demo:
+	bash build/csv_loader.sh build/csv/samples/customers.csv --env $(or $(ENV),dev)
+	bash build/csv_loader.sh build/csv/samples/orders.csv    --env $(or $(ENV),dev)
+	bash build/csv_loader.sh build/csv/samples/inventory.csv --env $(or $(ENV),dev)
+	bash build/csv_utilise.sh list --env $(or $(ENV),dev)
diff --git a/README.md b/README.md
index d95435d..9ae0a0a 100644
--- a/README.md
+++ b/README.md
@@ -290,6 +290,30 @@ CSV inputs must have a header row, use comma delimiters, and be UTF-8 encoded wi
 
 Supported loader backends are PostgreSQL, MariaDB/MySQL, SQLite, InfluxDB, Redis, and Teradata. PostgreSQL uses `COPY`, MariaDB/MySQL uses `LOAD DATA LOCAL INFILE`, SQLite uses Python `csv` + `sqlite3`, InfluxDB writes line protocol via the `influx` CLI, Redis writes hashes through `redis-cli`, and Teradata uses BTEQ/FastLoad tooling.
 
+### Load any CSV
+
+The loader is schema-agnostic — drop any CSV file in front of it and a matching table is auto-created in the target environment's database. Every CSV-loaded table is tagged with two marker columns: `_csv_row_id BIGSERIAL PRIMARY KEY` and `_loaded_at TIMESTAMPTZ`. All other columns start as `TEXT`; `ALTER TABLE` afterwards if you need stricter types.
+
+Three sample CSVs ship under `build/csv/samples/` (`customers.csv`, `orders.csv`, `inventory.csv`) — deliberately off-domain from the T&E schema to demonstrate that any shape is accepted.
+
+```bash
+# Single-command happy-path proof (loads all three samples into dev, lists them)
+make csv-demo
+
+# Load any CSV
+make csv-load FILE=path/to/anything.csv          # ENV defaults to dev
+make csv-load FILE=path/to/anything.csv ENV=test ENGINE=postgresql
+
+# Use loaded data — companion script: build/csv_utilise.sh (PostgreSQL only)
+./build/csv_utilise.sh list                       # all CSV-loaded tables in the env
+./build/csv_utilise.sh describe customers         # columns + row count
+./build/csv_utilise.sh peek orders --limit 5      # first N rows
+./build/csv_utilise.sh export inventory dump.csv  # round-trip back to CSV
+./build/csv_utilise.sh drop customers --yes       # remove a CSV-loaded table
+```
+
+`csv_utilise.sh` only sees tables that carry the marker columns, so it cannot accidentally touch the rigid te_core_schema tables.
+
 ---
 
 ## How Parameterisation Works
diff --git a/build/csv/samples/customers.csv b/build/csv/samples/customers.csv
new file mode 100644
index 0000000..423e9b0
--- /dev/null
+++ b/build/csv/samples/customers.csv
@@ -0,0 +1,6 @@
+id,name,email,signup_date
+1,Alice Nguyen,alice@example.com,2024-01-15
+2,Brian O'Connor,brian@example.com,2024-02-03
+3,Chen Wei,chen.wei@example.com,2024-03-22
+4,Diana Patel,diana.patel@example.com,2024-05-10
+5,Eduardo Silva,eduardo@example.com,2024-07-01
diff --git a/build/csv/samples/inventory.csv b/build/csv/samples/inventory.csv
new file mode 100644
index 0000000..ab325ff
--- /dev/null
+++ b/build/csv/samples/inventory.csv
@@ -0,0 +1,7 @@
+sku,description,stock,location
+SKU-001,Wireless headphones – black,42,Warehouse A
+SKU-002,USB-C charger,128,Warehouse B
+SKU-003,Notebook A5 hardcover,75,Warehouse A
+SKU-004,Mechanical keyboard,18,Warehouse C
+SKU-005,Mouse pad – large,200,Warehouse B
+SKU-006,Webcam 1080p,33,Warehouse A
diff --git a/build/csv/samples/orders.csv b/build/csv/samples/orders.csv
new file mode 100644
index 0000000..46eec37
--- /dev/null
+++ b/build/csv/samples/orders.csv
@@ -0,0 +1,9 @@
+order_id,customer_id,product,qty,price
+1001,1,"Wireless headphones, black",1,89.95
+1002,2,USB-C charger,2,19.50
+1003,1,"Notebook, A5 hardcover",3,12.00
+1004,3,Mechanical keyboard,1,145.00
+1005,4,"Mouse pad, large",1,24.99
+1006,2,Webcam 1080p,1,59.00
+1007,5,"Cable organiser, 6-pack",1,15.75
+1008,3,Desk lamp,1,42.50
diff --git a/build/csv_utilise.sh b/build/csv_utilise.sh
new file mode 100755
index 0000000..7246b0e
--- /dev/null
+++ b/build/csv_utilise.sh
@@ -0,0 +1,259 @@
+#!/usr/bin/env bash
+# =============================================================================
+# csv_utilise.sh — Utilise CSV-loaded tables
+# =============================================================================
+# Companion to csv_loader.sh. Lists, describes, peeks at, exports, or drops
+# tables that were created by the CSV loader. CSV-loaded tables are
+# identified by the marker columns the loader always adds:
+#   _csv_row_id  BIGSERIAL PRIMARY KEY
+#   _loaded_at   TIMESTAMPTZ
+#
+# Usage:
+#   ./csv_utilise.sh list                            [--env ENV] [--engine ENG]
+#   ./csv_utilise.sh describe <table>                [--env ENV] [--engine ENG]
+#   ./csv_utilise.sh peek <table> [--limit N]        [--env ENV] [--engine ENG]
+#   ./csv_utilise.sh export <table> <out.csv>        [--env ENV] [--engine ENG]
+#   ./csv_utilise.sh drop <table> --yes              [--env ENV] [--engine ENG]
+#
+#   ENG  defaults to value from config (or postgresql).
+#   ENV  defaults to dev.
+#
+# Only the postgresql engine is implemented in this script. Other engines
+# return a clear "not implemented" message and exit 2.
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CONFIG_LOCAL="${SCRIPT_DIR}/config.local.env"
+CONFIG_DEFAULT="${SCRIPT_DIR}/config.env"
+
+GREEN=$'\033[0;32m'; RED=$'\033[0;31m'; YELLOW=$'\033[1;33m'
+CYAN=$'\033[0;36m';  BOLD=$'\033[1m';   NC=$'\033[0m'
+
+log()   { echo -e "${GREEN}[✓]${NC} $*"; }
+warn()  { echo -e "${YELLOW}[⚠]${NC} $*"; }
+error() { echo -e "${RED}[✗]${NC} $*" >&2; }
+info()  { echo -e "${CYAN}[i]${NC} $*"; }
+
+usage() {
+   cat <<EOF
+
+${BOLD}Usage:${NC}
+  ./csv_utilise.sh <command> [args] [--env ENV] [--engine ENG]
+
+${BOLD}Commands:${NC}
+  list                        List CSV-loaded tables in the target schema
+  describe <table>            Show columns and row count for a table
+  peek <table> [--limit N]    Show first N rows (default 10)
+  export <table> <out.csv>    Export the table back to CSV
+  drop <table> --yes          Drop a CSV-loaded table (requires --yes)
+
+${BOLD}Options:${NC}
+  --env  <env>                dev | test | staging | prod   (default: dev)
+  --engine <engine>           postgresql                    (default: from config)
+  --limit <N>                 row limit for 'peek'          (default: 10)
+  --yes                       confirmation flag for 'drop'
+  --help, -h                  show this message
+
+${BOLD}Examples:${NC}
+  ./csv_utilise.sh list --env dev
+  ./csv_utilise.sh describe customers
+  ./csv_utilise.sh peek orders --limit 5
+  ./csv_utilise.sh export inventory /tmp/inventory_dump.csv
+  ./csv_utilise.sh drop orders --yes --env test
+
+EOF
+}
+
+# ── Parse arguments ───────────────────────────────────────────────────────────
+COMMAND=""
+TABLE=""
+OUT_FILE=""
+TARGET_ENV="dev"
+ENGINE_OVERRIDE=""
+PEEK_LIMIT="10"
+CONFIRM_DROP="false"
+
+if [[ $# -eq 0 ]]; then usage; exit 1; fi
+
+case "$1" in
+   --help|-h) usage; exit 0 ;;
+   list|describe|peek|export|drop) COMMAND="$1"; shift ;;
+   *) error "Unknown command: $1"; usage; exit 1 ;;
+esac
+
+# Positional args for some commands
+case "$COMMAND" in
+   describe|peek|drop)
+      if [[ $# -eq 0 || "${1:0:2}" == "--" ]]; then
+         error "Command '${COMMAND}' requires a <table> argument."; usage; exit 1
+      fi
+      TABLE="$1"; shift
+      ;;
+   export)
+      if [[ $# -lt 2 || "${1:0:2}" == "--" || "${2:0:2}" == "--" ]]; then
+         error "Command 'export' requires <table> and <out.csv> arguments."; usage; exit 1
+      fi
+      TABLE="$1"; OUT_FILE="$2"; shift 2
+      ;;
+esac
+
+while [[ $# -gt 0 ]]; do
+   case "$1" in
+      --env)     shift; TARGET_ENV="${1:-}" ;;
+      --engine)  shift; ENGINE_OVERRIDE="${1:-}" ;;
+      --limit)   shift; PEEK_LIMIT="${1:-10}" ;;
+      --yes)     CONFIRM_DROP="true" ;;
+      --help|-h) usage; exit 0 ;;
+      *) error "Unknown argument: $1"; usage; exit 1 ;;
+   esac
+   shift
+done
+
+# ── Sanitise table identifier (before any side effects) ──────────────────────
+if [[ -n "$TABLE" ]]; then
+   if [[ ! "$TABLE" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then
+      error "Invalid table name: '${TABLE}'. Allowed: letters, digits, underscore; must not start with a digit."
+      exit 1
+   fi
+fi
+
+# ── Early engine check (avoids loading config for unsupported engines) ───────
+if [[ -n "$ENGINE_OVERRIDE" && "$ENGINE_OVERRIDE" != "postgresql" ]]; then
+   error "csv_utilise.sh: engine '${ENGINE_OVERRIDE}' is not implemented."
+   error "Only 'postgresql' is supported. Run with --engine postgresql."
+   exit 2
+fi
+
+# ── Load configuration ────────────────────────────────────────────────────────
+if [[ -f "$CONFIG_LOCAL" ]]; then
+   source "$CONFIG_LOCAL"
+elif [[ -f "$CONFIG_DEFAULT" ]]; then
+   source "$CONFIG_DEFAULT"
+   warn "config.local.env not found — using defaults. Run ./setup.sh to configure."
+else
+   error "No config found. Run ./setup.sh first."
+   exit 1
+fi
+
+DB_ENGINE="${ENGINE_OVERRIDE:-${DB_ENGINE:-postgresql}}"
+
+if [[ "$DB_ENGINE" != "postgresql" ]]; then
+   error "csv_utilise.sh: engine '${DB_ENGINE}' is not implemented."
+   error "Only 'postgresql' is supported. Run with --engine postgresql."
+   exit 2
+fi
+
+# ── Resolve PostgreSQL connection details ────────────────────────────────────
+E="${TARGET_ENV^^}"
+PG_HOST="${PGHOST:-${PG_HOST:-localhost}}"
+PG_PORT="${PGPORT:-${PG_PORT:-5432}}"
+PG_USER="${PGUSER:-${PG_SUPERUSER:-postgres}}"
+DB_NAME="$(eval echo "\$PG_DB_${E}")"
+SCHEMA="$(eval echo "\$PG_SCHEMA_${E}")"
+
+if [[ -z "$DB_NAME" || -z "$SCHEMA" ]]; then
+   error "Could not resolve database / schema for env '${TARGET_ENV}'."
+   error "Check that PG_DB_${E} and PG_SCHEMA_${E} are set in config.local.env."
+   exit 1
+fi
+
+[[ -n "${PG_SUPERUSER_PASSWORD:-}" ]] && export PGPASSWORD="${PG_SUPERUSER_PASSWORD}"
+
+PSQL=(psql -h "${PG_HOST}" -p "${PG_PORT}" -U "${PG_USER}" -d "${DB_NAME}" -v ON_ERROR_STOP=1)
+
+# ── Reachability probe (gives a friendly error when DB is down) ──────────────
+if ! "${PSQL[@]}" -tA -c "SELECT 1" >/dev/null 2>&1; then
+   error "Cannot reach PostgreSQL at ${PG_HOST}:${PG_PORT} as ${PG_USER}/${DB_NAME}."
+   error "Check the database is running and config.local.env credentials are correct."
+   exit 3
+fi
+
+# ── Helper: assert table is a CSV-loaded table ───────────────────────────────
+assert_csv_table() {
+   local tbl="$1"
+   local cnt
+   cnt=$("${PSQL[@]}" -tA -c "
+      SELECT COUNT(*)
+      FROM information_schema.columns
+      WHERE table_schema = '${SCHEMA}'
+        AND table_name   = '${tbl}'
+        AND column_name IN ('_csv_row_id','_loaded_at');
+   ")
+   if [[ "$cnt" != "2" ]]; then
+      error "Table '${SCHEMA}.${tbl}' is not a CSV-loaded table (missing marker columns)."
+      error "Use the original loader to create it: ./csv_loader.sh <file>.csv"
+      exit 1
+   fi
+}
+
+# ── Commands ──────────────────────────────────────────────────────────────────
+case "$COMMAND" in
+
+   list)
+      info "CSV-loaded tables in ${DB_NAME}.${SCHEMA}:"
+      "${PSQL[@]}" -P pager=off -c "
+         SELECT t.table_name AS table,
+                pg_size_pretty(pg_total_relation_size(format('%I.%I', t.table_schema, t.table_name)::regclass)) AS size
+         FROM information_schema.tables t
+         WHERE t.table_schema = '${SCHEMA}'
+           AND EXISTS (SELECT 1 FROM information_schema.columns c
+                       WHERE c.table_schema = t.table_schema
+                         AND c.table_name   = t.table_name
+                         AND c.column_name  = '_csv_row_id')
+           AND EXISTS (SELECT 1 FROM information_schema.columns c
+                       WHERE c.table_schema = t.table_schema
+                         AND c.table_name   = t.table_name
+                         AND c.column_name  = '_loaded_at')
+         ORDER BY t.table_name;
+      "
+      ;;
+
+   describe)
+      assert_csv_table "$TABLE"
+      info "Columns of ${SCHEMA}.${TABLE}:"
+      "${PSQL[@]}" -P pager=off -c "
+         SELECT column_name, data_type, is_nullable
+         FROM information_schema.columns
+         WHERE table_schema = '${SCHEMA}' AND table_name = '${TABLE}'
+         ORDER BY ordinal_position;
+      "
+      ROW_COUNT=$("${PSQL[@]}" -tA -c "SELECT COUNT(*) FROM \"${SCHEMA}\".\"${TABLE}\";")
+      log "Row count: ${ROW_COUNT}"
+      ;;
+
+   peek)
+      assert_csv_table "$TABLE"
+      if [[ ! "$PEEK_LIMIT" =~ ^[0-9]+$ ]]; then
+         error "--limit must be a positive integer (got: '${PEEK_LIMIT}')."
+         exit 1
+      fi
+      info "First ${PEEK_LIMIT} row(s) of ${SCHEMA}.${TABLE}:"
+      "${PSQL[@]}" -P pager=off -c "SELECT * FROM \"${SCHEMA}\".\"${TABLE}\" ORDER BY _csv_row_id LIMIT ${PEEK_LIMIT};"
+      ;;
+
+   export)
+      assert_csv_table "$TABLE"
+      # Validate output path is writable
+      OUT_DIR="$(dirname "$OUT_FILE")"
+      mkdir -p "$OUT_DIR"
+      info "Exporting ${SCHEMA}.${TABLE} to ${OUT_FILE}..."
+      "${PSQL[@]}" -c "\\COPY (SELECT * FROM \"${SCHEMA}\".\"${TABLE}\" ORDER BY _csv_row_id) TO '${OUT_FILE}' WITH (FORMAT CSV, HEADER TRUE)"
+      log "Export complete: ${OUT_FILE}"
+      ;;
+
+   drop)
+      assert_csv_table "$TABLE"
+      if [[ "$CONFIRM_DROP" != "true" ]]; then
+         error "Refusing to drop '${SCHEMA}.${TABLE}' without --yes."
+         exit 1
+      fi
+      warn "Dropping ${SCHEMA}.${TABLE}..."
+      "${PSQL[@]}" -c "DROP TABLE \"${SCHEMA}\".\"${TABLE}\";"
+      log "Dropped ${SCHEMA}.${TABLE}"
+      ;;
+esac
+
+unset PGPASSWORD
+exit 0
diff --git a/tests/test_csv_loader_arbitrary_shapes.py b/tests/test_csv_loader_arbitrary_shapes.py
new file mode 100644
index 0000000..ae0038f
--- /dev/null
+++ b/tests/test_csv_loader_arbitrary_shapes.py
@@ -0,0 +1,112 @@
+"""End-to-end regression for the 'any CSV file' guarantee.
+
+Generates CSV files of varying shapes (column count × row count), runs them
+through build/csv_loader.sh into a live PostgreSQL instance, and asserts the
+auto-created table contains the right number of rows. Complements the
+offline Tier P scenarios in evals/datasets/tier_p/ — those exercise the
+validator only; these exercise the full validator → loader → DB pipeline.
+
+Skipped automatically when PostgreSQL is not reachable.
+"""
+import csv
+import os
+import shutil
+import subprocess
+import tempfile
+import unittest
+from pathlib import Path
+
+import pytest
+
+pytestmark = pytest.mark.integration
+
+ROOT       = Path(__file__).resolve().parents[1]
+LOADER     = ROOT / "build" / "csv_loader.sh"
+UTILISE    = ROOT / "build" / "csv_utilise.sh"
+BUILD_DIR  = ROOT / "build"
+
+
+def _can_connect_pg() -> bool:
+    if not shutil.which("psql"):
+        return False
+    try:
+        r = subprocess.run(
+            ["psql", "-tA", "-c", "SELECT 1"],
+            capture_output=True, text=True, timeout=5,
+            env={**os.environ, "PGUSER": os.environ.get("PGUSER", "postgres")},
+        )
+        return r.returncode == 0
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return False
+
+
+_PG_AVAILABLE = _can_connect_pg()
+_CONFIG_PRESENT = (BUILD_DIR / "config.local.env").exists()
+_SKIP_REASON = (
+    "PostgreSQL not reachable" if not _PG_AVAILABLE
+    else "build/config.local.env not present — run ./build/setup.sh"
+)
+
+
+def _write_csv(path: Path, n_cols: int, n_rows: int) -> None:
+    header = [f"col_{i}" for i in range(n_cols)]
+    with path.open("w", encoding="utf-8", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(header)
+        for r in range(n_rows):
+            w.writerow([f"r{r}c{i}" for i in range(n_cols)])
+
+
+@unittest.skipUnless(_PG_AVAILABLE and _CONFIG_PRESENT, _SKIP_REASON)
+class CsvLoaderArbitraryShapes(unittest.TestCase):
+    """Loads CSVs of varying shape into Postgres and verifies row counts."""
+
+    SHAPES = [
+        ("tiny",    2, 3),
+        ("medium", 10, 50),
+        ("skinny",  1, 100),
+    ]
+
+    def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            # Use a unique table name so parallel runs don't collide.
+            table = f"arb_{label}_{os.getpid()}"
+            csv_path = Path(tmp) / f"{table}.csv"
+            _write_csv(csv_path, n_cols, n_rows)
+
+            load = subprocess.run(
+                ["bash", str(LOADER), str(csv_path), "--env", "dev"],
+                capture_output=True, text=True, cwd=ROOT,
+            )
+            self.assertEqual(
+                load.returncode, 0,
+                f"Loader failed for {label}: stdout={load.stdout[-500:]} stderr={load.stderr[-500:]}",
+            )
+
+            try:
+                # Verify via csv_utilise.sh describe (also asserts marker columns present).
+                describe = subprocess.run(
+                    ["bash", str(UTILISE), "describe", table, "--env", "dev"],
+                    capture_output=True, text=True, cwd=ROOT,
+                )
+                self.assertEqual(describe.returncode, 0, describe.stderr)
+                self.assertIn(f"Row count: {n_rows}", describe.stdout)
+            finally:
+                # Always drop the table — keep dev clean.
+                subprocess.run(
+                    ["bash", str(UTILISE), "drop", table, "--yes", "--env", "dev"],
+                    capture_output=True, text=True, cwd=ROOT,
+                )
+
+    def test_shape_tiny(self):
+        self._run_loader_and_count(2, 3, "tiny")
+
+    def test_shape_medium(self):
+        self._run_loader_and_count(10, 50, "medium")
+
+    def test_shape_skinny(self):
+        self._run_loader_and_count(1, 100, "skinny")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_csv_utilise.py b/tests/test_csv_utilise.py
new file mode 100644
index 0000000..fccbbf9
--- /dev/null
+++ b/tests/test_csv_utilise.py
@@ -0,0 +1,88 @@
+"""Tests for build/csv_utilise.sh.
+
+Argument-parsing and reachability paths are covered as unit tests (no DB).
+Database-backed paths (list/describe/peek/export/drop against a real schema)
+live alongside the integration tests in test_csv_loader_arbitrary_shapes.py.
+"""
+import os
+import subprocess
+import unittest
+from pathlib import Path
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+SCRIPT = Path(__file__).resolve().parents[1] / "build" / "csv_utilise.sh"
+
+
+def run(args, env=None):
+    """Run csv_utilise.sh with the given args; capture output."""
+    return subprocess.run(
+        ["bash", str(SCRIPT), *args],
+        env={**os.environ, **(env or {})},
+        capture_output=True,
+        text=True,
+    )
+
+
+class CsvUtiliseArgumentParsing(unittest.TestCase):
+    def test_no_args_shows_usage_and_exits_nonzero(self):
+        """No args → exit 1 with usage banner on stdout."""
+        r = run([])
+        self.assertEqual(r.returncode, 1)
+        self.assertIn("Usage:", r.stdout)
+
+    def test_unknown_command_exits_nonzero(self):
+        """Unknown subcommand → exit 1 with 'Unknown command' on stderr."""
+        r = run(["frobnicate"])
+        self.assertEqual(r.returncode, 1)
+        self.assertIn("Unknown command", r.stderr)
+
+    def test_describe_requires_table_arg(self):
+        """`describe` with no positional table → exit 1 with a helpful message."""
+        r = run(["describe"])
+        self.assertEqual(r.returncode, 1)
+        self.assertIn("requires a <table> argument", r.stderr)
+
+    def test_peek_requires_table_arg(self):
+        """`peek` with no positional table → exit 1 with a helpful message."""
+        r = run(["peek"])
+        self.assertEqual(r.returncode, 1)
+        self.assertIn("requires a <table> argument", r.stderr)
+
+    def test_export_requires_two_positional_args(self):
+        """`export` needs both <table> and <out.csv> → exit 1 otherwise."""
+        r = run(["export", "only_one"])
+        self.assertEqual(r.returncode, 1)
+        self.assertIn("requires <table> and <out.csv>", r.stderr)
+
+    def test_drop_requires_table_arg(self):
+        """`drop` with no positional table → exit 1."""
+        r = run(["drop"])
+        self.assertEqual(r.returncode, 1)
+        self.assertIn("requires a <table> argument", r.stderr)
+
+    def test_help_flag_exits_zero(self):
+        """`--help` short-circuits to a 0 exit with the usage banner."""
+        r = run(["--help"])
+        self.assertEqual(r.returncode, 0)
+        self.assertIn("Usage:", r.stdout)
+
+    def test_invalid_table_name_rejected(self):
+        """Identifier validation rejects names with spaces or punctuation."""
+        # Use --engine postgresql so we get past engine validation; the
+        # identifier check happens before any DB connection.
+        r = run(["describe", "bad name; DROP TABLE x", "--engine", "postgresql"])
+        self.assertEqual(r.returncode, 1)
+        self.assertIn("Invalid table name", r.stderr)
+
+    def test_unimplemented_engine_returns_clear_error(self):
+        """Engines other than postgresql exit 2 with a 'not implemented' message."""
+        r = run(["list", "--engine", "redis"])
+        self.assertEqual(r.returncode, 2)
+        self.assertIn("not implemented", r.stderr)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2b0d3c363949e682c5ac73dbbd4db9816240c17e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 14:01:56 +0000
Subject: [PATCH 2/3] ci: install dev requirements; fail loudly when pytest is
 missing locally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Windows CI runner uses `python -m unittest discover` to find test files,
but every test module imports pytest at top-level for `pytestmark` markers.
Without pytest installed, all tests error at import time with
ModuleNotFoundError. `requirements-dev.txt` already pins pytest +
pytest-cov + flake8 + bandit; install it before the test step runs.

Also: scripts/test.sh and scripts/test.ps1 used to treat a missing pytest
as a silent PASS (`SKIP`), which masks real failures during local
development. Both now fail with a one-line install hint pointing at
requirements-dev.txt — consistent with what CI now does.
---
 .github/workflows/python-validator-tests.yml | 3 +++
 scripts/test.ps1                             | 5 +++--
 scripts/test.sh                              | 5 +++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-validator-tests.yml b/.github/workflows/python-validator-tests.yml
index 10fc725..4a65912 100644
--- a/.github/workflows/python-validator-tests.yml
+++ b/.github/workflows/python-validator-tests.yml
@@ -19,6 +19,9 @@ jobs:
         with:
           python-version: '3.11'
 
+      - name: Install dev dependencies
+        run: pip install -r requirements-dev.txt
+
       - name: Run validator unit tests (PowerShell)
         shell: pwsh
         run: powershell -NoProfile -ExecutionPolicy Bypass -File "tests/run_python_tests.ps1"
diff --git a/scripts/test.ps1 b/scripts/test.ps1
index e852585..f89a32e 100644
--- a/scripts/test.ps1
+++ b/scripts/test.ps1
@@ -54,8 +54,9 @@ try {
         Record -Layer 'pytest unit' -Pass $pass -Detail "exit=$LASTEXITCODE"
         Write-Host "[layer 1] $(if ($pass) {'PASS'} else {'FAIL'})" -ForegroundColor $(if ($pass) {'Green'} else {'Red'})
     } else {
-        Write-Host "[layer 1] SKIP: pytest not installed (pip install pytest)" -ForegroundColor Yellow
-        Record -Layer 'pytest unit' -Pass $true -Detail 'skipped: pytest not installed'
+        Write-Host "[layer 1] FAIL: pytest not installed" -ForegroundColor Red
+        Write-Host "    Run: pip install -r requirements-dev.txt" -ForegroundColor Yellow
+        Record -Layer 'pytest unit' -Pass $false -Detail 'pytest missing — pip install -r requirements-dev.txt'
     }
 
     # --- Layer 2: SQL test suite ---
diff --git a/scripts/test.sh b/scripts/test.sh
index 32bc68b..0dbfe87 100644
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -67,8 +67,9 @@ if command -v pytest >/dev/null 2>&1; then
         record "pytest unit" FAIL "exit=$?"
     fi
 else
-    echo "${Y}[layer 1] SKIP: pytest not installed${X}"
-    record "pytest unit" PASS "skipped"
+    echo "${R}[layer 1] FAIL: pytest not installed${X}"
+    echo "${Y}    Run: pip install -r requirements-dev.txt${X}"
+    record "pytest unit" FAIL "pytest missing — pip install -r requirements-dev.txt"
 fi
 
 # --- Layer 2: SQL suite ---

From 0af407ab024e5c779d8ee2f1a08fd4217de9f144 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 14:09:00 +0000
Subject: [PATCH 3/3] test: locate Git Bash on Windows runners
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On windows-latest GitHub Actions runners, `subprocess.run(["bash", ...])`
resolves to C:\Windows\System32\bash.exe — the WSL shim — which fails with
"Windows Subsystem for Linux has no installed distributions." Git Bash is
preinstalled at C:\Program Files\Git\bin\bash.exe; prefer it explicitly
and skip the test class only if no real bash is found.

Same helper added to test_csv_loader_arbitrary_shapes.py for consistency
(those tests already skip without Postgres, so the impact is preemptive).
---
 tests/test_csv_loader_arbitrary_shapes.py | 24 ++++++++++++++++---
 tests/test_csv_utilise.py                 | 28 ++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/tests/test_csv_loader_arbitrary_shapes.py b/tests/test_csv_loader_arbitrary_shapes.py
index ae0038f..f01f730 100644
--- a/tests/test_csv_loader_arbitrary_shapes.py
+++ b/tests/test_csv_loader_arbitrary_shapes.py
@@ -12,6 +12,7 @@
 import os
 import shutil
 import subprocess
+import sys
 import tempfile
 import unittest
 from pathlib import Path
@@ -26,6 +27,23 @@
 BUILD_DIR  = ROOT / "build"
 
 
+def _find_bash():
+    """Locate a real bash; on Windows prefer Git Bash over the WSL shim."""
+    if sys.platform == "win32":
+        for c in (r"C:\Program Files\Git\bin\bash.exe",
+                  r"C:\Program Files (x86)\Git\bin\bash.exe"):
+            if Path(c).exists():
+                return c
+        which = shutil.which("bash")
+        if which and "system32" not in which.lower():
+            return which
+        return None
+    return shutil.which("bash") or "bash"
+
+
+_BASH = _find_bash()
+
+
 def _can_connect_pg() -> bool:
     if not shutil.which("psql"):
         return False
@@ -75,7 +93,7 @@ def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None:
             _write_csv(csv_path, n_cols, n_rows)
 
             load = subprocess.run(
-                ["bash", str(LOADER), str(csv_path), "--env", "dev"],
+                [_BASH, str(LOADER), str(csv_path), "--env", "dev"],
                 capture_output=True, text=True, cwd=ROOT,
             )
             self.assertEqual(
@@ -86,7 +104,7 @@ def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None:
             try:
                 # Verify via csv_utilise.sh describe (also asserts marker columns present).
                 describe = subprocess.run(
-                    ["bash", str(UTILISE), "describe", table, "--env", "dev"],
+                    [_BASH, str(UTILISE), "describe", table, "--env", "dev"],
                     capture_output=True, text=True, cwd=ROOT,
                 )
                 self.assertEqual(describe.returncode, 0, describe.stderr)
@@ -94,7 +112,7 @@ def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None:
             finally:
                 # Always drop the table — keep dev clean.
                 subprocess.run(
-                    ["bash", str(UTILISE), "drop", table, "--yes", "--env", "dev"],
+                    [_BASH, str(UTILISE), "drop", table, "--yes", "--env", "dev"],
                     capture_output=True, text=True, cwd=ROOT,
                 )
 
diff --git a/tests/test_csv_utilise.py b/tests/test_csv_utilise.py
index fccbbf9..10869ee 100644
--- a/tests/test_csv_utilise.py
+++ b/tests/test_csv_utilise.py
@@ -5,7 +5,9 @@
 live alongside the integration tests in test_csv_loader_arbitrary_shapes.py.
 """
 import os
+import shutil
 import subprocess
+import sys
 import unittest
 from pathlib import Path
 
@@ -16,16 +18,40 @@
 SCRIPT = Path(__file__).resolve().parents[1] / "build" / "csv_utilise.sh"
 
 
+def _find_bash():
+    """Locate a real bash. On Windows, PATH-resolved `bash` is often WSL
+    (which may have no distro installed); prefer Git Bash explicitly.
+    Returns the absolute path, or None if no working bash is found."""
+    if sys.platform == "win32":
+        candidates = [
+            r"C:\Program Files\Git\bin\bash.exe",
+            r"C:\Program Files (x86)\Git\bin\bash.exe",
+        ]
+        for c in candidates:
+            if Path(c).exists():
+                return c
+        # Last resort: PATH lookup, but skip wsl shims.
+        which = shutil.which("bash")
+        if which and "system32" not in which.lower():
+            return which
+        return None
+    return shutil.which("bash") or "bash"
+
+
+_BASH = _find_bash()
+
+
 def run(args, env=None):
     """Run csv_utilise.sh with the given args; capture output."""
     return subprocess.run(
-        ["bash", str(SCRIPT), *args],
+        [_BASH, str(SCRIPT), *args],
         env={**os.environ, **(env or {})},
         capture_output=True,
         text=True,
     )
 
 
+@unittest.skipIf(_BASH is None, "No working bash found (Git Bash recommended on Windows)")
 class CsvUtiliseArgumentParsing(unittest.TestCase):
     def test_no_args_shows_usage_and_exits_nonzero(self):
         """No args → exit 1 with usage banner on stdout."""