diff --git a/.github/workflows/python-validator-tests.yml b/.github/workflows/python-validator-tests.yml
index 10fc725..4a65912 100644
--- a/.github/workflows/python-validator-tests.yml
+++ b/.github/workflows/python-validator-tests.yml
@@ -19,6 +19,9 @@ jobs:
with:
python-version: '3.11'
+ - name: Install dev dependencies
+ run: pip install -r requirements-dev.txt
+
- name: Run validator unit tests (PowerShell)
shell: pwsh
run: powershell -NoProfile -ExecutionPolicy Bypass -File "tests/run_python_tests.ps1"
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 98bc88c..c063be5 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -30,13 +30,15 @@ The three layers can break independently, so we keep them physically separate. T
|------|-----------|
| `build/te_core_schema.sql` | PostgreSQL master schema (legacy entry point) |
| `build/te_seed_data.sql` | Seed data |
-| `build/csv/` | Python CSV validator (`validator.py`) + per-engine shell loaders (`loader_*.sh`) |
+| `build/csv/` | Python CSV validator (`validator.py`), per-engine shell loaders (`loader_*.sh`), and `samples/` |
| `build/adapters/` | Per-engine deployment adapters (`adapter_postgresql.sh`, `adapter_mariadb.sh`, etc.) |
| `build/schema/` | Engine-specific DDL and seed data |
| `build/environments/` | PostgreSQL per-environment launchers (`env_dev.sql`, `env_test.sql`, etc.) |
| `build/terraform-github-repos/` | GitHub repository management as Infrastructure-as-Code |
| `build/setup.sh` | Interactive multi-database configuration wizard |
| `build/deploy_all.sh` | Multi-engine deployment router |
+| `build/csv_loader.sh` | Schema-agnostic CSV ingestion: any CSV → auto-created table |
+| `build/csv_utilise.sh` | Companion to the loader: list / describe / peek / export / drop CSV-loaded tables (PostgreSQL) |
### `tests/` — correctness coverage
diff --git a/Makefile b/Makefile
index c39f78d..2efe6c9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
.PHONY: test-free test-gate test-evals test-e2e test-all \
lint lint-diff health \
- eval-list eval-compare eval-summary select-tests
+ eval-list eval-compare eval-summary select-tests \
+ csv-load csv-list csv-demo
# ── Test tiers ────────────────────────────────────────────────────────────────
@@ -69,3 +70,27 @@ eval-summary:
# Mirrors: bun run eval:select
select-tests:
python3 scripts/select_tests.py
+
+# ── CSV loader / utiliser ─────────────────────────────────────────────────────
+
+# Load any CSV file into the target environment's database.
+# Usage: make csv-load FILE=path/to.csv [ENV=dev] [ENGINE=postgresql]
+csv-load:
+ @if [ -z "$(FILE)" ]; then \
+ echo "Usage: make csv-load FILE=path/to.csv [ENV=dev] [ENGINE=postgresql]"; \
+ exit 1; \
+ fi
+ bash build/csv_loader.sh "$(FILE)" --env $(or $(ENV),dev) $(if $(ENGINE),--engine $(ENGINE),)
+
+# List CSV-loaded tables in the target environment.
+# Usage: make csv-list [ENV=dev]
+csv-list:
+ bash build/csv_utilise.sh list --env $(or $(ENV),dev)
+
+# One-shot proof: load the three sample CSVs into dev, then list them.
+# Usage: make csv-demo [ENV=dev]
+csv-demo:
+ bash build/csv_loader.sh build/csv/samples/customers.csv --env $(or $(ENV),dev)
+ bash build/csv_loader.sh build/csv/samples/orders.csv --env $(or $(ENV),dev)
+ bash build/csv_loader.sh build/csv/samples/inventory.csv --env $(or $(ENV),dev)
+ bash build/csv_utilise.sh list --env $(or $(ENV),dev)
diff --git a/README.md b/README.md
index d95435d..9ae0a0a 100644
--- a/README.md
+++ b/README.md
@@ -290,6 +290,30 @@ CSV inputs must have a header row, use comma delimiters, and be UTF-8 encoded wi
Supported loader backends are PostgreSQL, MariaDB/MySQL, SQLite, InfluxDB, Redis, and Teradata. PostgreSQL uses `COPY`, MariaDB/MySQL uses `LOAD DATA LOCAL INFILE`, SQLite uses Python `csv` + `sqlite3`, InfluxDB writes line protocol via the `influx` CLI, Redis writes hashes through `redis-cli`, and Teradata uses BTEQ/FastLoad tooling.
+### Load any CSV
+
+The loader is schema-agnostic — drop any CSV file in front of it and a matching table is auto-created in the target environment's database. Every CSV-loaded table is tagged with two marker columns: `_csv_row_id BIGSERIAL PRIMARY KEY` and `_loaded_at TIMESTAMPTZ`. All other columns start as `TEXT`; `ALTER TABLE` afterwards if you need stricter types.
+
+Three sample CSVs ship under `build/csv/samples/` (`customers.csv`, `orders.csv`, `inventory.csv`) — deliberately off-domain from the T&E schema to demonstrate that any shape is accepted.
+
+```bash
+# Single-command happy-path proof (loads all three samples into dev, lists them)
+make csv-demo
+
+# Load any CSV
+make csv-load FILE=path/to/anything.csv # ENV defaults to dev
+make csv-load FILE=path/to/anything.csv ENV=test ENGINE=postgresql
+
+# Use loaded data — companion script: build/csv_utilise.sh (PostgreSQL only)
+./build/csv_utilise.sh list # all CSV-loaded tables in the env
+./build/csv_utilise.sh describe customers # columns + row count
+./build/csv_utilise.sh peek orders --limit 5 # first N rows
+./build/csv_utilise.sh export inventory dump.csv # round-trip back to CSV
+./build/csv_utilise.sh drop customers --yes # remove a CSV-loaded table
+```
+
+`csv_utilise.sh` only sees tables that carry the marker columns, so it cannot accidentally touch the rigid te_core_schema tables.
+
---
## How Parameterisation Works
diff --git a/build/csv/samples/customers.csv b/build/csv/samples/customers.csv
new file mode 100644
index 0000000..423e9b0
--- /dev/null
+++ b/build/csv/samples/customers.csv
@@ -0,0 +1,6 @@
+id,name,email,signup_date
+1,Alice Nguyen,alice@example.com,2024-01-15
+2,Brian O'Connor,brian@example.com,2024-02-03
+3,Chen Wei,chen.wei@example.com,2024-03-22
+4,Diana Patel,diana.patel@example.com,2024-05-10
+5,Eduardo Silva,eduardo@example.com,2024-07-01
diff --git a/build/csv/samples/inventory.csv b/build/csv/samples/inventory.csv
new file mode 100644
index 0000000..ab325ff
--- /dev/null
+++ b/build/csv/samples/inventory.csv
@@ -0,0 +1,7 @@
+sku,description,stock,location
+SKU-001,Wireless headphones – black,42,Warehouse A
+SKU-002,USB-C charger,128,Warehouse B
+SKU-003,Notebook A5 hardcover,75,Warehouse A
+SKU-004,Mechanical keyboard,18,Warehouse C
+SKU-005,Mouse pad – large,200,Warehouse B
+SKU-006,Webcam 1080p,33,Warehouse A
diff --git a/build/csv/samples/orders.csv b/build/csv/samples/orders.csv
new file mode 100644
index 0000000..46eec37
--- /dev/null
+++ b/build/csv/samples/orders.csv
@@ -0,0 +1,9 @@
+order_id,customer_id,product,qty,price
+1001,1,"Wireless headphones, black",1,89.95
+1002,2,USB-C charger,2,19.50
+1003,1,"Notebook, A5 hardcover",3,12.00
+1004,3,Mechanical keyboard,1,145.00
+1005,4,"Mouse pad, large",1,24.99
+1006,2,Webcam 1080p,1,59.00
+1007,5,"Cable organiser, 6-pack",1,15.75
+1008,3,Desk lamp,1,42.50
diff --git a/build/csv_utilise.sh b/build/csv_utilise.sh
new file mode 100755
index 0000000..7246b0e
--- /dev/null
+++ b/build/csv_utilise.sh
@@ -0,0 +1,259 @@
+#!/usr/bin/env bash
+# =============================================================================
+# csv_utilise.sh — Utilise CSV-loaded tables
+# =============================================================================
+# Companion to csv_loader.sh. Lists, describes, peeks at, exports, or drops
+# tables that were created by the CSV loader. CSV-loaded tables are
+# identified by the marker columns the loader always adds:
+# _csv_row_id BIGSERIAL PRIMARY KEY
+# _loaded_at TIMESTAMPTZ
+#
+# Usage:
+# ./csv_utilise.sh list [--env ENV] [--engine ENG]
+# ./csv_utilise.sh describe
[--env ENV] [--engine ENG]
+# ./csv_utilise.sh peek [--limit N] [--env ENV] [--engine ENG]
+# ./csv_utilise.sh export [--env ENV] [--engine ENG]
+# ./csv_utilise.sh drop --yes [--env ENV] [--engine ENG]
+#
+# ENG defaults to value from config (or postgresql).
+# ENV defaults to dev.
+#
+# Only the postgresql engine is implemented in this script. Other engines
+# return a clear "not implemented" message and exit 2.
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CONFIG_LOCAL="${SCRIPT_DIR}/config.local.env"
+CONFIG_DEFAULT="${SCRIPT_DIR}/config.env"
+
+GREEN=$'\033[0;32m'; RED=$'\033[0;31m'; YELLOW=$'\033[1;33m'
+CYAN=$'\033[0;36m'; BOLD=$'\033[1m'; NC=$'\033[0m'
+
+log() { echo -e "${GREEN}[✓]${NC} $*"; }
+warn() { echo -e "${YELLOW}[⚠]${NC} $*"; }
+error() { echo -e "${RED}[✗]${NC} $*" >&2; }
+info() { echo -e "${CYAN}[i]${NC} $*"; }
+
+usage() {
+ cat < [args] [--env ENV] [--engine ENG]
+
+${BOLD}Commands:${NC}
+ list List CSV-loaded tables in the target schema
+ describe Show columns and row count for a table
+ peek [--limit N] Show first N rows (default 10)
+ export Export the table back to CSV
+ drop --yes Drop a CSV-loaded table (requires --yes)
+
+${BOLD}Options:${NC}
+ --env dev | test | staging | prod (default: dev)
+ --engine postgresql (default: from config)
+ --limit row limit for 'peek' (default: 10)
+ --yes confirmation flag for 'drop'
+ --help, -h show this message
+
+${BOLD}Examples:${NC}
+ ./csv_utilise.sh list --env dev
+ ./csv_utilise.sh describe customers
+ ./csv_utilise.sh peek orders --limit 5
+ ./csv_utilise.sh export inventory /tmp/inventory_dump.csv
+ ./csv_utilise.sh drop orders --yes --env test
+
+EOF
+}
+
+# ── Parse arguments ───────────────────────────────────────────────────────────
+COMMAND=""
+TABLE=""
+OUT_FILE=""
+TARGET_ENV="dev"
+ENGINE_OVERRIDE=""
+PEEK_LIMIT="10"
+CONFIRM_DROP="false"
+
+if [[ $# -eq 0 ]]; then usage; exit 1; fi
+
+case "$1" in
+ --help|-h) usage; exit 0 ;;
+ list|describe|peek|export|drop) COMMAND="$1"; shift ;;
+ *) error "Unknown command: $1"; usage; exit 1 ;;
+esac
+
+# Positional args for some commands
+case "$COMMAND" in
+ describe|peek|drop)
+ if [[ $# -eq 0 || "${1:0:2}" == "--" ]]; then
+ error "Command '${COMMAND}' requires a argument."; usage; exit 1
+ fi
+ TABLE="$1"; shift
+ ;;
+ export)
+ if [[ $# -lt 2 || "${1:0:2}" == "--" || "${2:0:2}" == "--" ]]; then
+ error "Command 'export' requires and arguments."; usage; exit 1
+ fi
+ TABLE="$1"; OUT_FILE="$2"; shift 2
+ ;;
+esac
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --env) shift; TARGET_ENV="${1:-}" ;;
+ --engine) shift; ENGINE_OVERRIDE="${1:-}" ;;
+ --limit) shift; PEEK_LIMIT="${1:-10}" ;;
+ --yes) CONFIRM_DROP="true" ;;
+ --help|-h) usage; exit 0 ;;
+ *) error "Unknown argument: $1"; usage; exit 1 ;;
+ esac
+ shift
+done
+
+# ── Sanitise table identifier (before any side effects) ──────────────────────
+if [[ -n "$TABLE" ]]; then
+ if [[ ! "$TABLE" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then
+ error "Invalid table name: '${TABLE}'. Allowed: letters, digits, underscore; must not start with a digit."
+ exit 1
+ fi
+fi
+
+# ── Early engine check (avoids loading config for unsupported engines) ───────
+if [[ -n "$ENGINE_OVERRIDE" && "$ENGINE_OVERRIDE" != "postgresql" ]]; then
+ error "csv_utilise.sh: engine '${ENGINE_OVERRIDE}' is not implemented."
+ error "Only 'postgresql' is supported. Run with --engine postgresql."
+ exit 2
+fi
+
+# ── Load configuration ────────────────────────────────────────────────────────
+if [[ -f "$CONFIG_LOCAL" ]]; then
+ source "$CONFIG_LOCAL"
+elif [[ -f "$CONFIG_DEFAULT" ]]; then
+ source "$CONFIG_DEFAULT"
+ warn "config.local.env not found — using defaults. Run ./setup.sh to configure."
+else
+ error "No config found. Run ./setup.sh first."
+ exit 1
+fi
+
+DB_ENGINE="${ENGINE_OVERRIDE:-${DB_ENGINE:-postgresql}}"
+
+if [[ "$DB_ENGINE" != "postgresql" ]]; then
+ error "csv_utilise.sh: engine '${DB_ENGINE}' is not implemented."
+ error "Only 'postgresql' is supported. Run with --engine postgresql."
+ exit 2
+fi
+
+# ── Resolve PostgreSQL connection details ────────────────────────────────────
+E="${TARGET_ENV^^}"
+PG_HOST="${PGHOST:-${PG_HOST:-localhost}}"
+PG_PORT="${PGPORT:-${PG_PORT:-5432}}"
+PG_USER="${PGUSER:-${PG_SUPERUSER:-postgres}}"
+DB_NAME="$(eval echo "\$PG_DB_${E}")"
+SCHEMA="$(eval echo "\$PG_SCHEMA_${E}")"
+
+if [[ -z "$DB_NAME" || -z "$SCHEMA" ]]; then
+ error "Could not resolve database / schema for env '${TARGET_ENV}'."
+ error "Check that PG_DB_${E} and PG_SCHEMA_${E} are set in config.local.env."
+ exit 1
+fi
+
+[[ -n "${PG_SUPERUSER_PASSWORD:-}" ]] && export PGPASSWORD="${PG_SUPERUSER_PASSWORD}"
+
+PSQL=(psql -h "${PG_HOST}" -p "${PG_PORT}" -U "${PG_USER}" -d "${DB_NAME}" -v ON_ERROR_STOP=1)
+
+# ── Reachability probe (gives a friendly error when DB is down) ──────────────
+if ! "${PSQL[@]}" -tA -c "SELECT 1" >/dev/null 2>&1; then
+ error "Cannot reach PostgreSQL at ${PG_HOST}:${PG_PORT} as ${PG_USER}/${DB_NAME}."
+ error "Check the database is running and config.local.env credentials are correct."
+ exit 3
+fi
+
+# ── Helper: assert table is a CSV-loaded table ───────────────────────────────
+assert_csv_table() {
+ local tbl="$1"
+ local cnt
+ cnt=$("${PSQL[@]}" -tA -c "
+ SELECT COUNT(*)
+ FROM information_schema.columns
+ WHERE table_schema = '${SCHEMA}'
+ AND table_name = '${tbl}'
+ AND column_name IN ('_csv_row_id','_loaded_at');
+ ")
+ if [[ "$cnt" != "2" ]]; then
+ error "Table '${SCHEMA}.${tbl}' is not a CSV-loaded table (missing marker columns)."
+ error "Use the original loader to create it: ./csv_loader.sh .csv"
+ exit 1
+ fi
+}
+
+# ── Commands ──────────────────────────────────────────────────────────────────
+case "$COMMAND" in
+
+ list)
+ info "CSV-loaded tables in ${DB_NAME}.${SCHEMA}:"
+ "${PSQL[@]}" -P pager=off -c "
+ SELECT t.table_name AS table,
+ pg_size_pretty(pg_total_relation_size(format('%I.%I', t.table_schema, t.table_name)::regclass)) AS size
+ FROM information_schema.tables t
+ WHERE t.table_schema = '${SCHEMA}'
+ AND EXISTS (SELECT 1 FROM information_schema.columns c
+ WHERE c.table_schema = t.table_schema
+ AND c.table_name = t.table_name
+ AND c.column_name = '_csv_row_id')
+ AND EXISTS (SELECT 1 FROM information_schema.columns c
+ WHERE c.table_schema = t.table_schema
+ AND c.table_name = t.table_name
+ AND c.column_name = '_loaded_at')
+ ORDER BY t.table_name;
+ "
+ ;;
+
+ describe)
+ assert_csv_table "$TABLE"
+ info "Columns of ${SCHEMA}.${TABLE}:"
+ "${PSQL[@]}" -P pager=off -c "
+ SELECT column_name, data_type, is_nullable
+ FROM information_schema.columns
+ WHERE table_schema = '${SCHEMA}' AND table_name = '${TABLE}'
+ ORDER BY ordinal_position;
+ "
+ ROW_COUNT=$("${PSQL[@]}" -tA -c "SELECT COUNT(*) FROM \"${SCHEMA}\".\"${TABLE}\";")
+ log "Row count: ${ROW_COUNT}"
+ ;;
+
+ peek)
+ assert_csv_table "$TABLE"
+ if [[ ! "$PEEK_LIMIT" =~ ^[0-9]+$ ]]; then
+ error "--limit must be a positive integer (got: '${PEEK_LIMIT}')."
+ exit 1
+ fi
+ info "First ${PEEK_LIMIT} row(s) of ${SCHEMA}.${TABLE}:"
+ "${PSQL[@]}" -P pager=off -c "SELECT * FROM \"${SCHEMA}\".\"${TABLE}\" ORDER BY _csv_row_id LIMIT ${PEEK_LIMIT};"
+ ;;
+
+ export)
+ assert_csv_table "$TABLE"
+ # Validate output path is writable
+ OUT_DIR="$(dirname "$OUT_FILE")"
+ mkdir -p "$OUT_DIR"
+ info "Exporting ${SCHEMA}.${TABLE} to ${OUT_FILE}..."
+ "${PSQL[@]}" -c "\\COPY (SELECT * FROM \"${SCHEMA}\".\"${TABLE}\" ORDER BY _csv_row_id) TO '${OUT_FILE}' WITH (FORMAT CSV, HEADER TRUE)"
+ log "Export complete: ${OUT_FILE}"
+ ;;
+
+ drop)
+ assert_csv_table "$TABLE"
+ if [[ "$CONFIRM_DROP" != "true" ]]; then
+ error "Refusing to drop '${SCHEMA}.${TABLE}' without --yes."
+ exit 1
+ fi
+ warn "Dropping ${SCHEMA}.${TABLE}..."
+ "${PSQL[@]}" -c "DROP TABLE \"${SCHEMA}\".\"${TABLE}\";"
+ log "Dropped ${SCHEMA}.${TABLE}"
+ ;;
+esac
+
+unset PGPASSWORD
+exit 0
diff --git a/scripts/test.ps1 b/scripts/test.ps1
index e852585..f89a32e 100644
--- a/scripts/test.ps1
+++ b/scripts/test.ps1
@@ -54,8 +54,9 @@ try {
Record -Layer 'pytest unit' -Pass $pass -Detail "exit=$LASTEXITCODE"
Write-Host "[layer 1] $(if ($pass) {'PASS'} else {'FAIL'})" -ForegroundColor $(if ($pass) {'Green'} else {'Red'})
} else {
- Write-Host "[layer 1] SKIP: pytest not installed (pip install pytest)" -ForegroundColor Yellow
- Record -Layer 'pytest unit' -Pass $true -Detail 'skipped: pytest not installed'
+ Write-Host "[layer 1] FAIL: pytest not installed" -ForegroundColor Red
+ Write-Host " Run: pip install -r requirements-dev.txt" -ForegroundColor Yellow
+ Record -Layer 'pytest unit' -Pass $false -Detail 'pytest missing — pip install -r requirements-dev.txt'
}
# --- Layer 2: SQL test suite ---
diff --git a/scripts/test.sh b/scripts/test.sh
index 32bc68b..0dbfe87 100644
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -67,8 +67,9 @@ if command -v pytest >/dev/null 2>&1; then
record "pytest unit" FAIL "exit=$?"
fi
else
- echo "${Y}[layer 1] SKIP: pytest not installed${X}"
- record "pytest unit" PASS "skipped"
+ echo "${R}[layer 1] FAIL: pytest not installed${X}"
+ echo "${Y} Run: pip install -r requirements-dev.txt${X}"
+ record "pytest unit" FAIL "pytest missing — pip install -r requirements-dev.txt"
fi
# --- Layer 2: SQL suite ---
diff --git a/tests/test_csv_loader_arbitrary_shapes.py b/tests/test_csv_loader_arbitrary_shapes.py
new file mode 100644
index 0000000..f01f730
--- /dev/null
+++ b/tests/test_csv_loader_arbitrary_shapes.py
@@ -0,0 +1,130 @@
+"""End-to-end regression for the 'any CSV file' guarantee.
+
+Generates CSV files of varying shapes (column count × row count), runs them
+through build/csv_loader.sh into a live PostgreSQL instance, and asserts the
+auto-created table contains the right number of rows. Complements the
+offline Tier P scenarios in evals/datasets/tier_p/ — those exercise the
+validator only; these exercise the full validator → loader → DB pipeline.
+
+Skipped automatically when PostgreSQL is not reachable.
+"""
+import csv
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+import pytest
+
+pytestmark = pytest.mark.integration
+
+ROOT = Path(__file__).resolve().parents[1]
+LOADER = ROOT / "build" / "csv_loader.sh"
+UTILISE = ROOT / "build" / "csv_utilise.sh"
+BUILD_DIR = ROOT / "build"
+
+
+def _find_bash():
+ """Locate a real bash; on Windows prefer Git Bash over the WSL shim."""
+ if sys.platform == "win32":
+ for c in (r"C:\Program Files\Git\bin\bash.exe",
+ r"C:\Program Files (x86)\Git\bin\bash.exe"):
+ if Path(c).exists():
+ return c
+ which = shutil.which("bash")
+ if which and "system32" not in which.lower():
+ return which
+ return None
+ return shutil.which("bash") or "bash"
+
+
+_BASH = _find_bash()
+
+
+def _can_connect_pg() -> bool:
+ if not shutil.which("psql"):
+ return False
+ try:
+ r = subprocess.run(
+ ["psql", "-tA", "-c", "SELECT 1"],
+ capture_output=True, text=True, timeout=5,
+ env={**os.environ, "PGUSER": os.environ.get("PGUSER", "postgres")},
+ )
+ return r.returncode == 0
+ except (subprocess.TimeoutExpired, FileNotFoundError):
+ return False
+
+
+_PG_AVAILABLE = _can_connect_pg()
+_CONFIG_PRESENT = (BUILD_DIR / "config.local.env").exists()
+_SKIP_REASON = (
+ "PostgreSQL not reachable" if not _PG_AVAILABLE
+ else "build/config.local.env not present — run ./build/setup.sh"
+)
+
+
+def _write_csv(path: Path, n_cols: int, n_rows: int) -> None:
+ header = [f"col_{i}" for i in range(n_cols)]
+ with path.open("w", encoding="utf-8", newline="") as f:
+ w = csv.writer(f)
+ w.writerow(header)
+ for r in range(n_rows):
+ w.writerow([f"r{r}c{i}" for i in range(n_cols)])
+
+
+@unittest.skipUnless(_PG_AVAILABLE and _CONFIG_PRESENT, _SKIP_REASON)
+class CsvLoaderArbitraryShapes(unittest.TestCase):
+ """Loads CSVs of varying shape into Postgres and verifies row counts."""
+
+ SHAPES = [
+ ("tiny", 2, 3),
+ ("medium", 10, 50),
+ ("skinny", 1, 100),
+ ]
+
+ def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None:
+ with tempfile.TemporaryDirectory() as tmp:
+ # Use a unique table name so parallel runs don't collide.
+ table = f"arb_{label}_{os.getpid()}"
+ csv_path = Path(tmp) / f"{table}.csv"
+ _write_csv(csv_path, n_cols, n_rows)
+
+ load = subprocess.run(
+ [_BASH, str(LOADER), str(csv_path), "--env", "dev"],
+ capture_output=True, text=True, cwd=ROOT,
+ )
+ self.assertEqual(
+ load.returncode, 0,
+ f"Loader failed for {label}: stdout={load.stdout[-500:]} stderr={load.stderr[-500:]}",
+ )
+
+ try:
+ # Verify via csv_utilise.sh describe (also asserts marker columns present).
+ describe = subprocess.run(
+ [_BASH, str(UTILISE), "describe", table, "--env", "dev"],
+ capture_output=True, text=True, cwd=ROOT,
+ )
+ self.assertEqual(describe.returncode, 0, describe.stderr)
+ self.assertIn(f"Row count: {n_rows}", describe.stdout)
+ finally:
+ # Always drop the table — keep dev clean.
+ subprocess.run(
+ [_BASH, str(UTILISE), "drop", table, "--yes", "--env", "dev"],
+ capture_output=True, text=True, cwd=ROOT,
+ )
+
+ def test_shape_tiny(self):
+ self._run_loader_and_count(2, 3, "tiny")
+
+ def test_shape_medium(self):
+ self._run_loader_and_count(10, 50, "medium")
+
+ def test_shape_skinny(self):
+ self._run_loader_and_count(1, 100, "skinny")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_csv_utilise.py b/tests/test_csv_utilise.py
new file mode 100644
index 0000000..10869ee
--- /dev/null
+++ b/tests/test_csv_utilise.py
@@ -0,0 +1,114 @@
+"""Tests for build/csv_utilise.sh.
+
+Argument-parsing and reachability paths are covered as unit tests (no DB).
+Database-backed paths (list/describe/peek/export/drop against a real schema)
+live alongside the integration tests in test_csv_loader_arbitrary_shapes.py.
+"""
+import os
+import shutil
+import subprocess
+import sys
+import unittest
+from pathlib import Path
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+SCRIPT = Path(__file__).resolve().parents[1] / "build" / "csv_utilise.sh"
+
+
+def _find_bash():
+ """Locate a real bash. On Windows, PATH-resolved `bash` is often WSL
+ (which may have no distro installed); prefer Git Bash explicitly.
+ Returns the absolute path, or None if no working bash is found."""
+ if sys.platform == "win32":
+ candidates = [
+ r"C:\Program Files\Git\bin\bash.exe",
+ r"C:\Program Files (x86)\Git\bin\bash.exe",
+ ]
+ for c in candidates:
+ if Path(c).exists():
+ return c
+ # Last resort: PATH lookup, but skip wsl shims.
+ which = shutil.which("bash")
+ if which and "system32" not in which.lower():
+ return which
+ return None
+ return shutil.which("bash") or "bash"
+
+
+_BASH = _find_bash()
+
+
+def run(args, env=None):
+ """Run csv_utilise.sh with the given args; capture output."""
+ return subprocess.run(
+ [_BASH, str(SCRIPT), *args],
+ env={**os.environ, **(env or {})},
+ capture_output=True,
+ text=True,
+ )
+
+
+@unittest.skipIf(_BASH is None, "No working bash found (Git Bash recommended on Windows)")
+class CsvUtiliseArgumentParsing(unittest.TestCase):
+ def test_no_args_shows_usage_and_exits_nonzero(self):
+ """No args → exit 1 with usage banner on stdout."""
+ r = run([])
+ self.assertEqual(r.returncode, 1)
+ self.assertIn("Usage:", r.stdout)
+
+ def test_unknown_command_exits_nonzero(self):
+ """Unknown subcommand → exit 1 with 'Unknown command' on stderr."""
+ r = run(["frobnicate"])
+ self.assertEqual(r.returncode, 1)
+ self.assertIn("Unknown command", r.stderr)
+
+ def test_describe_requires_table_arg(self):
+ """`describe` with no positional table → exit 1 with a helpful message."""
+ r = run(["describe"])
+ self.assertEqual(r.returncode, 1)
+ self.assertIn("requires a argument", r.stderr)
+
+ def test_peek_requires_table_arg(self):
+ """`peek` with no positional table → exit 1 with a helpful message."""
+ r = run(["peek"])
+ self.assertEqual(r.returncode, 1)
+ self.assertIn("requires a argument", r.stderr)
+
+ def test_export_requires_two_positional_args(self):
+ """`export` needs both and → exit 1 otherwise."""
+ r = run(["export", "only_one"])
+ self.assertEqual(r.returncode, 1)
+ self.assertIn("requires and ", r.stderr)
+
+ def test_drop_requires_table_arg(self):
+ """`drop` with no positional table → exit 1."""
+ r = run(["drop"])
+ self.assertEqual(r.returncode, 1)
+ self.assertIn("requires a argument", r.stderr)
+
+ def test_help_flag_exits_zero(self):
+ """`--help` short-circuits to a 0 exit with the usage banner."""
+ r = run(["--help"])
+ self.assertEqual(r.returncode, 0)
+ self.assertIn("Usage:", r.stdout)
+
+ def test_invalid_table_name_rejected(self):
+ """Identifier validation rejects names with spaces or punctuation."""
+ # Use --engine postgresql so we get past engine validation; the
+ # identifier check happens before any DB connection.
+ r = run(["describe", "bad name; DROP TABLE x", "--engine", "postgresql"])
+ self.assertEqual(r.returncode, 1)
+ self.assertIn("Invalid table name", r.stderr)
+
+ def test_unimplemented_engine_returns_clear_error(self):
+ """Engines other than postgresql exit 2 with a 'not implemented' message."""
+ r = run(["list", "--engine", "redis"])
+ self.assertEqual(r.returncode, 2)
+ self.assertIn("not implemented", r.stderr)
+
+
+if __name__ == "__main__":
+ unittest.main()