From 37b2a71c11be8ce213cf24ecf16863123cbed4e2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Jun 2026 09:29:25 +0000 Subject: [PATCH 1/3] Add csv_utilise.sh, sample CSVs, and arbitrary-shape tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing csv_loader.sh already accepts any CSV file generically, but nothing in the repo helps a user query, list, or export what they loaded — they had to know psql and the auto-created schema. This change closes that gap and proves the path end-to-end: - build/csv_utilise.sh: list / describe / peek / export / drop subcommands scoped to tables carrying the loader's _csv_row_id + _loaded_at markers, so the te_core_schema tables can never be touched by accident - build/csv/samples/: three off-domain CSVs (customers, orders, inventory) so the loader has something to demo against - Makefile: csv-load, csv-list, csv-demo targets - tests/test_csv_utilise.py: 9 unit tests for arg parsing and engine guard - tests/test_csv_loader_arbitrary_shapes.py: parameterised regression (2x3, 10x50, 1x100) covering CSV -> validator -> Postgres -> query, skipped cleanly when PostgreSQL is unreachable - README.md / ARCHITECTURE.md: 'Load any CSV' subsection and new build/ rows Co-Authored-By: Claude Opus 4.7 Claude-Session: https://claude.ai/code/session_01VBrxqChRJtxdvSpFhiUWUy --- ARCHITECTURE.md | 4 +- Makefile | 27 ++- README.md | 24 ++ build/csv/samples/customers.csv | 6 + build/csv/samples/inventory.csv | 7 + build/csv/samples/orders.csv | 9 + build/csv_utilise.sh | 259 ++++++++++++++++++++++ tests/test_csv_loader_arbitrary_shapes.py | 112 ++++++++++ tests/test_csv_utilise.py | 88 ++++++++ 9 files changed, 534 insertions(+), 2 deletions(-) create mode 100644 build/csv/samples/customers.csv create mode 100644 build/csv/samples/inventory.csv create mode 100644 build/csv/samples/orders.csv create mode 100755 build/csv_utilise.sh create mode 100644 tests/test_csv_loader_arbitrary_shapes.py create mode 100644 tests/test_csv_utilise.py diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 98bc88c..c063be5 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -30,13 +30,15 @@ The three layers can break independently, so we keep them physically separate. T |------|-----------| | `build/te_core_schema.sql` | PostgreSQL master schema (legacy entry point) | | `build/te_seed_data.sql` | Seed data | -| `build/csv/` | Python CSV validator (`validator.py`) + per-engine shell loaders (`loader_*.sh`) | +| `build/csv/` | Python CSV validator (`validator.py`), per-engine shell loaders (`loader_*.sh`), and `samples/` | | `build/adapters/` | Per-engine deployment adapters (`adapter_postgresql.sh`, `adapter_mariadb.sh`, etc.) | | `build/schema/` | Engine-specific DDL and seed data | | `build/environments/` | PostgreSQL per-environment launchers (`env_dev.sql`, `env_test.sql`, etc.) | | `build/terraform-github-repos/` | GitHub repository management as Infrastructure-as-Code | | `build/setup.sh` | Interactive multi-database configuration wizard | | `build/deploy_all.sh` | Multi-engine deployment router | +| `build/csv_loader.sh` | Schema-agnostic CSV ingestion: any CSV → auto-created table | +| `build/csv_utilise.sh` | Companion to the loader: list / describe / peek / export / drop CSV-loaded tables (PostgreSQL) | ### `tests/` — correctness coverage diff --git a/Makefile b/Makefile index c39f78d..2efe6c9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ .PHONY: test-free test-gate test-evals test-e2e test-all \ lint lint-diff health \ - eval-list eval-compare eval-summary select-tests + eval-list eval-compare eval-summary select-tests \ + csv-load csv-list csv-demo # ── Test tiers ──────────────────────────────────────────────────────────────── @@ -69,3 +70,27 @@ eval-summary: # Mirrors: bun run eval:select select-tests: python3 scripts/select_tests.py + +# ── CSV loader / utiliser ───────────────────────────────────────────────────── + +# Load any CSV file into the target environment's database. +# Usage: make csv-load FILE=path/to.csv [ENV=dev] [ENGINE=postgresql] +csv-load: + @if [ -z "$(FILE)" ]; then \ + echo "Usage: make csv-load FILE=path/to.csv [ENV=dev] [ENGINE=postgresql]"; \ + exit 1; \ + fi + bash build/csv_loader.sh "$(FILE)" --env $(or $(ENV),dev) $(if $(ENGINE),--engine $(ENGINE),) + +# List CSV-loaded tables in the target environment. +# Usage: make csv-list [ENV=dev] +csv-list: + bash build/csv_utilise.sh list --env $(or $(ENV),dev) + +# One-shot proof: load the three sample CSVs into dev, then list them. +# Usage: make csv-demo [ENV=dev] +csv-demo: + bash build/csv_loader.sh build/csv/samples/customers.csv --env $(or $(ENV),dev) + bash build/csv_loader.sh build/csv/samples/orders.csv --env $(or $(ENV),dev) + bash build/csv_loader.sh build/csv/samples/inventory.csv --env $(or $(ENV),dev) + bash build/csv_utilise.sh list --env $(or $(ENV),dev) diff --git a/README.md b/README.md index d95435d..9ae0a0a 100644 --- a/README.md +++ b/README.md @@ -290,6 +290,30 @@ CSV inputs must have a header row, use comma delimiters, and be UTF-8 encoded wi Supported loader backends are PostgreSQL, MariaDB/MySQL, SQLite, InfluxDB, Redis, and Teradata. PostgreSQL uses `COPY`, MariaDB/MySQL uses `LOAD DATA LOCAL INFILE`, SQLite uses Python `csv` + `sqlite3`, InfluxDB writes line protocol via the `influx` CLI, Redis writes hashes through `redis-cli`, and Teradata uses BTEQ/FastLoad tooling. +### Load any CSV + +The loader is schema-agnostic — drop any CSV file in front of it and a matching table is auto-created in the target environment's database. Every CSV-loaded table is tagged with two marker columns: `_csv_row_id BIGSERIAL PRIMARY KEY` and `_loaded_at TIMESTAMPTZ`. All other columns start as `TEXT`; `ALTER TABLE` afterwards if you need stricter types. + +Three sample CSVs ship under `build/csv/samples/` (`customers.csv`, `orders.csv`, `inventory.csv`) — deliberately off-domain from the T&E schema to demonstrate that any shape is accepted. + +```bash +# Single-command happy-path proof (loads all three samples into dev, lists them) +make csv-demo + +# Load any CSV +make csv-load FILE=path/to/anything.csv # ENV defaults to dev +make csv-load FILE=path/to/anything.csv ENV=test ENGINE=postgresql + +# Use loaded data — companion script: build/csv_utilise.sh (PostgreSQL only) +./build/csv_utilise.sh list # all CSV-loaded tables in the env +./build/csv_utilise.sh describe customers # columns + row count +./build/csv_utilise.sh peek orders --limit 5 # first N rows +./build/csv_utilise.sh export inventory dump.csv # round-trip back to CSV +./build/csv_utilise.sh drop customers --yes # remove a CSV-loaded table +``` + +`csv_utilise.sh` only sees tables that carry the marker columns, so it cannot accidentally touch the rigid te_core_schema tables. + --- ## How Parameterisation Works diff --git a/build/csv/samples/customers.csv b/build/csv/samples/customers.csv new file mode 100644 index 0000000..423e9b0 --- /dev/null +++ b/build/csv/samples/customers.csv @@ -0,0 +1,6 @@ +id,name,email,signup_date +1,Alice Nguyen,alice@example.com,2024-01-15 +2,Brian O'Connor,brian@example.com,2024-02-03 +3,Chen Wei,chen.wei@example.com,2024-03-22 +4,Diana Patel,diana.patel@example.com,2024-05-10 +5,Eduardo Silva,eduardo@example.com,2024-07-01 diff --git a/build/csv/samples/inventory.csv b/build/csv/samples/inventory.csv new file mode 100644 index 0000000..ab325ff --- /dev/null +++ b/build/csv/samples/inventory.csv @@ -0,0 +1,7 @@ +sku,description,stock,location +SKU-001,Wireless headphones – black,42,Warehouse A +SKU-002,USB-C charger,128,Warehouse B +SKU-003,Notebook A5 hardcover,75,Warehouse A +SKU-004,Mechanical keyboard,18,Warehouse C +SKU-005,Mouse pad – large,200,Warehouse B +SKU-006,Webcam 1080p,33,Warehouse A diff --git a/build/csv/samples/orders.csv b/build/csv/samples/orders.csv new file mode 100644 index 0000000..46eec37 --- /dev/null +++ b/build/csv/samples/orders.csv @@ -0,0 +1,9 @@ +order_id,customer_id,product,qty,price +1001,1,"Wireless headphones, black",1,89.95 +1002,2,USB-C charger,2,19.50 +1003,1,"Notebook, A5 hardcover",3,12.00 +1004,3,Mechanical keyboard,1,145.00 +1005,4,"Mouse pad, large",1,24.99 +1006,2,Webcam 1080p,1,59.00 +1007,5,"Cable organiser, 6-pack",1,15.75 +1008,3,Desk lamp,1,42.50 diff --git a/build/csv_utilise.sh b/build/csv_utilise.sh new file mode 100755 index 0000000..7246b0e --- /dev/null +++ b/build/csv_utilise.sh @@ -0,0 +1,259 @@ +#!/usr/bin/env bash +# ============================================================================= +# csv_utilise.sh — Utilise CSV-loaded tables +# ============================================================================= +# Companion to csv_loader.sh. Lists, describes, peeks at, exports, or drops +# tables that were created by the CSV loader. CSV-loaded tables are +# identified by the marker columns the loader always adds: +# _csv_row_id BIGSERIAL PRIMARY KEY +# _loaded_at TIMESTAMPTZ +# +# Usage: +# ./csv_utilise.sh list [--env ENV] [--engine ENG] +# ./csv_utilise.sh describe [--env ENV] [--engine ENG] +# ./csv_utilise.sh peek
[--limit N] [--env ENV] [--engine ENG] +# ./csv_utilise.sh export
[--env ENV] [--engine ENG] +# ./csv_utilise.sh drop
--yes [--env ENV] [--engine ENG] +# +# ENG defaults to value from config (or postgresql). +# ENV defaults to dev. +# +# Only the postgresql engine is implemented in this script. Other engines +# return a clear "not implemented" message and exit 2. +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CONFIG_LOCAL="${SCRIPT_DIR}/config.local.env" +CONFIG_DEFAULT="${SCRIPT_DIR}/config.env" + +GREEN=$'\033[0;32m'; RED=$'\033[0;31m'; YELLOW=$'\033[1;33m' +CYAN=$'\033[0;36m'; BOLD=$'\033[1m'; NC=$'\033[0m' + +log() { echo -e "${GREEN}[✓]${NC} $*"; } +warn() { echo -e "${YELLOW}[⚠]${NC} $*"; } +error() { echo -e "${RED}[✗]${NC} $*" >&2; } +info() { echo -e "${CYAN}[i]${NC} $*"; } + +usage() { + cat < [args] [--env ENV] [--engine ENG] + +${BOLD}Commands:${NC} + list List CSV-loaded tables in the target schema + describe
Show columns and row count for a table + peek
[--limit N] Show first N rows (default 10) + export
Export the table back to CSV + drop
--yes Drop a CSV-loaded table (requires --yes) + +${BOLD}Options:${NC} + --env dev | test | staging | prod (default: dev) + --engine postgresql (default: from config) + --limit row limit for 'peek' (default: 10) + --yes confirmation flag for 'drop' + --help, -h show this message + +${BOLD}Examples:${NC} + ./csv_utilise.sh list --env dev + ./csv_utilise.sh describe customers + ./csv_utilise.sh peek orders --limit 5 + ./csv_utilise.sh export inventory /tmp/inventory_dump.csv + ./csv_utilise.sh drop orders --yes --env test + +EOF +} + +# ── Parse arguments ─────────────────────────────────────────────────────────── +COMMAND="" +TABLE="" +OUT_FILE="" +TARGET_ENV="dev" +ENGINE_OVERRIDE="" +PEEK_LIMIT="10" +CONFIRM_DROP="false" + +if [[ $# -eq 0 ]]; then usage; exit 1; fi + +case "$1" in + --help|-h) usage; exit 0 ;; + list|describe|peek|export|drop) COMMAND="$1"; shift ;; + *) error "Unknown command: $1"; usage; exit 1 ;; +esac + +# Positional args for some commands +case "$COMMAND" in + describe|peek|drop) + if [[ $# -eq 0 || "${1:0:2}" == "--" ]]; then + error "Command '${COMMAND}' requires a
argument."; usage; exit 1 + fi + TABLE="$1"; shift + ;; + export) + if [[ $# -lt 2 || "${1:0:2}" == "--" || "${2:0:2}" == "--" ]]; then + error "Command 'export' requires
and arguments."; usage; exit 1 + fi + TABLE="$1"; OUT_FILE="$2"; shift 2 + ;; +esac + +while [[ $# -gt 0 ]]; do + case "$1" in + --env) shift; TARGET_ENV="${1:-}" ;; + --engine) shift; ENGINE_OVERRIDE="${1:-}" ;; + --limit) shift; PEEK_LIMIT="${1:-10}" ;; + --yes) CONFIRM_DROP="true" ;; + --help|-h) usage; exit 0 ;; + *) error "Unknown argument: $1"; usage; exit 1 ;; + esac + shift +done + +# ── Sanitise table identifier (before any side effects) ────────────────────── +if [[ -n "$TABLE" ]]; then + if [[ ! "$TABLE" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then + error "Invalid table name: '${TABLE}'. Allowed: letters, digits, underscore; must not start with a digit." + exit 1 + fi +fi + +# ── Early engine check (avoids loading config for unsupported engines) ─────── +if [[ -n "$ENGINE_OVERRIDE" && "$ENGINE_OVERRIDE" != "postgresql" ]]; then + error "csv_utilise.sh: engine '${ENGINE_OVERRIDE}' is not implemented." + error "Only 'postgresql' is supported. Run with --engine postgresql." + exit 2 +fi + +# ── Load configuration ──────────────────────────────────────────────────────── +if [[ -f "$CONFIG_LOCAL" ]]; then + source "$CONFIG_LOCAL" +elif [[ -f "$CONFIG_DEFAULT" ]]; then + source "$CONFIG_DEFAULT" + warn "config.local.env not found — using defaults. Run ./setup.sh to configure." +else + error "No config found. Run ./setup.sh first." + exit 1 +fi + +DB_ENGINE="${ENGINE_OVERRIDE:-${DB_ENGINE:-postgresql}}" + +if [[ "$DB_ENGINE" != "postgresql" ]]; then + error "csv_utilise.sh: engine '${DB_ENGINE}' is not implemented." + error "Only 'postgresql' is supported. Run with --engine postgresql." + exit 2 +fi + +# ── Resolve PostgreSQL connection details ──────────────────────────────────── +E="${TARGET_ENV^^}" +PG_HOST="${PGHOST:-${PG_HOST:-localhost}}" +PG_PORT="${PGPORT:-${PG_PORT:-5432}}" +PG_USER="${PGUSER:-${PG_SUPERUSER:-postgres}}" +DB_NAME="$(eval echo "\$PG_DB_${E}")" +SCHEMA="$(eval echo "\$PG_SCHEMA_${E}")" + +if [[ -z "$DB_NAME" || -z "$SCHEMA" ]]; then + error "Could not resolve database / schema for env '${TARGET_ENV}'." + error "Check that PG_DB_${E} and PG_SCHEMA_${E} are set in config.local.env." + exit 1 +fi + +[[ -n "${PG_SUPERUSER_PASSWORD:-}" ]] && export PGPASSWORD="${PG_SUPERUSER_PASSWORD}" + +PSQL=(psql -h "${PG_HOST}" -p "${PG_PORT}" -U "${PG_USER}" -d "${DB_NAME}" -v ON_ERROR_STOP=1) + +# ── Reachability probe (gives a friendly error when DB is down) ────────────── +if ! "${PSQL[@]}" -tA -c "SELECT 1" >/dev/null 2>&1; then + error "Cannot reach PostgreSQL at ${PG_HOST}:${PG_PORT} as ${PG_USER}/${DB_NAME}." + error "Check the database is running and config.local.env credentials are correct." + exit 3 +fi + +# ── Helper: assert table is a CSV-loaded table ─────────────────────────────── +assert_csv_table() { + local tbl="$1" + local cnt + cnt=$("${PSQL[@]}" -tA -c " + SELECT COUNT(*) + FROM information_schema.columns + WHERE table_schema = '${SCHEMA}' + AND table_name = '${tbl}' + AND column_name IN ('_csv_row_id','_loaded_at'); + ") + if [[ "$cnt" != "2" ]]; then + error "Table '${SCHEMA}.${tbl}' is not a CSV-loaded table (missing marker columns)." + error "Use the original loader to create it: ./csv_loader.sh .csv" + exit 1 + fi +} + +# ── Commands ────────────────────────────────────────────────────────────────── +case "$COMMAND" in + + list) + info "CSV-loaded tables in ${DB_NAME}.${SCHEMA}:" + "${PSQL[@]}" -P pager=off -c " + SELECT t.table_name AS table, + pg_size_pretty(pg_total_relation_size(format('%I.%I', t.table_schema, t.table_name)::regclass)) AS size + FROM information_schema.tables t + WHERE t.table_schema = '${SCHEMA}' + AND EXISTS (SELECT 1 FROM information_schema.columns c + WHERE c.table_schema = t.table_schema + AND c.table_name = t.table_name + AND c.column_name = '_csv_row_id') + AND EXISTS (SELECT 1 FROM information_schema.columns c + WHERE c.table_schema = t.table_schema + AND c.table_name = t.table_name + AND c.column_name = '_loaded_at') + ORDER BY t.table_name; + " + ;; + + describe) + assert_csv_table "$TABLE" + info "Columns of ${SCHEMA}.${TABLE}:" + "${PSQL[@]}" -P pager=off -c " + SELECT column_name, data_type, is_nullable + FROM information_schema.columns + WHERE table_schema = '${SCHEMA}' AND table_name = '${TABLE}' + ORDER BY ordinal_position; + " + ROW_COUNT=$("${PSQL[@]}" -tA -c "SELECT COUNT(*) FROM \"${SCHEMA}\".\"${TABLE}\";") + log "Row count: ${ROW_COUNT}" + ;; + + peek) + assert_csv_table "$TABLE" + if [[ ! "$PEEK_LIMIT" =~ ^[0-9]+$ ]]; then + error "--limit must be a positive integer (got: '${PEEK_LIMIT}')." + exit 1 + fi + info "First ${PEEK_LIMIT} row(s) of ${SCHEMA}.${TABLE}:" + "${PSQL[@]}" -P pager=off -c "SELECT * FROM \"${SCHEMA}\".\"${TABLE}\" ORDER BY _csv_row_id LIMIT ${PEEK_LIMIT};" + ;; + + export) + assert_csv_table "$TABLE" + # Validate output path is writable + OUT_DIR="$(dirname "$OUT_FILE")" + mkdir -p "$OUT_DIR" + info "Exporting ${SCHEMA}.${TABLE} to ${OUT_FILE}..." + "${PSQL[@]}" -c "\\COPY (SELECT * FROM \"${SCHEMA}\".\"${TABLE}\" ORDER BY _csv_row_id) TO '${OUT_FILE}' WITH (FORMAT CSV, HEADER TRUE)" + log "Export complete: ${OUT_FILE}" + ;; + + drop) + assert_csv_table "$TABLE" + if [[ "$CONFIRM_DROP" != "true" ]]; then + error "Refusing to drop '${SCHEMA}.${TABLE}' without --yes." + exit 1 + fi + warn "Dropping ${SCHEMA}.${TABLE}..." + "${PSQL[@]}" -c "DROP TABLE \"${SCHEMA}\".\"${TABLE}\";" + log "Dropped ${SCHEMA}.${TABLE}" + ;; +esac + +unset PGPASSWORD +exit 0 diff --git a/tests/test_csv_loader_arbitrary_shapes.py b/tests/test_csv_loader_arbitrary_shapes.py new file mode 100644 index 0000000..ae0038f --- /dev/null +++ b/tests/test_csv_loader_arbitrary_shapes.py @@ -0,0 +1,112 @@ +"""End-to-end regression for the 'any CSV file' guarantee. + +Generates CSV files of varying shapes (column count × row count), runs them +through build/csv_loader.sh into a live PostgreSQL instance, and asserts the +auto-created table contains the right number of rows. Complements the +offline Tier P scenarios in evals/datasets/tier_p/ — those exercise the +validator only; these exercise the full validator → loader → DB pipeline. + +Skipped automatically when PostgreSQL is not reachable. +""" +import csv +import os +import shutil +import subprocess +import tempfile +import unittest +from pathlib import Path + +import pytest + +pytestmark = pytest.mark.integration + +ROOT = Path(__file__).resolve().parents[1] +LOADER = ROOT / "build" / "csv_loader.sh" +UTILISE = ROOT / "build" / "csv_utilise.sh" +BUILD_DIR = ROOT / "build" + + +def _can_connect_pg() -> bool: + if not shutil.which("psql"): + return False + try: + r = subprocess.run( + ["psql", "-tA", "-c", "SELECT 1"], + capture_output=True, text=True, timeout=5, + env={**os.environ, "PGUSER": os.environ.get("PGUSER", "postgres")}, + ) + return r.returncode == 0 + except (subprocess.TimeoutExpired, FileNotFoundError): + return False + + +_PG_AVAILABLE = _can_connect_pg() +_CONFIG_PRESENT = (BUILD_DIR / "config.local.env").exists() +_SKIP_REASON = ( + "PostgreSQL not reachable" if not _PG_AVAILABLE + else "build/config.local.env not present — run ./build/setup.sh" +) + + +def _write_csv(path: Path, n_cols: int, n_rows: int) -> None: + header = [f"col_{i}" for i in range(n_cols)] + with path.open("w", encoding="utf-8", newline="") as f: + w = csv.writer(f) + w.writerow(header) + for r in range(n_rows): + w.writerow([f"r{r}c{i}" for i in range(n_cols)]) + + +@unittest.skipUnless(_PG_AVAILABLE and _CONFIG_PRESENT, _SKIP_REASON) +class CsvLoaderArbitraryShapes(unittest.TestCase): + """Loads CSVs of varying shape into Postgres and verifies row counts.""" + + SHAPES = [ + ("tiny", 2, 3), + ("medium", 10, 50), + ("skinny", 1, 100), + ] + + def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None: + with tempfile.TemporaryDirectory() as tmp: + # Use a unique table name so parallel runs don't collide. + table = f"arb_{label}_{os.getpid()}" + csv_path = Path(tmp) / f"{table}.csv" + _write_csv(csv_path, n_cols, n_rows) + + load = subprocess.run( + ["bash", str(LOADER), str(csv_path), "--env", "dev"], + capture_output=True, text=True, cwd=ROOT, + ) + self.assertEqual( + load.returncode, 0, + f"Loader failed for {label}: stdout={load.stdout[-500:]} stderr={load.stderr[-500:]}", + ) + + try: + # Verify via csv_utilise.sh describe (also asserts marker columns present). + describe = subprocess.run( + ["bash", str(UTILISE), "describe", table, "--env", "dev"], + capture_output=True, text=True, cwd=ROOT, + ) + self.assertEqual(describe.returncode, 0, describe.stderr) + self.assertIn(f"Row count: {n_rows}", describe.stdout) + finally: + # Always drop the table — keep dev clean. + subprocess.run( + ["bash", str(UTILISE), "drop", table, "--yes", "--env", "dev"], + capture_output=True, text=True, cwd=ROOT, + ) + + def test_shape_tiny(self): + self._run_loader_and_count(2, 3, "tiny") + + def test_shape_medium(self): + self._run_loader_and_count(10, 50, "medium") + + def test_shape_skinny(self): + self._run_loader_and_count(1, 100, "skinny") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_csv_utilise.py b/tests/test_csv_utilise.py new file mode 100644 index 0000000..fccbbf9 --- /dev/null +++ b/tests/test_csv_utilise.py @@ -0,0 +1,88 @@ +"""Tests for build/csv_utilise.sh. + +Argument-parsing and reachability paths are covered as unit tests (no DB). +Database-backed paths (list/describe/peek/export/drop against a real schema) +live alongside the integration tests in test_csv_loader_arbitrary_shapes.py. +""" +import os +import subprocess +import unittest +from pathlib import Path + +import pytest + +pytestmark = pytest.mark.unit + +SCRIPT = Path(__file__).resolve().parents[1] / "build" / "csv_utilise.sh" + + +def run(args, env=None): + """Run csv_utilise.sh with the given args; capture output.""" + return subprocess.run( + ["bash", str(SCRIPT), *args], + env={**os.environ, **(env or {})}, + capture_output=True, + text=True, + ) + + +class CsvUtiliseArgumentParsing(unittest.TestCase): + def test_no_args_shows_usage_and_exits_nonzero(self): + """No args → exit 1 with usage banner on stdout.""" + r = run([]) + self.assertEqual(r.returncode, 1) + self.assertIn("Usage:", r.stdout) + + def test_unknown_command_exits_nonzero(self): + """Unknown subcommand → exit 1 with 'Unknown command' on stderr.""" + r = run(["frobnicate"]) + self.assertEqual(r.returncode, 1) + self.assertIn("Unknown command", r.stderr) + + def test_describe_requires_table_arg(self): + """`describe` with no positional table → exit 1 with a helpful message.""" + r = run(["describe"]) + self.assertEqual(r.returncode, 1) + self.assertIn("requires a
argument", r.stderr) + + def test_peek_requires_table_arg(self): + """`peek` with no positional table → exit 1 with a helpful message.""" + r = run(["peek"]) + self.assertEqual(r.returncode, 1) + self.assertIn("requires a
argument", r.stderr) + + def test_export_requires_two_positional_args(self): + """`export` needs both
and → exit 1 otherwise.""" + r = run(["export", "only_one"]) + self.assertEqual(r.returncode, 1) + self.assertIn("requires
and ", r.stderr) + + def test_drop_requires_table_arg(self): + """`drop` with no positional table → exit 1.""" + r = run(["drop"]) + self.assertEqual(r.returncode, 1) + self.assertIn("requires a
argument", r.stderr) + + def test_help_flag_exits_zero(self): + """`--help` short-circuits to a 0 exit with the usage banner.""" + r = run(["--help"]) + self.assertEqual(r.returncode, 0) + self.assertIn("Usage:", r.stdout) + + def test_invalid_table_name_rejected(self): + """Identifier validation rejects names with spaces or punctuation.""" + # Use --engine postgresql so we get past engine validation; the + # identifier check happens before any DB connection. + r = run(["describe", "bad name; DROP TABLE x", "--engine", "postgresql"]) + self.assertEqual(r.returncode, 1) + self.assertIn("Invalid table name", r.stderr) + + def test_unimplemented_engine_returns_clear_error(self): + """Engines other than postgresql exit 2 with a 'not implemented' message.""" + r = run(["list", "--engine", "redis"]) + self.assertEqual(r.returncode, 2) + self.assertIn("not implemented", r.stderr) + + +if __name__ == "__main__": + unittest.main() From 2b0d3c363949e682c5ac73dbbd4db9816240c17e Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Jun 2026 14:01:56 +0000 Subject: [PATCH 2/3] ci: install dev requirements; fail loudly when pytest is missing locally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Windows CI runner uses `python -m unittest discover` to find test files, but every test module imports pytest at top-level for `pytestmark` markers. Without pytest installed, all tests error at import time with ModuleNotFoundError. `requirements-dev.txt` already pins pytest + pytest-cov + flake8 + bandit; install it before the test step runs. Also: scripts/test.sh and scripts/test.ps1 used to treat a missing pytest as a silent PASS (`SKIP`), which masks real failures during local development. Both now fail with a one-line install hint pointing at requirements-dev.txt — consistent with what CI now does. --- .github/workflows/python-validator-tests.yml | 3 +++ scripts/test.ps1 | 5 +++-- scripts/test.sh | 5 +++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-validator-tests.yml b/.github/workflows/python-validator-tests.yml index 10fc725..4a65912 100644 --- a/.github/workflows/python-validator-tests.yml +++ b/.github/workflows/python-validator-tests.yml @@ -19,6 +19,9 @@ jobs: with: python-version: '3.11' + - name: Install dev dependencies + run: pip install -r requirements-dev.txt + - name: Run validator unit tests (PowerShell) shell: pwsh run: powershell -NoProfile -ExecutionPolicy Bypass -File "tests/run_python_tests.ps1" diff --git a/scripts/test.ps1 b/scripts/test.ps1 index e852585..f89a32e 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -54,8 +54,9 @@ try { Record -Layer 'pytest unit' -Pass $pass -Detail "exit=$LASTEXITCODE" Write-Host "[layer 1] $(if ($pass) {'PASS'} else {'FAIL'})" -ForegroundColor $(if ($pass) {'Green'} else {'Red'}) } else { - Write-Host "[layer 1] SKIP: pytest not installed (pip install pytest)" -ForegroundColor Yellow - Record -Layer 'pytest unit' -Pass $true -Detail 'skipped: pytest not installed' + Write-Host "[layer 1] FAIL: pytest not installed" -ForegroundColor Red + Write-Host " Run: pip install -r requirements-dev.txt" -ForegroundColor Yellow + Record -Layer 'pytest unit' -Pass $false -Detail 'pytest missing — pip install -r requirements-dev.txt' } # --- Layer 2: SQL test suite --- diff --git a/scripts/test.sh b/scripts/test.sh index 32bc68b..0dbfe87 100644 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -67,8 +67,9 @@ if command -v pytest >/dev/null 2>&1; then record "pytest unit" FAIL "exit=$?" fi else - echo "${Y}[layer 1] SKIP: pytest not installed${X}" - record "pytest unit" PASS "skipped" + echo "${R}[layer 1] FAIL: pytest not installed${X}" + echo "${Y} Run: pip install -r requirements-dev.txt${X}" + record "pytest unit" FAIL "pytest missing — pip install -r requirements-dev.txt" fi # --- Layer 2: SQL suite --- From 0af407ab024e5c779d8ee2f1a08fd4217de9f144 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Jun 2026 14:09:00 +0000 Subject: [PATCH 3/3] test: locate Git Bash on Windows runners MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On windows-latest GitHub Actions runners, `subprocess.run(["bash", ...])` resolves to C:\Windows\System32\bash.exe — the WSL shim — which fails with "Windows Subsystem for Linux has no installed distributions." Git Bash is preinstalled at C:\Program Files\Git\bin\bash.exe; prefer it explicitly and skip the test class only if no real bash is found. Same helper added to test_csv_loader_arbitrary_shapes.py for consistency (those tests already skip without Postgres, so the impact is preemptive). --- tests/test_csv_loader_arbitrary_shapes.py | 24 ++++++++++++++++--- tests/test_csv_utilise.py | 28 ++++++++++++++++++++++- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/tests/test_csv_loader_arbitrary_shapes.py b/tests/test_csv_loader_arbitrary_shapes.py index ae0038f..f01f730 100644 --- a/tests/test_csv_loader_arbitrary_shapes.py +++ b/tests/test_csv_loader_arbitrary_shapes.py @@ -12,6 +12,7 @@ import os import shutil import subprocess +import sys import tempfile import unittest from pathlib import Path @@ -26,6 +27,23 @@ BUILD_DIR = ROOT / "build" +def _find_bash(): + """Locate a real bash; on Windows prefer Git Bash over the WSL shim.""" + if sys.platform == "win32": + for c in (r"C:\Program Files\Git\bin\bash.exe", + r"C:\Program Files (x86)\Git\bin\bash.exe"): + if Path(c).exists(): + return c + which = shutil.which("bash") + if which and "system32" not in which.lower(): + return which + return None + return shutil.which("bash") or "bash" + + +_BASH = _find_bash() + + def _can_connect_pg() -> bool: if not shutil.which("psql"): return False @@ -75,7 +93,7 @@ def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None: _write_csv(csv_path, n_cols, n_rows) load = subprocess.run( - ["bash", str(LOADER), str(csv_path), "--env", "dev"], + [_BASH, str(LOADER), str(csv_path), "--env", "dev"], capture_output=True, text=True, cwd=ROOT, ) self.assertEqual( @@ -86,7 +104,7 @@ def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None: try: # Verify via csv_utilise.sh describe (also asserts marker columns present). describe = subprocess.run( - ["bash", str(UTILISE), "describe", table, "--env", "dev"], + [_BASH, str(UTILISE), "describe", table, "--env", "dev"], capture_output=True, text=True, cwd=ROOT, ) self.assertEqual(describe.returncode, 0, describe.stderr) @@ -94,7 +112,7 @@ def _run_loader_and_count(self, n_cols: int, n_rows: int, label: str) -> None: finally: # Always drop the table — keep dev clean. subprocess.run( - ["bash", str(UTILISE), "drop", table, "--yes", "--env", "dev"], + [_BASH, str(UTILISE), "drop", table, "--yes", "--env", "dev"], capture_output=True, text=True, cwd=ROOT, ) diff --git a/tests/test_csv_utilise.py b/tests/test_csv_utilise.py index fccbbf9..10869ee 100644 --- a/tests/test_csv_utilise.py +++ b/tests/test_csv_utilise.py @@ -5,7 +5,9 @@ live alongside the integration tests in test_csv_loader_arbitrary_shapes.py. """ import os +import shutil import subprocess +import sys import unittest from pathlib import Path @@ -16,16 +18,40 @@ SCRIPT = Path(__file__).resolve().parents[1] / "build" / "csv_utilise.sh" +def _find_bash(): + """Locate a real bash. On Windows, PATH-resolved `bash` is often WSL + (which may have no distro installed); prefer Git Bash explicitly. + Returns the absolute path, or None if no working bash is found.""" + if sys.platform == "win32": + candidates = [ + r"C:\Program Files\Git\bin\bash.exe", + r"C:\Program Files (x86)\Git\bin\bash.exe", + ] + for c in candidates: + if Path(c).exists(): + return c + # Last resort: PATH lookup, but skip wsl shims. + which = shutil.which("bash") + if which and "system32" not in which.lower(): + return which + return None + return shutil.which("bash") or "bash" + + +_BASH = _find_bash() + + def run(args, env=None): """Run csv_utilise.sh with the given args; capture output.""" return subprocess.run( - ["bash", str(SCRIPT), *args], + [_BASH, str(SCRIPT), *args], env={**os.environ, **(env or {})}, capture_output=True, text=True, ) +@unittest.skipIf(_BASH is None, "No working bash found (Git Bash recommended on Windows)") class CsvUtiliseArgumentParsing(unittest.TestCase): def test_no_args_shows_usage_and_exits_nonzero(self): """No args → exit 1 with usage banner on stdout."""