diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1d19af0..5355e12 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,7 +32,26 @@ jobs: run: uv sync --all-groups - name: Install dependencies - run: uv pip install -e . + run: uv pip install -e ".[neo4j]" + + # Keep generated docs in lockstep with the code being released: regenerate the + # README `canpy --help` block and the Neo4j schema.json from source, and commit + # them back to main. Releases are cut from main HEAD, so this fast-forwards; + # best-effort if main moved. + - name: Sync generated docs (README --help + Neo4j schema) + if: startsWith(github.ref, 'refs/tags/') + run: | + uv run python scripts/update_readme.py + uv run canpy --emit schema > schema.neo4j.json + if git diff --quiet README.md schema.neo4j.json; then + echo "Generated docs already current." + else + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add README.md schema.neo4j.json + git commit -m "docs: sync README --help and Neo4j schema for ${GITHUB_REF#refs/tags/}" + git push origin HEAD:main || echo "::warning::could not push doc sync to main (diverged?)" + fi - name: Run tests id: test @@ -51,6 +70,17 @@ jobs: - name: Build package run: uv build + # Platform-independent, version-locked release assets published alongside the + # wheels/sdist: the Neo4j schema contract (so a consumer can validate + # producer/consumer compatibility without installing the package) and the + # cargo-dist-style install script. + - name: Stage release assets (Neo4j schema + installer script) + run: | + mkdir -p release-assets + uv run canpy --emit schema > release-assets/schema.json + cp packaging/install/canpy-installer.sh release-assets/canpy-installer.sh + ls -lh release-assets + - name: Get version from tag id: tag_name run: | @@ -77,7 +107,9 @@ jobs: - name: Publish release on GitHub uses: softprops/action-gh-release@v1 with: - files: dist/* + files: | + dist/* + release-assets/* body: | ## Release Notes (from CHANGELOG.md) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a47607..6200df6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.0] - 2026-06-20 + +### Added +- **Neo4j property-graph output** (`--emit neo4j`). The same in-memory analysis (`PyApplication`) is projected to a labeled property graph, mirroring the `codeanalyzer-typescript` backend. Node labels are `Py`-prefixed and relationship types are `PY_`-prefixed (e.g. `:PyClass`, `PY_CALLS`) so multiple language analyzers can coexist in one database without label or relationship-type collisions. Two writers: + - **`graph.cypher` snapshot** (default) — a self-contained Cypher script (constraints + indexes, a scoped wipe of the project's prior subgraph, then batched `UNWIND … MERGE`). Load it with `cypher-shell < graph.cypher`. Needs no extra dependencies. + - **Live Bolt push** (`--neo4j-uri`) — an **incremental** writer: only modules whose `content_hash` changed are rewritten, and on a full run modules whose source file vanished are pruned. Requires the optional `neo4j` driver (`pip install 'codeanalyzer-python[neo4j]'`). +- **`--emit schema`** — emit the machine-readable, version-stamped Neo4j schema contract (`schema.json`: node labels, relationships, properties, constraints, indexes). Needs no project; bundled in every release as a GitHub Release asset and checked in as `schema.neo4j.json`. A `schema_version` (`1.0.0`) is stamped onto every graph's `:PyApplication` node. +- **New CLI options** mirroring the TypeScript analyzer's entrypoints: `--emit {json,neo4j,schema}`, `--app-name`, `--neo4j-uri`, `--neo4j-user`, `--neo4j-password`, `--neo4j-database`. `-i/--input` is now optional (not required for `--emit schema`). The four Neo4j connection options also read from the standard `NEO4J_URI` / `NEO4J_USERNAME` / `NEO4J_PASSWORD` / `NEO4J_DATABASE` environment variables when the flag is omitted (an explicit flag wins), so the password need not appear in shell history or the process list. +- **`codeanalyzer.neo4j`** package: `catalog` (the single source-of-truth schema catalog), `project` (pure IR → graph rows), `cypher` (snapshot writer), `bolt` (incremental writer), and `rows` (the output-agnostic intermediate). +- **Schema conformance test** (`test/test_neo4j_schema.py`, always runs) — asserts the emitter never produces a label/relationship/property the catalog doesn't declare, and that the checked-in `schema.neo4j.json` is regenerated. +- **Neo4j Testcontainers integration test** (`test/test_neo4j_bolt.py`, opt-in via `RUN_CONTAINER_TESTS=1`) — spins up a real Neo4j and asserts the pushed graph, idempotent re-push, vanished-declaration cleanup, and full-run orphan pruning. +- **Install script** (`packaging/install/canpy-installer.sh`) — a `curl … | sh` installer that provisions the CLI via uv / pipx / pip, published as a release asset. +- **`schema-uml.drawio`** — a clean UML of the `analysis.json` schema (the `PyApplication` containment tree). + +### Changed +- **The CLI command is now `canpy`** (was `codeanalyzer`), matching the `cants` (TypeScript) sibling. The PyPI package name is unchanged (`codeanalyzer-python`), as is the importable `codeanalyzer` module. The old `codeanalyzer` command is retained as a **deprecated alias** that prints a notice (to stderr) and then runs unchanged; it will be removed in a future release. +- The README `canpy --help` block is now generated from the live CLI (`scripts/update_readme.py`, between `` markers) so it can't drift from the code. +- The release workflow now installs the `[neo4j]` extra, syncs both the README `--help` block and `schema.neo4j.json` from source before publishing, and uploads the schema contract (`schema.json`) and installer script as GitHub Release assets. + ## [0.1.15] - 2026-05-15 ### Fixed diff --git a/README.md b/README.md index f7c4b1a..bafd948 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # A Python Static Analysis Toolkit (and Library) -A comprehensive static analysis tool for Python source code that provides symbol table generation, call graph analysis, and semantic analysis using Jedi, CodeQL, and Tree-sitter. +A comprehensive static analysis tool for Python source code that provides symbol table generation, call graph analysis, and semantic analysis using Jedi, CodeQL, and Tree-sitter — emitted as the canonical `analysis.json`, or projected into a **Neo4j property graph**. ## Installation @@ -10,6 +10,18 @@ A comprehensive static analysis tool for Python source code that provides symbol pip install codeanalyzer-python ``` +For the optional **live Neo4j push** (`--emit neo4j --neo4j-uri …`), install the `neo4j` extra: + +```bash +pip install 'codeanalyzer-python[neo4j]' +``` + +Or install the CLI as an isolated tool with the one-line installer (provisions via uv / pipx / pip): + +```bash +curl --proto '=https' --tlsv1.2 -LsSf https://github.com/codellm-devkit/codeanalyzer-python/releases/latest/download/canpy-installer.sh | sh +``` + ### Prerequisites - Python 3.12 or higher @@ -56,64 +68,109 @@ pyenv global 3.12.0 # or pyenv local 3.12.0 for project-specific ## Usage -The codeanalyzer provides a command-line interface for performing static analysis on Python projects. +`canpy` provides a command-line interface for performing static analysis on Python projects. ### Basic Usage ```bash -codeanalyzer --input /path/to/python/project +canpy --input /path/to/python/project ``` ### Command Line Options -To view the available options and commands, run `codeanalyzer --help`. You should see output similar to the following: +To view the available options and commands, run `canpy --help`. You should see output similar to the following: -```bash -❯ codeanalyzer --help + - Usage: codeanalyzer [OPTIONS] COMMAND [ARGS]... +```text +$ canpy --help - Static Analysis on Python source code using Jedi, CodeQL and Tree sitter. + Usage: canpy [OPTIONS] COMMAND [ARGS]... + Static Analysis on Python source code using Jedi, CodeQL and Tree sitter. -╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * --input -i PATH Path to the project root directory. [default: None] [required] │ -│ --output -o PATH Output directory for artifacts. [default: None] │ -│ --format -f [json|msgpack] Output format: json or msgpack. [default: json] │ -│ --codeql --no-codeql Enable CodeQL-based analysis. [default: no-codeql] │ -│ --eager --lazy Enable eager or lazy analysis. Defaults to lazy. [default: lazy] │ -│ --cache-dir -c PATH Directory to store analysis cache. [default: None] │ -│ --clear-cache --keep-cache Clear cache after analysis. [default: clear-cache] │ -│ -v INTEGER Increase verbosity: -v, -vv, -vvv [default: 0] │ -│ --help Show this message and exit. │ -╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --input -i PATH Path to the project root directory │ +│ (not required for --emit schema). │ +│ --output -o PATH Output directory for artifacts. │ +│ --format -f [json|msgpack] Output format for --emit json: │ +│ json or msgpack. │ +│ [default: json] │ +│ --emit [json|neo4j|schema] Output target: json │ +│ (analysis.json, default) | neo4j │ +│ (graph.cypher or live Bolt push) | │ +│ schema (the Neo4j schema.json │ +│ contract). │ +│ [default: json] │ +│ --app-name TEXT Logical application name for the │ +│ graph :PyApplication anchor │ +│ (default: input dir name). │ +│ --neo4j-uri TEXT Push the graph to a live Neo4j │ +│ over Bolt (incremental); omit to │ +│ write graph.cypher. │ +│ [env var: NEO4J_URI] │ +│ --neo4j-user TEXT Neo4j username. │ +│ [env var: NEO4J_USERNAME] │ +│ [default: neo4j] │ +│ --neo4j-password TEXT Neo4j password. Prefer the env var │ +│ over the flag (the flag is visible │ +│ in shell history / process list). │ +│ [env var: NEO4J_PASSWORD] │ +│ [default: neo4j] │ +│ --neo4j-database TEXT Neo4j database name (default: │ +│ server default). │ +│ [env var: NEO4J_DATABASE] │ +│ --codeql --no-codeql Enable CodeQL-based analysis. │ +│ [default: no-codeql] │ +│ --ray --no-ray Enable Ray for distributed │ +│ analysis. │ +│ [default: no-ray] │ +│ --eager --lazy Enable eager or lazy analysis. │ +│ Defaults to lazy. │ +│ [default: lazy] │ +│ --skip-tests --include-tests Skip test files in analysis. │ +│ [default: skip-tests] │ +│ --file-name PATH Analyze only the specified file │ +│ (relative to input directory). │ +│ --cache-dir -c PATH Directory to store analysis cache. │ +│ Defaults to '.codeanalyzer' in the │ +│ input directory. │ +│ --clear-cache --keep-cache Clear cache after analysis. By │ +│ default, cache is retained. │ +│ [default: keep-cache] │ +│ -v INTEGER Increase verbosity: -v, -vv, -vvv │ +│ [default: 0] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` + + ### Examples 1. **Basic analysis with symbol table:** ```bash - codeanalyzer --input ./my-python-project + canpy --input ./my-python-project ``` - This will print the symbol table to stdout in JSON format to the standard output. If you want to save the output, you can use the `--output` option. + This will print the symbol table to stdout in JSON format. If you want to save the output, you can use the `--output` option. ```bash - codeanalyzer --input ./my-python-project --output /path/to/analysis-results + canpy --input ./my-python-project --output /path/to/analysis-results ``` Now, you can find the analysis results in `analysis.json` in the specified directory. 2. **Change output format to msgpack:** ```bash - codeanalyzer --input ./my-python-project --output /path/to/analysis-results --format msgpack + canpy --input ./my-python-project --output /path/to/analysis-results --format msgpack ``` This will save the analysis results in `analysis.msgpack` in the specified directory. 3. **Analysis with CodeQL enabled:** ```bash - codeanalyzer --input ./my-python-project --codeql + canpy --input ./my-python-project --codeql ``` Every run produces a symbol table **and** a call graph. By default, edges come from Jedi's lexical analysis. Adding `--codeql` resolves additional edges (including RPC / third-party / dynamically-dispatched targets) and merges them with the Jedi-derived edges. CodeQL also backfills resolved callees on Jedi-emitted call sites where Jedi couldn't resolve them. @@ -121,158 +178,62 @@ To view the available options and commands, run `codeanalyzer --help`. You shoul 4. **Eager analysis with custom cache directory:** ```bash - codeanalyzer --input ./my-python-project --eager --cache-dir /path/to/custom-cache + canpy --input ./my-python-project --eager --cache-dir /path/to/custom-cache ``` - This will rebuild the analysis cache at every run and store it in `/path/to/custom-cache/.codeanalyzer`. The cache will be cleared by default after analysis unless you specify `--keep-cache`. + This will rebuild the analysis cache at every run and store it in `/path/to/custom-cache/.codeanalyzer`. - If you provide --cache-dir, the cache will be stored in that directory. If not specified, it defaults to `.codeanalyzer` in the current working directory (`$PWD`). - -5. **Quiet mode (minimal output):** +5. **Emit a Neo4j snapshot, or push to a live database:** ```bash - codeanalyzer --input /path/to/my-python-project --quiet + canpy --input ./my-python-project --emit neo4j --output ./out # → ./out/graph.cypher + canpy --input ./my-python-project --emit neo4j \ + --neo4j-uri bolt://localhost:7687 --neo4j-user neo4j --neo4j-password secret ``` -## Output - -By default, analysis results are printed to stdout in JSON format. When using the `--output` option, results are saved to `analysis.json` in the specified directory. If you use the `--format=msgpack` option, the results will be saved in `analysis.msgpack`, which is a binary format that can be more efficient for storage and transmission. - -## Development - -This project uses [uv](https://docs.astral.sh/uv/) for dependency management during development. - -### Development Setup - -1. Install [uv](https://docs.astral.sh/uv/getting-started/installation/) -![logo](https://github.com/codellm-devkit/codeanalyzer-python/blob/main/docs/assets/logo.png?raw=true) - -# A Python Static Analysis Toolkit (and Library) - -A comprehensive static analysis tool for Python source code that provides symbol table generation, call graph analysis, and semantic analysis using Jedi, CodeQL, and Tree-sitter. - -## Installation - -```bash -pip install codeanalyzer-python -``` - -### Prerequisites - -- Python 3.12 or higher - -#### System Package Requirements - -The tool creates virtual environments internally using Python's built-in `venv` module. - -**Ubuntu/Debian systems:** -```bash -sudo apt update -sudo apt install python3.12-venv python3-dev build-essential -``` - -**Fedora/RHEL/CentOS systems:** -```bash -sudo dnf group install "Development Tools" -sudo dnf install python3-pip python3-venv python3-devel -``` -or on older versions: -```bash -sudo yum groupinstall "Development Tools" -sudo yum install python3-pip python3-venv python3-devel -``` - -**macOS systems:** -```bash -# Install Xcode Command Line Tools (for compilation) -xcode-select --install - -# If using Homebrew Python (recommended) -brew install python@3.12 - -# If using pyenv (popular Python version manager) -# First ensure pyenv is properly installed and configured -pyenv install 3.12.0 # or latest 3.12.x version -pyenv global 3.12.0 # or pyenv local 3.12.0 for project-specific - -# If using system Python, you may need to install certificates -/Applications/Python\ 3.12/Install\ Certificates.command -``` +6. **Emit the Neo4j schema contract:** + ```bash + canpy --emit schema # print schema.json to stdout (no project needed) + canpy --emit schema --output ./out # → ./out/schema.json + ``` -> **Note:** These packages are required as the tool uses Python's built-in `venv` module to create isolated environments for analysis. +## Output targets -## Usage +`canpy` builds one analysis in memory and can emit it three ways (`--emit`): -The codeanalyzer provides a command-line interface for performing static analysis on Python projects. +### `analysis.json` (default) -### Basic Usage +A `PyApplication` document — the canonical CLDK contract: -```bash -codeanalyzer --input /path/to/python/project +```jsonc +{ + "symbol_table": { /* file path → module (classes, functions, variables, imports, …) */ }, + "call_graph": [ /* CALL_DEP edges: { source, target, weight, provenance } keyed by callable signature */ ] +} ``` -### Command Line Options +By default this is printed to stdout in JSON; with `--output` it is written to `analysis.json` (or `analysis.msgpack` with `--format msgpack`, a more compact binary format). -To view the available options and commands, run `codeanalyzer --help`. You should see output similar to the following: +### Neo4j graph -```bash -❯ codeanalyzer --help +`--emit neo4j` projects the same analysis into a labeled property graph. Every node label is `Py`-prefixed and every relationship type is `PY_`-prefixed (e.g. `:PyClass`, `PY_CALLS`) so multiple language analyzers can share one database without label or relationship-type collisions. Declarations are keyed by their signature under a shared `:PySymbol` label; calls, imports, inheritance, decorators, and call sites are relationships: - Usage: codeanalyzer [OPTIONS] COMMAND [ARGS]... +- **Without `--neo4j-uri`** — writes a self-contained `graph.cypher` (constraints + indexes, a scoped wipe, then batched `MERGE`s). Load it with `cypher-shell < graph.cypher`. Needs no extra dependencies. +- **With `--neo4j-uri`** — pushes to a live Neo4j over Bolt **incrementally**: only modules whose content hash changed are rewritten, and on a full run modules whose source file vanished are pruned. Requires the `neo4j` extra. Every graph carries a `schema_version` on its `:PyApplication` node. - Static Analysis on Python source code using Jedi, CodeQL and Tree sitter. +Call-graph endpoints that aren't present in the symbol table (third-party / framework / RPC targets) are materialized as `:PyExternal` ghost nodes, mirroring the analyzer's own ghost-node behaviour. +The connection options also read from the standard Neo4j environment variables — `NEO4J_URI`, `NEO4J_USERNAME`, `NEO4J_PASSWORD`, `NEO4J_DATABASE` — when the corresponding flag is omitted (an explicit flag wins). Prefer the env var for the password so it doesn't land in shell history or the process list: -╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * --input -i PATH Path to the project root directory. [default: None] [required] │ -│ --output -o PATH Output directory for artifacts. [default: None] │ -│ --format -f [json|msgpack] Output format: json or msgpack. [default: json]. │ -│ --codeql --no-codeql Enable CodeQL-based analysis. [default: no-codeql] │ -│ --eager --lazy Enable eager or lazy analysis. Defaults to lazy. [default: lazy] │ -│ --cache-dir -c PATH Directory to store analysis cache. [default: None] │ -│ --clear-cache --keep-cache Clear cache after analysis. [default: clear-cache] │ -│ -v INTEGER Increase verbosity: -v, -vv, -vvv [default: 0] │ -│ --help Show this message and exit. │ -╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +```sh +export NEO4J_URI=bolt://localhost:7687 +export NEO4J_PASSWORD=secret +canpy -i ./my-project --emit neo4j # credentials picked up from the environment ``` -### Examples +### Schema contract -1. **Basic analysis with symbol table:** - ```bash - codeanalyzer --input ./my-python-project - ``` +`--emit schema` writes the machine-readable, version-stamped Neo4j schema (`schema.json`: node labels, relationships, properties, constraints, and indexes). It needs no project and is checked into the repo as `schema.neo4j.json` and bundled in every release as a GitHub Release asset, so a consumer can validate producer/consumer compatibility without invoking the tool. The shape of the contract matches the [`codeanalyzer-typescript`](https://github.com/codellm-devkit/codeanalyzer-typescript) backend. - This will print the symbol table to stdout in JSON format to the standard output. If you want to save the output, you can use the `--output` option. - - ```bash - codeanalyzer --input ./my-python-project --output /path/to/analysis-results - ``` - - Now, you can find the analysis results in `analysis.json` in the specified directory. - -2. **Analysis with CodeQL enabled:** - ```bash - codeanalyzer --input ./my-python-project --codeql - ``` - Every run produces a symbol table **and** a call graph. By default, edges come from Jedi's lexical analysis. Adding `--codeql` resolves additional edges (including RPC / third-party / dynamically-dispatched targets) and merges them with the Jedi-derived edges. CodeQL also backfills resolved callees on Jedi-emitted call sites where Jedi couldn't resolve them. - - ***Note: CodeQL integration is experimental. The CLI is downloaded into `/codeql/` on first use and reused thereafter.*** - -3. **Eager analysis with custom cache directory:** - ```bash - codeanalyzer --input ./my-python-project --eager --cache-dir /path/to/custom-cache - ``` - This will rebuild the analysis cache at every run and store it in `/path/to/custom-cache/.codeanalyzer`. The cache will be cleared by default after analysis unless you specify `--keep-cache`. - - If you provide --cache-dir, the cache will be stored in that directory. If not specified, it defaults to `.codeanalyzer` in the current working directory (`$PWD`). - -4. **Save output in msgpack format:** - ```bash - codeanalyzer --input ./my-python-project --output /path/to/analysis-results --format msgpack - ``` - -### Output - -By default, analysis results are printed to stdout in JSON format. When using the `--output` option, results are saved to `analysis.json` in the specified directory. +A UML of the `analysis.json` schema (the `PyApplication` containment tree) is checked in as [`schema-uml.drawio`](./schema-uml.drawio). ## Development @@ -296,10 +257,9 @@ This project uses [uv](https://docs.astral.sh/uv/) for dependency management dur ### Running from Source -When developing, you can run the tool directly from source: - ```bash -uv run codeanalyzer --input /path/to/python/project +uv run canpy --input /path/to/python/project +uv run canpy --emit schema > schema.neo4j.json # regenerate the checked-in schema contract ``` ### Running Tests @@ -308,49 +268,19 @@ uv run codeanalyzer --input /path/to/python/project uv run pytest --pspec -s ``` -### Development Dependencies - -The project includes additional dependency groups for development: - -- **test**: pytest and related testing tools -- **dev**: development tools like ipdb - -Install all groups with: -```bash -uv sync --all-groups -``` - -2. Clone the repository: - ```bash - git clone https://github.com/codellm-devkit/codeanalyzer-python - cd codeanalyzer-python - ``` - -3. Install dependencies using uv: - ```bash - uv sync --all-groups - ``` - This will install all dependencies including development and test dependencies. - -### Running from Source - -When developing, you can run the tool directly from source: - -```bash -uv run codeanalyzer --input /path/to/python/project -``` - -### Running Tests +The Neo4j schema-conformance test always runs. The Neo4j **bolt** integration test spins up a real +Neo4j via [Testcontainers](https://testcontainers.com/) and is **opt-in** — it needs a container +runtime (Docker or Podman) and is enabled with an environment variable: ```bash -uv run pytest --pspec -s +RUN_CONTAINER_TESTS=1 uv run pytest test/test_neo4j_bolt.py -s ``` ### Development Dependencies The project includes additional dependency groups for development: -- **test**: pytest and related testing tools +- **test**: pytest and related testing tools (plus `neo4j` + `testcontainers` for the opt-in Neo4j test) - **dev**: development tools like ipdb Install all groups with: diff --git a/codeanalyzer/__main__.py b/codeanalyzer/__main__.py index 19e7f2a..d386d3b 100644 --- a/codeanalyzer/__main__.py +++ b/codeanalyzer/__main__.py @@ -7,13 +7,18 @@ from codeanalyzer.utils import _set_log_level, logger from codeanalyzer.config import OutputFormat from codeanalyzer.schema import model_dump_json -from codeanalyzer.options import AnalysisOptions +from codeanalyzer.options import AnalysisOptions, EmitTarget def main( input: Annotated[ - Path, typer.Option("-i", "--input", help="Path to the project root directory.") - ], + Optional[Path], + typer.Option( + "-i", + "--input", + help="Path to the project root directory (not required for --emit schema).", + ), + ] = None, output: Annotated[ Optional[Path], typer.Option("-o", "--output", help="Output directory for artifacts."), @@ -23,10 +28,61 @@ def main( typer.Option( "-f", "--format", - help="Output format: json or msgpack.", + help="Output format for --emit json: json or msgpack.", case_sensitive=False, ), ] = OutputFormat.JSON, + emit: Annotated[ + EmitTarget, + typer.Option( + "--emit", + help="Output target: json (analysis.json, default) | neo4j (graph.cypher or live " + "Bolt push) | schema (the Neo4j schema.json contract).", + case_sensitive=False, + ), + ] = EmitTarget.JSON, + app_name: Annotated[ + Optional[str], + typer.Option( + "--app-name", + help="Logical application name for the graph :PyApplication anchor " + "(default: input dir name).", + ), + ] = None, + neo4j_uri: Annotated[ + Optional[str], + typer.Option( + "--neo4j-uri", + envvar="NEO4J_URI", + help="Push the graph to a live Neo4j over Bolt (incremental); omit to write " + "graph.cypher. [env: NEO4J_URI]", + ), + ] = None, + neo4j_user: Annotated[ + str, + typer.Option( + "--neo4j-user", + envvar="NEO4J_USERNAME", + help="Neo4j username. [env: NEO4J_USERNAME]", + ), + ] = "neo4j", + neo4j_password: Annotated[ + str, + typer.Option( + "--neo4j-password", + envvar="NEO4J_PASSWORD", + help="Neo4j password. Prefer the env var over the flag (the flag is visible in shell " + "history / process list). [env: NEO4J_PASSWORD]", + ), + ] = "neo4j", + neo4j_database: Annotated[ + Optional[str], + typer.Option( + "--neo4j-database", + envvar="NEO4J_DATABASE", + help="Neo4j database name (default: server default). [env: NEO4J_DATABASE]", + ), + ] = None, using_codeql: Annotated[ bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.") ] = False, @@ -78,6 +134,12 @@ def main( input=input, output=output, format=format, + emit=emit, + app_name=app_name, + neo4j_uri=neo4j_uri, + neo4j_user=neo4j_user, + neo4j_password=neo4j_password, + neo4j_database=neo4j_database, using_codeql=using_codeql, using_ray=using_ray, rebuild_analysis=rebuild_analysis, @@ -89,6 +151,18 @@ def main( ) _set_log_level(options.verbosity) + + # The schema contract is a static artifact — no project analysis required. + if options.emit == EmitTarget.SCHEMA: + from codeanalyzer.neo4j.emit import emit_schema + + emit_schema(options.output) + return + + # Every other target requires an input project. + if options.input is None: + logger.error("Missing option '-i' / '--input' (required for --emit json | neo4j).") + raise typer.Exit(code=1) if not options.input.exists(): logger.error(f"Input path '{options.input}' does not exist.") raise typer.Exit(code=1) @@ -112,7 +186,11 @@ def main( with Codeanalyzer(options) as analyzer: artifacts = analyzer.analyze() - if options.output is None: + if options.emit == EmitTarget.NEO4J: + from codeanalyzer.neo4j.emit import emit_neo4j + + emit_neo4j(artifacts, options) + elif options.output is None: print(model_dump_json(artifacts, separators=(",", ":"))) else: options.output.mkdir(parents=True, exist_ok=True) @@ -142,7 +220,7 @@ def _write_output(artifacts, output_dir: Path, format: OutputFormat): app = typer.Typer( callback=main, - name="codeanalyzer", + name="canpy", help="Static Analysis on Python source code using Jedi, CodeQL and Tree sitter.", invoke_without_command=True, no_args_is_help=True, @@ -151,5 +229,20 @@ def _write_output(artifacts, output_dir: Path, format: OutputFormat): pretty_exceptions_show_locals=False, ) +def deprecated_main() -> None: + """Entry point for the legacy ``codeanalyzer`` command. Prints a one-line + deprecation notice to stderr (so piped stdout — e.g. ``--emit schema`` — stays + clean) and then runs the CLI unchanged. Kept for backwards compatibility; will + be removed in a future release.""" + import sys + + print( + "codeanalyzer: this command has been renamed to `canpy`. The `codeanalyzer` " + "alias is deprecated and will be removed in a future release — please use `canpy`.", + file=sys.stderr, + ) + app() + + if __name__ == "__main__": app() diff --git a/codeanalyzer/neo4j/__init__.py b/codeanalyzer/neo4j/__init__.py new file mode 100644 index 0000000..72dff05 --- /dev/null +++ b/codeanalyzer/neo4j/__init__.py @@ -0,0 +1,46 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Neo4j output: a pure projection of the :class:`PyApplication` IR to graph rows, +plus the two writers (cypher snapshot / bolt incremental). Nothing here runs +unless ``--emit neo4j`` (or ``--emit schema``) is selected. +""" +from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer +from codeanalyzer.neo4j.catalog import ( + MARKER_LABELS, + NODE_LABELS, + REL_TYPES, + SCHEMA_VERSION, + build_schema_document, +) +from codeanalyzer.neo4j.cypher import render_cypher +from codeanalyzer.neo4j.project import project +from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow + +__all__ = [ + "project", + "render_cypher", + "bolt_writer", + "BoltConfig", + "build_schema_document", + "SCHEMA_VERSION", + "NODE_LABELS", + "REL_TYPES", + "MARKER_LABELS", + "GraphRows", + "NodeRow", + "EdgeRow", +] diff --git a/codeanalyzer/neo4j/bolt.py b/codeanalyzer/neo4j/bolt.py new file mode 100644 index 0000000..4ae102b --- /dev/null +++ b/codeanalyzer/neo4j/bolt.py @@ -0,0 +1,223 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""The incremental writer: push :class:`GraphRows` into a live Neo4j over Bolt. +Unlike the snapshot writer, this one reads the DB's current state and updates +only what changed. + +Algorithm (the module subgraph is the unit of idempotent replacement): + 1. ensure constraints + indexes. + 2. diff each module's ``content_hash`` against the DB → the set of changed modules. + 3. per changed module, in a transaction: delete the edges it owned (edges out of + its nodes), detach-delete the declarations it no longer emits, then upsert + its current nodes. + 4. upsert edges owned by changed modules (+ the shared edges). + 5. on a FULL run only, prune modules whose source file vanished. + +Nodes are MERGE-upserted, never blindly deleted, so a declaration another +(unchanged) module still references survives and its incoming edges stay valid. +``:PyExternal`` / ``:PyPackage`` / ``:PyDecorator`` are shared (no ``_module``) and are +MERGE-only. + +The ``neo4j`` driver is imported lazily so it stays an optional dependency and +off the default (json) output path entirely. +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, List, Optional + +from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow, chunk +from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES +from codeanalyzer.utils import logger + +DESCENDANTS = "[:PY_DECLARES|PY_HAS_METHOD|PY_HAS_ATTRIBUTE|PY_DECLARES_VAR|PY_HAS_CALLSITE*1..]" +BATCH = 1000 + + +@dataclass +class BoltConfig: + uri: str + user: str + password: str + database: Optional[str] = None + + +def bolt_writer(rows: GraphRows, cfg: BoltConfig, full_run: bool) -> None: + try: + import neo4j # noqa: WPS433 (lazy, optional dependency) + except ImportError as exc: # pragma: no cover - exercised only without the extra + raise RuntimeError( + "The 'neo4j' driver is required for '--emit neo4j --neo4j-uri'. " + "Install it with: pip install 'codeanalyzer-python[neo4j]'" + ) from exc + + driver = neo4j.GraphDatabase.driver(cfg.uri, auth=(cfg.user, cfg.password)) + session_kwargs = {"database": cfg.database} if cfg.database else {} + + def session(): + return driver.session(**session_kwargs) + + try: + # 1. schema (DDL runs in its own autocommit transactions). + with session() as s: + for stmt in [*CONSTRAINTS, *INDEXES]: + s.run(stmt) + + # Partition nodes by owning module; shared nodes have no _module. + by_module: Dict[str, List[NodeRow]] = {} + shared: List[NodeRow] = [] + module_of: Dict[str, str] = {} # node value → owning module + for n in rows.nodes: + m = n.props.get("_module") + if isinstance(m, str): + by_module.setdefault(m, []).append(n) + module_of[n.value] = m + else: + shared.append(n) + + # 2. diff content_hash. + db_hash: Dict[str, Optional[str]] = {} + with session() as s: + res = s.run("MATCH (m:PyModule) RETURN m.file_key AS k, m.content_hash AS h") + for rec in res: + db_hash[rec["k"]] = rec["h"] + changed = set() + for m, nodes in by_module.items(): + row_hash = _hash_of(nodes, m) + if m not in db_hash or row_hash is None or row_hash != db_hash.get(m): + changed.add(m) + logger.info( + f"neo4j(bolt): {len(by_module)} modules ({len(changed)} changed), " + f"{len(shared)} shared nodes, {len(rows.edges)} edges" + ) + + # 3. shared nodes are always upserted (MERGE-only). + _upsert_nodes(session, neo4j, shared) + + # 4. per changed module: purge owned edges + vanished decls, then upsert its nodes. + for m in changed: + nodes = by_module[m] + keys = [n.value for n in nodes] + with session() as s: + def _purge(tx, module=m, node_keys=keys): + tx.run("MATCH (x {_module: $m})-[r]->() DELETE r", m=module) + tx.run( + "MATCH (x {_module: $m}) " + "WHERE NOT coalesce(x.signature, x.id, x.file_key) IN $keys " + "DETACH DELETE x", + m=module, + keys=node_keys, + ) + + s.execute_write(_purge) + _upsert_nodes(session, neo4j, nodes) + + # 5. upsert edges owned by a changed module (owner = source node's module) or shared. + edges = [ + e + for e in rows.edges + if module_of.get(e.from_ref.value) is None or module_of.get(e.from_ref.value) in changed + ] + _upsert_edges(session, neo4j, edges) + + # 6. orphan prune — only safe on a full run (a targeted run can't tell deleted from untargeted). + if full_run: + present = list(by_module.keys()) + with session() as s: + res = s.run( + "MATCH (m:PyModule) WHERE NOT m.file_key IN $present " + f"OPTIONAL MATCH (m)-{DESCENDANTS}->(x) DETACH DELETE x, m " + "RETURN count(m) AS pruned", + present=present, + ) + pruned = res.single() + pruned_count = pruned["pruned"] if pruned else 0 + logger.info(f"neo4j(bolt): pruned {pruned_count} vanished module(s)") + else: + logger.info( + "neo4j(bolt): targeted run — orphan pruning skipped (deleted files not removed)" + ) + finally: + driver.close() + + +# ---------------------------------------------------------------------------------------------- +# Batched upserts +# ---------------------------------------------------------------------------------------------- + + +def _upsert_nodes(session, neo4j, nodes: List[NodeRow]) -> None: + groups: Dict[str, List[NodeRow]] = {} + for n in nodes: + groups.setdefault(f"{':'.join(n.labels)}|{n.key_prop}", []).append(n) + + for group in groups.values(): + labels = group[0].labels + key_prop = group[0].key_prop + set_labels = f", n:{':'.join(labels[1:])}" if len(labels) > 1 else "" + cypher = ( + f"UNWIND $rows AS row MERGE (n:{labels[0]} {{{key_prop}: row.k}}) " + f"SET n += row.p{set_labels}" + ) + for batch in chunk(group, BATCH): + payload = [{"k": n.value, "p": _to_params(n.props, neo4j)} for n in batch] + with session() as s: + s.run(cypher, rows=payload) + + +def _upsert_edges(session, neo4j, edges: List[EdgeRow]) -> None: + groups: Dict[str, List[EdgeRow]] = {} + for e in edges: + key = f"{e.type}|{e.from_ref.label}.{e.from_ref.key_prop}|{e.to_ref.label}.{e.to_ref.key_prop}" + groups.setdefault(key, []).append(e) + + for group in groups.values(): + first = group[0] + from_ref, to_ref = first.from_ref, first.to_ref + cypher = ( + f"UNWIND $rows AS row " + f"MATCH (a:{from_ref.label} {{{from_ref.key_prop}: row.f}}) " + f"MATCH (b:{to_ref.label} {{{to_ref.key_prop}: row.t}}) " + f"MERGE (a)-[r:{first.type}]->(b) SET r += row.p" + ) + for batch in chunk(group, BATCH): + payload = [ + {"f": e.from_ref.value, "t": e.to_ref.value, "p": _to_params(e.props, neo4j)} + for e in batch + ] + with session() as s: + s.run(cypher, rows=payload) + + +# ---------------------------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------------------------- + + +def _hash_of(nodes: List[NodeRow], file_key: str) -> Optional[str]: + for n in nodes: + if n.labels[0] == "PyModule" and n.value == file_key: + h = n.props.get("content_hash") + return h if isinstance(h, str) else None + return None + + +def _to_params(props, neo4j) -> dict: + """Map props to driver params. The Python driver already distinguishes int + from float, so unlike the JS driver no integer coercion is needed — this is a + straight passthrough kept symmetric with the snapshot writer's shape.""" + return dict(props) diff --git a/codeanalyzer/neo4j/catalog.py b/codeanalyzer/neo4j/catalog.py new file mode 100644 index 0000000..37f8a1a --- /dev/null +++ b/codeanalyzer/neo4j/catalog.py @@ -0,0 +1,245 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""The declarative Neo4j schema catalog — the single in-repo source of truth for +the graph contract (node labels, their keys and typed properties, relationship +types and their endpoints). ``--emit schema`` serializes this (with the DDL from +:mod:`codeanalyzer.neo4j.schema`) to a machine-readable ``schema.json``, and the +conformance test (``test/test_neo4j_schema.py``) asserts the real emitter never +produces a label / relationship / property that isn't declared here — so this +file cannot silently drift from :mod:`codeanalyzer.neo4j.project`. + +SCHEMA_VERSION is the contract version: bump MAJOR on a breaking change +(renamed/removed label, relationship or key), MINOR on an additive change (new +label/rel/property). It is stamped onto the ``:PyApplication`` node of every +emitted graph so any consumer can detect a producer/consumer mismatch at runtime. +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, List + +from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES + +SCHEMA_VERSION = "1.0.0" + +# PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}. + + +@dataclass +class NodeLabel: + label: str # the specific label (also the catalog key) + merge_label: str # the label the uniqueness constraint / MERGE is on + key: str + properties: Dict[str, str] + + +@dataclass +class RelType: + type: str + from_labels: List[str] + to_labels: List[str] + properties: Dict[str, str] = field(default_factory=dict) + + +# Labels layered onto a node in addition to its primary/specific label. +MARKER_LABELS: List[str] = [] + +_SPAN = {"start_line": "integer", "end_line": "integer"} + + +NODE_LABELS: List[NodeLabel] = [ + NodeLabel( + "PyApplication", + "PyApplication", + "name", + {"name": "string", "schema_version": "string"}, + ), + NodeLabel( + "PyModule", + "PyModule", + "file_key", + { + "file_key": "string", + "module_name": "string", + "content_hash": "string", + "last_modified": "float", + "file_size": "integer", + "_module": "string", + }, + ), + NodeLabel( + "PyClass", + "PySymbol", + "signature", + { + "signature": "string", + "name": "string", + "code": "string", + "base_classes": "string[]", + "docstring": "string", + **_SPAN, + "_module": "string", + }, + ), + NodeLabel( + "PyCallable", + "PySymbol", + "signature", + { + "signature": "string", + "name": "string", + "path": "string", + "return_type": "string", + "cyclomatic_complexity": "integer", + "code": "string", + "code_start_line": "integer", + **_SPAN, + "docstring": "string", + "decorators": "string[]", + "parameters_json": "string", + "accessed_symbols_json": "string", + "_module": "string", + }, + ), + NodeLabel( + "PyExternal", + "PySymbol", + "signature", + {"signature": "string", "name": "string"}, + ), + NodeLabel("PyPackage", "PyPackage", "name", {"name": "string"}), + NodeLabel( + "PyDecorator", + "PyDecorator", + "name", + {"name": "string"}, + ), + NodeLabel( + "PyCallSite", + "PyCallSite", + "id", + { + "id": "string", + "method_name": "string", + "receiver_expr": "string", + "receiver_type": "string", + "argument_types": "string[]", + "return_type": "string", + "callee_signature": "string", + "is_constructor_call": "boolean", + "start_line": "integer", + "start_column": "integer", + "end_line": "integer", + "end_column": "integer", + "_module": "string", + }, + ), + NodeLabel( + "PyAttribute", + "PyAttribute", + "id", + { + "id": "string", + "name": "string", + "type": "string", + "docstring": "string", + **_SPAN, + "_module": "string", + }, + ), + NodeLabel( + "PyVariable", + "PyVariable", + "id", + { + "id": "string", + "name": "string", + "type": "string", + "initializer": "string", + "scope": "string", + **_SPAN, + "_module": "string", + }, + ), +] + +_DECL_TARGETS = ["PyClass", "PyCallable"] + + +REL_TYPES: List[RelType] = [ + RelType("PY_HAS_MODULE", ["PyApplication"], ["PyModule"]), + RelType("PY_DECLARES", ["PyModule", "PyClass", "PyCallable"], _DECL_TARGETS), + RelType("PY_HAS_METHOD", ["PyClass"], ["PyCallable"]), + RelType("PY_HAS_ATTRIBUTE", ["PyClass"], ["PyAttribute"]), + RelType("PY_DECLARES_VAR", ["PyModule", "PyCallable"], ["PyVariable"]), + RelType("PY_HAS_CALLSITE", ["PyCallable"], ["PyCallSite"]), + RelType("PY_RESOLVES_TO", ["PyCallSite"], ["PyCallable", "PyExternal"]), + RelType( + "PY_CALLS", + ["PyCallable", "PyExternal"], + ["PyCallable", "PyExternal"], + {"weight": "integer", "provenance": "string[]"}, + ), + RelType("PY_EXTENDS", ["PyClass"], ["PyClass"]), + RelType( + "PY_IMPORTS", + ["PyModule"], + ["PyPackage"], + {"imported_names": "string[]", "aliases": "string[]"}, + ), + RelType("PY_DECORATED_BY", ["PyCallable"], ["PyDecorator"]), +] + + +@dataclass +class SchemaDocument: + schema_version: str + generator: str + marker_labels: List[str] + node_labels: List[NodeLabel] + relationship_types: List[RelType] + constraints: List[str] + indexes: List[str] + + +def build_schema_document() -> dict: + """Build the full machine-readable schema document emitted by ``--emit schema``.""" + return { + "schema_version": SCHEMA_VERSION, + "generator": "codeanalyzer-python", + "marker_labels": list(MARKER_LABELS), + "node_labels": [ + { + "label": n.label, + "merge_label": n.merge_label, + "key": n.key, + "properties": n.properties, + } + for n in NODE_LABELS + ], + "relationship_types": [ + { + "type": r.type, + "from": r.from_labels, + "to": r.to_labels, + "properties": r.properties, + } + for r in REL_TYPES + ], + "constraints": list(CONSTRAINTS), + "indexes": list(INDEXES), + } diff --git a/codeanalyzer/neo4j/cypher.py b/codeanalyzer/neo4j/cypher.py new file mode 100644 index 0000000..ad77297 --- /dev/null +++ b/codeanalyzer/neo4j/cypher.py @@ -0,0 +1,138 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""The snapshot writer: render :class:`GraphRows` to a self-contained ``.cypher`` +script. Running it (e.g. ``cypher-shell < graph.cypher``) rebuilds this project's +subgraph from scratch — constraints, a scoped wipe of the prior version, then +batched ``UNWIND … MERGE`` for nodes and edges. + +This artifact is intentionally NOT incremental: a static script has no view of +the live DB, so it expresses the full truth. Incremental updates are the bolt +writer's job. +""" +from __future__ import annotations + +from typing import Dict, List + +from codeanalyzer.neo4j.rows import ( + EdgeRow, + GraphRows, + NodeRow, + chunk, + cypher_map, + cypher_value, +) +from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES + +BATCH = 500 + + +def render_cypher(rows: GraphRows, app_name: str) -> str: + out: List[str] = [] + + out.append("// ── constraints & indexes ──") + for stmt in CONSTRAINTS: + out.append(f"{stmt};") + for stmt in INDEXES: + out.append(f"{stmt};") + + out.append("") + out.append("// ── wipe this project's prior subgraph (externals/packages/decorators are shared) ──") + out.append(_wipe(app_name)) + + out.append("") + out.append("// ── nodes ──") + out.extend(_node_statements(rows.nodes)) + + out.append("") + out.append("// ── relationships ──") + out.extend(_edge_statements(rows.edges)) + + out.append("") + return "\n".join(out) + + +def _wipe(app_name: str) -> str: + name = cypher_value(app_name) + return "\n".join( + [ + f"MATCH (a:PyApplication {{name: {name}}})", + "OPTIONAL MATCH (a)-[:PY_HAS_MODULE]->(m:PyModule)", + "OPTIONAL MATCH (m)-[:PY_DECLARES|PY_HAS_METHOD|PY_HAS_ATTRIBUTE|PY_DECLARES_VAR|PY_HAS_CALLSITE*1..]->(x)", + "DETACH DELETE x, m, a;", + ] + ) + + +# ---------------------------------------------------------------------------------------------- +# Nodes — grouped by their full label set + key property, batched into UNWIND lists. +# ---------------------------------------------------------------------------------------------- + + +def _node_statements(nodes: List[NodeRow]) -> List[str]: + groups: Dict[str, List[NodeRow]] = {} + for n in nodes: + key = f"{':'.join(n.labels)}|{n.key_prop}" + groups.setdefault(key, []).append(n) + + blocks: List[str] = [] + for group in groups.values(): + labels = group[0].labels + key_prop = group[0].key_prop + merge_label = labels[0] + extra = labels[1:] + set_labels = f", n:{':'.join(extra)}" if extra else "" + for batch in chunk(group, BATCH): + rows_lit = ",\n".join( + f" {{k: {cypher_value(n.value)}, p: {cypher_map(n.props)}}}" for n in batch + ) + blocks.append( + f"UNWIND [\n{rows_lit}\n] AS row\n" + f"MERGE (n:{merge_label} {{{key_prop}: row.k}})\n" + f"SET n += row.p{set_labels};" + ) + return blocks + + +# ---------------------------------------------------------------------------------------------- +# Edges — grouped by (type, endpoint labels + key props), batched. +# ---------------------------------------------------------------------------------------------- + + +def _edge_statements(edges: List[EdgeRow]) -> List[str]: + groups: Dict[str, List[EdgeRow]] = {} + for e in edges: + key = f"{e.type}|{e.from_ref.label}.{e.from_ref.key_prop}|{e.to_ref.label}.{e.to_ref.key_prop}" + groups.setdefault(key, []).append(e) + + blocks: List[str] = [] + for group in groups.values(): + first = group[0] + from_ref, to_ref = first.from_ref, first.to_ref + for batch in chunk(group, BATCH): + rows_lit = ",\n".join( + f" {{f: {cypher_value(e.from_ref.value)}, t: {cypher_value(e.to_ref.value)}, " + f"p: {cypher_map(e.props)}}}" + for e in batch + ) + blocks.append( + f"UNWIND [\n{rows_lit}\n] AS row\n" + f"MATCH (a:{from_ref.label} {{{from_ref.key_prop}: row.f}})\n" + f"MATCH (b:{to_ref.label} {{{to_ref.key_prop}: row.t}})\n" + f"MERGE (a)-[r:{first.type}]->(b)\n" + f"SET r += row.p;" + ) + return blocks diff --git a/codeanalyzer/neo4j/emit.py b/codeanalyzer/neo4j/emit.py new file mode 100644 index 0000000..fe7cf8f --- /dev/null +++ b/codeanalyzer/neo4j/emit.py @@ -0,0 +1,74 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""The facade between the CLI and the Neo4j backend. Two entry points: + +- :func:`emit_schema` — serialize the static, version-stamped schema contract + (``schema.json``). Needs no analyzed project. +- :func:`emit_neo4j` — project a :class:`PyApplication` to a graph and either + write a ``graph.cypher`` snapshot or push it to a live Neo4j over Bolt. +""" +from __future__ import annotations + +import json +from pathlib import Path +from typing import Optional + +from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer +from codeanalyzer.neo4j.catalog import build_schema_document +from codeanalyzer.neo4j.cypher import render_cypher +from codeanalyzer.neo4j.project import project +from codeanalyzer.options import AnalysisOptions +from codeanalyzer.schema import PyApplication +from codeanalyzer.utils import logger + + +def emit_schema(output: Optional[Path]) -> None: + """Emit the Neo4j schema contract (``schema.json``) — a static artifact derived + from the in-repo catalog, independent of any analyzed project. With no + ``output`` it prints to stdout.""" + doc = json.dumps(build_schema_document(), indent=2) + "\n" + if output is None: + print(doc, end="") + return + output.mkdir(parents=True, exist_ok=True) + (output / "schema.json").write_text(doc) + logger.info(f"Neo4j schema written to {output / 'schema.json'}") + + +def emit_neo4j(app: PyApplication, options: AnalysisOptions) -> None: + """Project the analysis to a graph and write it: a live Bolt push when + ``--neo4j-uri`` is set, otherwise a self-contained ``graph.cypher`` snapshot.""" + app_name = options.app_name or Path(options.input).resolve().name + rows = project(app, app_name) + + if options.neo4j_uri: + cfg = BoltConfig( + uri=options.neo4j_uri, + user=options.neo4j_user, + password=options.neo4j_password, + database=options.neo4j_database, + ) + # A full run (no single-file restriction) makes orphan pruning safe. + full_run = options.file_name is None + bolt_writer(rows, cfg, full_run) + return + + out_dir = options.output if options.output is not None else Path.cwd() + out_dir.mkdir(parents=True, exist_ok=True) + target = out_dir / "graph.cypher" + target.write_text(render_cypher(rows, app_name)) + logger.info(f"Neo4j graph written to {target}") diff --git a/codeanalyzer/neo4j/project.py b/codeanalyzer/neo4j/project.py new file mode 100644 index 0000000..4878cda --- /dev/null +++ b/codeanalyzer/neo4j/project.py @@ -0,0 +1,322 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""``project()`` — the pure projection from the canonical :class:`PyApplication` +IR to graph rows. It walks the same recursive symbol table the call-graph builder +walks, but instead of collecting callables it emits nodes + edges. No I/O: the +writers (cypher snapshot / bolt incremental) consume the returned +:class:`GraphRows`. + +Modelling decisions (mirror of the TypeScript backend): + - signature-keyed declarations (PyClass, PyCallable) carry a shared ``:PySymbol`` + label (the global-identity / MERGE key). + - call sites, decorators, class attributes and variables are first-class nodes. + - call-graph endpoints absent from the symbol table become ``:PyExternal`` ghost + nodes, so RPC / third-party / framework edges are preserved (matching the + analyzer's own ghost-node behaviour). + - every project-owned node carries an internal ``_module`` provenance prop, so + the incremental writer can delete exactly what a re-analyzed module emitted. +""" +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, List, Optional + +from codeanalyzer.neo4j.catalog import SCHEMA_VERSION +from codeanalyzer.neo4j.rows import GraphRows, NodeRef, Props, RowBuilder, prune +from codeanalyzer.schema import ( + PyApplication, + PyCallable, + PyClass, + PyClassAttribute, + PyComment, + PyModule, + PyVariableDeclaration, +) +from codeanalyzer.schema.py_schema import PyCallsite + + +def project(app: PyApplication, app_name: str) -> GraphRows: + b = RowBuilder() + + app_ref = b.node(["PyApplication"], "name", app_name, {"schema_version": SCHEMA_VERSION}) + + for file_key, mod in app.symbol_table.items(): + mod_ref = b.node(["PyModule"], "file_key", file_key, _module_props(mod, file_key)) + b.edge("PY_HAS_MODULE", app_ref, mod_ref) + _project_module_body(b, file_key, mod_ref, mod) + + # The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become + # :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes). + for e in app.call_graph: + src = _call_endpoint(b, e.source) + tgt = _call_endpoint(b, e.target) + b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or []))) + + return b.finish() + + +def _sym(signature: str) -> NodeRef: + return NodeRef("PySymbol", "signature", signature) + + +def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef: + """A call-graph endpoint: a known callable already emitted, or a phantom + :PyExternal symbol materialized on demand for a ghost target.""" + if b.has_key(signature): + return _sym(signature) + name = signature.rsplit(".", 1)[-1] if "." in signature else signature + return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name}) + + +# ---------------------------------------------------------------------------------------------- +# Module body +# ---------------------------------------------------------------------------------------------- + + +def _project_module_body(b: RowBuilder, file_key: str, mod_ref: NodeRef, mod: PyModule) -> None: + for fn in (mod.functions or {}).values(): + _project_callable(b, file_key, mod_ref, "PY_DECLARES", fn) + for cl in (mod.classes or {}).values(): + _project_class(b, file_key, mod_ref, "PY_DECLARES", cl) + for v in mod.variables or []: + _project_variable(b, file_key, mod_ref, file_key, v) + _project_imports(b, mod_ref, mod) + + +def _project_imports(b: RowBuilder, mod_ref: NodeRef, mod: PyModule) -> None: + # Per-target-module aggregation: collapse all bindings for a given imported + # module into one PY_IMPORTS edge to a shared :PyPackage node. + agg: dict = {} + for im in mod.imports or []: + if not im.module: + continue # relative `from . import x` — no resolvable package + a = agg.setdefault(im.module, {"names": set(), "aliases": set()}) + if im.name: + a["names"].add(im.name) + if im.alias: + a["aliases"].add(im.alias) + for module_name, a in agg.items(): + pkg = b.node(["PyPackage"], "name", module_name, {}) + b.edge( + "PY_IMPORTS", + mod_ref, + pkg, + prune( + { + "imported_names": sorted(a["names"]) or None, + "aliases": sorted(a["aliases"]) or None, + } + ), + ) + + +# ---------------------------------------------------------------------------------------------- +# Declarations +# ---------------------------------------------------------------------------------------------- + + +def _project_class( + b: RowBuilder, file_key: str, parent: NodeRef, parent_rel: str, cl: PyClass +) -> None: + ref = b.node(["PySymbol", "PyClass"], "signature", cl.signature, _class_props(cl, file_key)) + b.edge(parent_rel, parent, ref) + + for base in cl.base_classes or []: + b.edge_to_symbol("PY_EXTENDS", ref, base) + + for m in (cl.methods or {}).values(): + _project_callable(b, file_key, ref, "PY_HAS_METHOD", m) + for a in (cl.attributes or {}).values(): + _project_attribute(b, file_key, ref, cl.signature, a) + for ic in (cl.inner_classes or {}).values(): + _project_class(b, file_key, ref, "PY_DECLARES", ic) + + +def _project_callable( + b: RowBuilder, file_key: str, owner: NodeRef, owner_rel: str, c: PyCallable +) -> None: + ref = b.node(["PySymbol", "PyCallable"], "signature", c.signature, _callable_props(c, file_key)) + b.edge(owner_rel, owner, ref) + + for d in c.decorators or []: + _project_decorator(b, ref, d) + + for s in c.call_sites or []: + # Key off the relative file (a call site lives in its callable's file) so ids stay portable. + cs_id = f"{file_key}#{s.start_line}:{s.start_column}-{s.end_line}:{s.end_column}" + cs = b.node(["PyCallSite"], "id", cs_id, _call_site_props(s, file_key)) + b.edge("PY_HAS_CALLSITE", ref, cs) + if s.callee_signature: + b.edge_to_symbol("PY_RESOLVES_TO", cs, s.callee_signature) + + for v in c.local_variables or []: + _project_variable(b, file_key, ref, c.signature, v) + for ic in (c.inner_callables or {}).values(): + _project_callable(b, file_key, ref, "PY_DECLARES", ic) + for cl in (c.inner_classes or {}).values(): + _project_class(b, file_key, ref, "PY_DECLARES", cl) + + +def _project_attribute( + b: RowBuilder, file_key: str, owner: NodeRef, owner_sig: str, a: PyClassAttribute +) -> None: + attr_id = f"{owner_sig}.{a.name}" + ref = b.node(["PyAttribute"], "id", attr_id, _attribute_props(a, attr_id, file_key)) + b.edge("PY_HAS_ATTRIBUTE", owner, ref) + + +def _project_variable( + b: RowBuilder, file_key: str, owner: NodeRef, owner_id: str, v: PyVariableDeclaration +) -> None: + var_id = f"{owner_id}#{v.name}@{v.start_line}" + ref = b.node(["PyVariable"], "id", var_id, _variable_props(v, var_id, file_key)) + b.edge("PY_DECLARES_VAR", owner, ref) + + +def _project_decorator(b: RowBuilder, on: NodeRef, decorator: str) -> None: + dec = b.node(["PyDecorator"], "name", decorator, {"name": decorator}) + b.edge("PY_DECORATED_BY", on, dec) + + +# ---------------------------------------------------------------------------------------------- +# Property flattening +# ---------------------------------------------------------------------------------------------- + + +def _module_props(mod: PyModule, file_key: str) -> Props: + return prune( + { + "module_name": mod.module_name, + "content_hash": mod.content_hash, + "last_modified": mod.last_modified, + "file_size": mod.file_size, + "_module": file_key, + } + ) + + +def _class_props(cl: PyClass, file_key: str) -> Props: + return prune( + { + "name": cl.name, + "code": cl.code, + "base_classes": list(cl.base_classes or []), + "docstring": _docstring_of(cl.comments), + "start_line": cl.start_line, + "end_line": cl.end_line, + "_module": file_key, + } + ) + + +def _callable_props(c: PyCallable, file_key: str) -> Props: + return prune( + { + "name": c.name, + "path": c.path, + "return_type": c.return_type, + "cyclomatic_complexity": c.cyclomatic_complexity, + "code": c.code, + "code_start_line": c.code_start_line, + "start_line": c.start_line, + "end_line": c.end_line, + "docstring": _docstring_of(c.comments), + "decorators": list(c.decorators or []), + "parameters_json": _stringify_if(c.parameters), + "accessed_symbols_json": _stringify_if(c.accessed_symbols), + "_module": file_key, + } + ) + + +def _attribute_props(a: PyClassAttribute, attr_id: str, file_key: str) -> Props: + return prune( + { + "id": attr_id, + "name": a.name, + "type": a.type, + "docstring": _docstring_of(a.comments), + "start_line": a.start_line, + "end_line": a.end_line, + "_module": file_key, + } + ) + + +def _variable_props(v: PyVariableDeclaration, var_id: str, file_key: str) -> Props: + return prune( + { + "id": var_id, + "name": v.name, + "type": v.type, + "initializer": v.initializer, + "scope": v.scope, + "start_line": v.start_line, + "end_line": v.end_line, + "_module": file_key, + } + ) + + +def _call_site_props(s: PyCallsite, file_key: str) -> Props: + cs_id = f"{file_key}#{s.start_line}:{s.start_column}-{s.end_line}:{s.end_column}" + return prune( + { + "id": cs_id, + "method_name": s.method_name, + "receiver_expr": s.receiver_expr, + "receiver_type": s.receiver_type, + "argument_types": list(s.argument_types or []), + "return_type": s.return_type, + "callee_signature": s.callee_signature, + "is_constructor_call": s.is_constructor_call, + "start_line": s.start_line, + "start_column": s.start_column, + "end_line": s.end_line, + "end_column": s.end_column, + "_module": file_key, + } + ) + + +def _call_edge_props(weight: int, provenance: List[str]) -> Props: + return prune({"weight": weight, "provenance": list(provenance)}) + + +def _docstring_of(comments: Optional[List[PyComment]]) -> Optional[str]: + docs = [c.content for c in (comments or []) if c.is_docstring] + return "\n".join(docs) if docs else None + + +def _stringify_if(value: Any) -> Optional[str]: + """JSON-encode a list/dict of pydantic models, or None when empty.""" + if value is None: + return None + if isinstance(value, (list, dict)) and len(value) == 0: + return None + return json.dumps(value, default=_jsonable, sort_keys=True) + + +def _jsonable(o: Any) -> Any: + if hasattr(o, "model_dump"): + return o.model_dump() + if hasattr(o, "dict"): + return o.dict() + if isinstance(o, Path): + return str(o) + return str(o) diff --git a/codeanalyzer/neo4j/rows.py b/codeanalyzer/neo4j/rows.py new file mode 100644 index 0000000..9edecde --- /dev/null +++ b/codeanalyzer/neo4j/rows.py @@ -0,0 +1,176 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""The output-agnostic intermediate between :func:`project` and the two writers +(cypher snapshot / bolt incremental). Pure data — no I/O, no driver. A +:class:`GraphRows` is a deterministic, deduped bag of nodes and edges that both +writers consume identically. + +Property values are restricted to Neo4j-legal shapes: primitives and homogeneous +arrays of primitives. ``None`` values are pruned (in Neo4j a null property is +simply absence). +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +# A property value: a primitive, or a homogeneous list of primitives. +Scalar = Union[str, int, float, bool] +Prop = Union[Scalar, List[str], List[int], List[float], List[bool]] +Props = Dict[str, Prop] + + +@dataclass(frozen=True) +class NodeRef: + """How an edge addresses one of its endpoints: the label + key property to + MATCH on, and the value.""" + + label: str # the label carrying the uniqueness constraint (e.g. "PySymbol", "PyModule") + key_prop: str # "signature" | "file_key" | "name" | "id" + value: str + + +@dataclass +class NodeRow: + labels: List[str] # labels[0] is the constrained MERGE label; the rest are SET as extra labels + key_prop: str + value: str + props: Props + + +@dataclass +class EdgeRow: + type: str + from_ref: NodeRef + to_ref: NodeRef + props: Props + + +@dataclass +class GraphRows: + nodes: List[NodeRow] = field(default_factory=list) + edges: List[EdgeRow] = field(default_factory=list) + + +def prune(p: Dict[str, Optional[Prop]]) -> Props: + """Drop ``None`` entries — in Neo4j a null property means "absent", so we + never store one. Empty lists are kept (a present-but-empty array is legal).""" + return {k: v for k, v in p.items() if v is not None} + + +class RowBuilder: + """Accumulates nodes/edges with ``MERGE`` semantics in memory, so the same + node touched many times (a hot external symbol, a canonical decorator) + collapses to one row, and cross-reference edges to a target that never + materialized are dropped (the "edge-only-when-resolved" rule). + """ + + def __init__(self) -> None: + self._nodes: Dict[str, NodeRow] = {} # key: f"{labels[0]} {value}" + self._edges: List[EdgeRow] = [] + self._deferred: List[EdgeRow] = [] # edges gated against node existence at finish() + self._keys: set = set() # every node value seen, for resolved-gating + + def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef: + """Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props + (last write wins) and unions labels — the in-memory analog of + ``MERGE (n:Label {key}) SET n += props``.""" + node_id = f"{labels[0]} {value}" + existing = self._nodes.get(node_id) + if existing is not None: + existing.props.update(props) + for label in labels: + if label not in existing.labels: + existing.labels.append(label) + else: + self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props)) + self._keys.add(value) + return NodeRef(labels[0], key_prop, value) + + def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None: + """An edge whose endpoints are known to exist (both ends emitted this run).""" + self._edges.append(EdgeRow(type_, from_ref, to_ref, dict(props or {}))) + + def edge_to_symbol( + self, type_: str, from_ref: NodeRef, target_signature: str, props: Optional[Props] = None + ) -> None: + """An edge to a ``:PySymbol`` target that may be external/library code not + present in the graph. Deferred and kept only if the target signature was + actually emitted as a node — so PY_EXTENDS / PY_RESOLVES_TO never dangle (the + string fallback lives on the source node's props).""" + self._deferred.append( + EdgeRow( + type_, + from_ref, + NodeRef("PySymbol", "signature", target_signature), + dict(props or {}), + ) + ) + + def has_key(self, value: str) -> bool: + return value in self._keys + + def finish(self) -> GraphRows: + for e in self._deferred: + if e.to_ref.value in self._keys: + self._edges.append(e) + nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}") + edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}") + return GraphRows(nodes, edges) + + +# ---------------------------------------------------------------------------------------------- +# Cypher literal rendering (used by the snapshot writer; the bolt writer passes params instead). +# ---------------------------------------------------------------------------------------------- + + +def cypher_value(v: Prop) -> str: + """Render a property value as a Cypher literal.""" + if isinstance(v, bool): + return "true" if v else "false" + if isinstance(v, str): + return _cypher_string(v) + if isinstance(v, (int, float)): + # bools are handled above; int/float fall through here. + if isinstance(v, float) and (v != v or v in (float("inf"), float("-inf"))): + return "null" + return repr(v) if isinstance(v, float) else str(v) + if isinstance(v, list): + return "[" + ", ".join(cypher_value(x) for x in v) + "]" + return "null" + + +def cypher_map(props: Props) -> str: + """Render a props map as a Cypher map literal: ``{key: value, ...}``. + Keys are valid identifiers.""" + return "{" + ", ".join(f"{k}: {cypher_value(v)}" for k, v in props.items()) + "}" + + +def _cypher_string(s: str) -> str: + escaped = ( + s.replace("\\", "\\\\") + .replace("'", "\\'") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") + ) + return f"'{escaped}'" + + +def chunk(items: list, size: int) -> list: + """Split a list into chunks of at most ``size`` (UNWIND batch sizing).""" + return [items[i : i + size] for i in range(0, len(items), size)] diff --git a/codeanalyzer/neo4j/schema.py b/codeanalyzer/neo4j/schema.py new file mode 100644 index 0000000..0dcbe98 --- /dev/null +++ b/codeanalyzer/neo4j/schema.py @@ -0,0 +1,39 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""The Cypher DDL — uniqueness constraints and indexes — shared by both writers. +Run BEFORE any load so MERGE uses an index seek (not a label scan) and the +identity invariant is enforced by the database. Every statement is idempotent +(``IF NOT EXISTS``). +""" +from typing import List + +CONSTRAINTS: List[str] = [ + "CREATE CONSTRAINT py_symbol_sig IF NOT EXISTS FOR (s:PySymbol) REQUIRE s.signature IS UNIQUE", + "CREATE CONSTRAINT py_app_name IF NOT EXISTS FOR (a:PyApplication) REQUIRE a.name IS UNIQUE", + "CREATE CONSTRAINT py_module_key IF NOT EXISTS FOR (m:PyModule) REQUIRE m.file_key IS UNIQUE", + "CREATE CONSTRAINT py_package_name IF NOT EXISTS FOR (p:PyPackage) REQUIRE p.name IS UNIQUE", + "CREATE CONSTRAINT py_decorator_name IF NOT EXISTS FOR (d:PyDecorator) REQUIRE d.name IS UNIQUE", + "CREATE CONSTRAINT py_callsite_id IF NOT EXISTS FOR (c:PyCallSite) REQUIRE c.id IS UNIQUE", + "CREATE CONSTRAINT py_attribute_id IF NOT EXISTS FOR (a:PyAttribute) REQUIRE a.id IS UNIQUE", + "CREATE CONSTRAINT py_variable_id IF NOT EXISTS FOR (v:PyVariable) REQUIRE v.id IS UNIQUE", +] + +INDEXES: List[str] = [ + "CREATE INDEX py_callable_name IF NOT EXISTS FOR (c:PyCallable) ON (c.name)", + "CREATE INDEX py_class_name IF NOT EXISTS FOR (c:PyClass) ON (c.name)", + "CREATE FULLTEXT INDEX py_code_fts IF NOT EXISTS FOR (c:PyCallable) ON EACH [c.code, c.docstring]", +] diff --git a/codeanalyzer/options/__init__.py b/codeanalyzer/options/__init__.py index db09fc0..127a183 100644 --- a/codeanalyzer/options/__init__.py +++ b/codeanalyzer/options/__init__.py @@ -1,3 +1,3 @@ -from .options import AnalysisOptions +from .options import AnalysisOptions, EmitTarget, OutputFormat -__all__ = ["AnalysisOptions"] \ No newline at end of file +__all__ = ["AnalysisOptions", "EmitTarget", "OutputFormat"] \ No newline at end of file diff --git a/codeanalyzer/options/options.py b/codeanalyzer/options/options.py index 1602d45..541fb85 100644 --- a/codeanalyzer/options/options.py +++ b/codeanalyzer/options/options.py @@ -9,11 +9,31 @@ class OutputFormat(str, Enum): MSGPACK = "msgpack" +class EmitTarget(str, Enum): + """Output target selected by ``--emit``. + + - ``json`` : the canonical ``analysis.json`` (symbol table + call graph). + - ``neo4j`` : project the analysis into a labeled property graph — a + ``graph.cypher`` snapshot, or a live Bolt push with ``--neo4j-uri``. + - ``schema`` : the machine-readable, version-stamped Neo4j schema contract. + """ + + JSON = "json" + NEO4J = "neo4j" + SCHEMA = "schema" + + @dataclass class AnalysisOptions: input: Path output: Optional[Path] = None format: OutputFormat = OutputFormat.JSON + emit: EmitTarget = EmitTarget.JSON + app_name: Optional[str] = None + neo4j_uri: Optional[str] = None + neo4j_user: str = "neo4j" + neo4j_password: str = "neo4j" + neo4j_database: Optional[str] = None using_codeql: bool = False using_ray: bool = False rebuild_analysis: bool = False diff --git a/neo4j-schema.drawio b/neo4j-schema.drawio new file mode 100644 index 0000000..afe5f1a --- /dev/null +++ b/neo4j-schema.drawio @@ -0,0 +1,148 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packaging/install/canpy-installer.sh b/packaging/install/canpy-installer.sh new file mode 100755 index 0000000..e02ae8c --- /dev/null +++ b/packaging/install/canpy-installer.sh @@ -0,0 +1,66 @@ +#!/bin/sh +# canpy installer — installs the codeanalyzer-python (`canpy`) CLI as an isolated +# tool. Mirrors the cargo-dist installer pattern, but because this is a pure-Python +# package (published to PyPI) it installs via uv / pipx / pip rather than downloading a binary. +# +# Usage: +# curl --proto '=https' --tlsv1.2 -LsSf https://github.com/codellm-devkit/codeanalyzer-python/releases/latest/download/canpy-installer.sh | sh +# +# Environment overrides: +# CANPY_VERSION release version, e.g. 0.2.0 (default: latest on PyPI) +# CANPY_NEO4J set to 1 to include the [neo4j] extra (live Bolt push driver) +# CANPY_INSTALLER force a backend: uv | pipx | pip (default: auto-detect) +set -eu + +PKG="codeanalyzer-python" +BIN="canpy" +VERSION="${CANPY_VERSION:-}" + +# `pkg[extra]==version` / `pkg[extra]` — assemble the PyPI requirement string. +extra="" +[ "${CANPY_NEO4J:-0}" = "1" ] && extra="[neo4j]" +if [ -n "$VERSION" ]; then + spec="${PKG}${extra}==${VERSION}" +else + spec="${PKG}${extra}" +fi + +pick_backend() { + if [ -n "${CANPY_INSTALLER:-}" ]; then + echo "$CANPY_INSTALLER"; return + fi + if command -v uv >/dev/null 2>&1; then echo uv; return; fi + if command -v pipx >/dev/null 2>&1; then echo pipx; return; fi + echo pip +} + +backend="$(pick_backend)" +echo "canpy: installing $spec via $backend ..." + +case "$backend" in + uv) + # `uv tool install` puts an isolated CLI on the uv tool path (~/.local/bin by default). + uv tool install --force "$spec" + ;; + pipx) + pipx install --force "$spec" + ;; + pip) + # Fall back to a user install. Prefer python3; require it to exist. + if command -v python3 >/dev/null 2>&1; then py=python3; elif command -v python >/dev/null 2>&1; then py=python; else + echo "canpy: need python3 (or uv / pipx) to install" >&2; exit 1 + fi + "$py" -m pip install --user --upgrade "$spec" + ;; + *) + echo "canpy: unknown installer backend '$backend' (use uv | pipx | pip)" >&2 + exit 1 + ;; +esac + +if command -v "$BIN" >/dev/null 2>&1; then + echo "canpy: installed — $("$BIN" --help >/dev/null 2>&1 && echo "run '$BIN --help' to get started")" +else + echo "canpy: installed, but '$BIN' is not on your PATH yet." + echo "canpy: add your tool bin dir to PATH (e.g. export PATH=\"\$HOME/.local/bin:\$PATH\")." +fi diff --git a/pyproject.toml b/pyproject.toml index fca4ee7..4b2b57c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "codeanalyzer-python" -version = "0.1.15" -description = "Static Analysis on Python source code using Jedi, CodeQL and Treesitter." +version = "0.2.0" +description = "Static Analysis on Python source code using Jedi, CodeQL and Treesitter — emits analysis.json or a Neo4j property graph." readme = "README.md" authors = [ { name = "Rahul Krishna", email = "i.m.ralk@gmail.com" } @@ -45,12 +45,22 @@ dependencies = [ "packaging>=25.0", ] +[project.optional-dependencies] +# The Neo4j Bolt driver is only needed for `--emit neo4j --neo4j-uri ...` (a live +# push). The `graph.cypher` snapshot and `--emit schema` need no extra packages. +neo4j = [ + "neo4j>=5.0.0,<6.0.0", +] + [dependency-groups] test = [ "pytest>=7.0.0,<8.0.0", "pytest-asyncio>=0.14.0,<0.15.0", "pytest-cov>=2.10.0,<3.0.0", - "pytest-pspec>=0.0.3" + "pytest-pspec>=0.0.3", + # Neo4j integration test (opt-in; spins up a real Neo4j via Testcontainers). + "neo4j>=5.0.0,<6.0.0", + "testcontainers[neo4j]>=4.0.0,<5.0.0; python_version >= '3.11'", ] dev = [ "ipdb>=0.13.0,<0.14.0", @@ -58,7 +68,9 @@ dev = [ ] [project.scripts] -codeanalyzer = "codeanalyzer.__main__:app" +canpy = "codeanalyzer.__main__:app" +# Deprecated alias kept for backwards compatibility — warns, then delegates to canpy. +codeanalyzer = "codeanalyzer.__main__:deprecated_main" [build-system] requires = ["hatchling"] @@ -72,6 +84,8 @@ include = ["codeanalyzer/py.typed"] include = [ "codeanalyzer", "codeanalyzer/py.typed", + "schema.neo4j.json", + "schema-uml.drawio", "README.md", "LICENSE", "NOTICE" diff --git a/schema-uml.drawio b/schema-uml.drawio new file mode 100644 index 0000000..88a1258 --- /dev/null +++ b/schema-uml.drawio @@ -0,0 +1,103 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/schema.neo4j.json b/schema.neo4j.json new file mode 100644 index 0000000..ffccf29 --- /dev/null +++ b/schema.neo4j.json @@ -0,0 +1,279 @@ +{ + "schema_version": "1.0.0", + "generator": "codeanalyzer-python", + "marker_labels": [], + "node_labels": [ + { + "label": "PyApplication", + "merge_label": "PyApplication", + "key": "name", + "properties": { + "name": "string", + "schema_version": "string" + } + }, + { + "label": "PyModule", + "merge_label": "PyModule", + "key": "file_key", + "properties": { + "file_key": "string", + "module_name": "string", + "content_hash": "string", + "last_modified": "float", + "file_size": "integer", + "_module": "string" + } + }, + { + "label": "PyClass", + "merge_label": "PySymbol", + "key": "signature", + "properties": { + "signature": "string", + "name": "string", + "code": "string", + "base_classes": "string[]", + "docstring": "string", + "start_line": "integer", + "end_line": "integer", + "_module": "string" + } + }, + { + "label": "PyCallable", + "merge_label": "PySymbol", + "key": "signature", + "properties": { + "signature": "string", + "name": "string", + "path": "string", + "return_type": "string", + "cyclomatic_complexity": "integer", + "code": "string", + "code_start_line": "integer", + "start_line": "integer", + "end_line": "integer", + "docstring": "string", + "decorators": "string[]", + "parameters_json": "string", + "accessed_symbols_json": "string", + "_module": "string" + } + }, + { + "label": "PyExternal", + "merge_label": "PySymbol", + "key": "signature", + "properties": { + "signature": "string", + "name": "string" + } + }, + { + "label": "PyPackage", + "merge_label": "PyPackage", + "key": "name", + "properties": { + "name": "string" + } + }, + { + "label": "PyDecorator", + "merge_label": "PyDecorator", + "key": "name", + "properties": { + "name": "string" + } + }, + { + "label": "PyCallSite", + "merge_label": "PyCallSite", + "key": "id", + "properties": { + "id": "string", + "method_name": "string", + "receiver_expr": "string", + "receiver_type": "string", + "argument_types": "string[]", + "return_type": "string", + "callee_signature": "string", + "is_constructor_call": "boolean", + "start_line": "integer", + "start_column": "integer", + "end_line": "integer", + "end_column": "integer", + "_module": "string" + } + }, + { + "label": "PyAttribute", + "merge_label": "PyAttribute", + "key": "id", + "properties": { + "id": "string", + "name": "string", + "type": "string", + "docstring": "string", + "start_line": "integer", + "end_line": "integer", + "_module": "string" + } + }, + { + "label": "PyVariable", + "merge_label": "PyVariable", + "key": "id", + "properties": { + "id": "string", + "name": "string", + "type": "string", + "initializer": "string", + "scope": "string", + "start_line": "integer", + "end_line": "integer", + "_module": "string" + } + } + ], + "relationship_types": [ + { + "type": "PY_HAS_MODULE", + "from": [ + "PyApplication" + ], + "to": [ + "PyModule" + ], + "properties": {} + }, + { + "type": "PY_DECLARES", + "from": [ + "PyModule", + "PyClass", + "PyCallable" + ], + "to": [ + "PyClass", + "PyCallable" + ], + "properties": {} + }, + { + "type": "PY_HAS_METHOD", + "from": [ + "PyClass" + ], + "to": [ + "PyCallable" + ], + "properties": {} + }, + { + "type": "PY_HAS_ATTRIBUTE", + "from": [ + "PyClass" + ], + "to": [ + "PyAttribute" + ], + "properties": {} + }, + { + "type": "PY_DECLARES_VAR", + "from": [ + "PyModule", + "PyCallable" + ], + "to": [ + "PyVariable" + ], + "properties": {} + }, + { + "type": "PY_HAS_CALLSITE", + "from": [ + "PyCallable" + ], + "to": [ + "PyCallSite" + ], + "properties": {} + }, + { + "type": "PY_RESOLVES_TO", + "from": [ + "PyCallSite" + ], + "to": [ + "PyCallable", + "PyExternal" + ], + "properties": {} + }, + { + "type": "PY_CALLS", + "from": [ + "PyCallable", + "PyExternal" + ], + "to": [ + "PyCallable", + "PyExternal" + ], + "properties": { + "weight": "integer", + "provenance": "string[]" + } + }, + { + "type": "PY_EXTENDS", + "from": [ + "PyClass" + ], + "to": [ + "PyClass" + ], + "properties": {} + }, + { + "type": "PY_IMPORTS", + "from": [ + "PyModule" + ], + "to": [ + "PyPackage" + ], + "properties": { + "imported_names": "string[]", + "aliases": "string[]" + } + }, + { + "type": "PY_DECORATED_BY", + "from": [ + "PyCallable" + ], + "to": [ + "PyDecorator" + ], + "properties": {} + } + ], + "constraints": [ + "CREATE CONSTRAINT py_symbol_sig IF NOT EXISTS FOR (s:PySymbol) REQUIRE s.signature IS UNIQUE", + "CREATE CONSTRAINT py_app_name IF NOT EXISTS FOR (a:PyApplication) REQUIRE a.name IS UNIQUE", + "CREATE CONSTRAINT py_module_key IF NOT EXISTS FOR (m:PyModule) REQUIRE m.file_key IS UNIQUE", + "CREATE CONSTRAINT py_package_name IF NOT EXISTS FOR (p:PyPackage) REQUIRE p.name IS UNIQUE", + "CREATE CONSTRAINT py_decorator_name IF NOT EXISTS FOR (d:PyDecorator) REQUIRE d.name IS UNIQUE", + "CREATE CONSTRAINT py_callsite_id IF NOT EXISTS FOR (c:PyCallSite) REQUIRE c.id IS UNIQUE", + "CREATE CONSTRAINT py_attribute_id IF NOT EXISTS FOR (a:PyAttribute) REQUIRE a.id IS UNIQUE", + "CREATE CONSTRAINT py_variable_id IF NOT EXISTS FOR (v:PyVariable) REQUIRE v.id IS UNIQUE" + ], + "indexes": [ + "CREATE INDEX py_callable_name IF NOT EXISTS FOR (c:PyCallable) ON (c.name)", + "CREATE INDEX py_class_name IF NOT EXISTS FOR (c:PyClass) ON (c.name)", + "CREATE FULLTEXT INDEX py_code_fts IF NOT EXISTS FOR (c:PyCallable) ON EACH [c.code, c.docstring]" + ] +} diff --git a/scripts/update_readme.py b/scripts/update_readme.py new file mode 100644 index 0000000..75cb8f0 --- /dev/null +++ b/scripts/update_readme.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +"""Regenerate the ``canpy --help`` block in README.md from the actual Typer CLI, +so the documented options can never drift from the code. Mirrors the TypeScript +backend's ``scripts/update-readme.ts``. + +Run it directly:: + + uv run python scripts/update_readme.py # rewrite the block in place + uv run python scripts/update_readme.py --check # exit 1 if the block is stale + +The release workflow runs the in-place form before publishing and commits the +result back to main. Exits non-zero if the marker block is missing. +""" +from __future__ import annotations + +import os +import re +import sys +from pathlib import Path + +README = Path(__file__).resolve().parents[1] / "README.md" +BEGIN = "" +END = "" +WIDTH = 100 # fixed render width so the box is deterministic across machines + + +def render_help() -> str: + """Render ``canpy --help`` deterministically: fixed width, no color, no + dependence on the host terminal.""" + os.environ["COLUMNS"] = str(WIDTH) + os.environ["TERM"] = "dumb" + os.environ["NO_COLOR"] = "1" + + from click.testing import CliRunner + from typer.main import get_command + + from codeanalyzer.__main__ import app + + result = CliRunner().invoke(get_command(app), ["--help"], prog_name="canpy") + if result.exit_code != 0: # pragma: no cover - help should always render + raise SystemExit(f"update_readme: `canpy --help` exited {result.exit_code}\n{result.output}") + # Drop rich's right-edge padding so the block is free of trailing whitespace. + return "\n".join(line.rstrip() for line in result.output.split("\n")).strip("\n") + + +def main() -> int: + block = f"{BEGIN}\n\n```text\n$ canpy --help\n\n{render_help()}\n```\n\n{END}" + md = README.read_text() + if BEGIN not in md or END not in md: + print(f"update_readme: markers {BEGIN} … {END} not found in README.md", file=sys.stderr) + return 1 + updated = re.sub(re.escape(BEGIN) + r"[\s\S]*?" + re.escape(END), lambda _: block, md) + + if updated == md: + print("README --help block already current") + return 0 + if "--check" in sys.argv[1:]: + print("README --help block is STALE — run: uv run python scripts/update_readme.py", file=sys.stderr) + return 1 + README.write_text(updated) + print("README --help block updated") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/test/sample_graph_app.py b/test/sample_graph_app.py new file mode 100644 index 0000000..11124f4 --- /dev/null +++ b/test/sample_graph_app.py @@ -0,0 +1,152 @@ +"""A small, hand-built :class:`PyApplication` that exercises every Neo4j +projection path (module, class + inheritance + methods + attributes + inner +class, callable + decorators + call sites + local vars + inner callable, module +variables, imports, and a call graph with a resolved edge and a ghost edge). + +Built directly from the schema models so the Neo4j tests need neither Jedi nor a +virtualenv — they stay fast and deterministic. +""" +from __future__ import annotations + +from codeanalyzer.schema import ( + PyApplication, + PyCallable, + PyClass, + PyClassAttribute, + PyComment, + PyImport, + PyModule, + PyVariableDeclaration, +) +from codeanalyzer.schema.py_schema import PyCallEdge, PyCallsite + + +def make_sample_app() -> PyApplication: + announce = PyCallable( + name="announce", + path="src/service.py", + signature="src.service.Service.announce", + comments=[PyComment(content="Announce something.", is_docstring=True)], + return_type="None", + code="def announce(self):\n ...", + start_line=10, + end_line=12, + code_start_line=10, + cyclomatic_complexity=1, + ) + inner = PyClass( + name="Inner", + signature="src.service.Service.Inner", + code="class Inner:\n ...", + start_line=14, + end_line=15, + ) + service = PyClass( + name="Service", + signature="src.service.Service", + comments=[PyComment(content="A service.", is_docstring=True)], + code="class Service(BaseService):\n ...", + base_classes=["src.service.BaseService"], + methods={"announce": announce}, + attributes={ + "name": PyClassAttribute(name="name", type="str", start_line=8, end_line=8) + }, + inner_classes={"Inner": inner}, + start_line=6, + end_line=15, + ) + base_service = PyClass( + name="BaseService", + signature="src.service.BaseService", + code="class BaseService:\n ...", + start_line=1, + end_line=4, + ) + helper = PyCallable( + name="helper", + path="src/service.py", + signature="src.service.helper", + decorators=["staticmethod"], + return_type="int", + code="def helper():\n Service().announce()\n requests.get(url)", + start_line=17, + end_line=20, + code_start_line=17, + cyclomatic_complexity=2, + call_sites=[ + PyCallsite( + method_name="announce", + receiver_expr="Service()", + receiver_type="src.service.Service", + callee_signature="src.service.Service.announce", + start_line=18, + start_column=4, + end_line=18, + end_column=22, + ) + ], + local_variables=[ + PyVariableDeclaration( + name="url", type="str", initializer="'x'", scope="function", + start_line=18, end_line=18, + ) + ], + ) + service_mod = PyModule( + file_path="src/service.py", + module_name="src.service", + imports=[PyImport(module="os", name="path", alias="p")], + classes={"Service": service, "BaseService": base_service}, + functions={"helper": helper}, + variables=[ + PyVariableDeclaration( + name="CONFIG", type="dict", initializer="{}", scope="module", + start_line=2, end_line=2, + ) + ], + content_hash="hash-service-v1", + last_modified=1.0, + file_size=100, + ) + util_mod = PyModule( + file_path="src/util.py", + module_name="src.util", + functions={ + "util_fn": PyCallable( + name="util_fn", + path="src/util.py", + signature="src.util.util_fn", + return_type="int", + code="def util_fn():\n return 1", + start_line=1, + end_line=2, + code_start_line=1, + cyclomatic_complexity=1, + ) + }, + content_hash="hash-util-v1", + last_modified=1.0, + file_size=40, + ) + + call_graph = [ + # resolved edge — both endpoints live in the symbol table + PyCallEdge( + source="src.service.helper", + target="src.service.Service.announce", + weight=1, + provenance=["jedi"], + ), + # ghost edge — target is third-party, materialized as an :PyExternal node + PyCallEdge( + source="src.service.helper", + target="requests.get", + weight=2, + provenance=["jedi", "codeql"], + ), + ] + + return PyApplication( + symbol_table={"src/service.py": service_mod, "src/util.py": util_mod}, + call_graph=call_graph, + ) diff --git a/test/test_neo4j_bolt.py b/test/test_neo4j_bolt.py new file mode 100644 index 0000000..ee84e01 --- /dev/null +++ b/test/test_neo4j_bolt.py @@ -0,0 +1,133 @@ +"""Integration test for the Neo4j bolt writer. Spins up a real Neo4j via +Testcontainers, projects the sample app to graph rows, pushes them, and asserts +the graph in the database — including the incremental behaviours (idempotent +re-push, vanished-declaration cleanup, and full-run orphan pruning). + +This suite needs a container runtime reachable by Testcontainers (Docker, or +Podman via DOCKER_HOST), so it is OPT-IN: it is skipped by default (CI release +gate, and contributors without a runtime) and runs only with +``RUN_CONTAINER_TESTS=1`` set. The no-container schema conformance test always +runs (see ``test_neo4j_schema.py``). +""" +import os + +import pytest + +from codeanalyzer.neo4j import project +from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer + +from sample_graph_app import make_sample_app + +pytestmark = pytest.mark.skipif( + not os.environ.get("RUN_CONTAINER_TESTS"), + reason="opt-in: set RUN_CONTAINER_TESTS=1 (needs Docker/Podman) to run the Neo4j bolt test", +) + +# Imported lazily so a machine without the extras doesn't error at collection time. +Neo4jContainer = pytest.importorskip("testcontainers.neo4j").Neo4jContainer +neo4j = pytest.importorskip("neo4j") + +_PASSWORD = "testpassword123" + + +@pytest.fixture(scope="module") +def neo4j_container(): + with Neo4jContainer("neo4j:5", password=_PASSWORD) as container: + yield container + + +@pytest.fixture(scope="module") +def driver(neo4j_container): + uri = neo4j_container.get_connection_url() + drv = neo4j.GraphDatabase.driver(uri, auth=("neo4j", _PASSWORD)) + yield drv + drv.close() + + +@pytest.fixture +def cfg(neo4j_container): + return BoltConfig( + uri=neo4j_container.get_connection_url(), + user="neo4j", + password=_PASSWORD, + database=None, + ) + + +def _num(driver, cypher, **params): + with driver.session() as session: + rec = session.run(cypher, **params).single() + if rec is None: + return 0 + value = rec[0] + return value if value is not None else 0 + + +@pytest.fixture(autouse=True) +def _clean_db(driver): + with driver.session() as session: + session.run("MATCH (n) DETACH DELETE n") + yield + + +def test_full_push_materializes_the_whole_graph_and_schema(driver, cfg): + rows = project(make_sample_app(), "sample-app") + bolt_writer(rows, cfg, full_run=True) + + # Every projected node/edge lands. + assert _num(driver, "MATCH (n) RETURN count(n)") == len(rows.nodes) + assert _num(driver, "MATCH ()-[r]->() RETURN count(r)") == len(rows.edges) + + # Shared :PySymbol label spans the signature-keyed declaration kinds. + symbol = _num(driver, "MATCH (s:PySymbol) RETURN count(s)") + kinds = _num( + driver, + "MATCH (s:PySymbol) WHERE s:PyCallable OR s:PyClass OR s:PyExternal RETURN count(s)", + ) + assert symbol > 0 + assert kinds == symbol + + # Constraints + indexes were created up front. + assert _num(driver, "SHOW CONSTRAINTS YIELD name RETURN count(*)") >= 8 + assert _num(driver, "SHOW INDEXES YIELD name RETURN count(*)") >= 3 + + # The known resolved call edge from the fixture (helper -> Service.announce). + assert ( + _num( + driver, + "MATCH (:PyCallable {name:$c})-[:PY_CALLS]->(t:PyCallable {name:$n}) RETURN count(*)", + c="helper", + n="announce", + ) + > 0 + ) + # The ghost edge resolved to an :PyExternal node. + assert _num(driver, "MATCH (e:PyExternal) RETURN count(e)") >= 1 + + +def test_re_pushing_identical_analysis_is_idempotent(driver, cfg): + rows = project(make_sample_app(), "sample-app") + bolt_writer(rows, cfg, full_run=True) + bolt_writer(rows, cfg, full_run=True) + assert _num(driver, "MATCH (n) RETURN count(n)") == len(rows.nodes) + assert _num(driver, "MATCH ()-[r]->() RETURN count(r)") == len(rows.edges) + + +def test_a_full_run_prunes_a_module_whose_source_vanished(driver, cfg): + bolt_writer(project(make_sample_app(), "sample-app"), cfg, full_run=True) + + # Drop one module from a fresh app and re-push as a full run. + app = make_sample_app() + victim = sorted(app.symbol_table.keys())[0] + del app.symbol_table[victim] + rows = project(app, "sample-app") + bolt_writer(rows, cfg, full_run=True) + + # The victim's module-scoped nodes are gone. + assert _num(driver, "MATCH (n {_module:$m}) RETURN count(n)", m=victim) == 0 + + # The surviving module-scoped graph matches the reduced projection. (Shared + # :PyExternal/:PyPackage/:PyDecorator nodes are MERGE-only and never pruned, so we + # compare only _module-tagged nodes.) + module_scoped = sum(1 for n in rows.nodes if "_module" in n.props) + assert _num(driver, "MATCH (n) WHERE n._module IS NOT NULL RETURN count(n)") == module_scoped diff --git a/test/test_neo4j_schema.py b/test/test_neo4j_schema.py new file mode 100644 index 0000000..401b465 --- /dev/null +++ b/test/test_neo4j_schema.py @@ -0,0 +1,96 @@ +"""Schema conformance test (no container needed). Projects the sample app and +asserts that the real emitter only ever produces node labels, relationship types +and properties that the catalog (``codeanalyzer/neo4j/catalog.py``) declares. +This is the anti-drift guard: if ``project.py`` grows a label or property that +``catalog.py`` doesn't declare, this fails — keeping the published +``schema.neo4j.json`` honest. It also checks the checked-in ``schema.neo4j.json`` +is regenerated (run ``canpy --emit schema > schema.neo4j.json``). +""" +import json +from pathlib import Path + +from codeanalyzer.neo4j import NODE_LABELS, REL_TYPES, build_schema_document, project +from codeanalyzer.neo4j.catalog import MARKER_LABELS +from codeanalyzer.neo4j.cypher import render_cypher + +from sample_graph_app import make_sample_app + +_BY_LABEL = {n.label: n for n in NODE_LABELS} +_MERGE_OF = {n.label: n.merge_label for n in NODE_LABELS} +_REL_BY_TYPE = {r.type: r for r in REL_TYPES} +_MARKERS = set(MARKER_LABELS) + + +def _specific_label(labels): + """The specific (catalog) label for a node row: the non-merge, non-marker label.""" + merge = labels[0] + if merge != "PySymbol": + return merge + for label in labels: + if label != "PySymbol" and label not in _MARKERS: + return label + return "PySymbol" + + +def _merge_labels_for(specifics): + return {_MERGE_OF[s] for s in specifics} + + +def test_every_emitted_node_label_and_property_is_declared(): + rows = project(make_sample_app(), "sample-app") + assert rows.nodes, "projection produced no nodes" + for node in rows.nodes: + specific = _specific_label(node.labels) + decl = _BY_LABEL.get(specific) + assert decl is not None, f"undeclared node label: {':'.join(node.labels)}" + assert node.labels[0] == decl.merge_label + for label in node.labels: + ok = label == decl.merge_label or label == specific or label in _MARKERS + assert ok, f"unexpected label '{label}' on {specific}" + for key in node.props: + assert key in decl.properties, f"undeclared property '{specific}.{key}'" + + +def test_every_emitted_relationship_type_property_and_endpoint_is_declared(): + rows = project(make_sample_app(), "sample-app") + assert rows.edges, "projection produced no edges" + for edge in rows.edges: + decl = _REL_BY_TYPE.get(edge.type) + assert decl is not None, f"undeclared relationship type: {edge.type}" + assert edge.from_ref.label in _merge_labels_for(decl.from_labels), ( + f"bad source {edge.from_ref.label} for {edge.type}" + ) + assert edge.to_ref.label in _merge_labels_for(decl.to_labels), ( + f"bad target {edge.to_ref.label} for {edge.type}" + ) + for key in edge.props: + assert key in decl.properties, f"undeclared property on {edge.type}.{key}" + + +def test_all_catalog_node_kinds_and_relationships_are_exercised(): + """Guards the fixture itself: every catalog label/rel should appear at least + once, so the conformance asserts above actually cover the whole schema.""" + rows = project(make_sample_app(), "sample-app") + seen_labels = {_specific_label(n.labels) for n in rows.nodes} + seen_rels = {e.type for e in rows.edges} + assert {n.label for n in NODE_LABELS} <= seen_labels + assert {r.type for r in REL_TYPES} <= seen_rels + + +def test_render_cypher_is_deterministic_and_self_contained(): + app = make_sample_app() + a = render_cypher(project(app, "sample-app"), "sample-app") + b = render_cypher(project(make_sample_app(), "sample-app"), "sample-app") + assert a == b, "cypher rendering must be deterministic" + assert "CREATE CONSTRAINT" in a + assert "DETACH DELETE" in a + assert "MERGE (n:PySymbol {signature: row.k})" in a + + +def test_checked_in_schema_matches_catalog(): + """Run `canpy --emit schema > schema.neo4j.json` if this fails.""" + on_disk_path = Path(__file__).resolve().parents[1] / "schema.neo4j.json" + assert on_disk_path.exists(), "schema.neo4j.json is missing — regenerate it" + on_disk = json.loads(on_disk_path.read_text()) + fresh = build_schema_document() + assert on_disk == fresh