diff --git a/EXPERIMENT.md b/EXPERIMENT.md index c2b856e7a..b4ac7bf67 100644 --- a/EXPERIMENT.md +++ b/EXPERIMENT.md @@ -24,6 +24,7 @@ All configurations live in [`config.yaml`](src/bcbench/agent/shared/config.yaml) | `skills.enabled` | `false` | Copy **only** `instructions/-/skills/` | | `agents.enabled` and `agents.name` | `false` | Copy **only** `instructions/-/agents/` and pass `--agent=` to the CLI | | `mcp.servers` | _(none)_ | List of MCP servers to register | +| `plugins.install` | _(none)_ | List of plugins to install (Copilot CLI **and** Claude Code). Each entry has `repo` (marketplace OWNER/REPO or git URL), `sha` (commit to pin the marketplace to), and `name` (`@`). The marketplace is cloned at the pinned SHA and added locally before install | Note: `instructions.enabled: true` is a superset — you don't also need to enable `skills` or `agents` to get them. Use `skills`/`agents` when you want to isolate the effect of just that piece. @@ -119,6 +120,7 @@ Each run uploads artifacts and updates a `leaderboard//` branc - [ ] Skills (`skills.enabled: true`) - [ ] Custom agents (`agents.enabled: true`, name: ___) - [ ] MCP servers (list below) +- [ ] Plugins (list below) - [ ] Other (describe) ### Agent & Model diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py index ae2e38214..5b1db4964 100644 --- a/src/bcbench/agent/claude/agent.py +++ b/src/bcbench/agent/claude/agent.py @@ -6,7 +6,7 @@ import yaml from bcbench.agent.claude.metrics import parse_metrics -from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks +from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, install_plugins, parse_tool_usage_from_hooks from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError @@ -38,8 +38,13 @@ def run_claude_code( logger.info(f"Running Claude Code on: {entry.instance_id}") + claude_cmd = shutil.which("claude") + if not claude_cmd: + raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.") + prompt: str = build_prompt(entry, repo_path, claude_config, category, al_mcp=al_mcp) mcp_config_json, mcp_server_names = build_mcp_config(claude_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name) + plugins: list[str] | None = install_plugins(claude_config, claude_cmd) lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.CLAUDE, al_lsp=al_lsp, container_name=container_name) instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) @@ -47,6 +52,7 @@ def run_claude_code( tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir) config = ExperimentConfiguration( mcp_servers=mcp_server_names, + plugins=plugins, al_lsp_enabled=lsp_plugin_dir is not None, custom_instructions=instructions_enabled, skills_enabled=skills_enabled, @@ -56,10 +62,6 @@ def run_claude_code( logger.info(f"Executing Claude Code in directory: {repo_path}") logger.debug(f"Using prompt:\n{prompt}") - claude_cmd = shutil.which("claude") - if not claude_cmd: - raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.") - try: cmd_args = [ claude_cmd, diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py index a7879cacc..d45f76db2 100644 --- a/src/bcbench/agent/copilot/agent.py +++ b/src/bcbench/agent/copilot/agent.py @@ -9,7 +9,7 @@ import yaml from bcbench.agent.copilot.metrics import parse_metrics -from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks +from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, install_plugins, parse_tool_usage_from_hooks from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError @@ -41,8 +41,15 @@ def run_copilot_agent( logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}") + # Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell, + # which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples). + copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot") + if not copilot_cmd: + raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.") + prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp) mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name) + plugins: list[str] | None = install_plugins(copilot_config, copilot_cmd) lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.COPILOT, al_lsp=al_lsp, container_name=container_name) instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) @@ -50,6 +57,7 @@ def run_copilot_agent( tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir) config = ExperimentConfiguration( mcp_servers=mcp_server_names, + plugins=plugins, al_lsp_enabled=lsp_plugin_dir is not None, custom_instructions=instructions_enabled, skills_enabled=skills_enabled, @@ -59,12 +67,6 @@ def run_copilot_agent( logger.info(f"Executing Copilot CLI in directory: {repo_path}") logger.debug(f"Using prompt:\n{prompt}") - # Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell, - # which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples). - copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot") - if not copilot_cmd: - raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.") - try: cmd_args = [ copilot_cmd, diff --git a/src/bcbench/agent/shared/__init__.py b/src/bcbench/agent/shared/__init__.py index 7820106ed..34b35c5ee 100644 --- a/src/bcbench/agent/shared/__init__.py +++ b/src/bcbench/agent/shared/__init__.py @@ -3,6 +3,7 @@ from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks from bcbench.agent.shared.lsp import build_al_lsp_plugin from bcbench.agent.shared.mcp import build_mcp_config +from bcbench.agent.shared.plugins import install_plugins from bcbench.agent.shared.prompt import build_prompt -__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"] +__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "install_plugins", "parse_tool_usage_from_hooks"] diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml index b5dd27f64..a675d1564 100644 --- a/src/bcbench/agent/shared/config.yaml +++ b/src/bcbench/agent/shared/config.yaml @@ -117,3 +117,21 @@ mcp: # type: "stdio" # command: "npx" # args: ["-y", "@modelcontextprotocol/server-filesystem", "{{repo_path}}"] + +# controls: +# which plugins to install before running the agent (works for both Copilot CLI and Claude Code) +# https://docs.github.com/en/copilot/concepts/agents/about-plugins +# Each entry pins a marketplace to a specific commit SHA for reproducibility: the marketplace +# repo is cloned at that SHA and added locally (no marketplace is assumed to be registered by +# default), then the plugin is installed via ` plugin install @`. +# - repo: marketplace GitHub repo as OWNER/REPO (or a full git URL) +# - sha: commit SHA to pin the marketplace to +# - name: install spec, "@" +plugins: + install: [] + # Example: the awesome-copilot plugin from github/awesome-copilot + # https://github.com/github/awesome-copilot/blob/main/plugins/awesome-copilot/README.md + # Uncomment to enable locally (update `sha` to the commit you want to pin): + # - repo: "github/awesome-copilot" + # sha: "28c3a14af4e6232091071ddb40272f72d9d96b2f" + # name: "awesome-copilot@awesome-copilot" diff --git a/src/bcbench/agent/shared/plugins.py b/src/bcbench/agent/shared/plugins.py new file mode 100644 index 000000000..27a2d2421 --- /dev/null +++ b/src/bcbench/agent/shared/plugins.py @@ -0,0 +1,64 @@ +import subprocess +import tempfile +from typing import Any + +from bcbench.exceptions import AgentError +from bcbench.logger import get_logger + +logger = get_logger(__name__) + + +def _marketplace_url(repo: str) -> str: + return repo if "://" in repo else f"https://github.com/{repo}.git" + + +def _plugin_name(spec: str) -> str: + return spec.split("@", 1)[0] + + +def _clone_marketplace_at_sha(repo: str, sha: str) -> str: + clone_dir = tempfile.mkdtemp(prefix="bcbench-plugin-") + url = _marketplace_url(repo) + logger.info(f"Cloning marketplace {url} at {sha} into {clone_dir}") + _run(["git", "clone", "--quiet", url, clone_dir]) + _run(["git", "-C", clone_dir, "checkout", "--quiet", sha]) + return clone_dir + + +def install_plugins(config: dict[str, Any], cli_cmd: str) -> list[str] | None: + """Install Copilot/Claude plugins declared in config, pinned to a specific marketplace SHA. + + Each entry under `plugins.install` must provide `repo` (marketplace OWNER/REPO or git URL), + `sha` (commit to pin the marketplace to), and `name` (`@` install spec). + The marketplace is cloned at the pinned SHA and added locally before installing, so no + marketplace is assumed to be registered by default. + + Returns: + Installed plugin names, or None when no plugins are configured. + """ + specs: list[dict[str, str]] = config.get("plugins", {}).get("install") or [] + + if not specs: + return None + + plugin_names: list[str] = [] + for spec in specs: + repo, sha, name = spec["repo"], spec["sha"], spec["name"] + marketplace_dir = _clone_marketplace_at_sha(repo, sha) + + logger.info(f"Registering marketplace from {marketplace_dir}") + _run([cli_cmd, "plugin", "marketplace", "add", marketplace_dir]) + + logger.info(f"Installing plugin: {name}") + _run([cli_cmd, "plugin", "install", name]) + + plugin_names.append(_plugin_name(name)) + + logger.info(f"Installed plugins: {plugin_names}") + return plugin_names + + +def _run(cmd: list[str]) -> None: + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + if result.returncode != 0: + raise AgentError(f"Plugin command failed: {' '.join(cmd)}\n{result.stderr}") diff --git a/src/bcbench/results/display.py b/src/bcbench/results/display.py index 4a6c2e030..a5d11c375 100644 --- a/src/bcbench/results/display.py +++ b/src/bcbench/results/display.py @@ -26,6 +26,7 @@ def create_console_summary(results: Sequence[BaseEvaluationResult], summary: Eva console.print(f"Total Processed: [bold]{len(results)}[/bold], using [bold]{results[0].agent_name}({results[0].model})[/bold]") console.print(f"Category: [bold]{results[0].category.value}[/bold]") console.print(f"MCP Servers: [bold]{', '.join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else 'None'}[/bold]") + console.print(f"Plugins: [bold]{', '.join(results[0].experiment.plugins) if results[0].experiment and results[0].experiment.plugins else 'None'}[/bold]") console.print(f"AL LSP: [bold]{'Yes' if results[0].experiment and results[0].experiment.al_lsp_enabled else 'No'}[/bold]") console.print(f"Custom Instructions: [bold]{'Yes' if results[0].experiment and results[0].experiment.custom_instructions else 'No'}[/bold]") console.print(f"Skills: [bold]{'Yes' if results[0].experiment and results[0].experiment.skills_enabled else 'No'}[/bold]") @@ -93,6 +94,7 @@ def create_github_job_summary(results: Sequence[BaseEvaluationResult], summary: f"Total entries processed: {len(results)}, using **{results[0].agent_name} ({results[0].model})**", f"- Category: `{results[0].category.value}`", f"- MCP Servers used: {', '.join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else 'None'}", + f"- Plugins: {', '.join(results[0].experiment.plugins) if results[0].experiment and results[0].experiment.plugins else 'None'}", f"- AL LSP: {'Yes' if results[0].experiment and results[0].experiment.al_lsp_enabled else 'No'}", f"- Custom Instructions: {'Yes' if results[0].experiment and results[0].experiment.custom_instructions else 'No'}", f"- Skills: {'Yes' if results[0].experiment and results[0].experiment.skills_enabled else 'No'}", diff --git a/src/bcbench/types.py b/src/bcbench/types.py index b70821d05..a7760912c 100644 --- a/src/bcbench/types.py +++ b/src/bcbench/types.py @@ -83,6 +83,9 @@ class ExperimentConfiguration(BaseModel): # MCP server names used in experiment (if any) mcp_servers: list[str] | None = None + # Plugin names installed for this experiment (if any) + plugins: list[str] | None = None + # Whether the AL LSP server was enabled for this experiment al_lsp_enabled: bool = False @@ -101,7 +104,7 @@ def is_empty(self) -> bool: An empty configuration means no special experiment settings were used. This is useful for comparing with None (no experiment) vs default experiment. """ - return self.mcp_servers is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None + return self.mcp_servers is None and self.plugins is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None class AgentType(StrEnum): diff --git a/tests/test_experiment_configuration.py b/tests/test_experiment_configuration.py index fd3c2b83d..123baf60e 100644 --- a/tests/test_experiment_configuration.py +++ b/tests/test_experiment_configuration.py @@ -8,11 +8,19 @@ def test_default_values(self): config = ExperimentConfiguration() assert config.mcp_servers is None + assert config.plugins is None assert config.al_lsp_enabled is False assert config.custom_instructions is False assert config.skills_enabled is False assert config.custom_agent is None + def test_with_plugins(self): + plugins = ["plugin-1", "plugin-2"] + config = ExperimentConfiguration(plugins=plugins) + + assert config.plugins == plugins + assert not config.is_empty() + def test_with_mcp_servers(self): mcp_servers = ["mcp-server-1", "mcp-server-2"] config = ExperimentConfiguration(mcp_servers=mcp_servers) diff --git a/tests/test_plugins_config.py b/tests/test_plugins_config.py new file mode 100644 index 000000000..3822d0061 --- /dev/null +++ b/tests/test_plugins_config.py @@ -0,0 +1,73 @@ +from unittest.mock import MagicMock + +import pytest + +from bcbench.agent.shared.plugins import install_plugins +from bcbench.exceptions import AgentError + +PLUGIN_SPEC = { + "repo": "github/awesome-copilot", + "sha": "28c3a14af4e6232091071ddb40272f72d9d96b2f", + "name": "awesome-copilot@awesome-copilot", +} + + +@pytest.fixture +def fake_run(monkeypatch): + mock = MagicMock(return_value=MagicMock(returncode=0, stderr="")) + monkeypatch.setattr("bcbench.agent.shared.plugins.subprocess.run", mock) + return mock + + +@pytest.fixture(autouse=True) +def fake_mkdtemp(monkeypatch): + monkeypatch.setattr("bcbench.agent.shared.plugins.tempfile.mkdtemp", lambda **_: "/tmp/market") + + +def _commands(fake_run) -> list[list[str]]: + return [c.args[0] for c in fake_run.call_args_list] + + +class TestInstallPlugins: + def test_returns_none_when_no_plugins(self, fake_run): + assert install_plugins({"plugins": {"install": []}}, "copilot") is None + assert install_plugins({}, "claude") is None + fake_run.assert_not_called() + + def test_returns_plugin_names(self, fake_run): + names = install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot") + + assert names == ["awesome-copilot"] + + def test_clones_marketplace_pinned_to_sha(self, fake_run): + install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot") + + commands = _commands(fake_run) + assert commands[0] == ["git", "clone", "--quiet", "https://github.com/github/awesome-copilot.git", "/tmp/market"] + assert commands[1] == ["git", "-C", "/tmp/market", "checkout", "--quiet", "28c3a14af4e6232091071ddb40272f72d9d96b2f"] + + def test_always_adds_marketplace_then_installs(self, fake_run): + install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "claude") + + commands = _commands(fake_run) + assert commands[2] == ["claude", "plugin", "marketplace", "add", "/tmp/market"] + assert commands[3] == ["claude", "plugin", "install", "awesome-copilot@awesome-copilot"] + + def test_uses_provided_cli_command(self, fake_run): + install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot") + + cli_commands = [c for c in _commands(fake_run) if c[0] != "git"] + assert all(c[0] == "copilot" for c in cli_commands) + + def test_full_git_url_repo(self, fake_run): + spec = {**PLUGIN_SPEC, "repo": "https://gitlab.com/o/r.git"} + + install_plugins({"plugins": {"install": [spec]}}, "copilot") + + assert _commands(fake_run)[0] == ["git", "clone", "--quiet", "https://gitlab.com/o/r.git", "/tmp/market"] + + def test_raises_on_failure(self, monkeypatch): + monkeypatch.setattr("bcbench.agent.shared.plugins.subprocess.run", MagicMock(return_value=MagicMock(returncode=1, stderr="boom"))) + + with pytest.raises(AgentError): + install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")