Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions EXPERIMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ All configurations live in [`config.yaml`](src/bcbench/agent/shared/config.yaml)
| `skills.enabled` | `false` | Copy **only** `instructions/<owner>-<repo>/skills/` |
| `agents.enabled` and `agents.name` | `false` | Copy **only** `instructions/<owner>-<repo>/agents/` and pass `--agent=<name>` to the CLI |
| `mcp.servers` | _(none)_ | List of MCP servers to register |
| `plugins.install` | _(none)_ | List of plugins to install (Copilot CLI **and** Claude Code). Each entry has `repo` (marketplace OWNER/REPO or git URL), `sha` (commit to pin the marketplace to), and `name` (`<plugin>@<marketplace>`). The marketplace is cloned at the pinned SHA and added locally before install |

Note: `instructions.enabled: true` is a superset — you don't also need to enable `skills` or `agents` to get them. Use `skills`/`agents` when you want to isolate the effect of just that piece.

Expand Down Expand Up @@ -119,6 +120,7 @@ Each run uploads artifacts and updates a `leaderboard/<category>/<run_id>` branc
- [ ] Skills (`skills.enabled: true`)
- [ ] Custom agents (`agents.enabled: true`, name: ___)
- [ ] MCP servers (list below)
- [ ] Plugins (list below)
- [ ] Other (describe)

### Agent & Model
Expand Down
12 changes: 7 additions & 5 deletions src/bcbench/agent/claude/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import yaml

from bcbench.agent.claude.metrics import parse_metrics
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, install_plugins, parse_tool_usage_from_hooks
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
Expand Down Expand Up @@ -38,15 +38,21 @@ def run_claude_code(

logger.info(f"Running Claude Code on: {entry.instance_id}")

claude_cmd = shutil.which("claude")
if not claude_cmd:
raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")

prompt: str = build_prompt(entry, repo_path, claude_config, category, al_mcp=al_mcp)
mcp_config_json, mcp_server_names = build_mcp_config(claude_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name)
plugins: list[str] | None = install_plugins(claude_config, claude_cmd)
lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.CLAUDE, al_lsp=al_lsp, container_name=container_name)
instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
plugins=plugins,
al_lsp_enabled=lsp_plugin_dir is not None,
custom_instructions=instructions_enabled,
skills_enabled=skills_enabled,
Expand All @@ -56,10 +62,6 @@ def run_claude_code(
logger.info(f"Executing Claude Code in directory: {repo_path}")
logger.debug(f"Using prompt:\n{prompt}")

claude_cmd = shutil.which("claude")
if not claude_cmd:
raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")

try:
cmd_args = [
claude_cmd,
Expand Down
16 changes: 9 additions & 7 deletions src/bcbench/agent/copilot/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import yaml

from bcbench.agent.copilot.metrics import parse_metrics
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, install_plugins, parse_tool_usage_from_hooks
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
Expand Down Expand Up @@ -41,15 +41,23 @@ def run_copilot_agent(

logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}")

# Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell,
# which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples).
copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot")
if not copilot_cmd:
raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.")

prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name)
plugins: list[str] | None = install_plugins(copilot_config, copilot_cmd)
lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.COPILOT, al_lsp=al_lsp, container_name=container_name)
instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
plugins=plugins,
al_lsp_enabled=lsp_plugin_dir is not None,
custom_instructions=instructions_enabled,
skills_enabled=skills_enabled,
Expand All @@ -59,12 +67,6 @@ def run_copilot_agent(
logger.info(f"Executing Copilot CLI in directory: {repo_path}")
logger.debug(f"Using prompt:\n{prompt}")

# Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell,
# which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples).
copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot")
if not copilot_cmd:
raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.")

try:
cmd_args = [
copilot_cmd,
Expand Down
3 changes: 2 additions & 1 deletion src/bcbench/agent/shared/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
from bcbench.agent.shared.lsp import build_al_lsp_plugin
from bcbench.agent.shared.mcp import build_mcp_config
from bcbench.agent.shared.plugins import install_plugins
from bcbench.agent.shared.prompt import build_prompt

__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "install_plugins", "parse_tool_usage_from_hooks"]
18 changes: 18 additions & 0 deletions src/bcbench/agent/shared/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,21 @@ mcp:
# type: "stdio"
# command: "npx"
# args: ["-y", "@modelcontextprotocol/server-filesystem", "{{repo_path}}"]

# controls:
# which plugins to install before running the agent (works for both Copilot CLI and Claude Code)
# https://docs.github.com/en/copilot/concepts/agents/about-plugins
# Each entry pins a marketplace to a specific commit SHA for reproducibility: the marketplace
# repo is cloned at that SHA and added locally (no marketplace is assumed to be registered by
# default), then the plugin is installed via `<cli> plugin install <plugin>@<marketplace>`.
# - repo: marketplace GitHub repo as OWNER/REPO (or a full git URL)
# - sha: commit SHA to pin the marketplace to
# - name: install spec, "<plugin-name>@<marketplace-name>"
plugins:
install: []
# Example: the awesome-copilot plugin from github/awesome-copilot
# https://github.com/github/awesome-copilot/blob/main/plugins/awesome-copilot/README.md
# Uncomment to enable locally (update `sha` to the commit you want to pin):
# - repo: "github/awesome-copilot"
# sha: "28c3a14af4e6232091071ddb40272f72d9d96b2f"
# name: "awesome-copilot@awesome-copilot"
64 changes: 64 additions & 0 deletions src/bcbench/agent/shared/plugins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import subprocess
import tempfile
from typing import Any

from bcbench.exceptions import AgentError
from bcbench.logger import get_logger

logger = get_logger(__name__)


def _marketplace_url(repo: str) -> str:
return repo if "://" in repo else f"https://github.com/{repo}.git"


def _plugin_name(spec: str) -> str:
return spec.split("@", 1)[0]


def _clone_marketplace_at_sha(repo: str, sha: str) -> str:
clone_dir = tempfile.mkdtemp(prefix="bcbench-plugin-")
url = _marketplace_url(repo)
logger.info(f"Cloning marketplace {url} at {sha} into {clone_dir}")
_run(["git", "clone", "--quiet", url, clone_dir])
_run(["git", "-C", clone_dir, "checkout", "--quiet", sha])
return clone_dir


def install_plugins(config: dict[str, Any], cli_cmd: str) -> list[str] | None:
"""Install Copilot/Claude plugins declared in config, pinned to a specific marketplace SHA.

Each entry under `plugins.install` must provide `repo` (marketplace OWNER/REPO or git URL),
`sha` (commit to pin the marketplace to), and `name` (`<plugin>@<marketplace>` install spec).
The marketplace is cloned at the pinned SHA and added locally before installing, so no
marketplace is assumed to be registered by default.

Returns:
Installed plugin names, or None when no plugins are configured.
"""
specs: list[dict[str, str]] = config.get("plugins", {}).get("install") or []

if not specs:
return None

plugin_names: list[str] = []
for spec in specs:
repo, sha, name = spec["repo"], spec["sha"], spec["name"]
marketplace_dir = _clone_marketplace_at_sha(repo, sha)

logger.info(f"Registering marketplace from {marketplace_dir}")
_run([cli_cmd, "plugin", "marketplace", "add", marketplace_dir])

logger.info(f"Installing plugin: {name}")
_run([cli_cmd, "plugin", "install", name])

plugin_names.append(_plugin_name(name))

logger.info(f"Installed plugins: {plugin_names}")
return plugin_names


def _run(cmd: list[str]) -> None:
result = subprocess.run(cmd, capture_output=True, text=True, check=False)
if result.returncode != 0:
raise AgentError(f"Plugin command failed: {' '.join(cmd)}\n{result.stderr}")
2 changes: 2 additions & 0 deletions src/bcbench/results/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def create_console_summary(results: Sequence[BaseEvaluationResult], summary: Eva
console.print(f"Total Processed: [bold]{len(results)}[/bold], using [bold]{results[0].agent_name}({results[0].model})[/bold]")
console.print(f"Category: [bold]{results[0].category.value}[/bold]")
console.print(f"MCP Servers: [bold]{', '.join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else 'None'}[/bold]")
console.print(f"Plugins: [bold]{', '.join(results[0].experiment.plugins) if results[0].experiment and results[0].experiment.plugins else 'None'}[/bold]")
console.print(f"AL LSP: [bold]{'Yes' if results[0].experiment and results[0].experiment.al_lsp_enabled else 'No'}[/bold]")
console.print(f"Custom Instructions: [bold]{'Yes' if results[0].experiment and results[0].experiment.custom_instructions else 'No'}[/bold]")
console.print(f"Skills: [bold]{'Yes' if results[0].experiment and results[0].experiment.skills_enabled else 'No'}[/bold]")
Expand Down Expand Up @@ -93,6 +94,7 @@ def create_github_job_summary(results: Sequence[BaseEvaluationResult], summary:
f"Total entries processed: {len(results)}, using **{results[0].agent_name} ({results[0].model})**",
f"- Category: `{results[0].category.value}`",
f"- MCP Servers used: {', '.join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else 'None'}",
f"- Plugins: {', '.join(results[0].experiment.plugins) if results[0].experiment and results[0].experiment.plugins else 'None'}",
f"- AL LSP: {'Yes' if results[0].experiment and results[0].experiment.al_lsp_enabled else 'No'}",
f"- Custom Instructions: {'Yes' if results[0].experiment and results[0].experiment.custom_instructions else 'No'}",
f"- Skills: {'Yes' if results[0].experiment and results[0].experiment.skills_enabled else 'No'}",
Expand Down
5 changes: 4 additions & 1 deletion src/bcbench/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ class ExperimentConfiguration(BaseModel):
# MCP server names used in experiment (if any)
mcp_servers: list[str] | None = None

# Plugin names installed for this experiment (if any)
plugins: list[str] | None = None

# Whether the AL LSP server was enabled for this experiment
al_lsp_enabled: bool = False

Expand All @@ -101,7 +104,7 @@ def is_empty(self) -> bool:
An empty configuration means no special experiment settings were used.
This is useful for comparing with None (no experiment) vs default experiment.
"""
return self.mcp_servers is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None
return self.mcp_servers is None and self.plugins is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None


class AgentType(StrEnum):
Expand Down
8 changes: 8 additions & 0 deletions tests/test_experiment_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,19 @@ def test_default_values(self):
config = ExperimentConfiguration()

assert config.mcp_servers is None
assert config.plugins is None
assert config.al_lsp_enabled is False
assert config.custom_instructions is False
assert config.skills_enabled is False
assert config.custom_agent is None

def test_with_plugins(self):
plugins = ["plugin-1", "plugin-2"]
config = ExperimentConfiguration(plugins=plugins)

assert config.plugins == plugins
assert not config.is_empty()

def test_with_mcp_servers(self):
mcp_servers = ["mcp-server-1", "mcp-server-2"]
config = ExperimentConfiguration(mcp_servers=mcp_servers)
Expand Down
73 changes: 73 additions & 0 deletions tests/test_plugins_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from unittest.mock import MagicMock

import pytest

from bcbench.agent.shared.plugins import install_plugins
from bcbench.exceptions import AgentError

PLUGIN_SPEC = {
"repo": "github/awesome-copilot",
"sha": "28c3a14af4e6232091071ddb40272f72d9d96b2f",
"name": "awesome-copilot@awesome-copilot",
}


@pytest.fixture
def fake_run(monkeypatch):
mock = MagicMock(return_value=MagicMock(returncode=0, stderr=""))
monkeypatch.setattr("bcbench.agent.shared.plugins.subprocess.run", mock)
return mock


@pytest.fixture(autouse=True)
def fake_mkdtemp(monkeypatch):
monkeypatch.setattr("bcbench.agent.shared.plugins.tempfile.mkdtemp", lambda **_: "/tmp/market")


def _commands(fake_run) -> list[list[str]]:
return [c.args[0] for c in fake_run.call_args_list]


class TestInstallPlugins:
def test_returns_none_when_no_plugins(self, fake_run):
assert install_plugins({"plugins": {"install": []}}, "copilot") is None
assert install_plugins({}, "claude") is None
fake_run.assert_not_called()

def test_returns_plugin_names(self, fake_run):
names = install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")

assert names == ["awesome-copilot"]

def test_clones_marketplace_pinned_to_sha(self, fake_run):
install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")

commands = _commands(fake_run)
assert commands[0] == ["git", "clone", "--quiet", "https://github.com/github/awesome-copilot.git", "/tmp/market"]
assert commands[1] == ["git", "-C", "/tmp/market", "checkout", "--quiet", "28c3a14af4e6232091071ddb40272f72d9d96b2f"]

def test_always_adds_marketplace_then_installs(self, fake_run):
install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "claude")

commands = _commands(fake_run)
assert commands[2] == ["claude", "plugin", "marketplace", "add", "/tmp/market"]
assert commands[3] == ["claude", "plugin", "install", "awesome-copilot@awesome-copilot"]

def test_uses_provided_cli_command(self, fake_run):
install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")

cli_commands = [c for c in _commands(fake_run) if c[0] != "git"]
assert all(c[0] == "copilot" for c in cli_commands)

def test_full_git_url_repo(self, fake_run):
spec = {**PLUGIN_SPEC, "repo": "https://gitlab.com/o/r.git"}

install_plugins({"plugins": {"install": [spec]}}, "copilot")

assert _commands(fake_run)[0] == ["git", "clone", "--quiet", "https://gitlab.com/o/r.git", "/tmp/market"]

def test_raises_on_failure(self, monkeypatch):
monkeypatch.setattr("bcbench.agent.shared.plugins.subprocess.run", MagicMock(return_value=MagicMock(returncode=1, stderr="boom")))

with pytest.raises(AgentError):
install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")