microsoft · Copilot · Jun 29, 2026 · Jun 30, 2026 · Jun 30, 2026
diff --git a/EXPERIMENT.md b/EXPERIMENT.md
@@ -24,6 +24,7 @@ All configurations live in [`config.yaml`](src/bcbench/agent/shared/config.yaml)
 | `skills.enabled` | `false` | Copy **only** `instructions/<owner>-<repo>/skills/` |
 | `agents.enabled` and `agents.name` | `false` | Copy **only** `instructions/<owner>-<repo>/agents/` and pass `--agent=<name>` to the CLI |
 | `mcp.servers` | _(none)_ | List of MCP servers to register |
+| `plugins.install` | _(none)_ | List of plugins to install (Copilot CLI **and** Claude Code). Each entry has `repo` (marketplace OWNER/REPO or git URL), `sha` (commit to pin the marketplace to), and `name` (`<plugin>@<marketplace>`). The marketplace is cloned at the pinned SHA and added locally before install |
 
 Note: `instructions.enabled: true` is a superset — you don't also need to enable `skills` or `agents` to get them. Use `skills`/`agents` when you want to isolate the effect of just that piece.
 
@@ -119,6 +120,7 @@ Each run uploads artifacts and updates a `leaderboard/<category>/<run_id>` branc
 - [ ] Skills (`skills.enabled: true`)
 - [ ] Custom agents (`agents.enabled: true`, name: ___)
 - [ ] MCP servers (list below)
+- [ ] Plugins (list below)
 - [ ] Other (describe)
 
 ### Agent & Model

diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py
@@ -6,7 +6,7 @@
 import yaml
 
 from bcbench.agent.claude.metrics import parse_metrics
-from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
+from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, install_plugins, parse_tool_usage_from_hooks
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
@@ -38,15 +38,21 @@ def run_claude_code(
 
     logger.info(f"Running Claude Code on: {entry.instance_id}")
 
+    claude_cmd = shutil.which("claude")
+    if not claude_cmd:
+        raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")
+
     prompt: str = build_prompt(entry, repo_path, claude_config, category, al_mcp=al_mcp)
     mcp_config_json, mcp_server_names = build_mcp_config(claude_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name)
+    plugins: list[str] | None = install_plugins(claude_config, claude_cmd)
     lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.CLAUDE, al_lsp=al_lsp, container_name=container_name)
     instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
     skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
     custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
     tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
     config = ExperimentConfiguration(
         mcp_servers=mcp_server_names,
+        plugins=plugins,
         al_lsp_enabled=lsp_plugin_dir is not None,
         custom_instructions=instructions_enabled,
         skills_enabled=skills_enabled,
@@ -56,10 +62,6 @@ def run_claude_code(
     logger.info(f"Executing Claude Code in directory: {repo_path}")
     logger.debug(f"Using prompt:\n{prompt}")
 
-    claude_cmd = shutil.which("claude")
-    if not claude_cmd:
-        raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")
-
     try:
         cmd_args = [
             claude_cmd,

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
@@ -9,7 +9,7 @@
 import yaml
 
 from bcbench.agent.copilot.metrics import parse_metrics
-from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
+from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, install_plugins, parse_tool_usage_from_hooks
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
@@ -41,15 +41,23 @@ def run_copilot_agent(
 
     logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}")
 
+    # Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell,
+    # which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples).
+    copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot")
+    if not copilot_cmd:
+        raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.")
+
     prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
     mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name)
+    plugins: list[str] | None = install_plugins(copilot_config, copilot_cmd)
     lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.COPILOT, al_lsp=al_lsp, container_name=container_name)
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
     skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
     tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
     config = ExperimentConfiguration(
         mcp_servers=mcp_server_names,
+        plugins=plugins,
         al_lsp_enabled=lsp_plugin_dir is not None,
         custom_instructions=instructions_enabled,
         skills_enabled=skills_enabled,
@@ -59,12 +67,6 @@ def run_copilot_agent(
     logger.info(f"Executing Copilot CLI in directory: {repo_path}")
     logger.debug(f"Using prompt:\n{prompt}")
 
-    # Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell,
-    # which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples).
-    copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot")
-    if not copilot_cmd:
-        raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.")
-
     try:
         cmd_args = [
             copilot_cmd,

diff --git a/src/bcbench/agent/shared/__init__.py b/src/bcbench/agent/shared/__init__.py
@@ -3,6 +3,7 @@
 from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
 from bcbench.agent.shared.lsp import build_al_lsp_plugin
 from bcbench.agent.shared.mcp import build_mcp_config
+from bcbench.agent.shared.plugins import install_plugins
 from bcbench.agent.shared.prompt import build_prompt
 
-__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
+__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "install_plugins", "parse_tool_usage_from_hooks"]
diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
@@ -117,3 +117,21 @@ mcp:
     #   type: "stdio"
     #   command: "npx"
     #   args: ["-y", "@modelcontextprotocol/server-filesystem", "{{repo_path}}"]
+
+# controls:
+# which plugins to install before running the agent (works for both Copilot CLI and Claude Code)
+#    https://docs.github.com/en/copilot/concepts/agents/about-plugins
+# Each entry pins a marketplace to a specific commit SHA for reproducibility: the marketplace
+# repo is cloned at that SHA and added locally (no marketplace is assumed to be registered by
+# default), then the plugin is installed via `<cli> plugin install <plugin>@<marketplace>`.
+#   - repo: marketplace GitHub repo as OWNER/REPO (or a full git URL)
+#   - sha:  commit SHA to pin the marketplace to
+#   - name: install spec, "<plugin-name>@<marketplace-name>"
+plugins:
+  install: []
+    # Example: the awesome-copilot plugin from github/awesome-copilot
+    # https://github.com/github/awesome-copilot/blob/main/plugins/awesome-copilot/README.md
+    # Uncomment to enable locally (update `sha` to the commit you want to pin):
+    # - repo: "github/awesome-copilot"
+    #   sha: "28c3a14af4e6232091071ddb40272f72d9d96b2f"
+    #   name: "awesome-copilot@awesome-copilot"
diff --git a/src/bcbench/agent/shared/plugins.py b/src/bcbench/agent/shared/plugins.py
@@ -0,0 +1,64 @@
+import subprocess
+import tempfile
+from typing import Any
+
+from bcbench.exceptions import AgentError
+from bcbench.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+def _marketplace_url(repo: str) -> str:
+    return repo if "://" in repo else f"https://github.com/{repo}.git"
+
+
+def _plugin_name(spec: str) -> str:
+    return spec.split("@", 1)[0]
+
+
+def _clone_marketplace_at_sha(repo: str, sha: str) -> str:
+    clone_dir = tempfile.mkdtemp(prefix="bcbench-plugin-")
+    url = _marketplace_url(repo)
+    logger.info(f"Cloning marketplace {url} at {sha} into {clone_dir}")
+    _run(["git", "clone", "--quiet", url, clone_dir])
+    _run(["git", "-C", clone_dir, "checkout", "--quiet", sha])
+    return clone_dir
+
+
+def install_plugins(config: dict[str, Any], cli_cmd: str) -> list[str] | None:
+    """Install Copilot/Claude plugins declared in config, pinned to a specific marketplace SHA.
+
+    Each entry under `plugins.install` must provide `repo` (marketplace OWNER/REPO or git URL),
+    `sha` (commit to pin the marketplace to), and `name` (`<plugin>@<marketplace>` install spec).
+    The marketplace is cloned at the pinned SHA and added locally before installing, so no
+    marketplace is assumed to be registered by default.
+
+    Returns:
+        Installed plugin names, or None when no plugins are configured.
+    """
+    specs: list[dict[str, str]] = config.get("plugins", {}).get("install") or []
+
+    if not specs:
+        return None
+
+    plugin_names: list[str] = []
+    for spec in specs:
+        repo, sha, name = spec["repo"], spec["sha"], spec["name"]
+        marketplace_dir = _clone_marketplace_at_sha(repo, sha)
+
+        logger.info(f"Registering marketplace from {marketplace_dir}")
+        _run([cli_cmd, "plugin", "marketplace", "add", marketplace_dir])
+
+        logger.info(f"Installing plugin: {name}")
+        _run([cli_cmd, "plugin", "install", name])
+
+        plugin_names.append(_plugin_name(name))
+
+    logger.info(f"Installed plugins: {plugin_names}")
+    return plugin_names
+
+
+def _run(cmd: list[str]) -> None:
+    result = subprocess.run(cmd, capture_output=True, text=True, check=False)
+    if result.returncode != 0:
+        raise AgentError(f"Plugin command failed: {' '.join(cmd)}\n{result.stderr}")
diff --git a/src/bcbench/results/display.py b/src/bcbench/results/display.py
@@ -26,6 +26,7 @@ def create_console_summary(results: Sequence[BaseEvaluationResult], summary: Eva
     console.print(f"Total Processed: [bold]{len(results)}[/bold], using [bold]{results[0].agent_name}({results[0].model})[/bold]")
     console.print(f"Category: [bold]{results[0].category.value}[/bold]")
     console.print(f"MCP Servers: [bold]{', '.join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else 'None'}[/bold]")
+    console.print(f"Plugins: [bold]{', '.join(results[0].experiment.plugins) if results[0].experiment and results[0].experiment.plugins else 'None'}[/bold]")
     console.print(f"AL LSP: [bold]{'Yes' if results[0].experiment and results[0].experiment.al_lsp_enabled else 'No'}[/bold]")
     console.print(f"Custom Instructions: [bold]{'Yes' if results[0].experiment and results[0].experiment.custom_instructions else 'No'}[/bold]")
     console.print(f"Skills: [bold]{'Yes' if results[0].experiment and results[0].experiment.skills_enabled else 'No'}[/bold]")
@@ -93,6 +94,7 @@ def create_github_job_summary(results: Sequence[BaseEvaluationResult], summary:
             f"Total entries processed: {len(results)}, using **{results[0].agent_name} ({results[0].model})**",
             f"- Category: `{results[0].category.value}`",
             f"- MCP Servers used: {', '.join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else 'None'}",
+            f"- Plugins: {', '.join(results[0].experiment.plugins) if results[0].experiment and results[0].experiment.plugins else 'None'}",
             f"- AL LSP: {'Yes' if results[0].experiment and results[0].experiment.al_lsp_enabled else 'No'}",
             f"- Custom Instructions: {'Yes' if results[0].experiment and results[0].experiment.custom_instructions else 'No'}",
             f"- Skills: {'Yes' if results[0].experiment and results[0].experiment.skills_enabled else 'No'}",

diff --git a/src/bcbench/types.py b/src/bcbench/types.py
@@ -83,6 +83,9 @@ class ExperimentConfiguration(BaseModel):
     # MCP server names used in experiment (if any)
     mcp_servers: list[str] | None = None
 
+    # Plugin names installed for this experiment (if any)
+    plugins: list[str] | None = None
+
     # Whether the AL LSP server was enabled for this experiment
     al_lsp_enabled: bool = False
 
@@ -101,7 +104,7 @@ def is_empty(self) -> bool:
         An empty configuration means no special experiment settings were used.
         This is useful for comparing with None (no experiment) vs default experiment.
         """
-        return self.mcp_servers is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None
+        return self.mcp_servers is None and self.plugins is None and self.al_lsp_enabled is False and self.custom_instructions is False and self.skills_enabled is False and self.custom_agent is None
 
 
 class AgentType(StrEnum):

diff --git a/tests/test_experiment_configuration.py b/tests/test_experiment_configuration.py
@@ -8,11 +8,19 @@ def test_default_values(self):
         config = ExperimentConfiguration()
 
         assert config.mcp_servers is None
+        assert config.plugins is None
         assert config.al_lsp_enabled is False
         assert config.custom_instructions is False
         assert config.skills_enabled is False
         assert config.custom_agent is None
 
+    def test_with_plugins(self):
+        plugins = ["plugin-1", "plugin-2"]
+        config = ExperimentConfiguration(plugins=plugins)
+
+        assert config.plugins == plugins
+        assert not config.is_empty()
+
     def test_with_mcp_servers(self):
         mcp_servers = ["mcp-server-1", "mcp-server-2"]
         config = ExperimentConfiguration(mcp_servers=mcp_servers)

diff --git a/tests/test_plugins_config.py b/tests/test_plugins_config.py
@@ -0,0 +1,73 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from bcbench.agent.shared.plugins import install_plugins
+from bcbench.exceptions import AgentError
+
+PLUGIN_SPEC = {
+    "repo": "github/awesome-copilot",
+    "sha": "28c3a14af4e6232091071ddb40272f72d9d96b2f",
+    "name": "awesome-copilot@awesome-copilot",
+}
+
+
+@pytest.fixture
+def fake_run(monkeypatch):
+    mock = MagicMock(return_value=MagicMock(returncode=0, stderr=""))
+    monkeypatch.setattr("bcbench.agent.shared.plugins.subprocess.run", mock)
+    return mock
+
+
+@pytest.fixture(autouse=True)
+def fake_mkdtemp(monkeypatch):
+    monkeypatch.setattr("bcbench.agent.shared.plugins.tempfile.mkdtemp", lambda **_: "/tmp/market")
+
+
+def _commands(fake_run) -> list[list[str]]:
+    return [c.args[0] for c in fake_run.call_args_list]
+
+
+class TestInstallPlugins:
+    def test_returns_none_when_no_plugins(self, fake_run):
+        assert install_plugins({"plugins": {"install": []}}, "copilot") is None
+        assert install_plugins({}, "claude") is None
+        fake_run.assert_not_called()
+
+    def test_returns_plugin_names(self, fake_run):
+        names = install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")
+
+        assert names == ["awesome-copilot"]
+
+    def test_clones_marketplace_pinned_to_sha(self, fake_run):
+        install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")
+
+        commands = _commands(fake_run)
+        assert commands[0] == ["git", "clone", "--quiet", "https://github.com/github/awesome-copilot.git", "/tmp/market"]
+        assert commands[1] == ["git", "-C", "/tmp/market", "checkout", "--quiet", "28c3a14af4e6232091071ddb40272f72d9d96b2f"]
+
+    def test_always_adds_marketplace_then_installs(self, fake_run):
+        install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "claude")
+
+        commands = _commands(fake_run)
+        assert commands[2] == ["claude", "plugin", "marketplace", "add", "/tmp/market"]
+        assert commands[3] == ["claude", "plugin", "install", "awesome-copilot@awesome-copilot"]
+
+    def test_uses_provided_cli_command(self, fake_run):
+        install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")
+
+        cli_commands = [c for c in _commands(fake_run) if c[0] != "git"]
+        assert all(c[0] == "copilot" for c in cli_commands)
+
+    def test_full_git_url_repo(self, fake_run):
+        spec = {**PLUGIN_SPEC, "repo": "https://gitlab.com/o/r.git"}
+
+        install_plugins({"plugins": {"install": [spec]}}, "copilot")
+
+        assert _commands(fake_run)[0] == ["git", "clone", "--quiet", "https://gitlab.com/o/r.git", "/tmp/market"]
+
+    def test_raises_on_failure(self, monkeypatch):
+        monkeypatch.setattr("bcbench.agent.shared.plugins.subprocess.run", MagicMock(return_value=MagicMock(returncode=1, stderr="boom")))
+
+        with pytest.raises(AgentError):
+            install_plugins({"plugins": {"install": [PLUGIN_SPEC]}}, "copilot")