diff --git a/.gitignore b/.gitignore index db641a20..dabeec55 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ # env .env* trusted-server.toml +js-assets.toml # backup **/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock index 9252e52e..5ff9460f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -182,6 +182,23 @@ dependencies = [ "syn 2.0.118", ] +[[package]] +name = "async-tungstenite" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc405d38be14342132609f06f02acaf825ddccfe76c4824a69281e0458ebd4" +dependencies = [ + "atomic-waker", + "futures-core", + "futures-io", + "futures-task", + "futures-util", + "log", + "pin-project-lite", + "tokio", + "tungstenite", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -361,11 +378,20 @@ version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" +dependencies = [ + "serde", +] [[package]] name = "cast" @@ -421,6 +447,71 @@ dependencies = [ "zeroize", ] +[[package]] +name = "chromiumoxide" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26ed067eb6c1f660bdb87c05efb964421d2ca262bae0296cdfe38cf0cd949a3e" +dependencies = [ + "async-tungstenite", + "base64", + "bytes", + "chromiumoxide_cdp", + "chromiumoxide_types", + "dunce", + "fnv", + "futures", + "futures-timer", + "pin-project-lite", + "reqwest 0.13.4", + "serde", + "serde_json", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "which", + "windows-registry", +] + +[[package]] +name = "chromiumoxide_cdp" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a6a03a7ebac4ea85308f285d6959a3e6b2ce32a0c9465dc7a7b1db0144eec7" +dependencies = [ + "chromiumoxide_pdl", + "chromiumoxide_types", + "serde", + "serde_json", +] + +[[package]] +name = "chromiumoxide_pdl" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c602dea92337bc4d824668d78c5b79c3b4ddb29b40dd7218282bbe8fd3fc2091" +dependencies = [ + "chromiumoxide_types", + "either", + "heck", + "once_cell", + "proc-macro2", + "quote", + "regex", + "serde_json", +] + +[[package]] +name = "chromiumoxide_types" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678d5146e74f16fc4a41978b275af572cd913de1f10270d2b93b6c276bc57d80" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "chrono" version = "0.4.45" @@ -735,6 +826,19 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e901edd733a1472f944a45116df3f846f54d37e67e68640ac8bb69689aca2aa" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.3", + "smallvec", +] + [[package]] name = "cssparser" version = "0.36.0" @@ -744,7 +848,7 @@ dependencies = [ "cssparser-macros", "dtoa-short", "itoa", - "phf", + "phf 0.13.1", "smallvec", ] @@ -830,6 +934,12 @@ dependencies = [ "syn 2.0.118", ] +[[package]] +name = "data-encoding" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8" + [[package]] name = "der" version = "0.7.10" @@ -1181,6 +1291,12 @@ dependencies = [ "validator", ] +[[package]] +name = "ego-tree" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" + [[package]] name = "either" version = "1.16.0" @@ -1409,6 +1525,16 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.32" @@ -1480,6 +1606,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" +[[package]] +name = "futures-timer" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af43fadb8a98512d547e37b4e92e0ced13e205c061b87b4623eff01d918d6968" + [[package]] name = "futures-util" version = "0.3.32" @@ -1497,6 +1629,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1508,6 +1649,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -1658,6 +1808,17 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "html5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4" +dependencies = [ + "log", + "markup5ever", + "match_token", +] + [[package]] name = "http" version = "1.4.2" @@ -2215,14 +2376,14 @@ checksum = "00aad58f6ec3990e795943872f13651e7a5fa59dca2c8f31a74faf8a0e0fb652" dependencies = [ "bitflags 2.13.0", "cfg-if", - "cssparser", + "cssparser 0.36.0", "encoding_rs", "foldhash", "hashbrown 0.17.1", "memchr", "mime", "precomputed-hash", - "selectors", + "selectors 0.37.0", "thiserror 2.0.18", ] @@ -2232,6 +2393,34 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3" +dependencies = [ + "log", + "tendril", + "web_atoms", +] + +[[package]] +name = "match_token" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + [[package]] name = "matchit" version = "0.7.3" @@ -2523,25 +2712,55 @@ dependencies = [ "sha2 0.10.9", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros 0.11.3", + "phf_shared 0.11.3", +] + [[package]] name = "phf" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ - "phf_macros", - "phf_shared", + "phf_macros 0.13.1", + "phf_shared 0.13.1", "serde", ] +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + [[package]] name = "phf_codegen" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.6", ] [[package]] @@ -2551,7 +2770,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ "fastrand", - "phf_shared", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn 2.0.118", ] [[package]] @@ -2560,13 +2792,22 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.13.1", + "phf_shared 0.13.1", "proc-macro2", "quote", "syn 2.0.118", ] +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "phf_shared" version = "0.13.1" @@ -3192,6 +3433,21 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f3a24d916e78954af99281a455168d4a9515d65eca99a18da1b813689c4ad9" +dependencies = [ + "cssparser 0.35.0", + "ego-tree", + "getopts", + "html5ever", + "precomputed-hash", + "selectors 0.31.0", + "tendril", +] + [[package]] name = "sec1" version = "0.7.3" @@ -3228,6 +3484,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5685b6ae43bfcf7d2e7dfcfb5d8e8f61b46442c902531e41a32a9a8bf0ee0fb6" +dependencies = [ + "bitflags 2.13.0", + "cssparser 0.35.0", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.11.3", + "phf_codegen 0.11.3", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "selectors" version = "0.37.0" @@ -3235,12 +3510,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2cfaaa6035167f0e604e42723c7650d59ee269ef220d7bbe0565602c8a0173b9" dependencies = [ "bitflags 2.13.0", - "cssparser", + "cssparser 0.36.0", "derive_more", "log", "new_debug_unreachable", - "phf", - "phf_codegen", + "phf 0.13.1", + "phf_codegen 0.13.1", "precomputed-hash", "rustc-hash", "servo_arc", @@ -3372,6 +3647,17 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest 0.10.7", +] + [[package]] name = "sha2" version = "0.9.9" @@ -3540,6 +3826,31 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.11.1" @@ -3637,6 +3948,17 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -4033,11 +4355,20 @@ dependencies = [ name = "trusted-server-cli" version = "0.1.0" dependencies = [ + "chromiumoxide", "clap", "edgezero-cli", + "futures", "log", + "regex", + "scraper", + "serde", "tempfile", + "tokio", + "toml", "trusted-server-core", + "url", + "which", ] [[package]] @@ -4111,6 +4442,23 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tungstenite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8628dcc84e5a09eb3d8423d6cb682965dea9133204e8fb3efee74c2a0c259442" +dependencies = [ + "bytes", + "data-encoding", + "http", + "httparse", + "log", + "rand 0.9.4", + "sha1", + "thiserror 2.0.18", + "utf-8", +] + [[package]] name = "typeid" version = "1.0.3" @@ -4141,6 +4489,12 @@ version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unicode-xid" version = "0.2.6" @@ -4181,6 +4535,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -4415,6 +4775,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web_atoms" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +dependencies = [ + "phf 0.11.3", + "phf_codegen 0.11.3", + "string_cache", + "string_cache_codegen", +] + [[package]] name = "webpki-root-certs" version = "1.0.8" @@ -4492,6 +4864,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" diff --git a/Cargo.toml b/Cargo.toml index 1ce05197..4e5c278d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ build-print = "1.0.1" bytes = "1.11" chacha20poly1305 = "0.10" chrono = "0.4.44" +chromiumoxide = "0.9.1" clap = { version = "4", features = ["derive"] } config = "0.15.19" cookie = "0.18.1" @@ -73,6 +74,7 @@ mime = "0.3" rand = "0.8" regex = "1.12.3" reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } +scraper = "0.24.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0.149" simple_logger = "5" diff --git a/README.md b/README.md index ca52799d..e56937e8 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,9 @@ ts config init # Edit trusted-server.toml ts config validate +# Audit a public page with Chrome/Chromium to bootstrap a draft config +ts audit https://publisher.example + # Run tests (Fastly/WASM crates — requires Viceroy) cargo test-fastly diff --git a/crates/trusted-server-cli/Cargo.toml b/crates/trusted-server-cli/Cargo.toml index 17cfba9c..189c5405 100644 --- a/crates/trusted-server-cli/Cargo.toml +++ b/crates/trusted-server-cli/Cargo.toml @@ -14,10 +14,20 @@ path = "src/main.rs" workspace = true [target.'cfg(not(target_arch = "wasm32"))'.dependencies] +chromiumoxide = { workspace = true } clap = { workspace = true } edgezero-cli = { workspace = true } +futures = { workspace = true } log = { workspace = true } +regex = { workspace = true } +scraper = { workspace = true } +serde = { workspace = true } +tempfile = { workspace = true } +tokio = { workspace = true } +toml = { workspace = true } trusted-server-core = { workspace = true } +url = { workspace = true } +which = { workspace = true } [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] tempfile = { workspace = true } diff --git a/crates/trusted-server-cli/src/audit.rs b/crates/trusted-server-cli/src/audit.rs new file mode 100644 index 00000000..5870b0de --- /dev/null +++ b/crates/trusted-server-cli/src/audit.rs @@ -0,0 +1,689 @@ +mod analyzer; +pub(crate) mod browser_collector; +pub(crate) mod collector; + +use std::collections::BTreeSet; +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; + +use serde::Serialize; +use url::Url; + +use crate::audit::collector::AuditCollector; +use crate::config_init::EXAMPLE_CONFIG; +use crate::error::{cli_error, report_error, CliResult}; +use crate::run::AuditArgs; + +use analyzer::{analyze_collected_page, extract_gtm_container_id}; + +const DEFAULT_JS_ASSETS_PATH: &str = "js-assets.toml"; +const DEFAULT_CONFIG_PATH: &str = "trusted-server.toml"; + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[serde(rename_all = "kebab-case")] +pub(crate) enum AssetParty { + FirstParty, + ThirdParty, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub(crate) struct AuditedAsset { + pub(crate) kind: String, + pub(crate) url: String, + pub(crate) host: String, + pub(crate) party: AssetParty, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) integration: Option, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub(crate) struct DetectedIntegration { + pub(crate) id: String, + pub(crate) evidence: String, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub(crate) struct AuditArtifact { + pub(crate) audited_url: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) page_title: Option, + pub(crate) js_asset_count: usize, + pub(crate) third_party_asset_count: usize, + pub(crate) detected_integrations: Vec, + pub(crate) assets: Vec, + pub(crate) warnings: Vec, +} + +#[derive(Debug, Clone)] +pub(crate) struct AuditOutputs { + pub(crate) artifact: AuditArtifact, + pub(crate) js_assets_toml: String, + pub(crate) draft_config_toml: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct AuditOutputPlan { + js_assets_path: Option, + config_path: Option, +} + +pub(crate) fn run_audit( + args: &AuditArgs, + collector: &dyn AuditCollector, + out: &mut dyn Write, +) -> CliResult<()> { + let target_url = parse_audit_url(&args.url)?; + let plan = resolve_output_plan(args)?; + let collected = collector.collect_page(&target_url)?; + let outputs = build_audit_outputs(&collected)?; + let wrote_config = plan.config_path.is_some(); + let written = write_audit_outputs(&outputs, &plan)?; + write_success_summary(&outputs, &written, wrote_config, out) +} + +fn parse_audit_url(value: &str) -> CliResult { + let url = Url::parse(value) + .map_err(|error| report_error(format!("invalid audit URL `{value}`: {error}")))?; + if !matches!(url.scheme(), "http" | "https") { + return cli_error(format!( + "`ts audit` only supports http/https URLs, got `{}`", + url.scheme() + )); + } + Ok(url) +} + +fn resolve_output_plan(args: &AuditArgs) -> CliResult { + if args.no_js_assets && args.no_config { + return cli_error("nothing to do: both --no-js-assets and --no-config were set"); + } + + let js_assets_path = if args.no_js_assets { + None + } else { + Some(resolve_output_path( + args.js_assets.as_deref(), + DEFAULT_JS_ASSETS_PATH, + )?) + }; + let config_path = if args.no_config { + None + } else { + Some(resolve_output_path( + args.config.as_deref(), + DEFAULT_CONFIG_PATH, + )?) + }; + + if js_assets_path.is_some() && js_assets_path == config_path { + return cli_error("audit output paths must be distinct"); + } + + for path in [&js_assets_path, &config_path].into_iter().flatten() { + if path.exists() && !args.force { + return cli_error(format!( + "refusing to overwrite existing file `{}`; re-run with --force", + path.display() + )); + } + } + + Ok(AuditOutputPlan { + js_assets_path, + config_path, + }) +} + +fn resolve_output_path(path: Option<&Path>, default: &str) -> CliResult { + let candidate = path.unwrap_or_else(|| Path::new(default)); + if candidate.is_absolute() { + Ok(candidate.to_path_buf()) + } else { + Ok(std::env::current_dir() + .map_err(|error| report_error(format!("failed to read current directory: {error}")))? + .join(candidate)) + } +} + +fn build_audit_outputs(collected: &collector::CollectedPage) -> CliResult { + let artifact = analyze_collected_page(collected)?; + let final_url = collected + .final_url() + .map_err(|error| report_error(format!("invalid final URL: {error}")))?; + let js_assets_toml = toml::to_string_pretty(&artifact) + .map_err(|error| report_error(format!("failed to serialize audit artifact: {error}")))?; + let draft_config_toml = build_draft_config(&final_url, &artifact)?; + + Ok(AuditOutputs { + artifact, + js_assets_toml, + draft_config_toml, + }) +} + +fn write_audit_outputs(outputs: &AuditOutputs, plan: &AuditOutputPlan) -> CliResult> { + let selected_paths = [&plan.js_assets_path, &plan.config_path] + .into_iter() + .flatten() + .collect::>(); + for path in &selected_paths { + if let Some(parent) = path + .parent() + .filter(|parent| !parent.as_os_str().is_empty()) + { + fs::create_dir_all(parent).map_err(|error| { + report_error(format!( + "failed to create parent directory {}: {error}", + parent.display() + )) + })?; + } + } + + let mut written_paths = Vec::new(); + if let Some(path) = &plan.js_assets_path { + fs::write(path, &outputs.js_assets_toml).map_err(|error| { + report_error(format!( + "failed to write JS asset audit {}: {error}", + path.display() + )) + })?; + written_paths.push(path.display().to_string()); + } + if let Some(path) = &plan.config_path { + fs::write(path, &outputs.draft_config_toml).map_err(|error| { + report_error(format!( + "failed to write draft config {}: {error}", + path.display() + )) + })?; + written_paths.push(path.display().to_string()); + } + + Ok(written_paths) +} + +fn write_success_summary( + outputs: &AuditOutputs, + written: &[String], + wrote_config: bool, + out: &mut dyn Write, +) -> CliResult<()> { + let integrations = outputs + .artifact + .detected_integrations + .iter() + .map(|integration| integration.id.as_str()) + .collect::>(); + let draft_note = if wrote_config { + "\nDraft config: review before validation and push" + } else { + "" + }; + writeln!( + out, + "Audited {}\nTitle: {}\nJS assets: {}\nThird-party assets: {}\nDetected integrations: {}\nWrote: {}{}", + outputs.artifact.audited_url, + outputs + .artifact + .page_title + .as_deref() + .unwrap_or(""), + outputs.artifact.js_asset_count, + outputs.artifact.third_party_asset_count, + if integrations.is_empty() { + "none".to_string() + } else { + integrations.join(", ") + }, + if written.is_empty() { + "none".to_string() + } else { + written.join(", ") + }, + draft_note + ) + .map_err(|error| report_error(format!("failed to write command output: {error}"))) +} + +fn build_draft_config(target_url: &Url, artifact: &AuditArtifact) -> CliResult { + let host = target_url + .host_str() + .ok_or_else(|| report_error("audited URL is missing a host"))?; + let origin = target_url.origin().ascii_serialization(); + let mut draft = EXAMPLE_CONFIG.to_string(); + + draft = replace_key_in_section( + &draft, + "publisher", + "domain", + &format!("domain = \"{host}\""), + )?; + draft = replace_key_in_section( + &draft, + "publisher", + "cookie_domain", + &format!("cookie_domain = \".{host}\""), + )?; + draft = replace_key_in_section( + &draft, + "publisher", + "origin_url", + &format!("origin_url = \"{origin}\""), + )?; + + let detected = artifact + .detected_integrations + .iter() + .map(|integration| integration.id.as_str()) + .collect::>(); + + if detected.contains("gpt") { + draft = replace_key_in_section(&draft, "integrations.gpt", "enabled", "enabled = true")?; + } + if detected.contains("didomi") { + draft = replace_key_in_section(&draft, "integrations.didomi", "enabled", "enabled = true")?; + } + if detected.contains("datadome") { + draft = + replace_key_in_section(&draft, "integrations.datadome", "enabled", "enabled = true")?; + } + + let mut manual_review = Vec::new(); + if detected.contains("google_tag_manager") { + if let Some(gtm_id) = extract_gtm_container_id(artifact) { + draft = replace_key_in_section( + &draft, + "integrations.google_tag_manager", + "enabled", + "enabled = true", + )?; + draft = replace_key_in_section( + &draft, + "integrations.google_tag_manager", + "container_id", + &format!("container_id = \"{gtm_id}\""), + )?; + } else { + manual_review.push("google_tag_manager"); + } + } + + for integration in detected { + if !matches!( + integration, + "gpt" | "didomi" | "datadome" | "google_tag_manager" + ) { + manual_review.push(integration); + } + } + + if !manual_review.is_empty() { + if !draft.ends_with('\n') { + draft.push('\n'); + } + draft.push_str("\n# Audit findings requiring manual review\n"); + for integration in manual_review { + draft.push_str(&format!( + "# - Detected {integration}; review the corresponding [integrations.{integration}] section before enabling it.\n" + )); + } + } + + Ok(draft) +} + +fn replace_key_in_section( + document: &str, + section: &str, + key: &str, + replacement_line: &str, +) -> CliResult { + let section_header = format!("[{section}]"); + let mut in_section = false; + let mut replaced = false; + let mut saw_section = false; + let mut lines = Vec::new(); + + for line in document.lines() { + let trimmed = line.trim(); + if trimmed.starts_with('[') && trimmed.ends_with(']') { + in_section = trimmed == section_header; + saw_section |= in_section; + } + + if in_section && !replaced && is_key_line(trimmed, key) { + lines.push(replacement_line.to_string()); + replaced = true; + } else { + lines.push(line.to_string()); + } + } + + if !saw_section { + return cli_error(format!( + "failed to update starter config because section `{section_header}` was not found" + )); + } + if !replaced { + return cli_error(format!( + "failed to update starter config because key `{key}` was not found in `{section_header}`" + )); + } + + let mut output = lines.join("\n"); + if document.ends_with('\n') { + output.push('\n'); + } + Ok(output) +} + +fn is_key_line(trimmed_line: &str, key: &str) -> bool { + trimmed_line + .strip_prefix(key) + .and_then(|remaining| remaining.trim_start().strip_prefix('=')) + .is_some() +} + +#[cfg(test)] +mod tests { + use std::cell::Cell; + + use tempfile::TempDir; + + use super::*; + use crate::audit::collector::{CollectedPage, CollectedRequest, CollectedScriptTag}; + + struct FakeCollector { + collected: CollectedPage, + calls: Cell, + } + + impl FakeCollector { + fn new(collected: CollectedPage) -> Self { + Self { + collected, + calls: Cell::new(0), + } + } + } + + impl AuditCollector for FakeCollector { + fn collect_page(&self, _target_url: &Url) -> CliResult { + self.calls.set(self.calls.get() + 1); + Ok(self.collected.clone()) + } + } + + fn collected_page() -> CollectedPage { + CollectedPage { + requested_url: "https://publisher.example/page".to_string(), + final_url: "https://publisher.example/page".to_string(), + page_title: Some("Example Publisher".to_string()), + html: r#"Example Publisher"#.to_string(), + script_tags: vec![ + CollectedScriptTag { + src: Some("https://www.googletagmanager.com/gtm.js?id=GTM-ABC123".to_string()), + inline_text: None, + }, + CollectedScriptTag { + src: Some("https://securepubads.g.doubleclick.net/tag/js/gpt.js".to_string()), + inline_text: None, + }, + ], + network_requests: vec![CollectedRequest { + url: "https://cdn.publisher.example/app.js".to_string(), + resource_type: Some("script".to_string()), + }], + warnings: Vec::new(), + } + } + + fn audit_args(url: &str) -> AuditArgs { + AuditArgs { + url: url.to_string(), + js_assets: None, + config: None, + no_js_assets: false, + no_config: false, + force: false, + } + } + + #[test] + fn parse_audit_url_accepts_http_and_https() { + assert!(parse_audit_url("http://publisher.example").is_ok()); + assert!(parse_audit_url("https://publisher.example").is_ok()); + } + + #[test] + fn parse_audit_url_rejects_non_http_schemes() { + for url in [ + "file:///etc/passwd", + "data:text/html,hello", + "chrome://version", + ] { + let error = parse_audit_url(url).expect_err("should reject non-http URL"); + assert!( + format!("{error:?}").contains("only supports http/https"), + "should explain scheme restriction" + ); + } + } + + #[test] + fn resolve_output_plan_rejects_no_outputs() { + let mut args = audit_args("https://publisher.example"); + args.no_js_assets = true; + args.no_config = true; + + let error = resolve_output_plan(&args).expect_err("should reject empty output set"); + + assert!( + format!("{error:?}").contains("nothing to do"), + "should explain no-output error" + ); + } + + #[test] + fn resolve_output_plan_rejects_existing_files_without_force() { + let temp = TempDir::new().expect("should create temp dir"); + let path = temp.path().join("js-assets.toml"); + fs::write(&path, "existing").expect("should write existing file"); + let mut args = audit_args("https://publisher.example"); + args.js_assets = Some(path); + args.no_config = true; + + let error = resolve_output_plan(&args).expect_err("should reject overwrite"); + + assert!( + format!("{error:?}").contains("refusing to overwrite"), + "should explain overwrite refusal" + ); + } + + #[test] + fn resolve_output_plan_allows_existing_files_with_force() { + let temp = TempDir::new().expect("should create temp dir"); + let path = temp.path().join("js-assets.toml"); + fs::write(&path, "existing").expect("should write existing file"); + let mut args = audit_args("https://publisher.example"); + args.js_assets = Some(path.clone()); + args.no_config = true; + args.force = true; + + let plan = resolve_output_plan(&args).expect("should allow forced overwrite"); + + assert_eq!(plan.js_assets_path.as_deref(), Some(path.as_path())); + } + + #[test] + fn run_audit_writes_selected_outputs_and_summary() { + let temp = TempDir::new().expect("should create temp dir"); + let js_assets = temp.path().join("audit/js-assets.toml"); + let config = temp.path().join("audit/trusted-server.toml"); + let args = AuditArgs { + url: "https://publisher.example/page".to_string(), + js_assets: Some(js_assets.clone()), + config: Some(config.clone()), + no_js_assets: false, + no_config: false, + force: false, + }; + let collector = FakeCollector::new(collected_page()); + let mut out = Vec::new(); + + run_audit(&args, &collector, &mut out).expect("should run audit"); + + assert_eq!(collector.calls.get(), 1, "should collect page once"); + assert!(js_assets.exists(), "should write JS assets"); + assert!(config.exists(), "should write draft config"); + let summary = String::from_utf8(out).expect("summary should be UTF-8"); + assert!(summary.contains("Audited https://publisher.example/page")); + assert!(summary.contains("Detected integrations: google_tag_manager, gpt")); + assert!(summary.contains("Draft config: review before validation and push")); + } + + #[test] + fn run_audit_respects_no_config() { + let temp = TempDir::new().expect("should create temp dir"); + let js_assets = temp.path().join("js-assets.toml"); + let mut args = audit_args("https://publisher.example/page"); + args.js_assets = Some(js_assets.clone()); + args.no_config = true; + let collector = FakeCollector::new(collected_page()); + + run_audit(&args, &collector, &mut Vec::new()).expect("should run audit"); + + assert!(js_assets.exists(), "should write assets"); + assert!( + !temp.path().join("trusted-server.toml").exists(), + "should not write config" + ); + } + + #[test] + fn run_audit_respects_no_js_assets() { + let temp = TempDir::new().expect("should create temp dir"); + let config = temp.path().join("trusted-server.toml"); + let mut args = audit_args("https://publisher.example/page"); + args.config = Some(config.clone()); + args.no_js_assets = true; + let collector = FakeCollector::new(collected_page()); + let mut out = Vec::new(); + + run_audit(&args, &collector, &mut out).expect("should run audit"); + + assert!(config.exists(), "should write config"); + assert!( + !temp.path().join("js-assets.toml").exists(), + "should not write JS assets" + ); + let summary = String::from_utf8(out).expect("summary should be UTF-8"); + assert!(summary.contains("Draft config: review before validation and push")); + } + + #[test] + fn run_audit_writes_collector_warnings_to_asset_artifact() { + let temp = TempDir::new().expect("should create temp dir"); + let js_assets = temp.path().join("js-assets.toml"); + let mut args = audit_args("https://publisher.example/page"); + args.js_assets = Some(js_assets.clone()); + args.no_config = true; + let mut collected = collected_page(); + collected.warnings.push( + "browser audit timed out while waiting for the page to settle; results may be partial" + .to_string(), + ); + let collector = FakeCollector::new(collected); + + run_audit(&args, &collector, &mut Vec::new()).expect("should run audit"); + + let artifact = fs::read_to_string(js_assets).expect("should read artifact"); + assert!( + artifact.contains("results may be partial"), + "should persist collector warning" + ); + } + + #[test] + fn run_audit_conflict_prevents_collection() { + let temp = TempDir::new().expect("should create temp dir"); + let js_assets = temp.path().join("js-assets.toml"); + fs::write(&js_assets, "existing").expect("should write existing file"); + let mut args = audit_args("https://publisher.example/page"); + args.js_assets = Some(js_assets); + args.no_config = true; + let collector = FakeCollector::new(collected_page()); + + let error = run_audit(&args, &collector, &mut Vec::new()) + .expect_err("should reject existing output"); + + assert_eq!(collector.calls.get(), 0, "should not collect page"); + assert!( + format!("{error:?}").contains("refusing to overwrite"), + "should report overwrite conflict" + ); + } + + #[test] + fn build_draft_config_uses_final_url_and_detected_integrations() { + let url = Url::parse("https://www.publisher.example:8443/path").expect("should parse URL"); + let artifact = AuditArtifact { + audited_url: url.to_string(), + page_title: Some("Example".to_string()), + js_asset_count: 2, + third_party_asset_count: 2, + detected_integrations: vec![ + DetectedIntegration { + id: "google_tag_manager".to_string(), + evidence: "GTM-ABC123".to_string(), + }, + DetectedIntegration { + id: "gpt".to_string(), + evidence: "https://securepubads.g.doubleclick.net/tag/js/gpt.js".to_string(), + }, + DetectedIntegration { + id: "prebid".to_string(), + evidence: "inline script matched `prebid`".to_string(), + }, + ], + assets: Vec::new(), + warnings: Vec::new(), + }; + + let draft = build_draft_config(&url, &artifact).expect("should build draft config"); + + assert!(draft.contains("domain = \"www.publisher.example\"")); + assert!(draft.contains("cookie_domain = \".www.publisher.example\"")); + assert!(draft.contains("origin_url = \"https://www.publisher.example:8443\"")); + assert!(draft.contains("[integrations.gpt]\nenabled = true")); + assert!(draft.contains("[integrations.google_tag_manager]\nenabled = true")); + assert!(draft.contains("container_id = \"GTM-ABC123\"")); + assert!(draft.contains("Detected prebid")); + toml::from_str::(&draft).expect("draft should parse as TOML"); + } + + #[test] + fn build_draft_config_does_not_enable_gtm_without_container_id() { + let url = Url::parse("https://publisher.example/path").expect("should parse URL"); + let artifact = AuditArtifact { + audited_url: url.to_string(), + page_title: None, + js_asset_count: 1, + third_party_asset_count: 1, + detected_integrations: vec![DetectedIntegration { + id: "google_tag_manager".to_string(), + evidence: "https://www.googletagmanager.com/gtm.js".to_string(), + }], + assets: Vec::new(), + warnings: Vec::new(), + }; + + let draft = build_draft_config(&url, &artifact).expect("should build draft config"); + + assert!(draft.contains("[integrations.google_tag_manager]\nenabled = false")); + assert!(draft.contains("Detected google_tag_manager")); + } +} diff --git a/crates/trusted-server-cli/src/audit/analyzer.rs b/crates/trusted-server-cli/src/audit/analyzer.rs new file mode 100644 index 00000000..e38bddfa --- /dev/null +++ b/crates/trusted-server-cli/src/audit/analyzer.rs @@ -0,0 +1,583 @@ +use std::collections::BTreeMap; +use std::sync::LazyLock; + +use regex::Regex; +use scraper::{Html, Selector}; +use url::Url; + +use crate::audit::collector::CollectedPage; +use crate::audit::{AssetParty, AuditArtifact, AuditedAsset, DetectedIntegration}; +use crate::error::{report_error, CliResult}; + +static GTM_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r"\bGTM-[A-Z0-9]+\b").expect("should compile GTM regex")); +static GPT_INLINE_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)\b(?:googletag|gpt\.js|googletagservices|securepubads)\b") + .expect("should compile GPT inline regex") +}); +static DIDOMI_INLINE_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\bdidomi\b").expect("should compile Didomi inline regex")); +static DATADOME_INLINE_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)\bdatadome\b").expect("should compile DataDome inline regex") +}); +static PERMUTIVE_INLINE_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)\bpermutive\b").expect("should compile Permutive inline regex") +}); +static LOCKR_INLINE_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)(?:\blockr\b|\bloc\.kr\b)").expect("should compile Lockr inline regex") +}); +static PREBID_INLINE_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)(?:\bprebid\b|\bpbjs\b)").expect("should compile Prebid inline regex") +}); + +pub(crate) fn analyze_collected_page(collected: &CollectedPage) -> CliResult { + let final_url = collected + .final_url() + .map_err(|error| report_error(format!("invalid final URL: {error}")))?; + let requested_url = collected + .requested_url() + .map_err(|error| report_error(format!("invalid requested URL: {error}")))?; + + let document = Html::parse_document(&collected.html); + let title_selector = Selector::parse("title").expect("should parse title selector"); + let derived_title = document + .select(&title_selector) + .next() + .map(|element| { + element + .text() + .collect::>() + .join(" ") + .trim() + .to_string() + }) + .filter(|title| !title.is_empty()); + + let mut assets_by_url = BTreeMap::::new(); + let mut integrations = BTreeMap::::new(); + let mut warnings = collected.warnings.clone(); + + if requested_url != final_url { + warnings.push(format!( + "page redirected from `{requested_url}` to `{final_url}`" + )); + } + + for tag in &collected.script_tags { + if let Some(src) = &tag.src { + if let Ok(asset_url) = final_url.join(src) { + let integration = detect_integration_from_url(&asset_url); + record_integration(&mut integrations, &integration, asset_url.as_str()); + insert_asset(&mut assets_by_url, &final_url, &asset_url, integration); + } else { + warnings.push(format!("could not resolve script URL `{src}`")); + } + } + + if let Some(inline_text) = &tag.inline_text { + for (integration_id, evidence) in detect_integrations_from_inline_script(inline_text) { + integrations.entry(integration_id).or_insert(evidence); + } + } + } + + for request in &collected.network_requests { + let is_script = request + .resource_type + .as_deref() + .is_some_and(|resource_type| resource_type.eq_ignore_ascii_case("script")); + if !is_script { + continue; + } + if let Ok(asset_url) = Url::parse(&request.url) { + let integration = detect_integration_from_url(&asset_url); + record_integration(&mut integrations, &integration, asset_url.as_str()); + insert_asset(&mut assets_by_url, &final_url, &asset_url, integration); + } + } + + let assets = assets_by_url.into_values().collect::>(); + let third_party_asset_count = assets + .iter() + .filter(|asset| asset.party == AssetParty::ThirdParty) + .count(); + + let page_title = collected + .page_title + .as_deref() + .map(str::trim) + .filter(|title| !title.is_empty()) + .map(ToOwned::to_owned) + .or(derived_title); + + Ok(AuditArtifact { + audited_url: final_url.to_string(), + page_title, + js_asset_count: assets.len(), + third_party_asset_count, + detected_integrations: integrations + .into_iter() + .map(|(id, evidence)| DetectedIntegration { id, evidence }) + .collect(), + assets, + warnings, + }) +} + +fn insert_asset( + assets_by_url: &mut BTreeMap, + page_url: &Url, + asset_url: &Url, + integration: Option, +) { + let asset = assets_by_url + .entry(asset_url.to_string()) + .or_insert_with(|| AuditedAsset { + kind: "script".to_string(), + url: asset_url.to_string(), + host: asset_url.host_str().unwrap_or_default().to_string(), + party: classify_party(page_url, asset_url), + integration: None, + }); + + if asset.integration.is_none() { + asset.integration = integration; + } +} + +fn record_integration( + integrations: &mut BTreeMap, + integration: &Option, + evidence: &str, +) { + if let Some(integration_id) = integration { + integrations + .entry(integration_id.clone()) + .or_insert_with(|| evidence.to_string()); + } +} + +pub(crate) fn classify_party(page_url: &Url, asset_url: &Url) -> AssetParty { + let page_host = page_url.host_str().unwrap_or_default(); + let asset_host = asset_url.host_str().unwrap_or_default(); + + if host_matches(page_host, asset_host) { + AssetParty::FirstParty + } else { + AssetParty::ThirdParty + } +} + +fn host_matches(page_host: &str, asset_host: &str) -> bool { + // This is an advisory heuristic, not public-suffix-aware eTLD+1 classification. + asset_host == page_host + || asset_host + .strip_suffix(page_host) + .is_some_and(|prefix| prefix.ends_with('.')) + || page_host + .strip_suffix(asset_host) + .is_some_and(|prefix| prefix.ends_with('.')) +} + +pub(crate) fn detect_integration_from_url(url: &Url) -> Option { + let host = url.host_str().unwrap_or_default(); + let path = url.path(); + let value = format!("{host}{path}").to_ascii_lowercase(); + + if value.contains("googletagmanager.com") { + Some("google_tag_manager".to_string()) + } else if value.contains("securepubads.g.doubleclick.net") + || value.contains("googletagservices.com") + || value.contains("doubleclick.net/tag/js/gpt") + { + Some("gpt".to_string()) + } else if value.contains("privacy-center.org") { + Some("didomi".to_string()) + } else if value.contains("datadome.co") { + Some("datadome".to_string()) + } else if value.contains("permutive") { + Some("permutive".to_string()) + } else if value.contains("loc.kr") { + Some("lockr".to_string()) + } else if value.contains("prebid") { + Some("prebid".to_string()) + } else { + None + } +} + +pub(crate) fn detect_integrations_from_inline_script(script: &str) -> Vec<(String, String)> { + let mut matches = Vec::new(); + + if let Some(container_id) = GTM_REGEX.find(script) { + matches.push(( + "google_tag_manager".to_string(), + container_id.as_str().to_string(), + )); + } + + for (integration, regex) in [ + ("gpt", &*GPT_INLINE_REGEX), + ("didomi", &*DIDOMI_INLINE_REGEX), + ("datadome", &*DATADOME_INLINE_REGEX), + ("permutive", &*PERMUTIVE_INLINE_REGEX), + ("lockr", &*LOCKR_INLINE_REGEX), + ("prebid", &*PREBID_INLINE_REGEX), + ] { + if regex.is_match(script) { + matches.push(( + integration.to_string(), + format!("inline script matched `{integration}`"), + )); + } + } + + matches +} + +pub(crate) fn extract_gtm_container_id(artifact: &AuditArtifact) -> Option { + for integration in &artifact.detected_integrations { + if integration.id == "google_tag_manager" && GTM_REGEX.is_match(&integration.evidence) { + return Some(integration.evidence.clone()); + } + } + + for asset in &artifact.assets { + if asset.integration.as_deref() == Some("google_tag_manager") { + if let Some(matched) = GTM_REGEX.find(asset.url.as_str()) { + return Some(matched.as_str().to_string()); + } + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::audit::collector::{CollectedRequest, CollectedScriptTag}; + + fn page_url() -> Url { + Url::parse("https://publisher.example/page").expect("should parse URL") + } + + #[test] + fn analyze_collected_page_merges_dom_and_network_scripts() { + let collected = CollectedPage { + requested_url: "https://publisher.example/page".to_string(), + final_url: "https://publisher.example/page".to_string(), + page_title: Some("Browser Title".to_string()), + html: r#"HTML Title"#.to_string(), + script_tags: vec![ + CollectedScriptTag { + src: Some("https://www.googletagmanager.com/gtm.js?id=GTM-ABCD123".to_string()), + inline_text: None, + }, + CollectedScriptTag { + src: Some("https://securepubads.g.doubleclick.net/tag/js/gpt.js".to_string()), + inline_text: None, + }, + ], + network_requests: vec![CollectedRequest { + url: "https://cdn.example.com/dynamic.js".to_string(), + resource_type: Some("Script".to_string()), + }], + warnings: vec!["partial settle".to_string()], + }; + + let artifact = analyze_collected_page(&collected).expect("should analyze collected page"); + + assert_eq!(artifact.page_title.as_deref(), Some("Browser Title")); + assert_eq!( + artifact.js_asset_count, 3, + "should merge all script evidence" + ); + assert_eq!(artifact.warnings, vec!["partial settle".to_string()]); + assert!( + artifact + .detected_integrations + .iter() + .any(|integration| integration.id == "google_tag_manager"), + "should preserve GTM detection" + ); + assert!( + artifact + .detected_integrations + .iter() + .any(|integration| integration.id == "gpt"), + "should detect GPT from browser collected scripts" + ); + } + + #[test] + fn analyze_collected_page_uses_html_title_when_browser_title_absent() { + let collected = CollectedPage { + requested_url: "https://publisher.example/page".to_string(), + final_url: "https://publisher.example/page".to_string(), + page_title: None, + html: "HTML Title".to_string(), + script_tags: Vec::new(), + network_requests: Vec::new(), + warnings: Vec::new(), + }; + + let artifact = analyze_collected_page(&collected).expect("should analyze collected page"); + + assert_eq!(artifact.page_title.as_deref(), Some("HTML Title")); + } + + #[test] + fn analyze_collected_page_uses_html_title_when_browser_title_is_empty() { + let collected = CollectedPage { + requested_url: "https://publisher.example/page".to_string(), + final_url: "https://publisher.example/page".to_string(), + page_title: Some(" ".to_string()), + html: "HTML Title".to_string(), + script_tags: Vec::new(), + network_requests: Vec::new(), + warnings: Vec::new(), + }; + + let artifact = analyze_collected_page(&collected).expect("should analyze collected page"); + + assert_eq!(artifact.page_title.as_deref(), Some("HTML Title")); + } + + #[test] + fn analyze_collected_page_deduplicates_scripts_and_updates_integration() { + let collected = CollectedPage { + requested_url: "https://publisher.example/page".to_string(), + final_url: "https://publisher.example/page".to_string(), + page_title: None, + html: r#""# + .to_string(), + script_tags: vec![CollectedScriptTag { + src: Some("https://cdn.example.com/prebid.js".to_string()), + inline_text: None, + }], + network_requests: vec![CollectedRequest { + url: "https://cdn.example.com/prebid.js".to_string(), + resource_type: Some("script".to_string()), + }], + warnings: Vec::new(), + }; + + let artifact = analyze_collected_page(&collected).expect("should analyze collected page"); + + assert_eq!( + artifact.js_asset_count, 1, + "should deduplicate identical script URLs" + ); + assert_eq!( + artifact.assets[0].integration.as_deref(), + Some("prebid"), + "should preserve detected integration on deduped asset" + ); + } + + #[test] + fn analyze_collected_page_resolves_relative_scripts_and_warns_on_invalid_src() { + let collected = CollectedPage { + requested_url: "https://publisher.example/page".to_string(), + final_url: "https://publisher.example/path/page".to_string(), + page_title: None, + html: "".to_string(), + script_tags: vec![ + CollectedScriptTag { + src: Some("/static/app.js".to_string()), + inline_text: None, + }, + CollectedScriptTag { + src: Some("http://[invalid".to_string()), + inline_text: None, + }, + ], + network_requests: Vec::new(), + warnings: Vec::new(), + }; + + let artifact = analyze_collected_page(&collected).expect("should analyze collected page"); + + assert!( + artifact + .assets + .iter() + .any(|asset| asset.url == "https://publisher.example/static/app.js"), + "should resolve relative URL against final URL" + ); + assert!( + artifact + .warnings + .iter() + .any(|warning| warning.contains("could not resolve script URL")), + "should warn about malformed script URL" + ); + } + + #[test] + fn analyze_collected_page_uses_final_url_and_records_redirect_warning() { + let collected = CollectedPage { + requested_url: "http://publisher.example/page".to_string(), + final_url: "https://www.publisher.example/landing".to_string(), + page_title: Some("Example Publisher".to_string()), + html: "".to_string(), + script_tags: Vec::new(), + network_requests: Vec::new(), + warnings: Vec::new(), + }; + + let artifact = analyze_collected_page(&collected).expect("should analyze collected page"); + + assert_eq!( + artifact.audited_url, "https://www.publisher.example/landing", + "should report the final audited URL" + ); + assert!( + artifact.warnings.iter().any(|warning| warning.contains( + "page redirected from `http://publisher.example/page` to `https://www.publisher.example/landing`" + )), + "should preserve redirect context in warnings" + ); + } + + #[test] + fn classify_party_uses_host_relationship() { + let page = page_url(); + let exact = Url::parse("https://publisher.example/app.js").expect("should parse URL"); + let subdomain = + Url::parse("https://cdn.publisher.example/app.js").expect("should parse URL"); + let parent = Url::parse("https://example/app.js").expect("should parse URL"); + let unrelated = Url::parse("https://cdn.example.com/app.js").expect("should parse URL"); + + assert_eq!(classify_party(&page, &exact), AssetParty::FirstParty); + assert_eq!(classify_party(&page, &subdomain), AssetParty::FirstParty); + assert_eq!(classify_party(&page, &parent), AssetParty::FirstParty); + assert_eq!(classify_party(&page, &unrelated), AssetParty::ThirdParty); + } + + #[test] + fn detect_integrations_from_inline_script_reads_standard_gtm_snippet() { + let matches = detect_integrations_from_inline_script( + r#"(function(w,d,s,l,i){w[l]=w[l]||[];})(window,document,'script','dataLayer','GTM-ABC123');"#, + ); + + assert!( + matches.iter().any( + |(integration, evidence)| integration == "google_tag_manager" + && evidence == "GTM-ABC123" + ), + "should detect GTM IDs followed by snippet punctuation" + ); + } + + #[test] + fn detect_integrations_from_inline_script_reads_case_insensitive_markers() { + let matches = detect_integrations_from_inline_script("window.PREBID = window.Didomi;"); + + assert!(matches + .iter() + .any(|(integration, _)| integration == "prebid")); + assert!(matches + .iter() + .any(|(integration, _)| integration == "didomi")); + } + + #[test] + fn detect_integrations_from_inline_script_avoids_short_substring_matches() { + let matches = detect_integrations_from_inline_script("const svgptimize = blockrResult;"); + + assert!( + !matches.iter().any(|(integration, _)| integration == "gpt"), + "should not match incidental GPT substrings" + ); + assert!( + !matches + .iter() + .any(|(integration, _)| integration == "lockr"), + "should not match lockr inside a larger token" + ); + } + + #[test] + fn detect_integration_from_url_recognizes_known_patterns() { + let cases = [ + ( + "https://www.googletagmanager.com/gtm.js?id=GTM-ABC123", + "google_tag_manager", + ), + ( + "https://securepubads.g.doubleclick.net/tag/js/gpt.js", + "gpt", + ), + ("https://sdk.privacy-center.org/sdk.js", "didomi"), + ("https://js.datadome.co/tags.js", "datadome"), + ("https://cdn.permutive.com/sdk.js", "permutive"), + ("https://identity.loc.kr/sdk.js", "lockr"), + ("https://cdn.example.com/prebid.js", "prebid"), + ]; + + for (url, expected) in cases { + let parsed = Url::parse(url).expect("should parse URL"); + assert_eq!( + detect_integration_from_url(&parsed).as_deref(), + Some(expected), + "should detect {expected}" + ); + } + } + + #[test] + fn extract_gtm_container_id_reads_query_parameter_urls() { + let artifact = AuditArtifact { + audited_url: "https://publisher.example".to_string(), + page_title: None, + js_asset_count: 1, + third_party_asset_count: 1, + detected_integrations: Vec::new(), + assets: vec![AuditedAsset { + kind: "script".to_string(), + url: "https://www.googletagmanager.com/gtm.js?id=GTM-ABC123&l=dataLayer" + .to_string(), + host: "www.googletagmanager.com".to_string(), + party: AssetParty::ThirdParty, + integration: Some("google_tag_manager".to_string()), + }], + warnings: Vec::new(), + }; + + assert_eq!( + extract_gtm_container_id(&artifact).as_deref(), + Some("GTM-ABC123"), + "should extract GTM container IDs before query separators" + ); + } + + #[test] + fn artifact_serialization_uses_expected_shape() { + let artifact = AuditArtifact { + audited_url: "https://publisher.example".to_string(), + page_title: None, + js_asset_count: 1, + third_party_asset_count: 1, + detected_integrations: vec![DetectedIntegration { + id: "gpt".to_string(), + evidence: "https://securepubads.g.doubleclick.net/tag/js/gpt.js".to_string(), + }], + assets: vec![AuditedAsset { + kind: "script".to_string(), + url: "https://securepubads.g.doubleclick.net/tag/js/gpt.js".to_string(), + host: "securepubads.g.doubleclick.net".to_string(), + party: AssetParty::ThirdParty, + integration: Some("gpt".to_string()), + }], + warnings: Vec::new(), + }; + + let toml = toml::to_string_pretty(&artifact).expect("should serialize artifact"); + + assert!(toml.contains("audited_url = \"https://publisher.example\"")); + assert!(toml.contains("party = \"third-party\"")); + assert!(!toml.contains("page_title")); + } +} diff --git a/crates/trusted-server-cli/src/audit/browser_collector.rs b/crates/trusted-server-cli/src/audit/browser_collector.rs new file mode 100644 index 00000000..add851bf --- /dev/null +++ b/crates/trusted-server-cli/src/audit/browser_collector.rs @@ -0,0 +1,375 @@ +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use chromiumoxide::browser::{Browser, BrowserConfig}; +use chromiumoxide::ArcHttpRequest; +use futures::StreamExt as _; +use serde::Deserialize; +use tempfile::TempDir; +use tokio::runtime::Builder; +use tokio::time::{sleep, timeout}; +use url::Url; +use which::which; + +use crate::audit::collector::{ + AuditCollector, CollectedPage, CollectedRequest, CollectedScriptTag, +}; +use crate::error::{report_error, CliResult}; + +const SETTLE_QUIET_PERIOD: Duration = Duration::from_millis(750); +const SETTLE_POLL_INTERVAL: Duration = Duration::from_millis(250); +const SETTLE_MAX_WAIT: Duration = Duration::from_secs(6); +const NAVIGATION_TIMEOUT: Duration = Duration::from_secs(30); +const BROWSER_CLOSE_TIMEOUT: Duration = Duration::from_secs(5); +const RESOURCE_TIMING_BUFFER_WARNING_THRESHOLD: usize = 250; + +#[derive(Default)] +pub(crate) struct BrowserAuditCollector; + +impl AuditCollector for BrowserAuditCollector { + fn collect_page(&self, target_url: &Url) -> CliResult { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .map_err(|error| { + report_error(format!( + "failed to build Tokio runtime for browser audit: {error}" + )) + })?; + + runtime.block_on(collect_page_via_browser_async(target_url)) + } +} + +async fn collect_page_via_browser_async(target_url: &Url) -> CliResult { + let chrome_executable = find_browser_executable()?; + let user_data_dir = TempDir::new().map_err(|error| { + report_error(format!( + "failed to create temporary browser profile for audit: {error}" + )) + })?; + let config = BrowserConfig::builder() + .chrome_executable(chrome_executable) + .user_data_dir(user_data_dir.path()) + .new_headless_mode() + .build() + .map_err(|error| { + report_error(format!( + "failed to build Chromium configuration for audit: {error}" + )) + })?; + + let (mut browser, mut handler) = Browser::launch(config).await.map_err(|error| { + report_error(format!( + "failed to launch Chrome/Chromium for audit: {error}" + )) + })?; + + let handler_task = tokio::spawn(async move { + while let Some(event) = handler.next().await { + if event.is_err() { + break; + } + } + }); + + let result = collect_page_from_browser(&mut browser, target_url).await; + + let close_result = timeout(BROWSER_CLOSE_TIMEOUT, browser.close()) + .await + .map_err(|_| report_error("timed out closing browser after audit")) + .and_then(|result| { + result.map_err(|error| { + report_error(format!("failed to close browser after audit: {error}")) + }) + }); + if close_result.is_err() { + handler_task.abort(); + } + let _ = handler_task.await; + + match (result, close_result) { + (Ok(collected), Ok(_)) => Ok(collected), + (Ok(_), Err(error)) | (Err(error), _) => Err(error), + } +} + +async fn collect_page_from_browser( + browser: &mut Browser, + target_url: &Url, +) -> CliResult { + let page = browser.new_page("about:blank").await.map_err(|error| { + report_error(format!("failed to create browser page for audit: {error}")) + })?; + + timeout(NAVIGATION_TIMEOUT, page.goto(target_url.as_str())) + .await + .map_err(|_| report_error(format!("timed out navigating to `{target_url}`")))? + .map_err(|error| report_error(format!("failed to navigate to `{target_url}`: {error}")))?; + + let navigation_response = timeout(NAVIGATION_TIMEOUT, page.wait_for_navigation_response()) + .await + .map_err(|_| { + report_error(format!( + "timed out waiting for main document navigation response from `{target_url}`" + )) + })? + .map_err(|error| { + report_error(format!( + "failed to read main document navigation response: {error}" + )) + })?; + + let mut warnings = Vec::new(); + if let Some(warning) = validate_navigation_response(navigation_response)? { + warnings.push(warning); + } + if !wait_for_page_settle(&page).await? { + warnings.push( + "browser audit timed out while waiting for the page to settle; results may be partial" + .to_string(), + ); + } + + let final_url = page + .url() + .await + .map_err(|error| report_error(format!("failed to read final page URL: {error}")))? + .ok_or_else(|| report_error("browser page URL was empty after navigation"))?; + let page_title = page + .get_title() + .await + .map_err(|error| report_error(format!("failed to read page title: {error}")))?; + let html = page + .content() + .await + .map_err(|error| report_error(format!("failed to read rendered page HTML: {error}")))?; + + let script_tags: Vec = page + .evaluate( + r#"() => Array.from(document.scripts).map((script) => ({ + src: script.src || null, + inline_text: script.src ? null : (script.textContent || null), + }))"#, + ) + .await + .map_err(|error| report_error(format!("failed to read rendered script tags: {error}")))? + .into_value() + .map_err(|error| { + report_error(format!( + "failed to decode rendered script tag data: {error}" + )) + })?; + + let network_requests: Vec = page + .evaluate( + r#"() => performance.getEntriesByType('resource').map((entry) => ({ + url: entry.name, + initiator_type: entry.initiatorType || null, + }))"#, + ) + .await + .map_err(|error| { + report_error(format!( + "failed to read browser performance resource entries: {error}" + )) + })? + .into_value() + .map_err(|error| { + report_error(format!( + "failed to decode browser performance resource data: {error}" + )) + })?; + + if network_requests.len() >= RESOURCE_TIMING_BUFFER_WARNING_THRESHOLD { + warnings.push( + "browser resource timing buffer reached its default size; some network assets may be missing" + .to_string(), + ); + } + + Ok(CollectedPage { + requested_url: target_url.to_string(), + final_url, + page_title: page_title.filter(|title| !title.trim().is_empty()), + html, + script_tags: script_tags + .into_iter() + .map(|script| CollectedScriptTag { + src: script.src, + inline_text: script.inline_text.filter(|text| !text.trim().is_empty()), + }) + .collect(), + network_requests: network_requests + .into_iter() + .map(|entry| CollectedRequest { + url: entry.url, + resource_type: entry.initiator_type, + }) + .collect(), + warnings, + }) +} + +async fn wait_for_page_settle(page: &chromiumoxide::Page) -> CliResult { + let mut elapsed = Duration::ZERO; + let mut previous_count = None; + let mut stable_for = Duration::ZERO; + + while elapsed < SETTLE_MAX_WAIT { + let ready_state: String = page + .evaluate("document.readyState") + .await + .map_err(|error| report_error(format!("failed to read document ready state: {error}")))? + .into_value() + .map_err(|error| { + report_error(format!("failed to decode document ready state: {error}")) + })?; + let resource_count: usize = page + .evaluate("performance.getEntriesByType('resource').length") + .await + .map_err(|error| report_error(format!("failed to read resource count: {error}")))? + .into_value() + .map_err(|error| report_error(format!("failed to decode resource count: {error}")))?; + + if ready_state == "complete" { + if previous_count == Some(resource_count) { + stable_for += SETTLE_POLL_INTERVAL; + } else { + stable_for = Duration::ZERO; + } + + if stable_for >= SETTLE_QUIET_PERIOD { + return Ok(true); + } + } + + previous_count = Some(resource_count); + sleep(SETTLE_POLL_INTERVAL).await; + elapsed += SETTLE_POLL_INTERVAL; + } + + Ok(false) +} + +fn validate_navigation_response(navigation_response: ArcHttpRequest) -> CliResult> { + let request = navigation_response + .ok_or_else(|| report_error("browser audit did not capture the main document response"))?; + + if let Some(failure_text) = &request.failure_text { + return Err(report_error(format!( + "main document request failed: {failure_text}" + ))); + } + + let response = request.response.as_ref().ok_or_else(|| { + report_error("browser audit did not capture the main document HTTP response") + })?; + + if is_successful_navigation_status(response.status) { + return Ok(None); + } + + Ok(Some(format!( + "audit request returned HTTP {} {} for `{}`; results may be partial", + response.status, response.status_text, response.url + ))) +} + +fn is_successful_navigation_status(status: i64) -> bool { + (200..400).contains(&status) +} + +fn find_browser_executable() -> CliResult { + for candidate in browser_executable_path_candidates() { + if let Ok(path) = which(candidate) { + return Ok(path); + } + } + + for candidate in browser_executable_fallbacks() { + let candidate_path = Path::new(candidate); + if candidate_path.is_file() { + return Ok(candidate_path.to_path_buf()); + } + } + + Err(report_error( + "Chrome/Chromium was not found on PATH or in the standard local install locations checked by `ts audit`. Install a local Chrome or Chromium binary before running `ts audit`.", + )) +} + +fn browser_executable_path_candidates() -> &'static [&'static str] { + &[ + "google-chrome", + "google-chrome-stable", + "chromium", + "chromium-browser", + "chrome", + "Google Chrome", + "Google Chrome for Testing", + ] +} + +fn browser_executable_fallbacks() -> &'static [&'static str] { + #[cfg(target_os = "macos")] + { + &[ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "/Applications/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing", + ] + } + + #[cfg(target_os = "linux")] + { + &[ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/chromium", + "/usr/bin/chromium-browser", + "/snap/bin/chromium", + ] + } + + #[cfg(not(any(target_os = "macos", target_os = "linux")))] + { + &[] + } +} + +#[derive(Debug, Deserialize)] +struct BrowserScriptTag { + src: Option, + inline_text: Option, +} + +#[derive(Debug, Deserialize)] +struct BrowserPerformanceEntry { + url: String, + initiator_type: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn successful_navigation_status_allows_redirects_but_rejects_errors() { + assert!(is_successful_navigation_status(200)); + assert!(is_successful_navigation_status(302)); + assert!(is_successful_navigation_status(399)); + assert!(!is_successful_navigation_status(199)); + assert!(!is_successful_navigation_status(400)); + assert!(!is_successful_navigation_status(500)); + } + + #[test] + fn browser_path_candidates_include_common_names() { + let candidates = browser_executable_path_candidates(); + + assert!(candidates.contains(&"google-chrome")); + assert!(candidates.contains(&"chromium")); + assert!(candidates.contains(&"Google Chrome for Testing")); + } +} diff --git a/crates/trusted-server-cli/src/audit/collector.rs b/crates/trusted-server-cli/src/audit/collector.rs new file mode 100644 index 00000000..314ae54f --- /dev/null +++ b/crates/trusted-server-cli/src/audit/collector.rs @@ -0,0 +1,41 @@ +use serde::{Deserialize, Serialize}; +use url::Url; + +use crate::error::CliResult; + +pub(crate) trait AuditCollector { + fn collect_page(&self, target_url: &Url) -> CliResult; +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub(crate) struct CollectedPage { + pub(crate) requested_url: String, + pub(crate) final_url: String, + pub(crate) page_title: Option, + pub(crate) html: String, + pub(crate) script_tags: Vec, + pub(crate) network_requests: Vec, + pub(crate) warnings: Vec, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub(crate) struct CollectedScriptTag { + pub(crate) src: Option, + pub(crate) inline_text: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub(crate) struct CollectedRequest { + pub(crate) url: String, + pub(crate) resource_type: Option, +} + +impl CollectedPage { + pub(crate) fn requested_url(&self) -> Result { + Url::parse(&self.requested_url) + } + + pub(crate) fn final_url(&self) -> Result { + Url::parse(&self.final_url) + } +} diff --git a/crates/trusted-server-cli/src/config_init.rs b/crates/trusted-server-cli/src/config_init.rs index 1db00d7c..f1fa1c0a 100644 --- a/crates/trusted-server-cli/src/config_init.rs +++ b/crates/trusted-server-cli/src/config_init.rs @@ -2,7 +2,7 @@ use std::fs; use std::io::Write; use std::path::PathBuf; -const EXAMPLE_CONFIG: &str = include_str!(concat!( +pub(crate) const EXAMPLE_CONFIG: &str = include_str!(concat!( env!("CARGO_MANIFEST_DIR"), "/../../trusted-server.example.toml" )); diff --git a/crates/trusted-server-cli/src/error.rs b/crates/trusted-server-cli/src/error.rs new file mode 100644 index 00000000..cdb0a2a9 --- /dev/null +++ b/crates/trusted-server-cli/src/error.rs @@ -0,0 +1,11 @@ +pub(crate) type CliResult = Result; + +pub(crate) fn cli_error(message: impl Into) -> CliResult { + Err(message.into()) +} + +pub(crate) fn report_error(message: impl Into) -> String { + let message = message.into(); + log::error!("{message}"); + message +} diff --git a/crates/trusted-server-cli/src/lib.rs b/crates/trusted-server-cli/src/lib.rs index eab19b7f..e0e54f2b 100644 --- a/crates/trusted-server-cli/src/lib.rs +++ b/crates/trusted-server-cli/src/lib.rs @@ -10,9 +10,13 @@ ) )] +#[cfg(not(target_arch = "wasm32"))] +mod audit; #[cfg(not(target_arch = "wasm32"))] mod config_init; #[cfg(not(target_arch = "wasm32"))] +mod error; +#[cfg(not(target_arch = "wasm32"))] mod run; #[cfg(not(target_arch = "wasm32"))] diff --git a/crates/trusted-server-cli/src/run.rs b/crates/trusted-server-cli/src/run.rs index b335fdad..83491022 100644 --- a/crates/trusted-server-cli/src/run.rs +++ b/crates/trusted-server-cli/src/run.rs @@ -7,6 +7,7 @@ use edgezero_cli::args::{ }; use trusted_server_core::config::TrustedServerAppConfig; +use crate::audit::browser_collector::BrowserAuditCollector; use crate::config_init::{run_config_init, ConfigInitArgs}; #[derive(Debug, Parser)] @@ -18,6 +19,8 @@ struct Args { #[derive(Debug, Subcommand)] enum Command { + /// Audit a public page and write draft Trusted Server artifacts. + Audit(AuditArgs), /// Sign in / out / status against an `EdgeZero` adapter. Auth(AuthArgs), /// Build the project for a target adapter. @@ -33,6 +36,27 @@ enum Command { Serve(ServeArgs), } +#[derive(Debug, clap::Args)] +pub(crate) struct AuditArgs { + /// Public HTTP(S) URL to audit. + pub(crate) url: String, + /// JavaScript asset audit output path. + #[arg(long)] + pub(crate) js_assets: Option, + /// Draft Trusted Server config output path. + #[arg(long)] + pub(crate) config: Option, + /// Do not write the JavaScript asset audit file. + #[arg(long)] + pub(crate) no_js_assets: bool, + /// Do not write the draft Trusted Server config file. + #[arg(long)] + pub(crate) no_config: bool, + /// Overwrite existing output files. + #[arg(long)] + pub(crate) force: bool, +} + #[derive(Debug, Subcommand)] enum ConfigCommand { /// Initialize a Trusted Server config file from the example template. @@ -50,13 +74,19 @@ enum ConfigCommand { /// # Errors /// /// Returns an error when command parsing, config validation, `EdgeZero` -/// delegation, or config initialization fails. +/// delegation, audit collection, or config initialization fails. pub fn run_from_env() -> Result<(), String> { dispatch(Args::parse()) } fn dispatch(args: Args) -> Result<(), String> { match args.command { + Command::Audit(args) => { + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + let collector = BrowserAuditCollector; + crate::audit::run_audit(&args, &collector, &mut out) + } Command::Auth(args) => edgezero_cli::run_auth(&args), Command::Build(args) => edgezero_cli::run_build(&args), Command::Config(ConfigCommand::Init(args)) => run_config_init(&args), @@ -92,6 +122,64 @@ mod tests { Args::try_parse_from(args).expect("should parse args") } + #[test] + fn parses_audit_with_default_outputs() { + let args = parse(&["ts", "audit", "https://publisher.example"]); + let Command::Audit(audit) = args.command else { + panic!("expected audit command"); + }; + assert_eq!(audit.url, "https://publisher.example"); + assert_eq!(audit.js_assets, None); + assert_eq!(audit.config, None); + assert!(!audit.no_js_assets); + assert!(!audit.no_config); + assert!(!audit.force); + } + + #[test] + fn parses_audit_with_custom_outputs() { + let args = parse(&[ + "ts", + "audit", + "https://publisher.example", + "--js-assets", + "audit/js-assets.toml", + "--config", + "audit/trusted-server.toml", + "--no-js-assets", + "--no-config", + "--force", + ]); + let Command::Audit(audit) = args.command else { + panic!("expected audit command"); + }; + assert_eq!(audit.js_assets, Some(PathBuf::from("audit/js-assets.toml"))); + assert_eq!( + audit.config, + Some(PathBuf::from("audit/trusted-server.toml")) + ); + assert!(audit.no_js_assets); + assert!(audit.no_config); + assert!(audit.force); + } + + #[test] + fn audit_does_not_accept_adapter_option() { + let error = Args::try_parse_from([ + "ts", + "audit", + "https://publisher.example", + "--adapter", + "fastly", + ]) + .expect_err("should reject audit adapter option"); + assert!( + error.to_string().contains("unexpected argument") + || error.to_string().contains("Found argument"), + "error should explain unsupported option" + ); + } + #[test] fn parses_build_with_adapter_args() { let args = parse(&[ diff --git a/crates/trusted-server-integration-tests/Cargo.lock b/crates/trusted-server-integration-tests/Cargo.lock index b707b0ae..859cc799 100644 --- a/crates/trusted-server-integration-tests/Cargo.lock +++ b/crates/trusted-server-integration-tests/Cargo.lock @@ -18,19 +18,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "getrandom 0.3.4", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.4" @@ -690,9 +677,9 @@ dependencies = [ [[package]] name = "cssparser" -version = "0.34.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" +checksum = "4e901edd733a1472f944a45116df3f846f54d37e67e68640ac8bb69689aca2aa" dependencies = [ "cssparser-macros", "dtoa-short", @@ -871,17 +858,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "derive_more" -version = "0.99.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.118", -] - [[package]] name = "derive_more" version = "2.1.1" @@ -1109,9 +1085,9 @@ dependencies = [ [[package]] name = "ego-tree" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c6ba7d4eec39eaa9ab24d44a0e73a7949a1095a8b3f3abb11eddf27dbb56a53" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" [[package]] name = "either" @@ -1540,12 +1516,11 @@ dependencies = [ [[package]] name = "html5ever" -version = "0.29.1" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c" +checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4" dependencies = [ "log", - "mac", "markup5ever", "match_token", ] @@ -2171,23 +2146,20 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" -version = "0.14.1" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18" +checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3" dependencies = [ "log", - "phf 0.11.3", - "phf_codegen 0.11.3", - "string_cache", - "string_cache_codegen", "tendril", + "web_atoms", ] [[package]] name = "match_token" -version = "0.1.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf" dependencies = [ "proc-macro2", "quote", @@ -3340,17 +3312,16 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.21.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0e749d29b2064585327af5038a5a8eb73aeebad4a3472e83531a436563f7208" +checksum = "e5f3a24d916e78954af99281a455168d4a9515d65eca99a18da1b813689c4ad9" dependencies = [ - "ahash", - "cssparser 0.34.0", + "cssparser 0.35.0", "ego-tree", "getopts", "html5ever", "precomputed-hash", - "selectors 0.26.0", + "selectors 0.31.0", "tendril", ] @@ -3392,13 +3363,13 @@ dependencies = [ [[package]] name = "selectors" -version = "0.26.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" +checksum = "5685b6ae43bfcf7d2e7dfcfb5d8e8f61b46442c902531e41a32a9a8bf0ee0fb6" dependencies = [ "bitflags 2.13.0", - "cssparser 0.34.0", - "derive_more 0.99.20", + "cssparser 0.35.0", + "derive_more", "fxhash", "log", "new_debug_unreachable", @@ -3417,7 +3388,7 @@ checksum = "2cfaaa6035167f0e604e42723c7650d59ee269ef220d7bbe0565602c8a0173b9" dependencies = [ "bitflags 2.13.0", "cssparser 0.36.0", - "derive_more 2.1.1", + "derive_more", "log", "new_debug_unreachable", "phf 0.13.1", @@ -4306,7 +4277,7 @@ dependencies = [ "chacha20poly1305", "chrono", "cookie", - "derive_more 2.1.1", + "derive_more", "ed25519-dalek", "edgezero-core", "error-stack", @@ -4344,7 +4315,7 @@ version = "0.1.0" dependencies = [ "axum", "bytes", - "derive_more 2.1.1", + "derive_more", "edgezero-adapter-axum", "edgezero-core", "env_logger", @@ -4740,6 +4711,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web_atoms" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +dependencies = [ + "phf 0.11.3", + "phf_codegen 0.11.3", + "string_cache", + "string_cache_codegen", +] + [[package]] name = "webpki-root-certs" version = "1.0.8" diff --git a/crates/trusted-server-integration-tests/Cargo.toml b/crates/trusted-server-integration-tests/Cargo.toml index 9f51a866..ad0a7af9 100644 --- a/crates/trusted-server-integration-tests/Cargo.toml +++ b/crates/trusted-server-integration-tests/Cargo.toml @@ -22,7 +22,7 @@ trusted-server-core = { path = "../trusted-server-core" } [dev-dependencies] testcontainers = { version = "0.25", features = ["blocking"] } reqwest = { version = "0.12", features = ["blocking", "cookies", "json"] } -scraper = "0.21" +scraper = "0.24.0" log = "0.4.33" error-stack = "0.6" derive_more = { version = "2.0", features = ["display"] } diff --git a/docs/guide/cli.md b/docs/guide/cli.md index 4438e613..d745c89d 100644 --- a/docs/guide/cli.md +++ b/docs/guide/cli.md @@ -1,7 +1,7 @@ # Trusted Server CLI The Trusted Server CLI binary is `ts`. It is a host-target operator tool for -configuration and EdgeZero-backed lifecycle commands. +configuration, page audits, and EdgeZero-backed lifecycle commands. ## Install from source @@ -68,3 +68,54 @@ ts provision --adapter fastly ts deploy --adapter fastly ts serve --adapter fastly ``` + +## Audit a public page + +`ts audit` loads a public page in a fresh headless Chrome/Chromium session, +collects rendered JavaScript asset evidence, detects known Trusted Server +integrations, and writes local draft artifacts. + +Chrome or Chromium must be installed locally. The command checks common PATH +names and standard macOS/Linux install locations. + +```bash +ts audit https://publisher.example +``` + +By default, the command writes: + +| File | Purpose | +| --------------------- | ------------------------------------------------------------------------ | +| `js-assets.toml` | JavaScript asset inventory, detected integrations, counts, and warnings. | +| `trusted-server.toml` | Draft Trusted Server config based on the starter template and final URL. | + +The generated config is a draft. Review it, replace placeholders/secrets, adjust +publisher-specific settings, then run: + +```bash +ts config validate +``` + +If a config already exists, avoid overwriting it: + +```bash +ts audit https://publisher.example --no-config +``` + +Use custom output paths when reviewing artifacts first: + +```bash +ts audit https://publisher.example \ + --js-assets audit/js-assets.toml \ + --config audit/trusted-server.toml +``` + +Use `--force` only when replacing existing output files is intentional: + +```bash +ts audit https://publisher.example --force +``` + +`ts audit` is not an EdgeZero adapter command. It has no `--adapter` option and +it does not provision resources, push config, build, deploy, or contact platform +APIs. diff --git a/docs/guide/getting-started.md b/docs/guide/getting-started.md index 91f02bec..893c5b18 100644 --- a/docs/guide/getting-started.md +++ b/docs/guide/getting-started.md @@ -13,6 +13,7 @@ Before you begin, ensure you have the following installed (versions are pinned i **For Fastly deployment** (optional for local dev): - Fastly {{FASTLY_VERSION}} CLI installed +- Chrome or Chromium, required for `ts audit` - A Fastly account and API key ## Installation @@ -109,6 +110,15 @@ Create a starter Trusted Server config with the `ts` CLI: ts config init ``` +To bootstrap from a public publisher page, run an audit first: + +```bash +ts audit https://publisher.example +``` + +The audit command writes `js-assets.toml` plus a draft `trusted-server.toml`. +Review the draft, replace placeholders/secrets, then validate it. + Edit `trusted-server.toml` to configure: - Ad server integrations diff --git a/docs/superpowers/plans/2026-06-16-edgezero-based-ts-audit-implementation-plan.md b/docs/superpowers/plans/2026-06-16-edgezero-based-ts-audit-implementation-plan.md new file mode 100644 index 00000000..a91fb182 --- /dev/null +++ b/docs/superpowers/plans/2026-06-16-edgezero-based-ts-audit-implementation-plan.md @@ -0,0 +1,820 @@ +# EdgeZero-Based Trusted Server Audit CLI Implementation Plan + +**Date:** 2026-06-16 +**Status:** Approved implementation plan +**Spec:** `docs/superpowers/specs/2026-06-16-edgezero-based-ts-audit-design.md` +**Depends on:** base CLI pass from +`docs/superpowers/specs/2026-06-16-edgezero-based-ts-cli-design.md` + +## Current baseline + +The base CLI pass has added the host-target `trusted-server-cli` crate with: + +```text +crates/trusted-server-cli/ + Cargo.toml + src/args.rs + src/config_command.rs + src/edgezero_delegate.rs + src/error.rs + src/lib.rs + src/main.rs + src/run.rs +``` + +Important existing shapes to preserve: + +- The binary is `ts`. +- The implementation is gated to non-wasm targets in `lib.rs` and `main.rs`. +- `run_from_env()` parses process args and wires production services. +- `run_with_io()` supports testable invocation with injected writers. +- `run::dispatch()` currently injects an `EdgeZeroDelegate` for lifecycle/config + push tests. +- `config_command.rs` already embeds `trusted-server.example.toml` for + `config init`. +- `trusted-server.example.toml` now uses `example.com` sentinel values rather + than the old `test-publisher.com` values. +- `.gitignore` already ignores `trusted-server.toml`, but does not yet ignore + `js-assets.toml`. + +The old implementation to port from is on `feature/ts-cli`: + +```text +crates/trusted-server-cli/src/audit.rs +crates/trusted-server-cli/src/audit/analyzer.rs +crates/trusted-server-cli/src/audit/browser_collector.rs +crates/trusted-server-cli/src/audit/collector.rs +``` + +This plan recreates that behavior on top of the new base CLI structure, while +applying the spec's tightening around output preflight, deterministic merge +behavior, and EdgeZero separation. + +## Decisions locked for this plan + +- `ts audit` is Trusted Server-owned, not an EdgeZero delegate. +- No `--adapter`, `--manifest`, `--store`, `--local`, `--dry-run`, or `--json` + options are added to audit v1. +- The command writes local draft artifacts only; it never provisions, pushes, + deploys, or contacts platform APIs. +- Preserve the old command surface: + - `ts audit `; + - `--js-assets `; + - `--config `; + - `--no-js-assets`; + - `--no-config`; + - `--force`. +- Preserve the old artifact schema exactly enough that existing + `js-assets.toml` readers do not need a migration. +- Improve over the old implementation by preflighting selected output paths + before launching the browser and before writing any file. +- Use a fake collector in tests; unit tests must not require Chrome/Chromium. +- Browser smoke tests, if added, must be ignored by default or feature-gated. +- Generated `trusted-server.toml` is a draft. It may still fail production + validation until the operator replaces placeholders and reviews settings. +- Do not write rendered HTML, inline script bodies, cookies, storage, request + bodies, or response bodies to artifacts. +- Keep all browser automation dependencies host-only under + `trusted-server-cli`. +- Follow repository error/logging style: `error-stack::Report`, no `println!`, + output through injected `Write` handles in testable code. + +## Definition of done + +- `ts audit [options] ` appears in clap help and dispatches correctly. +- URL validation accepts only `http` and `https` URLs. +- Default outputs are `js-assets.toml` and `trusted-server.toml`. +- `--no-js-assets` and `--no-config` work individually. +- Passing both no-output flags fails before browser collection. +- Existing outputs are rejected without `--force` before browser collection. +- If any selected output path conflicts, no selected file is written. +- Browser collector launches an isolated headless Chrome/Chromium session. +- Browser collector captures final URL, title, rendered HTML, DOM scripts, and + script resource timing entries. +- Navigation failures and non-`200..399` main-document statuses fail clearly. +- Page settle timeout continues with a warning. +- Analyzer merges HTML, DOM, and resource-timing script evidence. +- Assets and detected integrations are deduplicated and sorted deterministically. +- First-party/third-party classification matches the spec's host relationship + heuristic. +- Integration detectors match the old v1 detector set. +- `js-assets.toml` serializes the specified schema. +- Draft config generation patches current `trusted-server.example.toml` + sentinels, uses the final redirected URL, and appends manual-review comments. +- `ts audit` does not invoke any `EdgeZeroDelegate` or platform API. +- `.gitignore` ignores the default `js-assets.toml` artifact. +- CLI guide / getting-started docs mention the audit command and Chrome + requirement. +- Focused unit tests pass. +- Host-target CLI tests pass. +- Formatting passes. + +## Proposed module layout + +Add audit as an internal host-only module under the existing CLI crate: + +```text +crates/trusted-server-cli/src/ + audit.rs + audit/ + analyzer.rs + browser_collector.rs + collector.rs +``` + +Responsibilities: + +| File | Responsibility | +| ---------------------------- | ----------------------------------------------------------------------------- | +| `args.rs` | Add `Command::Audit(AuditArgs)` and parse audit flags. | +| `run.rs` | Dispatch audit via an injectable collector and stdout writer. | +| `audit.rs` | Command orchestration, output planning, file writes, draft config generation. | +| `audit/collector.rs` | `CollectedPage` data structs and `AuditCollector` trait. | +| `audit/analyzer.rs` | Convert `CollectedPage` to `AuditArtifact`; detection/classification. | +| `audit/browser_collector.rs` | Production Chrome/Chromium collector. | +| `Cargo.toml` | Add host-only audit dependencies. | +| `.gitignore` | Ignore default `js-assets.toml`. | +| docs | Document command usage and draft status. | + +## Data model sketch + +Port these old public/internal shapes with doc comments as needed for clippy: + +```rust +pub struct AuditArgs { + pub url: String, + pub js_assets: Option, + pub config: Option, + pub no_js_assets: bool, + pub no_config: bool, + pub force: bool, +} + +pub trait AuditCollector { + fn collect_page(&self, target_url: &Url) -> CliResult; +} + +pub struct CollectedPage { + pub requested_url: String, + pub final_url: String, + pub page_title: Option, + pub html: String, + pub script_tags: Vec, + pub network_requests: Vec, + pub warnings: Vec, +} + +pub struct AuditArtifact { + pub audited_url: String, + pub page_title: Option, + pub js_asset_count: usize, + pub third_party_asset_count: usize, + pub detected_integrations: Vec, + pub assets: Vec, + pub warnings: Vec, +} +``` + +Keep serialization compatible with the old artifact: + +- `AssetParty` serializes with `#[serde(rename_all = "kebab-case")]`. +- `AuditedAsset.integration` remains `Option`. +- `page_title` remains `Option`. +- No `schema_version` in v1. + +## Service injection shape + +The current `dispatch()` injects only an `EdgeZeroDelegate`. To test audit +without launching Chrome, extend the dispatcher to inject both platform and audit +services. + +Preferred shape: + +```rust +struct CliServices<'a> { + edgezero: &'a mut dyn EdgeZeroDelegate, + audit: &'a dyn AuditCollector, +} + +fn dispatch( + args: Args, + services: &mut CliServices<'_>, + out: &mut dyn Write, + err: &mut dyn Write, +) -> CliResult<()>; +``` + +Production setup in `run_from_env()` and `run_with_io()`: + +```rust +let mut edgezero = ProductionEdgeZeroDelegate; +let audit = BrowserAuditCollector; +let mut services = CliServices { + edgezero: &mut edgezero, + audit: &audit, +}; +``` + +Tests can use: + +```rust +let mut edgezero = FakeEdgeZeroDelegate::default(); +let audit = FakeAuditCollector::new(collected_page); +let mut services = CliServices { + edgezero: &mut edgezero, + audit: &audit, +}; +``` + +This keeps the no-EdgeZero requirement testable: after dispatching `Command::Audit`, +assert fake EdgeZero lifecycle/push calls are empty. + +If introducing `CliServices` feels too large, an acceptable smaller alternative +is `dispatch_with_audit_collector(args, delegate, collector, out, err)` used by +production and tests. Avoid global mutable test hooks. + +## Dependencies + +Add only host-target CLI dependencies. + +Likely additions to root `[workspace.dependencies]`: + +```toml +chromiumoxide = "" +scraper = "0.21" # or current compatible version +``` + +Likely additions to `crates/trusted-server-cli/Cargo.toml` under +`target.'cfg(not(target_arch = "wasm32"))'.dependencies`: + +```toml +chromiumoxide = { workspace = true } +futures = { workspace = true } +regex = { workspace = true } +scraper = { workspace = true } +tokio = { workspace = true } +url = { workspace = true } +which = { workspace = true } +``` + +Existing workspace dependencies already include `futures`, `regex`, `tokio`, +`url`, and `which`. Confirm `tokio` features are sufficient for the browser +collector: + +- current workspace features include `rt`, `time`, `macros`, `io-util`, and + `sync`; +- browser collector needs current-thread runtime and timers; +- if `chromiumoxide` requires extra Tokio features, add only the minimum + host-safe features needed. + +Dependency constraints: + +- Do not add these dependencies to runtime crates. +- Do not make `trusted-server-core` depend on browser automation or HTML + scraping crates. +- Keep the CLI crate wasm stub compiling by leaving all real audit modules under + `#[cfg(not(target_arch = "wasm32"))]` via `lib.rs` module gating. + +## Stage 1 — Add CLI argument surface + +Files: + +- `crates/trusted-server-cli/src/args.rs` +- `crates/trusted-server-cli/src/run.rs` + +Steps: + +1. Add `Command::Audit(AuditArgs)` to `args.rs`. +2. Add `AuditArgs` with: + - positional `url: String`; + - `#[arg(long)] js_assets: Option`; + - `#[arg(long)] config: Option`; + - `#[arg(long)] no_js_assets: bool`; + - `#[arg(long)] no_config: bool`; + - `#[arg(long)] force: bool`. +3. Use clap's default kebab-case flag names, so the struct field `js_assets` + maps to `--js-assets`. +4. Add parser tests: + - parses default audit URL; + - parses all custom options; + - `--no-js-assets` and `--no-config` can each parse; + - audit does not accept `--adapter`. +5. Add a dispatch match arm that calls `audit::run_audit()` with the injected + collector. +6. Ensure existing delegate command parser tests remain unchanged. + +Do not implement browser collection in this stage. + +## Stage 2 — Add audit module scaffold and output planning + +Files: + +- `crates/trusted-server-cli/src/lib.rs` +- `crates/trusted-server-cli/src/audit.rs` +- `crates/trusted-server-cli/src/audit/collector.rs` + +Steps: + +1. Register `mod audit;` in `lib.rs` under the existing non-wasm module gate. +2. Add collector data structs and `AuditCollector` trait. +3. Add `AuditOutputPlan` in `audit.rs`: + + ```rust + struct AuditOutputPlan { + js_assets_path: Option, + config_path: Option, + } + ``` + +4. Add `parse_audit_url(value: &str) -> CliResult`. +5. Add `resolve_output_plan(args: &AuditArgs) -> CliResult`. +6. Rules for `resolve_output_plan()`: + - reject both `no_js_assets` and `no_config`; + - default JS asset path to `js-assets.toml` unless disabled; + - default config path to `trusted-server.toml` unless disabled; + - resolve relative paths against `std::env::current_dir()`; + - preserve absolute paths; + - reject existing selected paths unless `force`; + - create no directories yet, or create only after all selected paths pass the + conflict check. +7. Add `prepare_output_paths(plan)` or integrate directory creation after + successful preflight. +8. Tests: + - URL parsing accepts HTTP/HTTPS; + - URL parsing rejects `file:`, `data:`, `chrome:`; + - both no-output flags reject with a clear message; + - default and custom paths resolve as expected; + - existing file fails without `--force`; + - existing file passes with `--force`; + - one conflicting path prevents all writes. + +Implementation note: keep path planning separate from browser collection so a +fake collector can record whether it was called. Use that to prove conflicts +short-circuit before collection. + +## Stage 3 — Port analyzer and artifact schema + +Files: + +- `crates/trusted-server-cli/src/audit.rs` +- `crates/trusted-server-cli/src/audit/analyzer.rs` + +Steps: + +1. Add serializable artifact structs in `audit.rs`: + - `AssetParty`; + - `AuditedAsset`; + - `DetectedIntegration`; + - `AuditArtifact`; + - `AuditOutputs`. +2. Port `analyze_collected_page()` from the old branch. +3. Preserve these analysis inputs: + - rendered HTML `