diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index ecec74fc21..33d6eec7cb 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -1044,17 +1044,22 @@ std::shared_ptr> make_with_new_trange( // build each target tile in one pass: a single source lookup per cell // sizes it and fills it together (no separate all-ranges walk). const auto outer_range = target_trange.make_tile_range(target_ord); - ArenaToTBuilder builder(outer_range); const std::size_t n = outer_range.volume(); + using InnerRange = typename Tile::value_type::range_type; + auto range_fn = [&](std::size_t o) -> InnerRange { + const auto* sc = source_cell_at(outer_range.idx(o)); + return (!sc || sc->empty()) ? InnerRange{} : sc->range(); + }; + Tile tile = arena_outer_init(outer_range, 1, range_fn); for (std::size_t o = 0; o < n; ++o) { + auto& cell = tile.data()[o]; + if (cell.empty()) continue; // deliberately-null cell const auto* sc = source_cell_at(outer_range.idx(o)); - if (!sc || sc->empty()) continue; // leaves a deliberately-null cell - auto& cell = builder.emplace(o, sc->range()); const auto* s = sc->data(); auto* d = cell.data(); for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p]; } - target_array.set(target_ord, std::move(builder).finish()); + target_array.set(target_ord, std::move(tile)); } target_array.world().gop.fence(); } else { diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h index 403de9ffd4..80e3373f84 100644 --- a/src/TiledArray/tensor/arena_kernels.h +++ b/src/TiledArray/tensor/arena_kernels.h @@ -17,7 +17,10 @@ #include "TiledArray/tensor/arena_tensor.h" #include +#include #include +#include +#include #include #include #include @@ -51,6 +54,62 @@ std::shared_ptr make_outer_data( } // namespace +/// Single-page invariant check for arena ToT outer tiles, env-gated by +/// `TA_ASSERT_SINGLE_PAGE` (zero overhead when unset). A size-determinable ToT +/// outer tile must occupy at most one arena page; `Arena::page_count() > 1` +/// means an incremental build spilled across pages and the strided-BLAS fast +/// path would silently revert to per-cell AXPY. On violation we print the +/// offending site and throw a TiledArray::Exception so the run fails loudly. +/// When the gate is on, a summary (tiles checked, violations) is printed at +/// process exit (on a clean exit; the throw path skips it). Using +/// `page_count()` -- not `classify_run` -- so the check is valid for tiles with +/// null or non-uniform inner cells (which are still single-page). +inline bool arena_single_page_assert_enabled() { + static const bool on = [] { + const char* e = std::getenv("TA_ASSERT_SINGLE_PAGE"); + return e != nullptr && e[0] != '\0' && e[0] != '0'; + }(); + return on; +} + +inline std::atomic& arena_single_page_check_count() { + static std::atomic c{0}; + return c; +} + +inline std::atomic& arena_single_page_violation_count() { + static std::atomic c{0}; + return c; +} + +inline void arena_assert_single_page(const Arena& arena, const char* where) { + if (!arena_single_page_assert_enabled()) return; + static const bool registered = [] { + std::atexit([] { + std::fprintf( + stderr, + "[TA_ASSERT_SINGLE_PAGE] checked %zu arena ToT outer tile(s), " + "%zu multi-page violation(s)\n", + arena_single_page_check_count().load(std::memory_order_relaxed), + arena_single_page_violation_count().load(std::memory_order_relaxed)); + }); + return true; + }(); + (void)registered; + arena_single_page_check_count().fetch_add(1, std::memory_order_relaxed); + const std::size_t pages = arena.page_count(); + if (pages > 1) { + arena_single_page_violation_count().fetch_add(1, std::memory_order_relaxed); + std::fprintf(stderr, + "[TA_ASSERT_SINGLE_PAGE] VIOLATION at %s: arena ToT outer tile " + "spans %zu pages (expected <= 1)\n", + where, pages); + TA_EXCEPTION( + "TA_ASSERT_SINGLE_PAGE: arena ToT outer tile spans multiple arena " + "pages -- a size-determinable ToT must be single-page"); + } +} + /// Allocate an arena-backed ToT outer tile with caller-provided inner ranges. /// /// `inner_range_fn(cell_ordinal)` -> inner `range_type` for each cell ordinal @@ -138,6 +197,7 @@ OuterTensor arena_outer_init( InnerT(r, std::shared_ptr(h, reinterpret_cast(h.get()))); } } + arena_assert_single_page(*arena_ptr, "arena_outer_init"); return result; } @@ -229,6 +289,7 @@ class ArenaToTBuilder { /// Finalize and hand back the assembled outer tile; the builder is spent. OuterTensor finish() && { + arena_assert_single_page(*arena_, "ArenaToTBuilder::finish"); return OuterTensor(outer_range_, batch_sz_, std::move(data_)); } @@ -250,21 +311,28 @@ class ArenaToTBuilder { /// to the next -- no separate all-ranges walk. A zero-volume inner range /// yields a deliberately-null cell, which `inner_fill_fn` is not invoked on. /// Cells are zero-initialized, so the default no-op fill still leaves zeroed -/// storage. Backed by `ArenaToTBuilder`. +/// storage. Backed by the up-front single-page `arena_outer_init`. template OuterTensor make_nested_tile( const typename OuterTensor::range_type& outer_range, InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) { - ArenaToTBuilder builder(outer_range, /*batch_sz=*/1, - /*zero_init=*/true); + // Up-front, single contiguous page: pre-walk ranges (cheap headers only), + // reserve one exact slab, fill in place. inner_range_fn/inner_fill_fn are + // random-access (idx-driven), so there is no single-pass data to buffer and + // no peak-memory doubling -- only the genuinely single-pass init_tiles path + // needs the incremental ArenaToTBuilder. + auto cell_range_fn = [&](std::size_t ord) { + return inner_range_fn(outer_range.idx(ord)); + }; + OuterTensor result = + arena_outer_init(outer_range, 1, cell_range_fn); const std::size_t N = outer_range.volume(); for (std::size_t ord = 0; ord < N; ++ord) { - const auto idx = outer_range.idx(ord); - auto& cell = builder.emplace(ord, inner_range_fn(idx)); - if (!cell.empty()) inner_fill_fn(cell, idx); + auto& cell = result.data()[ord]; + if (!cell.empty()) inner_fill_fn(cell, outer_range.idx(ord)); } - return std::move(builder).finish(); + return result; } /// Apply a unary fill op while preserving each source inner range. diff --git a/tests/tot_construction.cpp b/tests/tot_construction.cpp index 3084f7e875..9968318378 100644 --- a/tests/tot_construction.cpp +++ b/tests/tot_construction.cpp @@ -36,6 +36,17 @@ auto inner_range_2d(std::size_t d0, std::size_t d1) { return typename InnerTile::range_type(std::vector{d0, d1}); } +// Assert an arena ToT outer tile is a single contiguous page: uniform-size +// cells at constant page-jump-free stride. classify_run returns 0 when clean, +// 3 on a page jump (multi-page build). +template +static void check_single_page_uniform(const OuterTile& tile) { + const std::size_t n = tile.range().volume(); + const int code = TA::detail::classify_run( + [&](std::size_t i) -> decltype(auto) { return tile.data()[i]; }, n); + BOOST_CHECK_EQUAL(code, 0); // 0 == clean single-page constant stride +} + template void verify_cell(const InnerTile& cell, long e, bool expect_filled) { BOOST_REQUIRE(!cell.empty()); @@ -778,6 +789,25 @@ BOOST_AUTO_TEST_CASE(arena_tile_bipartite_permute) { test_arena_tile_permute(); } +BOOST_AUTO_TEST_CASE(make_nested_tile_arena_single_page) { + using Inner = TA::ArenaTensor; + using OuterTile = TA::Tensor; + // 256 cells x 64 doubles = 128 KiB > 64 KiB default page, so a multi-page + // builder spills; a single-page allocator does not. + const std::size_t ncells = 256, isize = 64; + TA::Range outer(std::array{ncells}); + auto range_fn = [isize](const TA::Range::index_type&) { + return Inner::range_type(std::vector{isize}); + }; + auto fill = [](Inner& cell, const TA::Range::index_type&) { + for (std::size_t p = 0; p < cell.size(); ++p) cell[p] = double(p); + }; + OuterTile tile = + TA::detail::make_nested_tile(outer, range_fn, fill); + BOOST_REQUIRE_EQUAL(ncells * isize * sizeof(double), std::size_t(128 * 1024)); + check_single_page_uniform(tile); +} + BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE(tot_construction_dist_suite, TA_UT_LABEL_DISTRIBUTED) @@ -786,4 +816,29 @@ BOOST_AUTO_TEST_CASE(arena_tot_remote_tile_transport) { test_distributed_arena_tot(); } +BOOST_AUTO_TEST_CASE(retile_arena_tot_single_page) { + using Inner = TA::ArenaTensor; + using OuterTile = TA::Tensor; + using Array = TA::DistArray; + auto& world = *GlobalFixture::world; + const std::size_t isize = 64; // 256 cells x 64 doubles = 128 KiB > page + TA::TiledRange src_tr{{0, 256}}; // 256 cells in one outer tile + Array src(world, src_tr); + src.init_tiles_nested( + [isize](const auto&) { + return Inner::range_type(std::vector{isize}); + }, + [](Inner& c, const auto&) { + for (std::size_t p = 0; p < c.size(); ++p) c[p] = double(p); + }); + world.gop.fence(); + TA::TiledRange dst_tr{{0, 128, 256}}; // ncells/2, ncells -> 2 tiles + Array dst = TA::retile(src, dst_tr); + world.gop.fence(); + for (const auto ord : *dst.pmap()) { + if (dst.is_zero(ord)) continue; + check_single_page_uniform(dst.find(ord).get()); + } +} + BOOST_AUTO_TEST_SUITE_END()