Skip to content

Commit 53f98d1

Browse files
committed
[GPU/D3D12] Memexport from anywhere in control flow + 8/16bpp memexport
There's no limit on the number of memory exports in a shader on the real Xenos, and exports can be done anywhere, including in loops. Now, instead of deferring the exports to the end of the shader, and assuming that export allocs are executed only once, Xenia flushes exports when it reaches an alloc (allocs terminate memory exports on Xenos, as well as individual ALU instructions with `serialize`, but not handling this case for simplicity, it's only truly mandatory to flush memory exports before starting a new one), the end of the shader, or a pixel with outstanding exports is killed. To know which eM# registers need to be flushed to the memory, traversing the successors of each exec potentially writing any eM#, and specifying that certain eM# registers might have potentially been written before each reached control flow instruction, until a flush point or the end of the shader is reached. Also, some games export to sub-32bpp formats. These are now supported via atomic AND clearing the bits of the dword to replace followed by an atomic OR inserting the new byte/short.
1 parent 8aaa6f1 commit 53f98d1

17 files changed

+1388
-800
lines changed

src/xenia/gpu/d3d12/d3d12_command_processor.cc

Lines changed: 23 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -2125,7 +2125,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
21252125
return false;
21262126
}
21272127
pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
2128-
bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
2128+
bool memexport_used_vertex = vertex_shader->memexport_eM_written();
21292129

21302130
// Pixel shader analysis.
21312131
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@@ -2154,7 +2154,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
21542154
}
21552155
}
21562156
bool memexport_used_pixel =
2157-
pixel_shader && pixel_shader->is_valid_memexport_used();
2157+
pixel_shader && pixel_shader->memexport_eM_written();
21582158
bool memexport_used = memexport_used_vertex || memexport_used_pixel;
21592159

21602160
if (!BeginSubmission(true)) {
@@ -2341,100 +2341,20 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
23412341
// Gather memexport ranges and ensure the heaps for them are resident, and
23422342
// also load the data surrounding the export and to fill the regions that
23432343
// won't be modified by the shaders.
2344-
struct MemExportRange {
2345-
uint32_t base_address_dwords;
2346-
uint32_t size_dwords;
2347-
};
2348-
MemExportRange memexport_ranges[512];
2349-
uint32_t memexport_range_count = 0;
2344+
memexport_ranges_.clear();
23502345
if (memexport_used_vertex) {
2351-
for (uint32_t constant_index :
2352-
vertex_shader->memexport_stream_constants()) {
2353-
const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
2354-
XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
2355-
if (memexport_stream.index_count == 0) {
2356-
continue;
2357-
}
2358-
uint32_t memexport_format_size =
2359-
GetSupportedMemExportFormatSize(memexport_stream.format);
2360-
if (memexport_format_size == 0) {
2361-
XELOGE("Unsupported memexport format {}",
2362-
FormatInfo::Get(
2363-
xenos::TextureFormat(uint32_t(memexport_stream.format)))
2364-
->name);
2365-
return false;
2366-
}
2367-
uint32_t memexport_size_dwords =
2368-
memexport_stream.index_count * memexport_format_size;
2369-
// Try to reduce the number of shared memory operations when writing
2370-
// different elements into the same buffer through different exports
2371-
// (happens in 4D5307E6).
2372-
bool memexport_range_reused = false;
2373-
for (uint32_t i = 0; i < memexport_range_count; ++i) {
2374-
MemExportRange& memexport_range = memexport_ranges[i];
2375-
if (memexport_range.base_address_dwords ==
2376-
memexport_stream.base_address) {
2377-
memexport_range.size_dwords =
2378-
std::max(memexport_range.size_dwords, memexport_size_dwords);
2379-
memexport_range_reused = true;
2380-
break;
2381-
}
2382-
}
2383-
// Add a new range if haven't expanded an existing one.
2384-
if (!memexport_range_reused) {
2385-
MemExportRange& memexport_range =
2386-
memexport_ranges[memexport_range_count++];
2387-
memexport_range.base_address_dwords = memexport_stream.base_address;
2388-
memexport_range.size_dwords = memexport_size_dwords;
2389-
}
2390-
}
2346+
draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_);
23912347
}
23922348
if (memexport_used_pixel) {
2393-
for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
2394-
const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
2395-
XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
2396-
if (memexport_stream.index_count == 0) {
2397-
continue;
2398-
}
2399-
uint32_t memexport_format_size =
2400-
GetSupportedMemExportFormatSize(memexport_stream.format);
2401-
if (memexport_format_size == 0) {
2402-
XELOGE("Unsupported memexport format {}",
2403-
FormatInfo::Get(
2404-
xenos::TextureFormat(uint32_t(memexport_stream.format)))
2405-
->name);
2406-
return false;
2407-
}
2408-
uint32_t memexport_size_dwords =
2409-
memexport_stream.index_count * memexport_format_size;
2410-
bool memexport_range_reused = false;
2411-
for (uint32_t i = 0; i < memexport_range_count; ++i) {
2412-
MemExportRange& memexport_range = memexport_ranges[i];
2413-
if (memexport_range.base_address_dwords ==
2414-
memexport_stream.base_address) {
2415-
memexport_range.size_dwords =
2416-
std::max(memexport_range.size_dwords, memexport_size_dwords);
2417-
memexport_range_reused = true;
2418-
break;
2419-
}
2420-
}
2421-
if (!memexport_range_reused) {
2422-
MemExportRange& memexport_range =
2423-
memexport_ranges[memexport_range_count++];
2424-
memexport_range.base_address_dwords = memexport_stream.base_address;
2425-
memexport_range.size_dwords = memexport_size_dwords;
2426-
}
2427-
}
2349+
draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_);
24282350
}
2429-
for (uint32_t i = 0; i < memexport_range_count; ++i) {
2430-
const MemExportRange& memexport_range = memexport_ranges[i];
2351+
for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
24312352
if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
2432-
memexport_range.size_dwords << 2)) {
2353+
memexport_range.size_bytes)) {
24332354
XELOGE(
24342355
"Failed to request memexport stream at 0x{:08X} (size {}) in the "
24352356
"shared memory",
2436-
memexport_range.base_address_dwords << 2,
2437-
memexport_range.size_dwords << 2);
2357+
memexport_range.base_address_dwords << 2, memexport_range.size_bytes);
24382358
return false;
24392359
}
24402360
}
@@ -2594,17 +2514,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
25942514
// when memexports should be awaited?
25952515
shared_memory_->MarkUAVWritesCommitNeeded();
25962516
// Invalidate textures in memexported memory and watch for changes.
2597-
for (uint32_t i = 0; i < memexport_range_count; ++i) {
2598-
const MemExportRange& memexport_range = memexport_ranges[i];
2517+
for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
25992518
shared_memory_->RangeWrittenByGpu(
2600-
memexport_range.base_address_dwords << 2,
2601-
memexport_range.size_dwords << 2, false);
2519+
memexport_range.base_address_dwords << 2, memexport_range.size_bytes,
2520+
false);
26022521
}
26032522
if (cvars::d3d12_readback_memexport) {
26042523
// Read the exported data on the CPU.
26052524
uint32_t memexport_total_size = 0;
2606-
for (uint32_t i = 0; i < memexport_range_count; ++i) {
2607-
memexport_total_size += memexport_ranges[i].size_dwords << 2;
2525+
for (const draw_util::MemExportRange& memexport_range :
2526+
memexport_ranges_) {
2527+
memexport_total_size += memexport_range.size_bytes;
26082528
}
26092529
if (memexport_total_size != 0) {
26102530
ID3D12Resource* readback_buffer =
@@ -2614,9 +2534,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
26142534
SubmitBarriers();
26152535
ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
26162536
uint32_t readback_buffer_offset = 0;
2617-
for (uint32_t i = 0; i < memexport_range_count; ++i) {
2618-
const MemExportRange& memexport_range = memexport_ranges[i];
2619-
uint32_t memexport_range_size = memexport_range.size_dwords << 2;
2537+
for (const draw_util::MemExportRange& memexport_range :
2538+
memexport_ranges_) {
2539+
uint32_t memexport_range_size = memexport_range.size_bytes;
26202540
deferred_command_list_.D3DCopyBufferRegion(
26212541
readback_buffer, readback_buffer_offset, shared_memory_buffer,
26222542
memexport_range.base_address_dwords << 2, memexport_range_size);
@@ -2629,14 +2549,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
26292549
void* readback_mapping;
26302550
if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
26312551
&readback_mapping))) {
2632-
const uint32_t* readback_dwords =
2633-
reinterpret_cast<const uint32_t*>(readback_mapping);
2634-
for (uint32_t i = 0; i < memexport_range_count; ++i) {
2635-
const MemExportRange& memexport_range = memexport_ranges[i];
2552+
const uint8_t* readback_bytes =
2553+
reinterpret_cast<const uint8_t*>(readback_mapping);
2554+
for (const draw_util::MemExportRange& memexport_range :
2555+
memexport_ranges_) {
26362556
std::memcpy(memory_->TranslatePhysical(
26372557
memexport_range.base_address_dwords << 2),
2638-
readback_dwords, memexport_range.size_dwords << 2);
2639-
readback_dwords += memexport_range.size_dwords;
2558+
readback_bytes, memexport_range.size_bytes);
2559+
readback_bytes += memexport_range.size_bytes;
26402560
}
26412561
D3D12_RANGE readback_write_range = {};
26422562
readback_buffer->Unmap(0, &readback_write_range);
@@ -4510,36 +4430,6 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader,
45104430
return true;
45114431
}
45124432

4513-
uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
4514-
xenos::ColorFormat format) {
4515-
switch (format) {
4516-
case xenos::ColorFormat::k_8_8_8_8:
4517-
case xenos::ColorFormat::k_2_10_10_10:
4518-
// TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the
4519-
// texture cache currently.
4520-
// case xenos::ColorFormat::k_8_8_8_8_A:
4521-
case xenos::ColorFormat::k_10_11_11:
4522-
case xenos::ColorFormat::k_11_11_10:
4523-
case xenos::ColorFormat::k_16_16:
4524-
case xenos::ColorFormat::k_16_16_FLOAT:
4525-
case xenos::ColorFormat::k_32_FLOAT:
4526-
case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16:
4527-
case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16:
4528-
case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16:
4529-
case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16:
4530-
return 1;
4531-
case xenos::ColorFormat::k_16_16_16_16:
4532-
case xenos::ColorFormat::k_16_16_16_16_FLOAT:
4533-
case xenos::ColorFormat::k_32_32_FLOAT:
4534-
return 2;
4535-
case xenos::ColorFormat::k_32_32_32_32_FLOAT:
4536-
return 4;
4537-
default:
4538-
break;
4539-
}
4540-
return 0;
4541-
}
4542-
45434433
ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
45444434
if (size == 0) {
45454435
return nullptr;

src/xenia/gpu/d3d12/d3d12_command_processor.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <string>
1919
#include <unordered_map>
2020
#include <utility>
21+
#include <vector>
2122

2223
#include "xenia/base/assert.h"
2324
#include "xenia/gpu/command_processor.h"
@@ -378,13 +379,6 @@ class D3D12CommandProcessor : public CommandProcessor {
378379
ID3D12RootSignature* root_signature,
379380
bool shared_memory_is_uav);
380381

381-
// Returns dword count for one element for a memexport format, or 0 if it's
382-
// not supported by the D3D12 command processor (if it's smaller that 1 dword,
383-
// for instance).
384-
// TODO(Triang3l): Check if any game uses memexport with formats smaller than
385-
// 32 bits per element.
386-
static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format);
387-
388382
// Returns a buffer for reading GPU data back to the CPU. Assuming
389383
// synchronizing immediately after use. Always in COPY_DEST state.
390384
ID3D12Resource* RequestReadbackBuffer(uint32_t size);
@@ -684,6 +678,9 @@ class D3D12CommandProcessor : public CommandProcessor {
684678

685679
// Current primitive topology.
686680
D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
681+
682+
// Temporary storage for memexport stream constants used in the draw.
683+
std::vector<draw_util::MemExportRange> memexport_ranges_;
687684
};
688685

689686
} // namespace d3d12

src/xenia/gpu/draw_util.cc

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
******************************************************************************
33
* Xenia : Xbox 360 Emulator Research Project *
44
******************************************************************************
5-
* Copyright 2022 Ben Vanik. All rights reserved. *
5+
* Copyright 2023 Ben Vanik. All rights reserved. *
66
* Released under the BSD license - see LICENSE in the root for more details. *
77
******************************************************************************
88
*/
@@ -141,7 +141,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
141141
//
142142
// Memory export is an obvious intentional side effect.
143143
if (shader.kills_pixels() || shader.writes_depth() ||
144-
shader.is_valid_memexport_used() ||
144+
shader.memexport_eM_written() ||
145145
(shader.writes_color_target(0) &&
146146
DoesCoverageDependOnAlpha(regs.Get<reg::RB_COLORCONTROL>()))) {
147147
return true;
@@ -651,6 +651,65 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
651651
return normalized_color_mask;
652652
}
653653

654+
void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
655+
std::vector<MemExportRange>& ranges_out) {
656+
if (!shader.memexport_eM_written()) {
657+
// The shader has eA writes, but no real exports.
658+
return;
659+
}
660+
uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex
661+
? regs.Get<reg::SQ_VS_CONST>().base
662+
: regs.Get<reg::SQ_PS_CONST>().base;
663+
for (uint32_t constant_index : shader.memexport_stream_constants()) {
664+
const auto& stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
665+
XE_GPU_REG_SHADER_CONSTANT_000_X +
666+
(float_constants_base + constant_index) * 4);
667+
if (!stream.index_count) {
668+
continue;
669+
}
670+
const FormatInfo& format_info =
671+
*FormatInfo::Get(xenos::TextureFormat(stream.format));
672+
if (format_info.type != FormatType::kResolvable) {
673+
XELOGE("Unsupported memexport format {}", format_info.name);
674+
// Translated shaders shouldn't be performing exports with an unknown
675+
// format, the draw can still be performed.
676+
continue;
677+
}
678+
// TODO(Triang3l): Remove the unresearched format logging when it's known
679+
// how exactly these formats need to be handled (most importantly what
680+
// components need to be stored and in which order).
681+
switch (stream.format) {
682+
case xenos::ColorFormat::k_8_A:
683+
case xenos::ColorFormat::k_8_B:
684+
case xenos::ColorFormat::k_8_8_8_8_A:
685+
XELOGW(
686+
"Memexport done to an unresearched format {}, report the game to "
687+
"Xenia developers!",
688+
format_info.name);
689+
break;
690+
default:
691+
break;
692+
}
693+
uint32_t stream_size_bytes =
694+
stream.index_count * (format_info.bits_per_pixel >> 3);
695+
// Try to reduce the number of shared memory operations when writing
696+
// different elements into the same buffer through different exports
697+
// (happens in 4D5307E6).
698+
bool range_reused = false;
699+
for (MemExportRange& range : ranges_out) {
700+
if (range.base_address_dwords == stream.base_address) {
701+
range.size_bytes = std::max(range.size_bytes, stream_size_bytes);
702+
range_reused = true;
703+
break;
704+
}
705+
}
706+
// Add a new range if haven't expanded an existing one.
707+
if (!range_reused) {
708+
ranges_out.emplace_back(stream.base_address, stream_size_bytes);
709+
}
710+
}
711+
}
712+
654713
xenos::CopySampleSelect SanitizeCopySampleSelect(
655714
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
656715
bool is_depth) {

src/xenia/gpu/draw_util.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <cmath>
1414
#include <cstdint>
1515
#include <utility>
16+
#include <vector>
1617

1718
#include "xenia/base/assert.h"
1819
#include "xenia/gpu/register_file.h"
@@ -330,6 +331,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
330331
return guest_sample_index ? 3 : 0;
331332
}
332333

334+
struct MemExportRange {
335+
uint32_t base_address_dwords;
336+
uint32_t size_bytes;
337+
338+
explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes)
339+
: base_address_dwords(base_address_dwords), size_bytes(size_bytes) {}
340+
};
341+
342+
// Gathers memory ranges involved in memexports in the shader with the float
343+
// constants from the registers, adding them to ranges_out.
344+
void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
345+
std::vector<MemExportRange>& ranges_out);
346+
333347
// To avoid passing values that the shader won't understand (even though
334348
// Direct3D 9 shouldn't pass them anyway).
335349
xenos::CopySampleSelect SanitizeCopySampleSelect(

0 commit comments

Comments
 (0)