Skip to content

Commit 95fd45d

Browse files
committed
Betsy: Implement BC4 compression
1 parent e4e024a commit 95fd45d

File tree

7 files changed

+221
-26
lines changed

7 files changed

+221
-26
lines changed

core/io/image.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2751,7 +2751,7 @@ Error Image::compress_from_channels(CompressMode p_mode, UsedChannels p_channels
27512751

27522752
case COMPRESS_S3TC: {
27532753
// BC3 is unsupported currently.
2754-
if ((p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) {
2754+
if ((p_channels == USED_CHANNELS_R || p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) {
27552755
Error result = _image_compress_bc_rd_func(this, p_channels);
27562756

27572757
// If the image was compressed successfully, we return here. If not, we fall back to the default compression scheme.

doc/classes/ProjectSettings.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2903,8 +2903,8 @@
29032903
</member>
29042904
<member name="rendering/textures/vram_compression/compress_with_gpu" type="bool" setter="" getter="" default="true">
29052905
If [code]true[/code], the texture importer will utilize the GPU for compressing textures, improving the import time of large images.
2906-
[b]Note:[/b] This setting requires either Vulkan or D3D12 available as a rendering backend.
2907-
[b]Note:[/b] Currently this only affects BC1 and BC6H compression, which are used on Desktop and Console for fully opaque and HDR images respectively.
2906+
[b]Note:[/b] This only functions on a device which supports either Vulkan, D3D12, or Metal available as a rendering backend.
2907+
[b]Note:[/b] Currently this only affects certain compressed formats (BC1, BC4, and BC6), all of which are exclusive to desktop platforms and consoles.
29082908
</member>
29092909
<member name="rendering/textures/vram_compression/import_etc2_astc" type="bool" setter="" getter="" default="false">
29102910
If [code]true[/code], the texture importer will import VRAM-compressed textures using the Ericsson Texture Compression 2 algorithm for lower quality textures and normal maps and Adaptable Scalable Texture Compression algorithm for high quality textures (in 4×4 block size).

modules/betsy/SCsub

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Import("env_modules")
55
env_betsy = env_modules.Clone()
66
env_betsy.GLSL_HEADER("bc6h.glsl")
77
env_betsy.GLSL_HEADER("bc1.glsl")
8+
env_betsy.GLSL_HEADER("bc4.glsl")
89
env_betsy.Depends(Glob("*.glsl.gen.h"), ["#glsl_builders.py"])
910

1011
# Thirdparty source files

modules/betsy/bc4.glsl

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#[versions]
2+
3+
unsigned = "";
4+
signed = "#define SNORM";
5+
6+
#[compute]
7+
#version 450
8+
9+
#include "CrossPlatformSettings_piece_all.glsl"
10+
#include "UavCrossPlatform_piece_all.glsl"
11+
12+
#VERSION_DEFINES
13+
14+
shared float2 g_minMaxValues[4u * 4u * 4u];
15+
shared uint2 g_mask[4u * 4u];
16+
17+
layout(binding = 0) uniform sampler2D srcTex;
18+
layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
19+
20+
layout(push_constant, std430) uniform Params {
21+
uint p_channelIdx;
22+
uint p_padding[3];
23+
}
24+
params;
25+
26+
layout(local_size_x = 4, //
27+
local_size_y = 4, //
28+
local_size_z = 4) in;
29+
30+
/// Each block is 16 pixels
31+
/// Each thread works on 4 pixels
32+
/// Therefore each block needs 4 threads, generating 8 masks
33+
/// At the end these 8 masks get merged into 2 and results written to output
34+
///
35+
/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
36+
///
37+
/// A: It's a sweetspot.
38+
/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
39+
/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
40+
/// overhead, and also more LDS usage which reduces occupancy.
41+
/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities
42+
void main() {
43+
float minVal, maxVal;
44+
float4 srcPixel;
45+
46+
const uint blockThreadId = gl_LocalInvocationID.x;
47+
48+
const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
49+
50+
for (uint i = 0u; i < 4u; ++i) {
51+
const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i, blockThreadId);
52+
53+
const float4 value = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyzw;
54+
srcPixel[i] = params.p_channelIdx == 0 ? value.x : (params.p_channelIdx == 1 ? value.y : value.w);
55+
srcPixel[i] *= 255.0f;
56+
}
57+
58+
minVal = min3(srcPixel.x, srcPixel.y, srcPixel.z);
59+
maxVal = max3(srcPixel.x, srcPixel.y, srcPixel.z);
60+
minVal = min(minVal, srcPixel.w);
61+
maxVal = max(maxVal, srcPixel.w);
62+
63+
const uint minMaxIdxBase = (gl_LocalInvocationID.z << 4u) + (gl_LocalInvocationID.y << 2u);
64+
const uint maskIdxBase = (gl_LocalInvocationID.z << 2u) + gl_LocalInvocationID.y;
65+
66+
g_minMaxValues[minMaxIdxBase + blockThreadId] = float2(minVal, maxVal);
67+
g_mask[maskIdxBase] = uint2(0u, 0u);
68+
69+
memoryBarrierShared();
70+
barrier();
71+
72+
// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
73+
for (uint i = 0u; i < 4u; ++i) {
74+
minVal = min(g_minMaxValues[minMaxIdxBase + i].x, minVal);
75+
maxVal = max(g_minMaxValues[minMaxIdxBase + i].y, maxVal);
76+
}
77+
78+
// determine bias and emit color indices
79+
// given the choice of maxVal/minVal, these indices are optimal:
80+
// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
81+
float dist = maxVal - minVal;
82+
float dist4 = dist * 4.0f;
83+
float dist2 = dist * 2.0f;
84+
float bias = (dist < 8) ? (dist - 1) : (trunc(dist * 0.5f) + 2);
85+
bias -= minVal * 7;
86+
87+
uint mask0 = 0u, mask1 = 0u;
88+
89+
for (uint i = 0u; i < 4u; ++i) {
90+
float a = srcPixel[i] * 7.0f + bias;
91+
92+
int ind = 0;
93+
94+
// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
95+
if (a >= dist4) {
96+
ind = 4;
97+
a -= dist4;
98+
}
99+
100+
if (a >= dist2) {
101+
ind += 2;
102+
a -= dist2;
103+
}
104+
105+
if (a >= dist)
106+
ind += 1;
107+
108+
// turn linear scale into DXT index (0/1 are extremal pts)
109+
ind = -ind & 7;
110+
ind ^= (2 > ind) ? 1 : 0;
111+
112+
// write index
113+
const uint bits = 16u + ((blockThreadId << 2u) + i) * 3u;
114+
if (bits < 32u) {
115+
mask0 |= uint(ind) << bits;
116+
if (bits + 3u > 32u) {
117+
mask1 |= uint(ind) >> (32u - bits);
118+
}
119+
} else {
120+
mask1 |= uint(ind) << (bits - 32u);
121+
}
122+
}
123+
124+
if (mask0 != 0u)
125+
atomicOr(g_mask[maskIdxBase].x, mask0);
126+
if (mask1 != 0u)
127+
atomicOr(g_mask[maskIdxBase].y, mask1);
128+
129+
memoryBarrierShared();
130+
barrier();
131+
132+
if (blockThreadId == 0u) {
133+
// Save data
134+
uint2 outputBytes;
135+
136+
#ifdef SNORM
137+
outputBytes.x =
138+
packSnorm4x8(float4(maxVal * (1.0f / 255.0f) * 2.0f - 1.0f,
139+
minVal * (1.0f / 255.0f) * 2.0f - 1.0f, 0.0f, 0.0f));
140+
#else
141+
outputBytes.x = packUnorm4x8(
142+
float4(maxVal * (1.0f / 255.0f), minVal * (1.0f / 255.0f), 0.0f, 0.0f));
143+
#endif
144+
145+
outputBytes.x |= g_mask[maskIdxBase].x;
146+
outputBytes.y = g_mask[maskIdxBase].y;
147+
148+
uint2 dstUV = gl_GlobalInvocationID.yz;
149+
imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
150+
}
151+
}

modules/betsy/image_compress_betsy.cpp

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "betsy_bc1.h"
3636

3737
#include "bc1.glsl.gen.h"
38+
#include "bc4.glsl.gen.h"
3839
#include "bc6h.glsl.gen.h"
3940

4041
static Mutex betsy_mutex;
@@ -165,6 +166,10 @@ static String get_shader_name(BetsyFormat p_format) {
165166
case BETSY_FORMAT_BC3:
166167
return "BC3";
167168

169+
case BETSY_FORMAT_BC4_SIGNED:
170+
case BETSY_FORMAT_BC4_UNSIGNED:
171+
return "BC4";
172+
168173
case BETSY_FORMAT_BC6_SIGNED:
169174
case BETSY_FORMAT_BC6_UNSIGNED:
170175
return "BC6";
@@ -202,6 +207,12 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
202207
dest_format = Image::FORMAT_DXT1;
203208
break;
204209

210+
case BETSY_FORMAT_BC4_UNSIGNED:
211+
version = "unsigned";
212+
dst_rd_format = RD::DATA_FORMAT_R32G32_UINT;
213+
dest_format = Image::FORMAT_RGTC_R;
214+
break;
215+
205216
case BETSY_FORMAT_BC6_SIGNED:
206217
version = "signed";
207218
dst_rd_format = RD::DATA_FORMAT_R32G32B32A32_UINT;
@@ -235,8 +246,13 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
235246
err = source->parse_versions_from_text(bc1_shader_glsl);
236247
break;
237248

238-
case BETSY_FORMAT_BC6_UNSIGNED:
249+
case BETSY_FORMAT_BC4_SIGNED:
250+
case BETSY_FORMAT_BC4_UNSIGNED:
251+
err = source->parse_versions_from_text(bc4_shader_glsl);
252+
break;
253+
239254
case BETSY_FORMAT_BC6_SIGNED:
255+
case BETSY_FORMAT_BC6_UNSIGNED:
240256
err = source->parse_versions_from_text(bc6h_shader_glsl);
241257
break;
242258

@@ -430,26 +446,45 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
430446
compress_rd->compute_list_bind_compute_pipeline(compute_list, shader.pipeline);
431447
compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
432448

433-
if (dest_format == Image::FORMAT_BPTC_RGBFU || dest_format == Image::FORMAT_BPTC_RGBF) {
434-
BC6PushConstant push_constant;
435-
push_constant.sizeX = 1.0f / width;
436-
push_constant.sizeY = 1.0f / height;
437-
push_constant.padding[0] = 0;
438-
push_constant.padding[1] = 0;
439-
440-
compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
441-
442-
} else {
443-
BC1PushConstant push_constant;
444-
push_constant.num_refines = 2;
445-
push_constant.padding[0] = 0;
446-
push_constant.padding[1] = 0;
447-
push_constant.padding[2] = 0;
448-
449-
compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant));
449+
switch (dest_format) {
450+
case Image::FORMAT_BPTC_RGBFU:
451+
case Image::FORMAT_BPTC_RGBF: {
452+
BC6PushConstant push_constant;
453+
push_constant.sizeX = 1.0f / width;
454+
push_constant.sizeY = 1.0f / height;
455+
push_constant.padding[0] = 0;
456+
push_constant.padding[1] = 0;
457+
458+
compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
459+
compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
460+
} break;
461+
462+
case Image::FORMAT_DXT1: {
463+
BC1PushConstant push_constant;
464+
push_constant.num_refines = 2;
465+
push_constant.padding[0] = 0;
466+
push_constant.padding[1] = 0;
467+
push_constant.padding[2] = 0;
468+
469+
compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant));
470+
compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
471+
} break;
472+
473+
case Image::FORMAT_RGTC_R: {
474+
BC4PushConstant push_constant;
475+
push_constant.channel_idx = 0;
476+
push_constant.padding[0] = 0;
477+
push_constant.padding[1] = 0;
478+
push_constant.padding[2] = 0;
479+
480+
compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC4PushConstant));
481+
compress_rd->compute_list_dispatch(compute_list, 1, get_next_multiple(width, 16) / 16, get_next_multiple(height, 16) / 16);
482+
} break;
483+
484+
default: {
485+
} break;
450486
}
451487

452-
compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
453488
compress_rd->compute_list_end();
454489

455490
compress_rd->submit();
@@ -511,13 +546,14 @@ Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels) {
511546

512547
switch (p_channels) {
513548
case Image::USED_CHANNELS_RGB:
514-
result = betsy->compress(BETSY_FORMAT_BC1_DITHER, r_img);
515-
break;
516-
517549
case Image::USED_CHANNELS_L:
518550
result = betsy->compress(BETSY_FORMAT_BC1, r_img);
519551
break;
520552

553+
case Image::USED_CHANNELS_R:
554+
result = betsy->compress(BETSY_FORMAT_BC4_UNSIGNED, r_img);
555+
break;
556+
521557
default:
522558
break;
523559
}

modules/betsy/image_compress_betsy.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ enum BetsyFormat {
5050
BETSY_FORMAT_BC1,
5151
BETSY_FORMAT_BC1_DITHER,
5252
BETSY_FORMAT_BC3,
53+
BETSY_FORMAT_BC4_SIGNED,
54+
BETSY_FORMAT_BC4_UNSIGNED,
5355
BETSY_FORMAT_BC6_SIGNED,
5456
BETSY_FORMAT_BC6_UNSIGNED,
5557
};
@@ -65,6 +67,11 @@ struct BC1PushConstant {
6567
uint32_t padding[3];
6668
};
6769

70+
struct BC4PushConstant {
71+
uint32_t channel_idx;
72+
uint32_t padding[3];
73+
};
74+
6875
void free_device();
6976

7077
Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels);

thirdparty/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ fix build with our own copy of zstd (patch in `patches`).
7878

7979
Files extracted from upstream source:
8080

81-
- `bc6h.glsl`, `bc1.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`.
81+
- `bc6h.glsl`, `bc1.glsl`, `bc4.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`.
8282
- `LICENSE.md`
8383

8484

0 commit comments

Comments
 (0)