Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions src/pal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ const LIQ_WEIGHT_MSE: f64 = 0.45;
/// ARGB layout is important for x86 SIMD.
/// I've created the newtype wrapper to try a 16-byte alignment, but it didn't improve perf :(
#[cfg_attr(
any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64")),
any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"), all(target_arch = "wasm32", target_feature = "simd128")),
repr(C, align(16))
)]
#[derive(Debug, Copy, Clone, Default, PartialEq)]
#[allow(non_camel_case_types)]
pub struct f_pixel(pub ARGBF);

impl f_pixel {
#[cfg(not(any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"))))]
#[cfg(not(any(target_arch = "x86_64", all(target_feature = "neon", target_arch = "aarch64"), all(target_arch = "wasm32", target_feature = "simd128"))))]
#[inline(always)]
pub fn diff(&self, other: &f_pixel) -> f32 {
let alphas = other.0.a - self.0.a;
Expand All @@ -51,6 +51,31 @@ impl f_pixel {
(black.b * black.b).max(white.b * white.b)
}

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[inline(always)]
pub fn diff(&self, other: &f_pixel) -> f32 {
use core::arch::wasm32::*;

let px = f32x4(self.0.a, self.0.r, self.0.g, self.0.b);
let py = f32x4(other.0.a, other.0.r, other.0.g, other.0.b);

// y.a - x.a, then broadcast lane 0 to all four
let alpha_diff = f32x4_sub(py, px);
let alphas = f32x4_splat(f32x4_extract_lane::<0>(alpha_diff));

let mut onblack = f32x4_sub(px, py); // x - y
let mut onwhite = f32x4_add(onblack, alphas); // x - y + (y.a - x.a)

onblack = f32x4_mul(onblack, onblack);
onwhite = f32x4_mul(onwhite, onwhite);
let max = f32x4_max(onwhite, onblack);

// add rgb (lanes 1,2,3), not a (lane 0)
f32x4_extract_lane::<1>(max)
+ f32x4_extract_lane::<2>(max)
+ f32x4_extract_lane::<3>(max)
}

#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
#[inline(always)]
pub fn diff(&self, other: &Self) -> f32 {
Expand Down
Loading