Skip to content

Commit

Permalink
vertexfilter: Remove redundant WASM instructions
Browse files Browse the repository at this point in the history
After a fast float->int conversion for signed numbers, we were adjusting
the result to get the byte representation.

First, for values in a range where conversion works (-2^22..2^22), we
already know the resulting exponent so we can subtract 0x4B40_0000
without masking.

Second, because we only need low 8/16 bits anyway, we can skip that as
well.

LLVM was actually eliding the and + sub for us since it knows we only
need the low 8/16 bits so this doesn't affect performance/codegen.
  • Loading branch information
zeux committed Feb 22, 2020
1 parent 1cd4ee8 commit 5281361
Showing 1 changed file with 13 additions and 16 deletions.
29 changes: 13 additions & 16 deletions src/vertexfilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,12 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), l);

// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
const v128_t fmask = wasm_i32x4_splat(0x7fffff);
const v128_t fbase = wasm_i32x4_splat(0x400000);

v128_t xr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap), fmask), fbase);
v128_t yr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap), fmask), fbase);
v128_t zr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap), fmask), fbase);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);

// combine xr/yr/zr into final value
v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
Expand Down Expand Up @@ -180,13 +179,12 @@ static void decodeFilterOctSimd(short* data, size_t count)
v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), l);

// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
const v128_t fmask = wasm_i32x4_splat(0x7fffff);
const v128_t fbase = wasm_i32x4_splat(0x400000);

v128_t xr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap), fmask), fbase);
v128_t yr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap), fmask), fbase);
v128_t zr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap), fmask), fbase);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);

// mix x/z and y/0 to make 16-bit unpack easier
v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
Expand Down Expand Up @@ -235,14 +233,13 @@ static void decodeFilterQuatSimd(short* data, size_t count)
v128_t s = wasm_f32x4_splat(32767.f);

// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
const v128_t fmask = wasm_i32x4_splat(0x7fffff);
const v128_t fbase = wasm_i32x4_splat(0x400000);

v128_t xr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap), fmask), fbase);
v128_t yr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap), fmask), fbase);
v128_t zr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap), fmask), fbase);
v128_t wr = wasm_i32x4_sub(wasm_v128_and(wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap), fmask), fbase);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);

// mix x/z and w/y to make 16-bit unpack easier
v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
Expand Down

0 comments on commit 5281361

Please sign in to comment.