vertexfilter: Implement a floating-point exponent filter

In some cases we can't quantize the floating point data because the range of the data is unknown. While it's possible to use meshopt_quantizeFloat to reduce the precision and gain some compression back, this is often insufficient and suboptimal. For inputs that represent a vector in 3D space, such as a position or scale, a good alternative is to use a shared-exponent encoding - it's a reasonable assumption that we are content with the same (absolute) precision in all three components. To be able to encode in shared exp, we use a modified floating point like format, where we store a 24-bit signed integer mantissa (without implicit 1) and a 8-bit exponent. This is less precise than a floating point number - we lose 1 bit - but we gain an ability to individually select the exponent and mantissa at any level of desired mantissa precision. Additionally this moves exponent into a single byte, and stores the mantissa as a two-complement integer - both of these are much friendlier for vertex codec than a basic float encoding. While ideally the shared exponent would be stored just once, this complicates the SIMD decoding and is actually redundant if the output of the filter is compressed with vertex encoder *and* a general purpose LZ, because the stream of exponent bytes will be exactly the same between all three components. The resulting decoder runs at ~13 GB/s using WASM SIMD and ~2.5 GB/s using scalar WASM.
zeux · Mar 31, 2020 · 8c9bb5a · 8c9bb5a
1 parent adb3b47
commit 8c9bb5a
Show file tree

Hide file tree

Showing 7 changed files with 118 additions and 6 deletions.
diff --git a/Makefile b/Makefile
@@ -26,7 +26,7 @@ LDFLAGS=
 WASM_SOURCES=src/vertexcodec.cpp src/indexcodec.cpp src/vertexfilter.cpp
 WASM_EXPORTS="__start","_sbrk"
 WASM_EXPORTS+=,"_meshopt_decodeVertexBuffer","_meshopt_decodeIndexBuffer"
-WASM_EXPORTS+=,"_meshopt_decodeFilterOct","_meshopt_decodeFilterQuat"
+WASM_EXPORTS+=,"_meshopt_decodeFilterOct","_meshopt_decodeFilterQuat","_meshopt_decodeFilterExp"
 WASM_FLAGS=-O3 -DNDEBUG -s EXPORTED_FUNCTIONS='[$(WASM_EXPORTS)]' -s ALLOW_MEMORY_GROWTH=1 -s TOTAL_STACK=24576 -s TOTAL_MEMORY=65536
 
 ifeq ($(config),iphone)

diff --git a/demo/tests.cpp b/demo/tests.cpp
@@ -429,6 +429,27 @@ static void decodeFilterQuat12()
 	assert(memcmp(data, expected, sizeof(data)) == 0);
 }
 
+static void decodeFilterExp()
+{
+	unsigned int data[4] = {
+	    0,
+	    0xff000003,
+	    0x02fffff7,
+	    0xfe7fffff, // clang-format :-/
+	};
+
+	meshopt_decodeFilterExp(data, 4, 4);
+
+	const unsigned int expected[4] = {
+	    0,
+	    0x3fc00000,
+	    0xc2100000,
+	    0x49fffffe, // clang-format :-/
+	};
+
+	assert(memcmp(data, expected, sizeof(data)) == 0);
+}
+
 static void clusterBoundsDegenerate()
 {
 	const float vbd[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -584,6 +605,7 @@ static void runTestsOnce()
 	decodeFilterOct8();
 	decodeFilterOct12();
 	decodeFilterQuat12();
+	decodeFilterExp();
 
 	clusterBoundsDegenerate();
 

diff --git a/js/meshopt_decoder.js b/js/meshopt_decoder.js
diff --git a/js/meshopt_decoder.test.js b/js/meshopt_decoder.test.js
@@ -142,6 +142,27 @@ var tests = {
 
 		assert.deepStrictEqual(result, expected);
 	},
+
+	decodeFilterExp: function() {
+		var encoded = new Uint8Array([
+			0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff, 0x02, 0xff, 0xff, 0x7f,
+			0xfe,
+		]);
+
+		var expected = new Uint32Array([
+			0,
+			0x3fc00000,
+			0xc2100000,
+			0x49fffffe,
+		]);
+
+		var result = new Uint32Array(expected.length);
+		decoder.decodeVertexBuffer(new Uint8Array(result.buffer), 1, 16, encoded, /* filter= */ 3);
+
+		assert.deepStrictEqual(result, expected);
+	},
 };
 
 decoder.ready.then(() => {

diff --git a/src/meshoptimizer.h b/src/meshoptimizer.h
@@ -219,9 +219,13 @@ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t verte
  *
  * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
  * Each component is stored as an 16-bit integer; stride must be equal to 8.
+ *
+ * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
+ * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
  */
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size);
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size);
 
 /**
  * Experimental: Mesh simplifier

diff --git a/src/vertexfilter.cpp b/src/vertexfilter.cpp
@@ -96,6 +96,29 @@ static void decodeFilterQuat(short* data, size_t count)
 		data[i * 4 + order[qc][3]] = short(wf);
 	}
 }
+
+static void decodeFilterExp(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int v = data[i];
+
+		// decode mantissa and exponent
+		int m = int(v << 8) >> 8;
+		int e = char(v >> 24);
+
+		union {
+			float f;
+			unsigned int ui;
+		} u;
+
+		// optimized version of ldexp(float(m), e)
+		u.ui = unsigned(e + 127) << 23;
+		u.f = u.f * float(m);
+
+		data[i] = u.ui;
+	}
+}
 #endif
 
 #ifdef SIMD_WASM
@@ -277,6 +300,26 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 		out[3] = __builtin_rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
 	}
 }
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t v = wasm_v128_load(&data[i]);
+
+		// decode exponent into 2^x directly
+		v128_t ef = wasm_i32x4_shr(v, 24);
+		v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
+		v128_t m = wasm_f32x4_convert_i32x4(mf);
+
+		v128_t r = wasm_f32x4_mul(es, m);
+
+		wasm_v128_store(&data[i], r);
+	}
+}
 #endif
 
 } // namespace meshopt
@@ -316,4 +359,18 @@ void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_s
 #endif
 }
 
+void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size % 4 == 0);
+
+#if defined(SIMD_WASM)
+	decodeFilterExpSimd(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#else
+	decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#endif
+}
+
 #undef SIMD_WASM
diff --git a/tools/codecbench.cpp b/tools/codecbench.cpp
@@ -115,12 +115,17 @@ void benchFilters(size_t count)
 
 		double t3 = timestamp();
 
+		meshopt_decodeFilterExp(&d8[0], count4, 8);
+
+		double t4 = timestamp();
+
 		double GB = 1024 * 1024 * 1024;
 
-		printf("filter: oct8 %.2f ms (%.2f GB/sec), oct12 %.2f ms (%.2f GB/sec), quat12 %.2f ms (%.2f GB/sec)\n",
+		printf("filter: oct8 %.2f ms (%.2f GB/sec), oct12 %.2f ms (%.2f GB/sec), quat12 %.2f ms (%.2f GB/sec), exp %.2f ms (%.2f GB/sec)\n",
 			(t1 - t0) * 1000, double(d4.size()) / GB / (t1 - t0),
 			(t2 - t1) * 1000, double(d8.size()) / GB / (t2 - t1),
-			(t3 - t2) * 1000, double(d8.size()) / GB / (t3 - t2));
+			(t3 - t2) * 1000, double(d8.size()) / GB / (t3 - t2),
+			(t4 - t3) * 1000, double(d8.size()) / GB / (t4 - t3));
 	}
 }