| Summary: | Using WASM SIMD on https://squoosh.app breaks WebP decoding | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Product: | WebKit | Reporter: | Jake Archibald <jaffathecake> | ||||||||||
| Component: | WebAssembly | Assignee: | Justin Michaud <justin_michaud> | ||||||||||
| Status: | RESOLVED FIXED | ||||||||||||
| Severity: | Normal | CC: | d_degazio, justin_michaud, karlcow, mark.lam, webkit-bug-importer, ysuzuki | ||||||||||
| Priority: | P2 | Keywords: | BrowserCompat, InRadar | ||||||||||
| Version: | Safari Technology Preview | ||||||||||||
| Hardware: | Unspecified | ||||||||||||
| OS: | Unspecified | ||||||||||||
| Attachments: |
|
||||||||||||
|
Description
Jake Archibald
2023-03-03 04:30:13 PST
Created attachment 465279 [details]
Chrome Screenshot
I could throw together a demo with just the codec wasm if that would help. LMK 1. Please do provide a reduced test case if at all possible. 2. Please point out what is actually wrong. I can see some rendering artifacts in the images you attached but that doesn't tell me what is actually expected. Is the rendering supposed to ice exact and 100% reproducible every time? Or is there any randomness? This is where a reduced test case to show where values are not expected would be helpful instead of a large image with uncertainty as to what is actually wrong. In the meantime: yes, the output is expected to be the same each time. The output is identical between Chrome and Firefox. Only Safari produces the artifacts on the image, and only if SIMD is used. https://safari-simd-bug.glitch.me/without-simd.html - no SIMD. Behaves as expected in Safari, Chrome, Firefox. https://safari-simd-bug.glitch.me/with-simd.html - with SIMD. Behaves as expected in Chrome, Firefox. Fail as expected in stable Safari, due to lack of SIMD support. Produces image with weird artifacts in Safari TP. I know this isn't totally reduced, but it's the best I've got :) Created attachment 465324 [details]
cli version
Safari compiles this simplified test case incorrectly:
(func $main (export "main") (result i32)
v128.const i64x2 -27866447905751188 -27866447902605412
v128.const i32x4 0x00080008 0x00080008 0x00080008 0x00080008
i32x4.dot_i16x8_s
i64x2.extract_lane 0
i64.const -6867652708672
i64.eq
)
Thank you so much for this test case!
Let me explain how I ended up debugging this, just for future reference for any future WebKit contributors.
First, I extracted a CLI version (attached above). Then, I ran binaryen to instrument the code:
```
bin/wasm-opt ~/Desktop/case/webp_enc_simd.wasm -o ~/Desktop/case/webp_enc_simd-instrumented.wasm --enable-simd --disable-gc --log-execution --instrument-memory
```
And the following environment:
```
let START_DEBUG = 3200
let count = 0;
function createWasm() {
var info = { a: asmLibraryArg };
let memPrint = function(...args) {
if (count >= START_DEBUG)
print(...args)
}
let env = {
log_i32: function(x) {
memPrint('[LoggingExternalInterface logging ' + literal(x, 'i32') + ']');
},
log_i64: function(x, h) {
memPrint('[LoggingExternalInterface logging ' + literal(x, 'i32') + ' ' + literal(h, 'i32') + ']');
},
log_f32: function(x) {
memPrint('[LoggingExternalInterface logging ' + literal(x, 'f64') + ']');
},
log_f64: function(x) {
memPrint('[LoggingExternalInterface logging ' + literal(x, 'f64') + ']');
},
my_log_i32: function(i) {
print("My log: " + i)
return i
},
my_log_i64: function(i) {
print("My log 64: " + i)
return i
},
log_execution: function(loc) {
if (count >= START_DEBUG)
print('log_execution ' + loc + " count: " + count);
if (loc < 133713370)
++count;
if (count == 3230+1) {
print("******* " + loc)
$vm.breakpoint()
}
},
setTempRet0: function(x) {
tempRet0 = x;
},
getTempRet0: function() {
return x;
},
get_i32: function(loc, index, value) {
memPrint('get_i32 ' + [loc, index, value]);
return value;
},
get_i64: function(loc, index, low, high) {
memPrint('get_i64 ' + [loc, index, low, high]);
env['setTempRet0'](high);
return low;
},
get_f32: function(loc, index, value) {
memPrint('get_f32 ' + [loc, index, value]);
return value;
},
get_f64: function(loc, index, value) {
memPrint('get_f64 ' + [loc, index, value]);
return value;
},
set_i32: function(loc, index, value) {
memPrint('set_i32 ' + [loc, index, value]);
return value;
},
set_i64: function(loc, index, low, high) {
memPrint('set_i64 ' + [loc, index, low, high]);
env['setTempRet0'](high);
return low;
},
set_f32: function(loc, index, value) {
memPrint('set_f32 ' + [loc, index, value]);
return value;
},
set_f64: function(loc, index, value) {
memPrint('set_f64 ' + [loc, index, value]);
return value;
},
load_ptr: function(loc, bytes, offset, ptr) {
memPrint('load_ptr ' + [loc, bytes, offset, ptr]);
if (count >= START_DEBUG) {
print('*load_ptr ' + [loc, bytes, offset, ptr]);
let arr = new Uint32Array(wasmMemory.buffer)
for(let i = 0; i < 4; ++i)
print("Val: " + arr[(ptr + offset + i * 4) / 4])
}
return ptr;
},
load_val_i32: function(loc, value) {
memPrint('load_val_i32 ' + [loc, value]);
return value;
},
load_val_i64: function(loc, low, high) {
memPrint('load_val_i64 ' + [loc, low, high]);
env['setTempRet0'](high);
return low;
},
load_val_f32: function(loc, value) {
memPrint('load_val_f32 ' + [loc, value]);
return value;
},
load_val_f64: function(loc, value) {
memPrint('load_val_f64 ' + [loc, value]);
return value;
},
store_ptr: function(loc, bytes, offset, ptr) {
memPrint('store_ptr ' + [loc, bytes, offset, ptr]);
if (count >= START_DEBUG) {
print('*store_ptr ' + [loc, bytes, offset, ptr]);
let arr = new Uint32Array(wasmMemory.buffer)
for(let i = 0; i < 4; ++i)
print("Val: " + arr[(ptr + offset + i * 4) / 4])
}
if (loc == 12554 && count >= 3230) {
print("*********** end")
$vm.breakpoint()
}
return ptr;
},
store_val_i32: function(loc, value) {
memPrint('store_val_i32 ' + [loc, value]);
if (loc == 12552 && count >= 3230) {
print("*********** start")
// $vm.breakpoint()
}
return value;
},
store_val_i64: function(loc, low, high) {
memPrint('store_val_i64 ' + [loc, low, high]);
env['setTempRet0'](high);
return low;
},
store_val_f32: function(loc, value) {
memPrint('store_val_f32 ' + [loc, value]);
return value;
},
store_val_f64: function(loc, value) {
memPrint('store_val_f64 ' + [loc, value]);
return value;
},
struct_get_val_i32: function(loc, value) {
memPrint('struct_get_val_i32 ' + [loc, value]);
return value;
},
struct_get_val_i64: function(loc, value) {
memPrint('struct_get_val_i64 ' + [loc, value]);
return value;
},
struct_get_val_f32: function(loc, value) {
memPrint('struct_get_val_f32 ' + [loc, value]);
return value;
},
struct_get_val_f64: function(loc, value) {
memPrint('struct_get_val_f64 ' + [loc, value]);
return value;
},
struct_set_val_i32: function(loc, value) {
memPrint('struct_set_val_i32 ' + [loc, value]);
return value;
},
struct_set_val_i64: function(loc, value) {
memPrint('struct_set_val_i64 ' + [loc, value]);
return value;
},
struct_set_val_f32: function(loc, value) {
memPrint('struct_set_val_f32 ' + [loc, value]);
return value;
},
struct_set_val_f64: function(loc, value) {
memPrint('struct_set_val_f64 ' + [loc, value]);
return value;
},
array_get_val_i32: function(loc, value) {
memPrint('array_get_val_i32 ' + [loc, value]);
return value;
},
array_get_val_i64: function(loc, value) {
memPrint('array_get_val_i64 ' + [loc, value]);
return value;
},
array_get_val_f32: function(loc, value) {
memPrint('array_get_val_f32 ' + [loc, value]);
return value;
},
array_get_val_f64: function(loc, value) {
memPrint('array_get_val_f64 ' + [loc, value]);
return value;
},
array_set_val_i32: function(loc, value) {
memPrint('array_set_val_i32 ' + [loc, value]);
return value;
},
array_set_val_i64: function(loc, value) {
memPrint('array_set_val_i64 ' + [loc, value]);
return value;
},
array_set_val_f32: function(loc, value) {
memPrint('array_set_val_f32 ' + [loc, value]);
return value;
},
array_set_val_f64: function(loc, value) {
memPrint('array_set_val_f64 ' + [loc, value]);
return value;
},
array_get_index: function(loc, value) {
memPrint('array_get_index ' + [loc, value]);
return value;
},
array_set_index: function(loc, value) {
memPrint('array_set_index ' + [loc, value]);
return value;
},
};
info["env"] = env;
info["a"]["log_execution"] = env.log_execution
```
I ran it in both v8 and chrome:
```
/Volumes/WebKit/wabt/bin/wasm2wat --inline-exports --inline-imports --generate-names --enable-annotations --enable-exceptions --enable-threads --enable-code-metadata webp_enc_simd-instrumented.wasm -o webp_enc_simd-instrumented.wat
[ add any extra debug calls ]
wat2wasm webp_enc_simd-instrumented.wat -o webp_enc_simd-instrumented-2.wasm
jsc -m with-simd.js --useConcurrentJIT=0 --useOMGJIT=0 --useDollarVM=1 --dumpDisassembly=0 > ./jsc-cfg.txt
v8 --single-threaded --module with-simd.js > ./v8-cfg.txt
```
Then I used beyond compare to diff the output.
Another useful trick to see where you are is to add a $vm.breakpoint call, and then catch it in lldb from the jsc shell.
Finally, I added my own debug call to narrow in on the precise divergence in execution.
I have attached my working setup in case I need to debug a similar issue in the future.
Created attachment 465330 [details]
debugging
Pull request: https://github.com/WebKit/WebKit/pull/11153 Committed 261326@main (03e6b1ff539c): <https://commits.webkit.org/261326@main> Reviewed commits have been landed. Closing PR #11153 and removing active labels. |