// // ESP32-S3 SIMD optimized code // Written by Larry Bank // Copyright (c) 2024 BitBank Software, Inc. // Project started Jan 21, 2024 // #ifdef ARDUINO_ARCH_ESP32 #include "dsps_fft2r_platform.h" #if (dsps_fft2r_sc16_aes3_enabled == 1) .text .align 4 // // Inverse DCT dequantization for JPEG decompression // A2 A3 // Call as void s3_dequant(int16_t *pMCU, int16_t *pQuant); .global s3_dequant .type s3_dequant,@function s3_dequant: # no idea what this frequency keyword does # .frequency 1.000 0.000 entry a1,16 ee.vld.128.ip q0,a2,16 # load MCU rows 0-3 into Q0,Q1,Q2,Q3 ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7 ee.vld.128.ip q1,a2,16 ee.vld.128.ip q5,a3,16 ee.vld.128.ip q2,a2,16 ee.vld.128.ip q6,a3,16 ee.vld.128.ip q3,a2,16 ee.vld.128.ip q7,a3,16 movi.n a4,0 # load the shift register with 0 wsr.sar a2 # put it in the SAR (shift amount register) ee.vmul.s16 q0,q0,q4 # de-quantize each row ee.vmul.s16 q1,q1,q5 ee.vmul.s16 q2,q2,q6 ee.vmul.s16 q3,q3,q7 addi.n a2,a2,64 # point to first row of MCUs to store dequantized values ee.vst.128.ip q0,a2,16 # write back dequantized rows 0-3 ee.vst.128.ip q1,a2,16 ee.vst.128.ip q2,a2,16 ee.vst.128.ip q3,a2,16 // repeat for rows 4-7 ee.vld.128.ip q0,a2,16 # load MCU rows 4-7 into Q0,Q1,Q2,Q3 ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7 ee.vld.128.ip q1,a2,16 ee.vld.128.ip q5,a3,16 ee.vld.128.ip q2,a2,16 ee.vld.128.ip q6,a3,16 ee.vld.128.ip q3,a2,16 ee.vld.128.ip q7,a3,16 ee.vmul.s16 q0,q0,q4 # de-quantize rows 4-7 ee.vmul.s16 q1,q1,q5 ee.vmul.s16 q2,q2,q6 ee.vmul.s16 q3,q3,q7 addi.n a2,a2,64 # point to 4th row of MCUs ee.vst.128.ip q0,a2,16 # write back dequantized rows 4-7 ee.vst.128.ip q1,a2,16 ee.vst.128.ip q2,a2,16 ee.vst.128.ip q3,a2,16 retw.n # done #endif // dsps_fft2r_sc16_aes3_enabled #endif // ESP32