Tasmota/lib/libesp32/JPEGDEC/src/s3_simd_idct.S_FUTURE
2025-09-07 20:16:53 +02:00

116 lines
4.5 KiB
Plaintext

//
// ESP32-S3 SIMD optimized code
// Written by Larry Bank
// Copyright (c) 2024 BitBank Software, Inc.
// Project started Jan 21, 2024
//
#ifdef ARDUINO_ESP32S3_DEV
#include "dsps_fft2r_platform.h"
#if (dsps_fft2r_sc16_aes3_enabled == 1)
.text
.align 4
//
// Inverse DCT dequantization for JPEG decompression
// A2 A3 A4 A5
// Call as void s3_idct(int16_t *pMCU, int16_t *pQuant, const int16_t *pConst, uint16_ u16MCUFlags);
.global s3_idct
.type s3_idct,@function
s3_idct:
# no idea what this frequency keyword does
# .frequency 1.000 0.000
entry a1,64
mov.n a9,a1 # keep stack ptr copy in a9
andi.n a5,0x2000 # isolate lower rows populated info
bnez.n a5,.full_calc
// Lower 4 rows are empty, this simplifies the calculations
// columns first, even part
ee.vld.128.ip q0,a2,32 # load MCU row 0 int Q0
ee.vld.128.ip q1,a2,0 # load row 2 into Q1
ee.vld.128.ip q2,a3,32 # load quant row 0 into Q2
ee.vld.128.ip q3,a3,0 # load quant row 2 into Q3
movi.n a6,0 # load the shift register with 0
wsr.sar a6 # put it in the SAR (shift amount register)
ee.vmul.s16 q0,q0,q2 # de-quantize row 0
ee.vmul.s16 q1,q1,q3 # de-quantize row 2
ee.vld.128.ip q4,a4,16 # load 0.414 constants into Q4
movi.n a6,14 # load the shift register with 14
wsr.sar a6 # put it in the SAR (shift amount register)
ee.vmul.s16 q6,q4,q1 # temp12 = row2 * 0.414
ee.vadds.s16 q4,q0,q1 # 0+2 tmp0
ee.vsubs.s16 q7,q0,q1 # 0-2 tmp3
ee.vadds.s16 q5,q0,q6 # 10+12 tmp1
ee.vsubs.s16 q6,q0,q6 # 10-12 tmp2
// odd part
subi.n a2,16 # point to row 1
subi.n a3,16 # point to quant vals for row 1
ee.vld.128.ip q0,a2,32 # load MCU row 1 int Q0
ee.vld.128.ip q1,a2,0 # load row 3 into Q1
ee.vld.128.ip q2,a3,32 # load quant row 1 into Q2
ee.vld.128.ip q3,a3,0 # load quant row 3 into Q3
movi.n a6,0 # load the shift register with 0
wsr.sar a6 # put it in the SAR (shift amount register)
ee.vmul.s16 q0,q0,q2 # de-quantize row 1
ee.vmul.s16 q1,q1,q3 # de-quantize row 3
ee.vadds.s16 q2,q0,q1 # tmp7 = tmp4+tmp5
ee.vld.128.ip q3,a4,16 # load 1.414 constants into Q3
movi.n a6,14 # load the shift register with 14
wsr.sar a6 # put it in the SAR (shift amount register)
ee.vst.128.ip q4,a9,16 # need to stack these registers
ee.vst.128.ip q5,a9,16 # because 8 Q registers is not enough :(
ee.vsubs.s16 q4,q0,q1 # tmp4-tmp5
ee.vmul.s16 q5,q4,q3 # temp11 = (tmp4-tmp5) * 1.414
ee.vld.128.ip q3,a4,16 # load 1.8477 constant into Q3
ee.vmul.s16 q4,q4,q3 # Z5 = (tmp4-tmp5) * 1.847
ee.vld.128.ip q3,a4,16 # load 2.613 constant into Q3
movi.n a6,13 # load the shift register with 13
wsr.sar a6 # otherwise this constant would overflow
ee.vmul.s16 q3,q3,q1 # tmp5 * 2.613
.full_calc:
// Need to do the full calculations
ee.vld.128.ip q0,a2,16 # load MCU rows 0-3 into Q0,Q1,Q2,Q3
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
ee.vld.128.ip q1,a2,16
ee.vld.128.ip q5,a3,16
ee.vld.128.ip q2,a2,16
ee.vld.128.ip q6,a3,16
ee.vld.128.ip q3,a2,16
ee.vld.128.ip q7,a3,16
movi.n a4,0 # load the shift register with 0
wsr.sar a2 # put it in the SAR (shift amount register)
ee.vmul.s16 q0,q0,q4 # de-quantize each row
ee.vmul.s16 q1,q1,q5
ee.vmul.s16 q2,q2,q6
ee.vmul.s16 q3,q3,q7
addi.n a2,a2,64 # point to first row of MCUs to store dequantized values
ee.vst.128.ip q0,a2,16 # write back dequantized rows 0-3
ee.vst.128.ip q1,a2,16
ee.vst.128.ip q2,a2,16
ee.vst.128.ip q3,a2,16
// repeat for rows 4-7
ee.vld.128.ip q0,a2,16 # load MCU rows 4-7 into Q0,Q1,Q2,Q3
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
ee.vld.128.ip q1,a2,16
ee.vld.128.ip q5,a3,16
ee.vld.128.ip q2,a2,16
ee.vld.128.ip q6,a3,16
ee.vld.128.ip q3,a2,16
ee.vld.128.ip q7,a3,16
ee.vmul.s16 q0,q0,q4 # de-quantize rows 4-7
ee.vmul.s16 q1,q1,q5
ee.vmul.s16 q2,q2,q6
ee.vmul.s16 q3,q3,q7
addi.n a2,a2,64 # point to 4th row of MCUs
ee.vst.128.ip q0,a2,16 # write back dequantized rows 4-7
ee.vst.128.ip q1,a2,16
ee.vst.128.ip q2,a2,16
ee.vst.128.ip q3,a2,16
retw.n # done
#endif // dsps_fft2r_sc16_aes3_enabled
#endif // ESP32