116 lines
4.5 KiB
Plaintext
116 lines
4.5 KiB
Plaintext
//
|
|
// ESP32-S3 SIMD optimized code
|
|
// Written by Larry Bank
|
|
// Copyright (c) 2024 BitBank Software, Inc.
|
|
// Project started Jan 21, 2024
|
|
//
|
|
#ifdef ARDUINO_ESP32S3_DEV
|
|
|
|
#include "dsps_fft2r_platform.h"
|
|
#if (dsps_fft2r_sc16_aes3_enabled == 1)
|
|
.text
|
|
.align 4
|
|
//
|
|
// Inverse DCT dequantization for JPEG decompression
|
|
// A2 A3 A4 A5
|
|
// Call as void s3_idct(int16_t *pMCU, int16_t *pQuant, const int16_t *pConst, uint16_ u16MCUFlags);
|
|
.global s3_idct
|
|
.type s3_idct,@function
|
|
|
|
s3_idct:
|
|
# no idea what this frequency keyword does
|
|
# .frequency 1.000 0.000
|
|
entry a1,64
|
|
mov.n a9,a1 # keep stack ptr copy in a9
|
|
andi.n a5,0x2000 # isolate lower rows populated info
|
|
bnez.n a5,.full_calc
|
|
// Lower 4 rows are empty, this simplifies the calculations
|
|
// columns first, even part
|
|
ee.vld.128.ip q0,a2,32 # load MCU row 0 int Q0
|
|
ee.vld.128.ip q1,a2,0 # load row 2 into Q1
|
|
ee.vld.128.ip q2,a3,32 # load quant row 0 into Q2
|
|
ee.vld.128.ip q3,a3,0 # load quant row 2 into Q3
|
|
movi.n a6,0 # load the shift register with 0
|
|
wsr.sar a6 # put it in the SAR (shift amount register)
|
|
ee.vmul.s16 q0,q0,q2 # de-quantize row 0
|
|
ee.vmul.s16 q1,q1,q3 # de-quantize row 2
|
|
ee.vld.128.ip q4,a4,16 # load 0.414 constants into Q4
|
|
movi.n a6,14 # load the shift register with 14
|
|
wsr.sar a6 # put it in the SAR (shift amount register)
|
|
ee.vmul.s16 q6,q4,q1 # temp12 = row2 * 0.414
|
|
ee.vadds.s16 q4,q0,q1 # 0+2 tmp0
|
|
ee.vsubs.s16 q7,q0,q1 # 0-2 tmp3
|
|
ee.vadds.s16 q5,q0,q6 # 10+12 tmp1
|
|
ee.vsubs.s16 q6,q0,q6 # 10-12 tmp2
|
|
// odd part
|
|
subi.n a2,16 # point to row 1
|
|
subi.n a3,16 # point to quant vals for row 1
|
|
ee.vld.128.ip q0,a2,32 # load MCU row 1 int Q0
|
|
ee.vld.128.ip q1,a2,0 # load row 3 into Q1
|
|
ee.vld.128.ip q2,a3,32 # load quant row 1 into Q2
|
|
ee.vld.128.ip q3,a3,0 # load quant row 3 into Q3
|
|
movi.n a6,0 # load the shift register with 0
|
|
wsr.sar a6 # put it in the SAR (shift amount register)
|
|
ee.vmul.s16 q0,q0,q2 # de-quantize row 1
|
|
ee.vmul.s16 q1,q1,q3 # de-quantize row 3
|
|
ee.vadds.s16 q2,q0,q1 # tmp7 = tmp4+tmp5
|
|
ee.vld.128.ip q3,a4,16 # load 1.414 constants into Q3
|
|
movi.n a6,14 # load the shift register with 14
|
|
wsr.sar a6 # put it in the SAR (shift amount register)
|
|
ee.vst.128.ip q4,a9,16 # need to stack these registers
|
|
ee.vst.128.ip q5,a9,16 # because 8 Q registers is not enough :(
|
|
|
|
ee.vsubs.s16 q4,q0,q1 # tmp4-tmp5
|
|
ee.vmul.s16 q5,q4,q3 # temp11 = (tmp4-tmp5) * 1.414
|
|
ee.vld.128.ip q3,a4,16 # load 1.8477 constant into Q3
|
|
ee.vmul.s16 q4,q4,q3 # Z5 = (tmp4-tmp5) * 1.847
|
|
ee.vld.128.ip q3,a4,16 # load 2.613 constant into Q3
|
|
movi.n a6,13 # load the shift register with 13
|
|
wsr.sar a6 # otherwise this constant would overflow
|
|
ee.vmul.s16 q3,q3,q1 # tmp5 * 2.613
|
|
|
|
|
|
.full_calc:
|
|
// Need to do the full calculations
|
|
ee.vld.128.ip q0,a2,16 # load MCU rows 0-3 into Q0,Q1,Q2,Q3
|
|
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
|
|
ee.vld.128.ip q1,a2,16
|
|
ee.vld.128.ip q5,a3,16
|
|
ee.vld.128.ip q2,a2,16
|
|
ee.vld.128.ip q6,a3,16
|
|
ee.vld.128.ip q3,a2,16
|
|
ee.vld.128.ip q7,a3,16
|
|
movi.n a4,0 # load the shift register with 0
|
|
wsr.sar a2 # put it in the SAR (shift amount register)
|
|
ee.vmul.s16 q0,q0,q4 # de-quantize each row
|
|
ee.vmul.s16 q1,q1,q5
|
|
ee.vmul.s16 q2,q2,q6
|
|
ee.vmul.s16 q3,q3,q7
|
|
addi.n a2,a2,64 # point to first row of MCUs to store dequantized values
|
|
ee.vst.128.ip q0,a2,16 # write back dequantized rows 0-3
|
|
ee.vst.128.ip q1,a2,16
|
|
ee.vst.128.ip q2,a2,16
|
|
ee.vst.128.ip q3,a2,16
|
|
// repeat for rows 4-7
|
|
ee.vld.128.ip q0,a2,16 # load MCU rows 4-7 into Q0,Q1,Q2,Q3
|
|
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
|
|
ee.vld.128.ip q1,a2,16
|
|
ee.vld.128.ip q5,a3,16
|
|
ee.vld.128.ip q2,a2,16
|
|
ee.vld.128.ip q6,a3,16
|
|
ee.vld.128.ip q3,a2,16
|
|
ee.vld.128.ip q7,a3,16
|
|
|
|
ee.vmul.s16 q0,q0,q4 # de-quantize rows 4-7
|
|
ee.vmul.s16 q1,q1,q5
|
|
ee.vmul.s16 q2,q2,q6
|
|
ee.vmul.s16 q3,q3,q7
|
|
addi.n a2,a2,64 # point to 4th row of MCUs
|
|
ee.vst.128.ip q0,a2,16 # write back dequantized rows 4-7
|
|
ee.vst.128.ip q1,a2,16
|
|
ee.vst.128.ip q2,a2,16
|
|
ee.vst.128.ip q3,a2,16
|
|
retw.n # done
|
|
#endif // dsps_fft2r_sc16_aes3_enabled
|
|
#endif // ESP32
|