Tasmota/lib/libesp32/JPEGDEC/src/s3_simd_idct.S_FUTURE

//
// ESP32-S3 SIMD optimized code
// Written by Larry Bank
// Copyright (c) 2024 BitBank Software, Inc.
// Project started Jan 21, 2024
//
#ifdef ARDUINO_ESP32S3_DEV

#include "dsps_fft2r_platform.h"
#if (dsps_fft2r_sc16_aes3_enabled == 1)
	.text
	.align 4
//
// Inverse DCT dequantization for JPEG decompression
//                        A2               A3               A4                     A5
// Call as void s3_idct(int16_t *pMCU, int16_t *pQuant, const int16_t *pConst, uint16_ u16MCUFlags);
	.global s3_idct
    .type   s3_idct,@function

s3_idct:
	# no idea what this frequency keyword does
#	.frequency 1.000 0.000
	entry	a1,64
  mov.n a9,a1              # keep stack ptr copy in a9
  andi.n a5,0x2000          # isolate lower rows populated info
  bnez.n a5,.full_calc
// Lower 4 rows are empty, this simplifies the calculations
  // columns first, even part
  ee.vld.128.ip q0,a2,32   # load MCU row 0 int Q0
  ee.vld.128.ip q1,a2,0    # load row 2 into Q1
  ee.vld.128.ip q2,a3,32   # load quant row 0 into Q2
  ee.vld.128.ip q3,a3,0    # load quant row 2 into Q3
  movi.n  a6,0             # load the shift register with 0
  wsr.sar  a6              # put it in the SAR (shift amount register)
  ee.vmul.s16 q0,q0,q2     # de-quantize row 0
  ee.vmul.s16 q1,q1,q3     # de-quantize row 2
  ee.vld.128.ip q4,a4,16   # load 0.414 constants into Q4
  movi.n  a6,14            # load the shift register with 14
  wsr.sar  a6              # put it in the SAR (shift amount register)
  ee.vmul.s16 q6,q4,q1     # temp12 = row2 * 0.414
  ee.vadds.s16 q4,q0,q1    # 0+2 tmp0
  ee.vsubs.s16 q7,q0,q1    # 0-2 tmp3
  ee.vadds.s16 q5,q0,q6    # 10+12 tmp1
  ee.vsubs.s16 q6,q0,q6    # 10-12 tmp2
// odd part
  subi.n a2,16             # point to row 1
  subi.n a3,16             # point to quant vals for row 1
  ee.vld.128.ip q0,a2,32   # load MCU row 1 int Q0
  ee.vld.128.ip q1,a2,0    # load row 3 into Q1
  ee.vld.128.ip q2,a3,32   # load quant row 1 into Q2
  ee.vld.128.ip q3,a3,0    # load quant row 3 into Q3
  movi.n  a6,0             # load the shift register with 0
  wsr.sar  a6              # put it in the SAR (shift amount register)
  ee.vmul.s16 q0,q0,q2     # de-quantize row 1
  ee.vmul.s16 q1,q1,q3     # de-quantize row 3
  ee.vadds.s16 q2,q0,q1    # tmp7 = tmp4+tmp5
  ee.vld.128.ip q3,a4,16   # load 1.414 constants into Q3
  movi.n  a6,14            # load the shift register with 14
  wsr.sar  a6              # put it in the SAR (shift amount register)
  ee.vst.128.ip q4,a9,16   # need to stack these registers
  ee.vst.128.ip q5,a9,16   # because 8 Q registers is not enough :(

  ee.vsubs.s16 q4,q0,q1    # tmp4-tmp5
  ee.vmul.s16 q5,q4,q3     # temp11 = (tmp4-tmp5) * 1.414
  ee.vld.128.ip q3,a4,16   # load 1.8477 constant into Q3
  ee.vmul.s16 q4,q4,q3     # Z5 = (tmp4-tmp5) * 1.847
  ee.vld.128.ip q3,a4,16   # load 2.613 constant into Q3
  movi.n  a6,13            # load the shift register with 13
  wsr.sar  a6              # otherwise this constant would overflow
  ee.vmul.s16 q3,q3,q1     # tmp5 * 2.613


.full_calc:
// Need to do the full calculations
  ee.vld.128.ip	q0,a2,16   # load MCU rows 0-3 into Q0,Q1,Q2,Q3
  ee.vld.128.ip q4,a3,16   # load quantization values into Q4,Q5,Q6,Q7
  ee.vld.128.ip q1,a2,16
  ee.vld.128.ip q5,a3,16
  ee.vld.128.ip q2,a2,16
  ee.vld.128.ip q6,a3,16
  ee.vld.128.ip q3,a2,16
  ee.vld.128.ip q7,a3,16
  movi.n  a4,0           # load the shift register with 0
  wsr.sar  a2             # put it in the SAR (shift amount register)
  ee.vmul.s16 q0,q0,q4   # de-quantize each row
  ee.vmul.s16 q1,q1,q5
  ee.vmul.s16 q2,q2,q6
  ee.vmul.s16 q3,q3,q7
  addi.n a2,a2,64        # point to first row of MCUs to store dequantized values
  ee.vst.128.ip q0,a2,16  # write back dequantized rows 0-3
  ee.vst.128.ip q1,a2,16
  ee.vst.128.ip q2,a2,16
  ee.vst.128.ip q3,a2,16
// repeat for rows 4-7
  ee.vld.128.ip   q0,a2,16   # load MCU rows 4-7 into Q0,Q1,Q2,Q3
  ee.vld.128.ip   q4,a3,16   # load quantization values into Q4,Q5,Q6,Q7
  ee.vld.128.ip   q1,a2,16
  ee.vld.128.ip   q5,a3,16
  ee.vld.128.ip   q2,a2,16
  ee.vld.128.ip   q6,a3,16
  ee.vld.128.ip   q3,a2,16
  ee.vld.128.ip   q7,a3,16

  ee.vmul.s16 q0,q0,q4   # de-quantize rows 4-7
  ee.vmul.s16 q1,q1,q5
  ee.vmul.s16 q2,q2,q6
  ee.vmul.s16 q3,q3,q7
  addi.n a2,a2,64        # point to 4th row of MCUs
  ee.vst.128.ip q0,a2,16  # write back dequantized rows 4-7
  ee.vst.128.ip q1,a2,16
  ee.vst.128.ip q2,a2,16
  ee.vst.128.ip q3,a2,16
  retw.n            # done
#endif // dsps_fft2r_sc16_aes3_enabled
#endif // ESP32