Tasmota/lib/libesp32/JPEGDEC/src/s3_simd_444.S
Christian Baars 7cb8a3f968
Berry: add cam module, img class (#21743)
* cam module, img class
2024-07-07 19:50:33 +02:00

110 lines
4.7 KiB
ArmAsm

//
// ESP32-S3 SIMD optimized code
// Written by Larry Bank
// Copyright (c) 2024 BitBank Software, Inc.
// Project started Jan 21, 2024
//
#ifdef ARDUINO_ARCH_ESP32
#include "dsps_fft2r_platform.h"
#if (dsps_fft2r_sc16_aes3_enabled == 1)
.text
.align 4
// Convert 8 pixels of YCbCr 4:4:4 to RGB565
// A2 A3 A4 A5 A6 A7
// Call as void s3_ycbcr_convert_444(uint8_t *pY, uint8_t *pCB, uint8_t *pCR, uint16_t *pOut, int16_t *pConsts, uint8_t ucPixelType);
// supported pixel types: 0 = RGB656_LE, 1 = RGB565_BE, 2 = RGBA8888
.global s3_ycbcr_convert_444
.type s3_ycbcr_convert_444,@function
s3_ycbcr_convert_444:
# no idea what this frequency keyword does
# .frequency 1.000 0.000
entry a1,16
ee.xorq q4,q4,q4 # load Q4 with 0's
ee.vldbc.16.ip q3,a6,2 # get constant 0x80 as 16-bits in all 128 bits of q3
ee.vld.l.64.ip q0,a2,0 # load 8 Y values into Q0
ee.vld.l.64.ip q1,a3,0 # load 8 Cb values into Q1
ee.vld.l.64.ip q2,a4,0 # load 8 Cr values into Q2
ee.vzip.8 q0,q4 # expand 8-bit Y data to 16-bits
ee.xorq q4,q4,q4 # need to reset to 0's
ee.vzip.8 q1,q4 # expand 8-bit Cb data to 16-bits
ee.xorq q4,q4,q4
ee.vzip.8 q2,q4 # expand 8-bit Cr data to 16-bits
ee.vsubs.s16 q1,q1,q3 # subtract 0x0080 from Cb's
ee.vsubs.s16 q2,q2,q3 # subtract 0x0080 from Cr's
ee.vldbc.16.ip q3,a6,2 # get constant 1.77200 as 16-bits in all 128 bits of q3
movi.n a2,6 # load the shift register with 6
wsr.sar a2 # put it in the SAR (shift amount register)
ee.vmul.s16 q6,q1,q3 # (Cb *= 1.77200) >> 6
ee.vadds.s16 q6,q6,q0 # Cb += y (8 blue pixels in q6)
ee.vldbc.16.ip q3,a6,2 # get constant 1.402 as 16-bits in all 128 bits of q3
ee.vmul.s16 q7,q2,q3 # (Cr *= 1.402) >> 6
ee.vadds.s16 q7,q7,q0 # Cr += y (8 red pixels in q7)
ee.vldbc.16.ip q3,a6,2 # get constant 0.34414 as 16-bits in all 128 bits of q3
movi.n a2,0 # load the shift register with 0
wsr.sar a2 # put it in the SAR (shift amount register)
ee.vmul.s16 q4,q1,q3 # (Cb * 0.34414) >> 0
ee.vldbc.16.ip q3,a6,2 # get constant 0.71414 as 16-bits in all 128 bits of q3
ee.vmul.s16 q3,q2,q3 # (Cr * 0.71414) >> 0
ee.vadds.s16 q3,q3,q4 # (Cb * 0.34414) + (Cr * 0.71414)
ee.vldbc.16.ip q4,a6,2 # get constant 1 (so we can do a 16-bit shift)
movi.n a2,6 # load the shift register with 6
wsr.sar a2 # put it in the SAR (shift amount register)
ee.vmul.s16 q3,q3,q4 # shift right by 6
ee.vsubs.s16 q3,q0,q3 # Y - ((Cb * 0.34414) + (Cr * 0.71414)) = green in Q3
// saturate to 8 bits
ee.xorq q0,q0,q0
ee.vmax.s16 q3,q3,q0
ee.vmax.s16 q6,q6,q0
ee.vmax.s16 q7,q7,q0
ee.vcmp.eq.s16 q1,q1,q1 # create 255
ee.vzip.8 q1,q0
ee.vmin.s16 q3,q3,q1 # clamp to 255
ee.vmin.s16 q6,q6,q1
ee.vmin.s16 q7,q7,q1
// Now we have RGB888, is that the output pixel type?
beqi a7,2,.rgb8888_output
// either RGB565 LE or BE from here
movi.n a2,3 # load the shift register with 3 (for blue and red)
wsr.sar a2 # put it in the SAR (shift amount register)
ee.vmul.s16 q6,q4,q6 # shift blue right by 3
ee.vmul.s16 q7,q4,q7 # shift red right by 3
movi.n a2,2 # load the shift register with 2 (for green)
wsr.sar a2 # put it in the SAR (shift amount register)
ee.vmul.s16 q3,q4,q3 # shift green right by 2
// now combine to form RGB565 pixels
movi.n a2,0
wsr.sar a2 # no shift after multiply
ee.vldbc.16.ip q4,a6,2 # get constant value 32 (to shift green left by 5 bits)
ee.vldbc.16.ip q5,a6,2 # get constant value 2048 (to shift red left by 11 bits)
ee.vmul.s16 q3,q4,q3 # shift green left by 5
ee.vmul.s16 q7,q5,q7 # shift red left by 11
ee.orq q6,q6,q3 # combine blue + green
ee.orq q6,q6,q7 # combine blue + green + red
mv.qr q5,q6 # in case we're generating little endian output
beqi a7,0,.rgb565_exit # RGB565 little endian?
ee.vunzip.8 q6,q5 # swap the byte order to be big-endian
ee.vzip.8 q5,q6
.rgb565_exit:
ee.vst.128.ip q5,a5,0 # store the 8 RGB565 pixels
retw.n
// Create RGBA (32-bit) pixels
.rgb8888_output:
movi.n a2,8 # shift 8 bits
wsr.sar a2
ee.vsl.32 q3,q3 # shift green over 8 bits
ee.orq q7,q7,q3 # combine red and green
ee.vcmp.eq.s16 q1,q1,q1 # create FFs
ee.xorq q2,q2,q2 # create 00s
ee.vzip.8 q2,q1 # create FF00 for Alpha
ee.orq q2,q2,q6 # combine blue + alpha
ee.vzip.16 q7,q2 # create RGB8888 pixels
ee.vst.128.ip q7,a5,16 # store 8 x RGB8888 pixels = 32 bytes
ee.vst.128.ip q2,a5,0
retw.n # done
#endif // dsps_fft2r_sc16_aes3_enabled
#endif // ESP32