* `Sendmail` upgraded to ESP-Mail-Client v3.4.9 from v1.2.0, using BearSSL instead of MbedTLS * Fix compilation on ESP8266 * Fix compilation * fix compilation
242 lines
5.8 KiB
C
242 lines
5.8 KiB
C
/*
|
|
* Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#include "../ESP_SSLClient_FS.h"
|
|
#if defined(USE_LIB_SSL_ENGINE)
|
|
|
|
#define BR_ENABLE_INTRINSICS 1
|
|
#include "inner.h"
|
|
|
|
#if BR_SSE2
|
|
|
|
/*
|
|
* This file contains a ChaCha20 implementation that leverages SSE2
|
|
* opcodes for better performance.
|
|
*/
|
|
|
|
/* see bearssl_block.h */
|
|
br_chacha20_run
|
|
br_chacha20_sse2_get(void)
|
|
{
|
|
/*
|
|
* If using 64-bit mode, then SSE2 opcodes should be automatically
|
|
* available, since they are part of the ABI.
|
|
*
|
|
* In 32-bit mode, we use CPUID to detect the SSE2 feature.
|
|
*/
|
|
|
|
#if BR_amd64
|
|
return &br_chacha20_sse2_run;
|
|
#else
|
|
|
|
/*
|
|
* SSE2 support is indicated by bit 26 in EDX.
|
|
*/
|
|
if (br_cpuid(0, 0, 0, 0x04000000)) {
|
|
return &br_chacha20_sse2_run;
|
|
} else {
|
|
return 0;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
BR_TARGETS_X86_UP
|
|
|
|
/* see bearssl_block.h */
|
|
BR_TARGET("sse2")
|
|
uint32_t
|
|
br_chacha20_sse2_run(const void *key,
|
|
const void *iv, uint32_t cc, void *data, size_t len)
|
|
{
|
|
unsigned char *buf;
|
|
uint32_t ivtmp[4];
|
|
__m128i kw0, kw1;
|
|
__m128i iw, cw;
|
|
__m128i one;
|
|
|
|
static const uint32_t CW[] = {
|
|
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
|
|
};
|
|
|
|
buf = data;
|
|
kw0 = _mm_loadu_si128(key);
|
|
kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
|
|
ivtmp[0] = cc;
|
|
memcpy(ivtmp + 1, iv, 12);
|
|
iw = _mm_loadu_si128((const void *)ivtmp);
|
|
cw = _mm_loadu_si128((const void *)CW);
|
|
one = _mm_set_epi32(0, 0, 0, 1);
|
|
|
|
while (len > 0) {
|
|
/*
|
|
* sj contains state words 4*j to 4*j+3.
|
|
*/
|
|
__m128i s0, s1, s2, s3;
|
|
int i;
|
|
|
|
s0 = cw;
|
|
s1 = kw0;
|
|
s2 = kw1;
|
|
s3 = iw;
|
|
for (i = 0; i < 10; i ++) {
|
|
/*
|
|
* Even round is straightforward application on
|
|
* the state words.
|
|
*/
|
|
s0 = _mm_add_epi32(s0, s1);
|
|
s3 = _mm_xor_si128(s3, s0);
|
|
s3 = _mm_or_si128(
|
|
_mm_slli_epi32(s3, 16),
|
|
_mm_srli_epi32(s3, 16));
|
|
|
|
s2 = _mm_add_epi32(s2, s3);
|
|
s1 = _mm_xor_si128(s1, s2);
|
|
s1 = _mm_or_si128(
|
|
_mm_slli_epi32(s1, 12),
|
|
_mm_srli_epi32(s1, 20));
|
|
|
|
s0 = _mm_add_epi32(s0, s1);
|
|
s3 = _mm_xor_si128(s3, s0);
|
|
s3 = _mm_or_si128(
|
|
_mm_slli_epi32(s3, 8),
|
|
_mm_srli_epi32(s3, 24));
|
|
|
|
s2 = _mm_add_epi32(s2, s3);
|
|
s1 = _mm_xor_si128(s1, s2);
|
|
s1 = _mm_or_si128(
|
|
_mm_slli_epi32(s1, 7),
|
|
_mm_srli_epi32(s1, 25));
|
|
|
|
/*
|
|
* For the odd round, we must rotate some state
|
|
* words so that the computations apply on the
|
|
* right combinations of words.
|
|
*/
|
|
s1 = _mm_shuffle_epi32(s1, 0x39);
|
|
s2 = _mm_shuffle_epi32(s2, 0x4E);
|
|
s3 = _mm_shuffle_epi32(s3, 0x93);
|
|
|
|
s0 = _mm_add_epi32(s0, s1);
|
|
s3 = _mm_xor_si128(s3, s0);
|
|
s3 = _mm_or_si128(
|
|
_mm_slli_epi32(s3, 16),
|
|
_mm_srli_epi32(s3, 16));
|
|
|
|
s2 = _mm_add_epi32(s2, s3);
|
|
s1 = _mm_xor_si128(s1, s2);
|
|
s1 = _mm_or_si128(
|
|
_mm_slli_epi32(s1, 12),
|
|
_mm_srli_epi32(s1, 20));
|
|
|
|
s0 = _mm_add_epi32(s0, s1);
|
|
s3 = _mm_xor_si128(s3, s0);
|
|
s3 = _mm_or_si128(
|
|
_mm_slli_epi32(s3, 8),
|
|
_mm_srli_epi32(s3, 24));
|
|
|
|
s2 = _mm_add_epi32(s2, s3);
|
|
s1 = _mm_xor_si128(s1, s2);
|
|
s1 = _mm_or_si128(
|
|
_mm_slli_epi32(s1, 7),
|
|
_mm_srli_epi32(s1, 25));
|
|
|
|
/*
|
|
* After the odd round, we rotate back the values
|
|
* to undo the rotate at the start of the odd round.
|
|
*/
|
|
s1 = _mm_shuffle_epi32(s1, 0x93);
|
|
s2 = _mm_shuffle_epi32(s2, 0x4E);
|
|
s3 = _mm_shuffle_epi32(s3, 0x39);
|
|
}
|
|
|
|
/*
|
|
* Addition with the initial state.
|
|
*/
|
|
s0 = _mm_add_epi32(s0, cw);
|
|
s1 = _mm_add_epi32(s1, kw0);
|
|
s2 = _mm_add_epi32(s2, kw1);
|
|
s3 = _mm_add_epi32(s3, iw);
|
|
|
|
/*
|
|
* Increment block counter.
|
|
*/
|
|
iw = _mm_add_epi32(iw, one);
|
|
|
|
/*
|
|
* XOR final state with the data.
|
|
*/
|
|
if (len < 64) {
|
|
unsigned char tmp[64];
|
|
size_t u;
|
|
|
|
_mm_storeu_si128((void *)(tmp + 0), s0);
|
|
_mm_storeu_si128((void *)(tmp + 16), s1);
|
|
_mm_storeu_si128((void *)(tmp + 32), s2);
|
|
_mm_storeu_si128((void *)(tmp + 48), s3);
|
|
for (u = 0; u < len; u ++) {
|
|
buf[u] ^= tmp[u];
|
|
}
|
|
break;
|
|
} else {
|
|
__m128i b0, b1, b2, b3;
|
|
|
|
b0 = _mm_loadu_si128((const void *)(buf + 0));
|
|
b1 = _mm_loadu_si128((const void *)(buf + 16));
|
|
b2 = _mm_loadu_si128((const void *)(buf + 32));
|
|
b3 = _mm_loadu_si128((const void *)(buf + 48));
|
|
b0 = _mm_xor_si128(b0, s0);
|
|
b1 = _mm_xor_si128(b1, s1);
|
|
b2 = _mm_xor_si128(b2, s2);
|
|
b3 = _mm_xor_si128(b3, s3);
|
|
_mm_storeu_si128((void *)(buf + 0), b0);
|
|
_mm_storeu_si128((void *)(buf + 16), b1);
|
|
_mm_storeu_si128((void *)(buf + 32), b2);
|
|
_mm_storeu_si128((void *)(buf + 48), b3);
|
|
buf += 64;
|
|
len -= 64;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* _mm_extract_epi32() requires SSE4.1. We prefer to stick to
|
|
* raw SSE2, thus we use _mm_extract_epi16().
|
|
*/
|
|
return (uint32_t)_mm_extract_epi16(iw, 0)
|
|
| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
|
|
}
|
|
|
|
BR_TARGETS_X86_DOWN
|
|
|
|
#else
|
|
|
|
/* see bearssl_block.h */
|
|
br_chacha20_run
|
|
br_chacha20_sse2_get(void)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif |