From d0eb3bb6aed6ab38ec70a7ae46fe1e68b82e04ca Mon Sep 17 00:00:00 2001 From: Christian Baars Date: Fri, 26 Sep 2025 13:25:33 +0200 Subject: [PATCH] use ROM bigint functions in BearSSL (#23949) --- .../bearssl-esp8266/src/ec/_ec_c25519_m15.c | 308 ++++++++++++ .../bearssl-esp8266/src/ec/_ec_p256_m15.c | 468 ++++++++++++++++++ 2 files changed, 776 insertions(+) create mode 100644 lib/lib_ssl/bearssl-esp8266/src/ec/_ec_c25519_m15.c create mode 100644 lib/lib_ssl/bearssl-esp8266/src/ec/_ec_p256_m15.c diff --git a/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_c25519_m15.c b/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_c25519_m15.c new file mode 100644 index 000000000..5aa467b92 --- /dev/null +++ b/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_c25519_m15.c @@ -0,0 +1,308 @@ +/* + * _ec_c25519_m15.c — BearSSL Curve25519 (X25519) implementation using ESP32 ROM-backed Montgomery arithmetic + * + * This file provides a fast Montgomery ladder implementation for Curve25519 scalar + * multiplication (X25519), leveraging the ESP32's ROM bigint accelerator for modular + * multiplication in the prime field p = 2^255 - 19. + * + * Key features: + * - Field arithmetic in the normal domain using single-step ROM-backed multiply/square. + * - 8×32-bit little-endian limb representation for all field elements. + * - Constant-time Montgomery ladder for scalar multiplication. + * - RFC 7748–compliant clamping of scalar inputs. + * - Supports multiplication by arbitrary u‑coordinates and basepoint generation. + * - Fully compatible with BearSSL's ec_impl API. + * + * Internal operations avoid heap allocation and use fixed-size buffers. + * + * Requires: ESP32 platform with SOC_MPI_SUPPORTED enabled. + * + * Author: Christian Baars + */ + +#if defined(USE_SHA_ROM) +#if defined(ESP_PLATFORM) && !defined(ESP8266) && !defined(CONFIG_IDF_TARGET_ESP32) + +#if __has_include("soc/sha_caps.h") +# include "soc/sha_caps.h" +#elif __has_include("soc/soc_caps.h") +# include "soc/soc_caps.h" +#else +# error "No ESP capability header found" +#endif + +#if SOC_MPI_SUPPORTED + +#include "rom/bigint.h" +#include "t_inner.h" + +#define WORDS 8 /* 8×32-bit limbs */ + +/* Prime p = 2^255 - 19 (little-endian 32-bit limbs) */ +static const uint32_t P_LE[WORDS] = { + 0xFFFFFFED, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF +}; + +/* R^2 mod p (with R = 2^256 mod p = 38) */ +static const uint32_t RR_LE[WORDS] = { 1444, 0,0,0,0,0,0,0 }; + +/* A24 = 121665 (normal-domain constant) */ +static const uint32_t A24_LE[WORDS] = { 121665, 0,0,0,0,0,0,0 }; + +static const uint32_t MPRIME = 0x286BCA1B; + +/* ---------- limb utilities ---------- */ + +static inline void zclear(uint32_t *a) { memset(a, 0, WORDS * sizeof(uint32_t)); } +static inline void zcopy(uint32_t *dst, const uint32_t *src) { memcpy(dst, src, WORDS * sizeof(uint32_t)); } + +/* ct compare: return 1 if a >= b, else 0 */ +static inline uint32_t ge_ct(const uint32_t *a, const uint32_t *b) { + uint32_t gt = 0, eq = 1; + for (int i = WORDS - 1; i >= 0; i--) { + uint32_t ai = a[i], bi = b[i]; + uint32_t gt_i = (ai > bi); + uint32_t lt_i = (ai < bi); + gt |= (eq & gt_i); + eq &= ~(gt_i | lt_i); + } + return gt | eq; +} + +static inline void add_mod(uint32_t *d, const uint32_t *a, const uint32_t *b) { + uint64_t c = 0; + for (int i = 0; i < WORDS; i++) { + c = (uint64_t)a[i] + b[i] + (c >> 32); + d[i] = (uint32_t)c; + } + uint32_t need_sub = (uint32_t)(c >> 32); + need_sub |= ge_ct(d, P_LE); + uint32_t borrow = 0, tmp[WORDS]; + for (int i = 0; i < WORDS; i++) { + uint64_t t = (uint64_t)d[i] - P_LE[i] - borrow; + tmp[i] = (uint32_t)t; + borrow = (uint32_t)(t >> 63); + } + for (int i = 0; i < WORDS; i++) d[i] = need_sub ? tmp[i] : d[i]; +} + +static inline void sub_mod(uint32_t *d, const uint32_t *a, const uint32_t *b) { + uint32_t borrow = 0; + for (int i = 0; i < WORDS; i++) { + uint64_t t = (uint64_t)a[i] - b[i] - borrow; + d[i] = (uint32_t)t; + borrow = (uint32_t)(t >> 63); + } + uint64_t c = 0, tmp[WORDS]; + for (int i = 0; i < WORDS; i++) { + c = (uint64_t)d[i] + P_LE[i] + (c >> 32); + tmp[i] = (uint32_t)c; + } + for (int i = 0; i < WORDS; i++) d[i] = borrow ? tmp[i] : d[i]; +} + +static inline void field_mul(uint32_t *dst, const uint32_t *a, const uint32_t *b) { + ets_bigint_enable(); + ets_bigint_modmult(a, b, P_LE, MPRIME, RR_LE, WORDS); + ets_bigint_wait_finish(); + ets_bigint_getz(dst, WORDS); + ets_bigint_disable(); +} + +static inline void field_sqr(uint32_t *dst, const uint32_t *a) { + field_mul(dst, a, a); +} + +/* Fermat inversion: a^(p-2) in normal domain */ +static void field_inv(uint32_t *out, const uint32_t *a) { + static const uint32_t EXP_P_MINUS_2[WORDS] = { + 0xFFFFFFEB, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF + }; + uint32_t res[WORDS], base[WORDS]; + zclear(res); res[0] = 1; + zcopy(base, a); + for (int wi = WORDS - 1; wi >= 0; wi--) { + uint32_t w = EXP_P_MINUS_2[wi]; + for (int b = 31; b >= 0; b--) { + field_sqr(res, res); + if ((w >> b) & 1U) field_mul(res, res, base); + } + } + zcopy(out, res); +} + +/* Conditional swap */ +static inline void cswap(uint32_t *a, uint32_t *b, uint32_t ctl) { + uint32_t mask = -ctl; + for (int i = 0; i < WORDS; i++) { + uint32_t t = (a[i] ^ b[i]) & mask; + a[i] ^= t; b[i] ^= t; + } +} + +/* ---------- X25519 ladder (normal domain) ---------- */ + +static const unsigned char GEN[] PROGMEM = { + 0x09, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 +}; + +static const unsigned char ORDER[] PROGMEM = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return GEN; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return ORDER; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 0; +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + (void)curve; + + if (Glen != 32 || kblen > 32) { + return 0; + } + + /* Clamp scalar per RFC 7748 */ + unsigned char k[32]; + memset(k, 0, 32 - kblen); + memcpy(k + (32 - kblen), kb, kblen); + k[31] &= 0xF8; + k[0] &= 0x7F; + k[0] |= 0x40; + + /* Load u and clear high bit per RFC 7748 */ + unsigned char u_bytes[32]; + memcpy(u_bytes, G, 32); + u_bytes[31] &= 0x7F; + + uint32_t x1[WORDS], x2[WORDS], z2[WORDS], x3[WORDS], z3[WORDS]; + br_range_dec32le(x1, WORDS, u_bytes); + + /* Initialize: + * (x2:z2) = (1:0) + * (x3:z3) = (u:1) + */ + zclear(z2); + zclear(x2); x2[0] = 1; + zcopy(x3, x1); + zclear(z3); z3[0] = 1; + + uint32_t a[WORDS], aa[WORDS], b[WORDS], bb[WORDS]; + uint32_t c[WORDS], d[WORDS], e[WORDS], da[WORDS], cb[WORDS]; + uint32_t t[WORDS]; + + uint32_t swap = 0; + for (int i = 254; i >= 0; i--) { + uint32_t kt = (k[31 - (i >> 3)] >> (i & 7)) & 1U; + swap ^= kt; + cswap(x2, x3, swap); + cswap(z2, z3, swap); + swap = kt; + + /* Ladder step */ + add_mod(a, x2, z2); /* a = x2 + z2 */ + sub_mod(b, x2, z2); /* b = x2 - z2 */ + field_sqr(aa, a); /* aa = a^2 */ + field_sqr(bb, b); /* bb = b^2 */ + sub_mod(e, aa, bb); /* e = aa - bb */ + + add_mod(c, x3, z3); /* c = x3 + z3 */ + sub_mod(d, x3, z3); /* d = x3 - z3 */ + field_mul(da, d, a); /* da = d * a */ + field_mul(cb, c, b); /* cb = c * b */ + + add_mod(x3, da, cb); /* x3 = (da + cb)^2 */ + field_sqr(x3, x3); + sub_mod(z3, da, cb); /* z3 = (da - cb)^2 * x1 */ + field_sqr(z3, z3); + field_mul(z3, z3, x1); + + field_mul(x2, aa, bb); /* x2 = aa * bb */ + + /* z2 = e * (aa + A24 * e) */ + field_mul(t, A24_LE, e); /* t = A24 * e */ + add_mod(t, t, aa); /* t = aa + A24*e */ + field_mul(z2, e, t); /* z2 = e * t */ + } + + cswap(x2, x3, swap); + cswap(z2, z3, swap); + + /* u = x2 / z2 */ + uint32_t z2i[WORDS], unorm[WORDS]; + field_inv(z2i, z2); + field_mul(unorm, x2, z2i); + + /* Final reduction if needed and serialize */ + if (ge_ct(unorm, P_LE)) { + sub_mod(unorm, unorm, P_LE); + } + br_range_enc32le(G, unorm, WORDS); + return 1; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G0; + size_t Glen; + + G0 = api_generator(curve, &Glen); + memcpy_P(R, G0, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + (void)A; (void)B; (void)len; (void)x; (void)xlen; (void)y; (void)ylen; (void)curve; + /* Not applicable for Curve25519 (no ECDSA). */ + return 0; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_c25519_m15 PROGMEM = { + (uint32_t)0x20000000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; + +#endif /* SOC_MPI_SUPPORTED */ +#endif /* ESP_PLATFORM && !ESP8266 */ +#endif /* USE_SHA_ROM */ diff --git a/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_p256_m15.c b/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_p256_m15.c new file mode 100644 index 000000000..69d4f519f --- /dev/null +++ b/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_p256_m15.c @@ -0,0 +1,468 @@ +/* + * _ec_p256_m15.c — BearSSL P-256 implementation using ESP32 ROM-backed Montgomery arithmetic + * + * This file provides a fast elliptic curve implementation for secp256r1 (P-256), + * leveraging the ESP32's ROM bigint accelerator for modular multiplication. + * + * Key features: + * - Field arithmetic in normal domain using Montgomery-backed multiply/square. + * - Jacobian point representation with full group law (point add/double). + * - Scalar multiplication via double-and-add, supporting arbitrary base points. + * - Conversion between affine and Jacobian coordinates. + * - Compact encoding/decoding of uncompressed points (04 || X || Y). + * - Fully compatible with BearSSL's ec_impl API. + * + * All field elements are stored as 8×32-bit little-endian limbs. + * Internal operations avoid heap allocation and use fixed-size buffers. + * + * Requires: ESP32 platform with SOC_MPI_SUPPORTED enabled. + * + * Author: Christian Baars + */ + +#if defined(USE_SHA_ROM) +#if defined(ESP_PLATFORM) && !defined(ESP8266) && !defined(CONFIG_IDF_TARGET_ESP32) + +#if __has_include("soc/sha_caps.h") +# include "soc/sha_caps.h" +#elif __has_include("soc/soc_caps.h") +# include "soc/soc_caps.h" +#else +# error "No ESP capability header found" +#endif + +#if SOC_MPI_SUPPORTED + +#include +#include "rom/bigint.h" +#include "t_inner.h" + +#define WORDS 8 + +/* ESP32 ROM Montgomery parameters (little-endian).*/ +static const uint32_t P_LE[8] = { + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, + 0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF +}; + +static const uint32_t RR_LE[8] = { + 0x03000000, 0x00000000, 0xFFFFFFFF, 0xFBFFFFFF, + 0xFEFFFFFF, 0xFFFFFFFF, 0xFDFFFFFF, 0x04000000 +}; + +/* -p^{-1} mod 2^32 */ +static const uint32_t MPRIME = 0x00000001; + +/* Factor to convert ROM Montgomery output back to normal domain (8 limbs) */ +static const uint32_t CINV2_LE[8] = { + 0xB15F7DC9, 0x21BC7192, 0xF82DEBEB, 0xF2086906, + 0x8AD3BB54, 0xE34453E4, 0xB2B4EF16, 0x5FF55809 +}; + +/* Generator point G in little-endian 32-bit limbs (LSW first) */ +static const uint32_t Gx[WORDS] = { + 0xD898C296, 0xF4A13945, 0x2DEB33A0, 0x77037D81, + 0x63A440F2, 0xF8BCE6E5, 0xE12C4247, 0x6B17D1F2 +}; + +static const uint32_t Gy[WORDS] = { + 0x37BF51F5, 0xCBB64068, 0x6B315ECE, 0x2BCE3357, + 0x7C0F9E16, 0x8EE7EB4A, 0xFE1A7F9B, 0x4FE342E2 +}; + +typedef struct { + uint32_t X[WORDS]; + uint32_t Y[WORDS]; + uint32_t Z[WORDS]; +} p256_pt; + +/* ---------- small utilities ---------- */ + +static inline void zclear(uint32_t *dst) { + memset(dst, 0, WORDS * sizeof(uint32_t)); +} + +static inline void zcopy(uint32_t *dst, const uint32_t *src) { + memcpy(dst, src, WORDS * sizeof(uint32_t)); +} + +static inline int is_zero(const uint32_t *a) { + uint32_t acc = 0; + for (int i = 0; i < WORDS; i++) acc |= a[i]; + return acc == 0; +} + +/* big-endian bytes -> internal little-endian limbs (reverse word order) */ +static void be32_to_le32(const uint8_t *src, uint32_t *dst) { + for (int i = 0; i < WORDS; i++) { + int j = WORDS - 1 - i; + dst[i] = ((uint32_t)src[4*j] << 24) + | ((uint32_t)src[4*j + 1] << 16) + | ((uint32_t)src[4*j + 2] << 8) + | ((uint32_t)src[4*j + 3]); + } +} + +/* internal little-endian limbs -> big-endian bytes (reverse word order) */ +static void le32_to_be32(const uint32_t *src, uint8_t *dst) { + for (int i = 0; i < WORDS; i++) { + int j = WORDS - 1 - i; + dst[4*j] = (uint8_t)(src[i] >> 24); + dst[4*j + 1] = (uint8_t)(src[i] >> 16); + dst[4*j + 2] = (uint8_t)(src[i] >> 8); + dst[4*j + 3] = (uint8_t)(src[i]); + } +} + +/* ---------- field arithmetic modulo p (normal domain) ---------- */ + +static inline const uint32_t *Pmod(void) { return P_LE; } + +static int ge_mod_p(const uint32_t *a) { + const uint32_t *P = Pmod(); + for (int i = WORDS - 1; i >= 0; i--) { + if (a[i] > P[i]) return 1; + if (a[i] < P[i]) return 0; + } + return 1; /* equal */ +} + +static void field_add_mod(uint32_t *dst, const uint32_t *a, const uint32_t *b) { + const uint32_t *P = Pmod(); + uint64_t carry = 0; + for (int i = 0; i < WORDS; i++) { + uint64_t sum = (uint64_t)a[i] + b[i] + carry; + dst[i] = (uint32_t)sum; + carry = sum >> 32; + } + if (carry || ge_mod_p(dst)) { + uint64_t borrow = 0; + for (int i = 0; i < WORDS; i++) { + uint64_t diff = (uint64_t)dst[i] - P[i] - borrow; + dst[i] = (uint32_t)diff; + borrow = (diff >> 63) & 1; + } + } +} + +static void field_sub_mod(uint32_t *dst, const uint32_t *a, const uint32_t *b) { + const uint32_t *P = Pmod(); + uint64_t borrow = 0; + for (int i = 0; i < WORDS; i++) { + uint64_t diff = (uint64_t)a[i] - b[i] - borrow; + dst[i] = (uint32_t)diff; + borrow = (diff >> 63) & 1; + } + if (borrow) { + uint64_t carry = 0; + for (int i = 0; i < WORDS; i++) { + uint64_t sum = (uint64_t)dst[i] + P[i] + carry; + dst[i] = (uint32_t)sum; + carry = sum >> 32; + } + } +} + +/* ROM-backed modular multiply returning normal-domain result (8 limbs) */ +static void rom_field_mul(uint32_t *dst, const uint32_t *a, const uint32_t *b) { + uint32_t tmp[WORDS]; + + ets_bigint_enable(); + + /* Montgomery multiply in ROM (returns Montgomery residue) */ + ets_bigint_modmult(a, b, P_LE, MPRIME, RR_LE, WORDS); + ets_bigint_wait_finish(); + ets_bigint_getz(tmp, WORDS); + + /* Convert out of Montgomery domain using the proven CINV2_LE */ + ets_bigint_modmult(tmp, CINV2_LE, P_LE, MPRIME, RR_LE, WORDS); + ets_bigint_wait_finish(); + ets_bigint_getz(dst, WORDS); + + ets_bigint_disable(); +} + +static inline void field_mul(uint32_t *dst, const uint32_t *a, const uint32_t *b) { + rom_field_mul(dst, a, b); +} + +static inline void field_sqr(uint32_t *dst, const uint32_t *a) { + rom_field_mul(dst, a, a); +} + +/* Square-and-multiply exponentiation for p-2 (normal domain throughout) */ +static void field_inv(uint32_t *out, const uint32_t *a) { + static const uint32_t EXP_P_MINUS_2[WORDS] = { + 0xFFFFFFFD, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, + 0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF + }; + uint32_t res[WORDS], base[WORDS]; + + zclear(res); res[0] = 1; /* res = 1 */ + zcopy(base, a); /* base = a */ + + for (int wi = WORDS - 1; wi >= 0; wi--) { + uint32_t w = EXP_P_MINUS_2[wi]; + for (int b = 31; b >= 0; b--) { + field_sqr(res, res); + if ((w >> b) & 1U) { + field_mul(res, res, base); + } + } + } + zcopy(out, res); +} + +/* ---------- point utilities ---------- */ + +static inline void load_generator(p256_pt *Pp) { + zcopy(Pp->X, Gx); + zcopy(Pp->Y, Gy); + zclear(Pp->Z); + Pp->Z[0] = 1; +} + +static void to_affine(p256_pt *Pp) { + uint32_t zi[WORDS], zi2[WORDS], xi[WORDS], yi[WORDS]; + uint32_t z0 = 0; + for (int i = 0; i < WORDS; i++) z0 |= Pp->Z[i]; + if (z0 == 0) { + zclear(Pp->X); zclear(Pp->Y); zclear(Pp->Z); + return; + } + field_inv(zi, Pp->Z); + field_sqr(zi2, zi); + field_mul(xi, Pp->X, zi2); + field_mul(yi, Pp->Y, zi2); + field_mul(yi, yi, zi); + zcopy(Pp->X, xi); + zcopy(Pp->Y, yi); + zclear(Pp->Z); Pp->Z[0] = 1; +} + +/* ---------- group law (Jacobian, a = -3) ---------- */ + +__attribute__((noinline)) +static void p256_point_double(p256_pt *Q) { + if (is_zero(Q->Y)) { zclear(Q->X); zclear(Q->Y); zclear(Q->Z); return; } + + uint32_t Z2[WORDS], M[WORDS], Mtmp[WORDS], S[WORDS], T1[WORDS], T2[WORDS], X3[WORDS], Y3[WORDS]; + + /* Z2 = Z^2 */ + field_sqr(Z2, Q->Z); + + /* T1 = X - Z^2 ; T2 = X + Z^2 */ + field_sub_mod(T1, Q->X, Z2); + field_add_mod(T2, Q->X, Z2); + + /* M = (X - Z^2) * (X + Z^2) */ + field_mul(M, T1, T2); + + /* M = 3 * M */ + zcopy(Mtmp, M); + field_add_mod(M, M, M); /* 2*M */ + field_add_mod(M, M, Mtmp); /* 3*M */ + + /* S = 4 * X * Y^2 */ + field_sqr(S, Q->Y); /* Y^2 */ + field_mul(S, S, Q->X); /* X*Y^2 */ + field_add_mod(S, S, S); /* 2*X*Y^2 */ + field_add_mod(S, S, S); /* 4*X*Y^2 */ + + /* X3 = M^2 - 2*S */ + field_sqr(X3, M); + field_sub_mod(X3, X3, S); + field_sub_mod(X3, X3, S); + + /* Y3 = M*(S - X3) - 8*Y^4 */ + field_sub_mod(Y3, S, X3); + field_mul(Y3, Y3, M); + + field_sqr(T1, Q->Y); /* Y^2 */ + field_sqr(T1, T1); /* Y^4 */ + field_add_mod(T1, T1, T1); /* 2*Y^4 */ + field_add_mod(T1, T1, T1); /* 4*Y^4 */ + field_add_mod(T1, T1, T1); /* 8*Y^4 */ + + field_sub_mod(Y3, Y3, T1); + + /* Z3 = 2*Y*Z */ + field_mul(Q->Z, Q->Y, Q->Z); + field_add_mod(Q->Z, Q->Z, Q->Z); + + zcopy(Q->X, X3); + zcopy(Q->Y, Y3); +} + +__attribute__((noinline)) +static void p256_point_add(p256_pt *R, const p256_pt *Pp, const p256_pt *Qp) { + if (is_zero(Pp->Z)) { zcopy(R->X, Qp->X); zcopy(R->Y, Qp->Y); zcopy(R->Z, Qp->Z); return; } + if (is_zero(Qp->Z)) { zcopy(R->X, Pp->X); zcopy(R->Y, Pp->Y); zcopy(R->Z, Pp->Z); return; } + + uint32_t Z1Z1[WORDS], Z2Z2[WORDS], U1[WORDS], U2[WORDS]; + uint32_t S1[WORDS], S2[WORDS], H[WORDS], RR[WORDS]; + uint32_t H2[WORDS], H3[WORDS], U1H2[WORDS], X3[WORDS], Y3[WORDS], Z3[WORDS], t[WORDS]; + + field_sqr(Z1Z1, Pp->Z); + field_sqr(Z2Z2, Qp->Z); + + field_mul(U1, Pp->X, Z2Z2); + field_mul(U2, Qp->X, Z1Z1); + + field_mul(t, Qp->Z, Z2Z2); /* Z2^3 */ + field_mul(S1, Pp->Y, t); + + field_mul(t, Pp->Z, Z1Z1); /* Z1^3 */ + field_mul(S2, Qp->Y, t); + + field_sub_mod(H, U2, U1); + field_sub_mod(RR, S2, S1); + + if (is_zero(H)) { + if (is_zero(RR)) { + p256_pt D = *Pp; + p256_point_double(&D); + zcopy(R->X, D.X); + zcopy(R->Y, D.Y); + zcopy(R->Z, D.Z); + } else { + zclear(R->X); zclear(R->Y); zclear(R->Z); /* infinity */ + } + return; + } + + field_sqr(H2, H); + field_mul(H3, H, H2); + field_mul(U1H2, U1, H2); + + field_sqr(X3, RR); + field_sub_mod(X3, X3, H3); + field_sub_mod(X3, X3, U1H2); + field_sub_mod(X3, X3, U1H2); /* -2*U1H2 */ + + field_sub_mod(Y3, U1H2, X3); + field_mul(Y3, Y3, RR); + + field_mul(t, S1, H3); + field_sub_mod(Y3, Y3, t); + + field_mul(t, Pp->Z, Qp->Z); + field_mul(Z3, t, H); + + zcopy(R->X, X3); + zcopy(R->Y, Y3); + zcopy(R->Z, Z3); +} + +/* ---------- shared scalar multiply helpers (reduce duplication) ---------- */ + +static void scalar_mul_point(p256_pt *R, const p256_pt *Base, const uint8_t *k, size_t klen) { + zclear(R->X); zclear(R->Y); zclear(R->Z); /* R = O */ + for (size_t bi = 0; bi < klen * 8; bi++) { + p256_point_double(R); + if ((k[bi >> 3] >> (7 - (bi & 7))) & 1) { + p256_point_add(R, R, Base); + } + } +} + +/* Load uncompressed point (04 || X || Y) into Jacobian with Z=1 */ +static int load_point_uncompressed(p256_pt *Pp, const unsigned char *buf, size_t len) { + if (len != 65 || buf[0] != 0x04) return 0; + be32_to_le32(buf + 1, Pp->X); + be32_to_le32(buf + 33, Pp->Y); + zclear(Pp->Z); Pp->Z[0] = 1; + return 1; +} + +static void store_point_uncompressed(unsigned char *buf, const p256_pt *Pp) { + buf[0] = 0x04; + le32_to_be32(Pp->X, buf + 1); + le32_to_be32(Pp->Y, buf + 33); +} + +/* ---------- BearSSL ec_impl API ---------- */ + +static const unsigned char *api_generator(int curve, size_t *len) { + (void)curve; + *len = br_secp256r1.generator_len; + return br_secp256r1.generator; +} + +static const unsigned char *api_order(int curve, size_t *len) { + (void)curve; + *len = br_secp256r1.order_len; + return br_secp256r1.order; +} + +static size_t api_xoff(int curve, size_t *len) { + (void)curve; + *len = 32; + return 1; +} + +static uint32_t api_mul(unsigned char *G, size_t Glen, + const unsigned char *x, size_t xlen, + int curve) { + (void)curve; + p256_pt Pp, R; + if (!load_point_uncompressed(&Pp, G, Glen)) return 0; + + scalar_mul_point(&R, &Pp, x, xlen); + to_affine(&R); + store_point_uncompressed(G, &R); + return 1; +} + +static size_t api_mulgen(unsigned char *Rbuf, + const unsigned char *x, size_t xlen, + int curve) { + (void)curve; + p256_pt Gp, R; + load_generator(&Gp); + + scalar_mul_point(&R, &Gp, x, xlen); + to_affine(&R); + store_point_uncompressed(Rbuf, &R); + return 65; +} + +static uint32_t api_muladd(unsigned char *A, const unsigned char *B, + size_t Glen, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, + int curve) { + (void)curve; + p256_pt Pp, Qp, R, T; + + if (!load_point_uncompressed(&Pp, A, Glen)) return 0; + + scalar_mul_point(&R, &Pp, x, xlen); + + if (B) { + if (!load_point_uncompressed(&Qp, B, Glen)) return 0; + } else { + load_generator(&Qp); + } + + scalar_mul_point(&T, &Qp, y, ylen); + p256_point_add(&R, &R, &T); + + to_affine(&R); + store_point_uncompressed(A, &R); + return 1; +} + +const br_ec_impl br_ec_p256_m15 PROGMEM = { + (uint32_t)0x00800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; + +#endif // SOC_MPI_SUPPORTED +#endif // defined(ESP_PLATFORM) && !defined(ESP8266) +#endif // USE_SHA_ROM \ No newline at end of file