use ROM bigint functions in BearSSL (#23949)

2025-09-26 13:25:33 +02:00 · 2025-09-26 13:25:33 +02:00 · d0eb3bb6ae
commit d0eb3bb6ae
parent 95a85e9886
2 changed files with 776 additions and 0 deletions
--- a/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_c25519_m15.c
+++ b/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_c25519_m15.c
@ -0,0 +1,308 @@
 /*
 * _ec_c25519_m15.c — BearSSL Curve25519 (X25519) implementation using ESP32 ROM-backed Montgomery arithmetic
 *
 * This file provides a fast Montgomery ladder implementation for Curve25519 scalar
 * multiplication (X25519), leveraging the ESP32's ROM bigint accelerator for modular
 * multiplication in the prime field p = 2^255 - 19.
 *
 * Key features:
 *   - Field arithmetic in the normal domain using single-step ROM-backed multiply/square.
 *   - 8×32-bit little-endian limb representation for all field elements.
 *   - Constant-time Montgomery ladder for scalar multiplication.
 *   - RFC 7748–compliant clamping of scalar inputs.
 *   - Supports multiplication by arbitrary u‑coordinates and basepoint generation.
 *   - Fully compatible with BearSSL's ec_impl API.
 *
 * Internal operations avoid heap allocation and use fixed-size buffers.
 *
 * Requires: ESP32 platform with SOC_MPI_SUPPORTED enabled.
 *
 * Author: Christian Baars
 */
 #if defined(USE_SHA_ROM)
 #if defined(ESP_PLATFORM) && !defined(ESP8266) && !defined(CONFIG_IDF_TARGET_ESP32)
 #if __has_include("soc/sha_caps.h")
 # include "soc/sha_caps.h"
 #elif __has_include("soc/soc_caps.h")
 # include "soc/soc_caps.h"
 #else
 # error "No ESP capability header found"
 #endif
 #if SOC_MPI_SUPPORTED
 #include "rom/bigint.h"
 #include "t_inner.h"
 #define WORDS 8  /* 8×32-bit limbs */
 /* Prime p = 2^255 - 19 (little-endian 32-bit limbs) */
 static const uint32_t P_LE[WORDS] = {
    0xFFFFFFED, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF
 };
 /* R^2 mod p (with R = 2^256 mod p = 38) */
 static const uint32_t RR_LE[WORDS] = { 1444, 0,0,0,0,0,0,0 };
 /* A24 = 121665 (normal-domain constant) */
 static const uint32_t A24_LE[WORDS] = { 121665, 0,0,0,0,0,0,0 };
 static const uint32_t MPRIME = 0x286BCA1B;
 /* ---------- limb utilities ---------- */
 static inline void zclear(uint32_t *a) { memset(a, 0, WORDS * sizeof(uint32_t)); }
 static inline void zcopy(uint32_t *dst, const uint32_t *src) { memcpy(dst, src, WORDS * sizeof(uint32_t)); }
 /* ct compare: return 1 if a >= b, else 0 */
 static inline uint32_t ge_ct(const uint32_t *a, const uint32_t *b) {
    uint32_t gt = 0, eq = 1;
    for (int i = WORDS - 1; i >= 0; i--) {
        uint32_t ai = a[i], bi = b[i];
        uint32_t gt_i = (ai > bi);
        uint32_t lt_i = (ai < bi);
        gt |= (eq & gt_i);
        eq &= ~(gt_i | lt_i);
    }
    return gt | eq;
 }
 static inline void add_mod(uint32_t *d, const uint32_t *a, const uint32_t *b) {
    uint64_t c = 0;
    for (int i = 0; i < WORDS; i++) {
        c = (uint64_t)a[i] + b[i] + (c >> 32);
        d[i] = (uint32_t)c;
    }
    uint32_t need_sub = (uint32_t)(c >> 32);
    need_sub |= ge_ct(d, P_LE);
    uint32_t borrow = 0, tmp[WORDS];
    for (int i = 0; i < WORDS; i++) {
        uint64_t t = (uint64_t)d[i] - P_LE[i] - borrow;
        tmp[i] = (uint32_t)t;
        borrow = (uint32_t)(t >> 63);
    }
    for (int i = 0; i < WORDS; i++) d[i] = need_sub ? tmp[i] : d[i];
 }
 static inline void sub_mod(uint32_t *d, const uint32_t *a, const uint32_t *b) {
    uint32_t borrow = 0;
    for (int i = 0; i < WORDS; i++) {
        uint64_t t = (uint64_t)a[i] - b[i] - borrow;
        d[i] = (uint32_t)t;
        borrow = (uint32_t)(t >> 63);
    }
    uint64_t c = 0, tmp[WORDS];
    for (int i = 0; i < WORDS; i++) {
        c = (uint64_t)d[i] + P_LE[i] + (c >> 32);
        tmp[i] = (uint32_t)c;
    }
    for (int i = 0; i < WORDS; i++) d[i] = borrow ? tmp[i] : d[i];
 }
 static inline void field_mul(uint32_t *dst, const uint32_t *a, const uint32_t *b) {
    ets_bigint_enable();
    ets_bigint_modmult(a, b, P_LE, MPRIME, RR_LE, WORDS);
    ets_bigint_wait_finish();
    ets_bigint_getz(dst, WORDS);
    ets_bigint_disable();
 }
 static inline void field_sqr(uint32_t *dst, const uint32_t *a) {
    field_mul(dst, a, a);
 }
 /* Fermat inversion: a^(p-2) in normal domain */
 static void field_inv(uint32_t *out, const uint32_t *a) {
    static const uint32_t EXP_P_MINUS_2[WORDS] = {
        0xFFFFFFEB, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF
    };
    uint32_t res[WORDS], base[WORDS];
    zclear(res); res[0] = 1;
    zcopy(base, a);
    for (int wi = WORDS - 1; wi >= 0; wi--) {
        uint32_t w = EXP_P_MINUS_2[wi];
        for (int b = 31; b >= 0; b--) {
            field_sqr(res, res);
            if ((w >> b) & 1U) field_mul(res, res, base);
        }
    }
    zcopy(out, res);
 }
 /* Conditional swap */
 static inline void cswap(uint32_t *a, uint32_t *b, uint32_t ctl) {
    uint32_t mask = -ctl;
    for (int i = 0; i < WORDS; i++) {
        uint32_t t = (a[i] ^ b[i]) & mask;
        a[i] ^= t; b[i] ^= t;
    }
 }
 /* ---------- X25519 ladder (normal domain) ---------- */
 static const unsigned char GEN[] PROGMEM = {
    0x09, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
 };
 static const unsigned char ORDER[] PROGMEM = {
    0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
 };
 static const unsigned char *
 api_generator(int curve, size_t *len)
 {
    (void)curve;
    *len = 32;
    return GEN;
 }
 static const unsigned char *
 api_order(int curve, size_t *len)
 {
    (void)curve;
    *len = 32;
    return ORDER;
 }
 static size_t
 api_xoff(int curve, size_t *len)
 {
    (void)curve;
    *len = 32;
    return 0;
 }
 static uint32_t
 api_mul(unsigned char *G, size_t Glen,
        const unsigned char *kb, size_t kblen, int curve)
 {
    (void)curve;
    if (Glen != 32 || kblen > 32) {
        return 0;
    }
    /* Clamp scalar per RFC 7748 */
    unsigned char k[32];
    memset(k, 0, 32 - kblen);
    memcpy(k + (32 - kblen), kb, kblen);
    k[31] &= 0xF8;
    k[0] &= 0x7F;
    k[0] |= 0x40;
    /* Load u and clear high bit per RFC 7748 */
    unsigned char u_bytes[32];
    memcpy(u_bytes, G, 32);
    u_bytes[31] &= 0x7F;
    uint32_t x1[WORDS], x2[WORDS], z2[WORDS], x3[WORDS], z3[WORDS];
    br_range_dec32le(x1, WORDS, u_bytes);
    /* Initialize:
     *   (x2:z2) = (1:0)
     *   (x3:z3) = (u:1)
     */
    zclear(z2);
    zclear(x2); x2[0] = 1;
    zcopy(x3, x1);
    zclear(z3); z3[0] = 1;
    uint32_t a[WORDS], aa[WORDS], b[WORDS], bb[WORDS];
    uint32_t c[WORDS], d[WORDS], e[WORDS], da[WORDS], cb[WORDS];
    uint32_t t[WORDS];
    uint32_t swap = 0;
    for (int i = 254; i >= 0; i--) {
        uint32_t kt = (k[31 - (i >> 3)] >> (i & 7)) & 1U;
        swap ^= kt;
        cswap(x2, x3, swap);
        cswap(z2, z3, swap);
        swap = kt;
        /* Ladder step */
        add_mod(a, x2, z2);        /* a = x2 + z2 */
        sub_mod(b, x2, z2);        /* b = x2 - z2 */
        field_sqr(aa, a);          /* aa = a^2 */
        field_sqr(bb, b);          /* bb = b^2 */
        sub_mod(e, aa, bb);        /* e = aa - bb */
        add_mod(c, x3, z3);        /* c = x3 + z3 */
        sub_mod(d, x3, z3);        /* d = x3 - z3 */
        field_mul(da, d, a);       /* da = d * a */
        field_mul(cb, c, b);       /* cb = c * b */
        add_mod(x3, da, cb);       /* x3 = (da + cb)^2 */
        field_sqr(x3, x3);
        sub_mod(z3, da, cb);       /* z3 = (da - cb)^2 * x1 */
        field_sqr(z3, z3);
        field_mul(z3, z3, x1);
        field_mul(x2, aa, bb);     /* x2 = aa * bb */
        /* z2 = e * (aa + A24 * e) */
        field_mul(t, A24_LE, e);   /* t = A24 * e */
        add_mod(t, t, aa);         /* t = aa + A24*e */
        field_mul(z2, e, t);       /* z2 = e * t */
    }
    cswap(x2, x3, swap);
    cswap(z2, z3, swap);
    /* u = x2 / z2 */
    uint32_t z2i[WORDS], unorm[WORDS];
    field_inv(z2i, z2);
    field_mul(unorm, x2, z2i);
    /* Final reduction if needed and serialize */
    if (ge_ct(unorm, P_LE)) {
        sub_mod(unorm, unorm, P_LE);
    }
    br_range_enc32le(G, unorm, WORDS);
    return 1;
 }
 static size_t
 api_mulgen(unsigned char *R,
           const unsigned char *x, size_t xlen, int curve)
 {
    const unsigned char *G0;
    size_t Glen;
    G0 = api_generator(curve, &Glen);
    memcpy_P(R, G0, Glen);
    api_mul(R, Glen, x, xlen, curve);
    return Glen;
 }
 static uint32_t
 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
           const unsigned char *x, size_t xlen,
           const unsigned char *y, size_t ylen, int curve)
 {
    (void)A; (void)B; (void)len; (void)x; (void)xlen; (void)y; (void)ylen; (void)curve;
    /* Not applicable for Curve25519 (no ECDSA). */
    return 0;
 }
 /* see bearssl_ec.h */
 const br_ec_impl br_ec_c25519_m15 PROGMEM = {
    (uint32_t)0x20000000,
    &api_generator,
    &api_order,
    &api_xoff,
    &api_mul,
    &api_mulgen,
    &api_muladd
 };
 #endif /* SOC_MPI_SUPPORTED */
 #endif /* ESP_PLATFORM && !ESP8266 */
 #endif /* USE_SHA_ROM */
--- a/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_p256_m15.c
+++ b/lib/lib_ssl/bearssl-esp8266/src/ec/_ec_p256_m15.c
@ -0,0 +1,468 @@
 /*
 * _ec_p256_m15.c — BearSSL P-256 implementation using ESP32 ROM-backed Montgomery arithmetic
 *
 * This file provides a fast elliptic curve implementation for secp256r1 (P-256),
 * leveraging the ESP32's ROM bigint accelerator for modular multiplication.
 *
 * Key features:
 *   - Field arithmetic in normal domain using Montgomery-backed multiply/square.
 *   - Jacobian point representation with full group law (point add/double).
 *   - Scalar multiplication via double-and-add, supporting arbitrary base points.
 *   - Conversion between affine and Jacobian coordinates.
 *   - Compact encoding/decoding of uncompressed points (04 || X || Y).
 *   - Fully compatible with BearSSL's ec_impl API.
 *
 * All field elements are stored as 8×32-bit little-endian limbs.
 * Internal operations avoid heap allocation and use fixed-size buffers.
 *
 * Requires: ESP32 platform with SOC_MPI_SUPPORTED enabled.
 * 
 * Author: Christian Baars
 */
 #if defined(USE_SHA_ROM)
 #if defined(ESP_PLATFORM) && !defined(ESP8266) && !defined(CONFIG_IDF_TARGET_ESP32)
 #if __has_include("soc/sha_caps.h")
 # include "soc/sha_caps.h"
 #elif __has_include("soc/soc_caps.h")
 # include "soc/soc_caps.h"
 #else
 # error "No ESP capability header found"
 #endif
 #if SOC_MPI_SUPPORTED
 #include <stdint.h>
 #include "rom/bigint.h"
 #include "t_inner.h"
 #define WORDS 8
 /* ESP32 ROM Montgomery parameters (little-endian).*/
 static const uint32_t P_LE[8] = {
    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
    0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF
 };
 static const uint32_t RR_LE[8] = {
    0x03000000, 0x00000000, 0xFFFFFFFF, 0xFBFFFFFF,
    0xFEFFFFFF, 0xFFFFFFFF, 0xFDFFFFFF, 0x04000000
 };
 /* -p^{-1} mod 2^32 */
 static const uint32_t MPRIME = 0x00000001;
 /* Factor to convert ROM Montgomery output back to normal domain (8 limbs) */
 static const uint32_t CINV2_LE[8] = {
    0xB15F7DC9, 0x21BC7192, 0xF82DEBEB, 0xF2086906,
    0x8AD3BB54, 0xE34453E4, 0xB2B4EF16, 0x5FF55809
 };
 /* Generator point G in little-endian 32-bit limbs (LSW first) */
 static const uint32_t Gx[WORDS] = {
    0xD898C296, 0xF4A13945, 0x2DEB33A0, 0x77037D81,
    0x63A440F2, 0xF8BCE6E5, 0xE12C4247, 0x6B17D1F2
 };
 static const uint32_t Gy[WORDS] = {
    0x37BF51F5, 0xCBB64068, 0x6B315ECE, 0x2BCE3357,
    0x7C0F9E16, 0x8EE7EB4A, 0xFE1A7F9B, 0x4FE342E2
 };
 typedef struct {
    uint32_t X[WORDS];
    uint32_t Y[WORDS];
    uint32_t Z[WORDS];
 } p256_pt;
 /* ---------- small utilities ---------- */
 static inline void zclear(uint32_t *dst) {
    memset(dst, 0, WORDS * sizeof(uint32_t));
 }
 static inline void zcopy(uint32_t *dst, const uint32_t *src) {
    memcpy(dst, src, WORDS * sizeof(uint32_t));
 }
 static inline int is_zero(const uint32_t *a) {
    uint32_t acc = 0;
    for (int i = 0; i < WORDS; i++) acc |= a[i];
    return acc == 0;
 }
 /* big-endian bytes -> internal little-endian limbs (reverse word order) */
 static void be32_to_le32(const uint8_t *src, uint32_t *dst) {
    for (int i = 0; i < WORDS; i++) {
        int j = WORDS - 1 - i;
        dst[i] = ((uint32_t)src[4*j] << 24)
               | ((uint32_t)src[4*j + 1] << 16)
               | ((uint32_t)src[4*j + 2] << 8)
               | ((uint32_t)src[4*j + 3]);
    }
 }
 /* internal little-endian limbs -> big-endian bytes (reverse word order) */
 static void le32_to_be32(const uint32_t *src, uint8_t *dst) {
    for (int i = 0; i < WORDS; i++) {
        int j = WORDS - 1 - i;
        dst[4*j]     = (uint8_t)(src[i] >> 24);
        dst[4*j + 1] = (uint8_t)(src[i] >> 16);
        dst[4*j + 2] = (uint8_t)(src[i] >> 8);
        dst[4*j + 3] = (uint8_t)(src[i]);
    }
 }
 /* ---------- field arithmetic modulo p (normal domain) ---------- */
 static inline const uint32_t *Pmod(void) { return P_LE; }
 static int ge_mod_p(const uint32_t *a) {
    const uint32_t *P = Pmod();
    for (int i = WORDS - 1; i >= 0; i--) {
        if (a[i] > P[i]) return 1;
        if (a[i] < P[i]) return 0;
    }
    return 1; /* equal */
 }
 static void field_add_mod(uint32_t *dst, const uint32_t *a, const uint32_t *b) {
    const uint32_t *P = Pmod();
    uint64_t carry = 0;
    for (int i = 0; i < WORDS; i++) {
        uint64_t sum = (uint64_t)a[i] + b[i] + carry;
        dst[i] = (uint32_t)sum;
        carry = sum >> 32;
    }
    if (carry || ge_mod_p(dst)) {
        uint64_t borrow = 0;
        for (int i = 0; i < WORDS; i++) {
            uint64_t diff = (uint64_t)dst[i] - P[i] - borrow;
            dst[i] = (uint32_t)diff;
            borrow = (diff >> 63) & 1;
        }
    }
 }
 static void field_sub_mod(uint32_t *dst, const uint32_t *a, const uint32_t *b) {
    const uint32_t *P = Pmod();
    uint64_t borrow = 0;
    for (int i = 0; i < WORDS; i++) {
        uint64_t diff = (uint64_t)a[i] - b[i] - borrow;
        dst[i] = (uint32_t)diff;
        borrow = (diff >> 63) & 1;
    }
    if (borrow) {
        uint64_t carry = 0;
        for (int i = 0; i < WORDS; i++) {
            uint64_t sum = (uint64_t)dst[i] + P[i] + carry;
            dst[i] = (uint32_t)sum;
            carry = sum >> 32;
        }
    }
 }
 /* ROM-backed modular multiply returning normal-domain result (8 limbs) */
 static void rom_field_mul(uint32_t *dst, const uint32_t *a, const uint32_t *b) {
    uint32_t tmp[WORDS];
    ets_bigint_enable();
    /* Montgomery multiply in ROM (returns Montgomery residue) */
    ets_bigint_modmult(a, b, P_LE, MPRIME, RR_LE, WORDS);
    ets_bigint_wait_finish();
    ets_bigint_getz(tmp, WORDS);
    /* Convert out of Montgomery domain using the proven CINV2_LE */
    ets_bigint_modmult(tmp, CINV2_LE, P_LE, MPRIME, RR_LE, WORDS);
    ets_bigint_wait_finish();
    ets_bigint_getz(dst, WORDS);
    ets_bigint_disable();
 }
 static inline void field_mul(uint32_t *dst, const uint32_t *a, const uint32_t *b) {
    rom_field_mul(dst, a, b);
 }
 static inline void field_sqr(uint32_t *dst, const uint32_t *a) {
    rom_field_mul(dst, a, a);
 }
 /* Square-and-multiply exponentiation for p-2 (normal domain throughout) */
 static void field_inv(uint32_t *out, const uint32_t *a) {
    static const uint32_t EXP_P_MINUS_2[WORDS] = {
        0xFFFFFFFD, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
        0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF
    };
    uint32_t res[WORDS], base[WORDS];
    zclear(res); res[0] = 1;     /* res = 1 */
    zcopy(base, a);              /* base = a */
    for (int wi = WORDS - 1; wi >= 0; wi--) {
        uint32_t w = EXP_P_MINUS_2[wi];
        for (int b = 31; b >= 0; b--) {
            field_sqr(res, res);
            if ((w >> b) & 1U) {
                field_mul(res, res, base);
            }
        }
    }
    zcopy(out, res);
 }
 /* ---------- point utilities ---------- */
 static inline void load_generator(p256_pt *Pp) {
    zcopy(Pp->X, Gx);
    zcopy(Pp->Y, Gy);
    zclear(Pp->Z);
    Pp->Z[0] = 1;
 }
 static void to_affine(p256_pt *Pp) {
    uint32_t zi[WORDS], zi2[WORDS], xi[WORDS], yi[WORDS];
    uint32_t z0 = 0;
    for (int i = 0; i < WORDS; i++) z0 |= Pp->Z[i];
    if (z0 == 0) {
        zclear(Pp->X); zclear(Pp->Y); zclear(Pp->Z);
        return;
    }
    field_inv(zi, Pp->Z);
    field_sqr(zi2, zi);
    field_mul(xi, Pp->X, zi2);
    field_mul(yi, Pp->Y, zi2);
    field_mul(yi, yi, zi);
    zcopy(Pp->X, xi);
    zcopy(Pp->Y, yi);
    zclear(Pp->Z); Pp->Z[0] = 1;
 }
 /* ---------- group law (Jacobian, a = -3) ---------- */
 __attribute__((noinline))
 static void p256_point_double(p256_pt *Q) {
    if (is_zero(Q->Y)) { zclear(Q->X); zclear(Q->Y); zclear(Q->Z); return; }
    uint32_t Z2[WORDS], M[WORDS], Mtmp[WORDS], S[WORDS], T1[WORDS], T2[WORDS], X3[WORDS], Y3[WORDS];
    /* Z2 = Z^2 */
    field_sqr(Z2, Q->Z);
    /* T1 = X - Z^2 ; T2 = X + Z^2 */
    field_sub_mod(T1, Q->X, Z2);
    field_add_mod(T2, Q->X, Z2);
    /* M = (X - Z^2) * (X + Z^2) */
    field_mul(M, T1, T2);
    /* M = 3 * M */
    zcopy(Mtmp, M);
    field_add_mod(M, M, M);    /* 2*M */
    field_add_mod(M, M, Mtmp); /* 3*M */
    /* S = 4 * X * Y^2 */
    field_sqr(S, Q->Y);        /* Y^2 */
    field_mul(S, S, Q->X);     /* X*Y^2 */
    field_add_mod(S, S, S);    /* 2*X*Y^2 */
    field_add_mod(S, S, S);    /* 4*X*Y^2 */
    /* X3 = M^2 - 2*S */
    field_sqr(X3, M);
    field_sub_mod(X3, X3, S);
    field_sub_mod(X3, X3, S);
    /* Y3 = M*(S - X3) - 8*Y^4 */
    field_sub_mod(Y3, S, X3);
    field_mul(Y3, Y3, M);
    field_sqr(T1, Q->Y);       /* Y^2 */
    field_sqr(T1, T1);         /* Y^4 */
    field_add_mod(T1, T1, T1); /* 2*Y^4 */
    field_add_mod(T1, T1, T1); /* 4*Y^4 */
    field_add_mod(T1, T1, T1); /* 8*Y^4 */
    field_sub_mod(Y3, Y3, T1);
    /* Z3 = 2*Y*Z */
    field_mul(Q->Z, Q->Y, Q->Z);
    field_add_mod(Q->Z, Q->Z, Q->Z);
    zcopy(Q->X, X3);
    zcopy(Q->Y, Y3);
 }
 __attribute__((noinline))
 static void p256_point_add(p256_pt *R, const p256_pt *Pp, const p256_pt *Qp) {
    if (is_zero(Pp->Z)) { zcopy(R->X, Qp->X); zcopy(R->Y, Qp->Y); zcopy(R->Z, Qp->Z); return; }
    if (is_zero(Qp->Z)) { zcopy(R->X, Pp->X); zcopy(R->Y, Pp->Y); zcopy(R->Z, Pp->Z); return; }
    uint32_t Z1Z1[WORDS], Z2Z2[WORDS], U1[WORDS], U2[WORDS];
    uint32_t S1[WORDS], S2[WORDS], H[WORDS], RR[WORDS];
    uint32_t H2[WORDS], H3[WORDS], U1H2[WORDS], X3[WORDS], Y3[WORDS], Z3[WORDS], t[WORDS];
    field_sqr(Z1Z1, Pp->Z);
    field_sqr(Z2Z2, Qp->Z);
    field_mul(U1, Pp->X, Z2Z2);
    field_mul(U2, Qp->X, Z1Z1);
    field_mul(t, Qp->Z, Z2Z2);  /* Z2^3 */
    field_mul(S1, Pp->Y, t);
    field_mul(t, Pp->Z, Z1Z1);  /* Z1^3 */
    field_mul(S2, Qp->Y, t);
    field_sub_mod(H, U2, U1);
    field_sub_mod(RR, S2, S1);
    if (is_zero(H)) {
        if (is_zero(RR)) {
            p256_pt D = *Pp;
            p256_point_double(&D);
            zcopy(R->X, D.X);
            zcopy(R->Y, D.Y);
            zcopy(R->Z, D.Z);
        } else {
            zclear(R->X); zclear(R->Y); zclear(R->Z); /* infinity */
        }
        return;
    }
    field_sqr(H2, H);
    field_mul(H3, H, H2);
    field_mul(U1H2, U1, H2);
    field_sqr(X3, RR);
    field_sub_mod(X3, X3, H3);
    field_sub_mod(X3, X3, U1H2);
    field_sub_mod(X3, X3, U1H2); /* -2*U1H2 */
    field_sub_mod(Y3, U1H2, X3);
    field_mul(Y3, Y3, RR);
    field_mul(t, S1, H3);
    field_sub_mod(Y3, Y3, t);
    field_mul(t, Pp->Z, Qp->Z);
    field_mul(Z3, t, H);
    zcopy(R->X, X3);
    zcopy(R->Y, Y3);
    zcopy(R->Z, Z3);
 }
 /* ---------- shared scalar multiply helpers (reduce duplication) ---------- */
 static void scalar_mul_point(p256_pt *R, const p256_pt *Base, const uint8_t *k, size_t klen) {
    zclear(R->X); zclear(R->Y); zclear(R->Z); /* R = O */
    for (size_t bi = 0; bi < klen * 8; bi++) {
        p256_point_double(R);
        if ((k[bi >> 3] >> (7 - (bi & 7))) & 1) {
            p256_point_add(R, R, Base);
        }
    }
 }
 /* Load uncompressed point (04 || X || Y) into Jacobian with Z=1 */
 static int load_point_uncompressed(p256_pt *Pp, const unsigned char *buf, size_t len) {
    if (len != 65 || buf[0] != 0x04) return 0;
    be32_to_le32(buf + 1,  Pp->X);
    be32_to_le32(buf + 33, Pp->Y);
    zclear(Pp->Z); Pp->Z[0] = 1;
    return 1;
 }
 static void store_point_uncompressed(unsigned char *buf, const p256_pt *Pp) {
    buf[0] = 0x04;
    le32_to_be32(Pp->X, buf + 1);
    le32_to_be32(Pp->Y, buf + 33);
 }
 /* ---------- BearSSL ec_impl API ---------- */
 static const unsigned char *api_generator(int curve, size_t *len) {
    (void)curve;
    *len = br_secp256r1.generator_len;
    return br_secp256r1.generator;
 }
 static const unsigned char *api_order(int curve, size_t *len) {
    (void)curve;
    *len = br_secp256r1.order_len;
    return br_secp256r1.order;
 }
 static size_t api_xoff(int curve, size_t *len) {
    (void)curve;
    *len = 32;
    return 1;
 }
 static uint32_t api_mul(unsigned char *G, size_t Glen,
                        const unsigned char *x, size_t xlen,
                        int curve) {
    (void)curve;
    p256_pt Pp, R;
    if (!load_point_uncompressed(&Pp, G, Glen)) return 0;
    scalar_mul_point(&R, &Pp, x, xlen);
    to_affine(&R);
    store_point_uncompressed(G, &R);
    return 1;
 }
 static size_t api_mulgen(unsigned char *Rbuf,
                         const unsigned char *x, size_t xlen,
                         int curve) {
    (void)curve;
    p256_pt Gp, R;
    load_generator(&Gp);
    scalar_mul_point(&R, &Gp, x, xlen);
    to_affine(&R);
    store_point_uncompressed(Rbuf, &R);
    return 65;
 }
 static uint32_t api_muladd(unsigned char *A, const unsigned char *B,
                           size_t Glen,
                           const unsigned char *x, size_t xlen,
                           const unsigned char *y, size_t ylen,
                           int curve) {
    (void)curve;
    p256_pt Pp, Qp, R, T;
    if (!load_point_uncompressed(&Pp, A, Glen)) return 0;
    scalar_mul_point(&R, &Pp, x, xlen);
    if (B) {
        if (!load_point_uncompressed(&Qp, B, Glen)) return 0;
    } else {
        load_generator(&Qp);
    }
    scalar_mul_point(&T, &Qp, y, ylen);
    p256_point_add(&R, &R, &T);
    to_affine(&R);
    store_point_uncompressed(A, &R);
    return 1;
 }
 const br_ec_impl br_ec_p256_m15 PROGMEM = {
    (uint32_t)0x00800000,
    &api_generator,
    &api_order,
    &api_xoff,
    &api_mul,
    &api_mulgen,
    &api_muladd
 };
 #endif // SOC_MPI_SUPPORTED
 #endif // defined(ESP_PLATFORM) && !defined(ESP8266)
 #endif // USE_SHA_ROM