/* KECCAK permutation and sponge construction * J. Welsh, May 2019 * * Based on Guido Bertoni, Joan Daemen, Michael Peeters and Gilles van Assche * (2011): The KECCAK reference, Version 3.0. */ #include #include "io.h" /* State is arranged in a 5 x 5 x W bit array, indexed as (x,y,z). * The 1D section for a given (x,y) is called a lane and packed in a lane_t. * The 1D section for a given (x,z) is called a column. * The 2D section for a given z is called a slice. */ typedef uint64_t lane_t; #define L 6 #define W (1 << L) /* lane size = 64 bits */ #define B 25*W /* permutation width = 1600 bits */ /* we require L be at least 3 (W=8, B=200) to avoid sub-byte lane size */ #define ROUNDS (12 + 2*L) #define mask(x) (((unsigned) (x)) & (W-1)) #define rot(v,r) (v = v<>mask(W-mask(r))) static lane_t state[5][5]; /* physical storage order can be swapped as a performance tweak */ #define a(x,y) (state[x][y]) /* Galois linear feedback shift register, output in low bit: * (x^t mod x^8 + x^6 + x^5 + x^4 + 1) mod x in GF(2)(x) * (I'd forgotten my grade-school algebra but the Handbook of Applied * Cryptography plus http://archive.is/hhvKKJ (nayuki.io) and some written * exercises cleared this up for me.) */ static uint8_t lfsr_step(uint8_t rstate) { /* polynomial xor'd in when high (overflowing) bit is set */ uint8_t gate = ((int8_t) rstate) >> 7; return (rstate << 1) ^ (gate & 0x71); /* bits 6,5,4,0 */ } /* Column parity diffusion */ static void theta(void) { /* LUT for x+1 mod 5 (not used on secret bits!) */ uint8_t const inc_mod5[8] = {1,2,3,4,0,1,2,3}; unsigned x, y; lane_t c[5], d; for (x=0; x<5; ++x) c[x] = a(x,0) ^ a(x,1) ^ a(x,2) ^ a(x,3) ^ a(x,4); for (x=0; x<5; ++x) { d = c[inc_mod5[x]]; rot(d,1); d ^= c[inc_mod5[x+3]]; /* x+3+1 == x-1 mod 5 */ for (y=0; y<5; ++y) a(x,y) ^= d; } } /* Inter-slice dispersion */ static void rho(void) { /* precomputed offsets from Table 2.1 */ rot(a(0,1), 36); rot(a(0,2), 3); rot(a(0,3), 105); rot(a(0,4), 210); rot(a(1,0), 1); rot(a(1,1), 300); rot(a(1,2), 10); rot(a(1,3), 45); rot(a(1,4), 66); rot(a(2,0), 190); rot(a(2,1), 6); rot(a(2,2), 171); rot(a(2,3), 15); rot(a(2,4), 253); rot(a(3,0), 28); rot(a(3,1), 55); rot(a(3,2), 153); rot(a(3,3), 21); rot(a(3,4), 120); rot(a(4,0), 91); rot(a(4,1), 276); rot(a(4,2), 231); rot(a(4,3), 136); rot(a(4,4), 78); } /* Slicewise transposition by matrix * [ 0 1 ] * [ 2 3 ] * For all indices x, y, z: * a'(y, 2x+3y mod 5, z) = a(x, y, z) */ static void pi(void) { /* shuffle in-place by swapping alternating temporaries with successive * destinations */ lane_t c, d; #define even(x,y) (c=a(x,y), a(x,y)=d) #define odd(x,y) (d=a(x,y), a(x,y)=c) /* a[0][0] doesn't move */ d=a(0,1); /* 2*0 + 3*1 = 3 -> 3 */ even(1,3); /* 2+9=11 -> 1 */ odd (3,1); /* 6+3=9 -> 4 */ even(1,4); /* 2+12=14 */ odd (4,4); /* 8+12=20 */ even(4,0); /* 8+0=8 */ odd (0,3); /* 0+9=9 */ even(3,4); /* 6+12=18 */ odd (4,3); /* 8+9=17 */ even(3,2); /* 6+6=12 */ odd (2,2); /* 4+6=10 */ even(2,0); /* 4+0=4 */ odd (0,4); /* 0+12=12 */ even(4,2); /* 8+6=14 */ odd (2,4); /* 4+12=16 */ even(4,1); /* 8+3=11 */ odd (1,1); /* 2+3=5 */ even(1,0); /* 2+0=2 */ odd (0,2); /* 0+6=6 */ even(2,1); /* 4+3=7 */ odd (1,2); /* 2+6=8 */ even(2,3); /* 4+9=13 */ odd (3,3); /* 6+9=15 */ even(3,0); /* 6+0=6 */ a(0,1)=c; /* And we're back home in 24 steps, lovely! */ } /* Nonlinear row mapping */ static void chi(void) { unsigned y; for (y=0; y<5; ++y) { lane_t c0=a(0,y), c1=a(1,y), c2=a(2,y), c3=a(3,y), c4=a(4,y); a(0,y) = c0 ^ (~c1 & c2); a(1,y) = c1 ^ (~c2 & c3); a(2,y) = c2 ^ (~c3 & c4); a(3,y) = c3 ^ (~c4 & c0); a(4,y) = c4 ^ (~c0 & c1); } } /* Round constants for disrupting z symmetry * * Mix LFSR bitstream sequentially into first lane with z index given by * 2^j - 1 * LFSR advances 7 steps per round regardless of how many "j" are needed to * fill the lane, corresponding to the maximum 64-bit lane size. */ static uint8_t iota(uint8_t rstate) { unsigned j; for (j=0; j<=L; ++j) { a(0,0) ^= ((lane_t) (rstate & 1)) << ((1 << j) - 1); rstate = lfsr_step(rstate); } for (; j<7; ++j) rstate = lfsr_step(rstate); return rstate; } #ifdef TEST #include #include "testvectors.h" static int test_fail; static unsigned test, test_step; static void checkpoint(unsigned round, char const *step) { int fail = 0; unsigned x, y, i=0; printf("test=%u round=%u %s ", test, round, step); for (y=0; y<5; ++y) for (x=0; x<5; ++x, ++i) { assert(i<25); assert(test_step> z; /* Digits within the byte rendered big-endian, * matching common practice but unfortunately * not bitstream order. */ buf[i++] = hex[byte>>4]; buf[i++] = hex[byte&15]; } } static void extract(unsigned len) { static char buf[2*B/8]; assert(2*len <= sizeof buf); store_hex(buf,len); write_all(1,buf,2*len); } /* Stream octets from stdin until EOF, then write hex to stdout. */ void sponge(unsigned capacity, size_t out_bits) { static uint8_t buf[B/8]; unsigned rate = B - capacity, len = rate/8, n; size_t out_len = out_bits/8; assert(W == 8*sizeof(lane_t)); assert(0 < rate && rate < B); assert(!(rate%8)); assert(len <= sizeof buf); /* absorb */ reset(); while ((n = read_all(0,buf,len)) == len) { load(buf,len); keccakf(); } /* pad (1000...1) */ if (n < len-1) { buf[n] = 0x01; /* first bit (little-endian) */ for (++n; n < len-1; ++n) buf[n] = 0; buf[n] = 0x80; /* last bit (little-endian) */ } else { assert(n == len-1); buf[n] = 0x81; /* first and last bits */ } load(buf,len); keccakf(); /* squeeze */ for (; out_len > len; out_len -= len) { extract(len); keccakf(); } extract(out_len); } #ifdef TEST int main() { for (test=0; test