/* spu_md5.c * * Please note that this release is obsolete. The new code will be made * available on www.security-assessment.com in a couple of weeks. * * v0.1 - Benchmark * (c)2007 Nick Breese * * Performance modifications and suppression of compiler dead-code stripping * added by Jonathan Taylor, Durham University, UK. 16.9.08 * http://www.dur.ac.uk/contactperson/?id=dph3jmt * Note this must be compiled with the option --param max-inline-insns-single=2000 * to force inlining of the two calls to md5_process * modified version 0.4 * * License: GPLv2 * * It's ugly as hell, but it works. * * Processes 4 values: 1234aaa, 2345aaa, * 3456aaa, 4567aaa * * Compile with -DSPEEDTEST to kick off 80,000,000 iterations * jmt: altered to 3,200,000 iterations * */ #include #include #include #include vec_uint4 vec_null = { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; #define SPEEDTEST vec_uint4 rotate_left (const vec_uint4 * vec_x, unsigned int n) { // jmt: selection of the appropriate spu instruction here makes a 20% difference to execution speed! return spu_rl(*vec_x, n); } vec_uint4 f_round_1(const vec_uint4 * vec_x, const vec_uint4 * vec_y, const vec_uint4 * vec_z) { vec_uint4 vec_f; vec_uint4 vec_f1; vec_uint4 vec_f2; vec_f1 = spu_and(*vec_x,*vec_y); // vec_uint4 vec_comp_x = spu_nor (*vec_x,vec_null); // vec_f2 = spu_and ( vec_comp_x, *vec_z ); // jmt: better instruction selection here (and in equivalent places) gives 30% speed improvement! vec_f2 = spu_andc(*vec_z, *vec_x); vec_f = spu_or ( vec_f1, vec_f2 ); return(vec_f); } static void set_round_1 (vec_uint4 * vec_a, const vec_uint4 * vec_b, const vec_uint4 * vec_c, const vec_uint4 * vec_d, const vec_uint4 * vec_k, unsigned int s, const vec_uint4 * Ti) { vec_uint4 vec_t; vec_t = f_round_1(vec_b, vec_c, vec_d); // vec_t = spu_add(vec_t,*vec_a); // vec_t = spu_add(vec_t,*vec_k); // vec_t = spu_add(vec_t,*Ti); // jmt: the following slightly reduces dependencies. // not important if we're doing two calculations in parallel vec_uint4 vec_t2 = spu_add(spu_add(*vec_a, *vec_k), *Ti); vec_t = spu_add(vec_t, vec_t2); vec_t = rotate_left(&vec_t,s); vec_t = spu_add(vec_t,*vec_b); *vec_a = vec_t; } vec_uint4 f_round_2 (const vec_uint4 * vec_x, const vec_uint4 * vec_y, const vec_uint4 * vec_z) { vec_uint4 vec_f; vec_uint4 vec_f1; vec_uint4 vec_f2; vec_f1 = spu_and ( *vec_x, *vec_z ); // vec_uint4 vec_comp_z = spu_eqv (*vec_z,vec_null); // vec_f2 = spu_and ( *vec_y, vec_comp_z ); // jmt: better instruction selection here (and in equivalent places) gives 30% speed improvement! vec_f2 = spu_andc(*vec_y, *vec_z); vec_f = spu_or ( vec_f1, vec_f2 ); return(vec_f); } static void set_round_2 (vec_uint4 *vec_a, const vec_uint4 *vec_b, const vec_uint4 *vec_c, const vec_uint4 *vec_d, const vec_uint4 *vec_k, unsigned int s, const vec_uint4 * Ti) { vec_uint4 vec_t; vec_t = f_round_2(vec_b, vec_c, vec_d); // vec_t = spu_add(vec_t,*vec_a); // vec_t = spu_add(vec_t,*vec_k); // vec_t = spu_add(vec_t,*Ti); vec_uint4 vec_t2 = spu_add(spu_add(*vec_a, *vec_k), *Ti); vec_t = spu_add(vec_t, vec_t2); vec_t = rotate_left(&vec_t,s); vec_t = spu_add(vec_t,*vec_b); *vec_a = vec_t; } vec_uint4 f_round_3 (const vec_uint4 *vec_x, const vec_uint4 *vec_y, const vec_uint4 *vec_z) { vec_uint4 vec_f; vec_uint4 vec_f1; vec_f1 = spu_xor ( *vec_x, *vec_y ); vec_f = spu_xor ( vec_f1, *vec_z ); return(vec_f); } static void set_round_3 (vec_uint4 *vec_a, const vec_uint4 *vec_b, const vec_uint4 *vec_c, const vec_uint4 *vec_d, const vec_uint4 *vec_k, unsigned int s, const vec_uint4 * Ti) { vec_uint4 vec_t; vec_t = f_round_3(vec_b, vec_c, vec_d); // vec_t = spu_add(vec_t,*vec_a); // vec_t = spu_add(vec_t,*vec_k); // vec_t = spu_add(vec_t,*Ti); vec_uint4 vec_t2 = spu_add(spu_add(*vec_a, *vec_k), *Ti); vec_t = spu_add(vec_t, vec_t2); vec_t = rotate_left(&vec_t,s); vec_t = spu_add(vec_t,*vec_b); *vec_a = vec_t; } vec_uint4 f_round_4 (const vec_uint4 *vec_x, const vec_uint4 *vec_y, const vec_uint4 *vec_z) { vec_uint4 vec_f; vec_uint4 vec_f1; // vec_uint4 vec_comp_z = spu_eqv (*vec_z,vec_null); // vec_f1 = spu_or ( *vec_x, vec_comp_z ); // jmt: better instruction selection here (and in equivalent places) gives 30% speed improvement! vec_f1 = spu_orc(*vec_x, *vec_z); vec_f = spu_xor ( *vec_y ,vec_f1 ); return(vec_f); } static void set_round_4 (vec_uint4 *vec_a, const vec_uint4 *vec_b, const vec_uint4 *vec_c, const vec_uint4 *vec_d, const vec_uint4 *vec_k, unsigned int s, const vec_uint4 * Ti) { vec_uint4 vec_t; vec_t = f_round_4(vec_b, vec_c, vec_d); // vec_t = spu_add(vec_t,*vec_a); // vec_t = spu_add(vec_t,*vec_k); // vec_t = spu_add(vec_t,*Ti); vec_uint4 vec_t2 = spu_add(spu_add(*vec_a, *vec_k), *Ti); vec_t = spu_add(vec_t, vec_t2); vec_t = rotate_left(&vec_t,s); vec_t = spu_add(vec_t,*vec_b); *vec_a = vec_t; } inline vec_uint4 md5_process(const vec_uint4 *datachunk) { vec_uint4 a = (vec_uint4){0x67452301, 0x67452301, 0x67452301, 0x67452301}; vec_uint4 b = (vec_uint4){0xefcdab89, 0xefcdab89, 0xefcdab89, 0xefcdab89}; vec_uint4 c = (vec_uint4){0x98badcfe, 0x98badcfe, 0x98badcfe, 0x98badcfe}; vec_uint4 d = (vec_uint4){0x10325476, 0x10325476, 0x10325476, 0x10325476}; vec_uint4 orig_a = a; vec_uint4 orig_b = b; vec_uint4 orig_c = c; vec_uint4 orig_d = d; vec_uint4 T1 = {0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478}; vec_uint4 T2 = {0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756}; vec_uint4 T3 = {0x242070db, 0x242070db, 0x242070db, 0x242070db}; vec_uint4 T4 = {0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee}; vec_uint4 T5 = {0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf}; vec_uint4 T6 = {0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a}; vec_uint4 T7 = {0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613}; vec_uint4 T8 = {0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501}; vec_uint4 T9 = {0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8}; vec_uint4 T10 = {0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af}; vec_uint4 T11 = {0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1}; vec_uint4 T12 = {0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be}; vec_uint4 T13 = {0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122}; vec_uint4 T14 = {0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193}; vec_uint4 T15 = {0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e}; vec_uint4 T16 = {0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821}; vec_uint4 T17 = {0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562}; vec_uint4 T18 = {0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340}; vec_uint4 T19 = {0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51}; vec_uint4 T20 = {0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa}; vec_uint4 T21 = {0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d}; vec_uint4 T22 = {0x02441453, 0x02441453, 0x02441453, 0x02441453}; vec_uint4 T23 = {0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681}; vec_uint4 T24 = {0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8}; vec_uint4 T25 = {0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6}; vec_uint4 T26 = {0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6}; vec_uint4 T27 = {0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87}; vec_uint4 T28 = {0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed}; vec_uint4 T29 = {0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905}; vec_uint4 T30 = {0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8}; vec_uint4 T31 = {0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9}; vec_uint4 T32 = {0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a}; vec_uint4 T33 = {0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942}; vec_uint4 T34 = {0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681}; vec_uint4 T35 = {0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122}; vec_uint4 T36 = {0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c}; vec_uint4 T37 = {0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44}; vec_uint4 T38 = {0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9}; vec_uint4 T39 = {0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60}; vec_uint4 T40 = {0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70}; vec_uint4 T41 = {0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6}; vec_uint4 T42 = {0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa}; vec_uint4 T43 = {0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085}; vec_uint4 T44 = {0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05}; vec_uint4 T45 = {0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039}; vec_uint4 T46 = {0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5}; vec_uint4 T47 = {0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8}; vec_uint4 T48 = {0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665}; vec_uint4 T49 = {0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244}; vec_uint4 T50 = {0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97}; vec_uint4 T51 = {0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7}; vec_uint4 T52 = {0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039}; vec_uint4 T53 = {0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3}; vec_uint4 T54 = {0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92}; vec_uint4 T55 = {0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d}; vec_uint4 T56 = {0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1}; vec_uint4 T57 = {0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f}; vec_uint4 T58 = {0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0}; vec_uint4 T59 = {0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314}; vec_uint4 T60 = {0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1}; vec_uint4 T61 = {0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82}; vec_uint4 T62 = {0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235}; vec_uint4 T63 = {0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb}; vec_uint4 T64 = {0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391}; set_round_1(&a, &b, &c, &d, &datachunk[0], 7, &T1); set_round_1(&d, &a, &b, &c, &datachunk[1], 12, &T2); set_round_1(&c, &d, &a, &b, &datachunk[2], 17, &T3); set_round_1(&b, &c, &d, &a, &datachunk[3], 22, &T4); set_round_1(&a, &b, &c, &d, &datachunk[4], 7, &T5); set_round_1(&d, &a, &b, &c, &datachunk[5], 12, &T6); set_round_1(&c, &d, &a, &b, &datachunk[6], 17, &T7); set_round_1(&b, &c, &d, &a, &datachunk[7], 22, &T8); set_round_1(&a, &b, &c, &d, &datachunk[8], 7, &T9); set_round_1(&d, &a, &b, &c, &datachunk[9], 12, &T10); set_round_1(&c, &d, &a, &b, &datachunk[10], 17, &T11); set_round_1(&b, &c, &d, &a, &datachunk[11], 22, &T12); set_round_1(&a, &b, &c, &d, &datachunk[12], 7, &T13); set_round_1(&d, &a, &b, &c, &datachunk[13], 12, &T14); set_round_1(&c, &d, &a, &b, &datachunk[14], 17, &T15); set_round_1(&b, &c, &d, &a, &datachunk[15], 22, &T16); set_round_2(&a, &b, &c, &d, &datachunk[1], 5, &T17); set_round_2(&d, &a, &b, &c, &datachunk[6], 9, &T18); set_round_2(&c, &d, &a, &b, &datachunk[11], 14, &T19); set_round_2(&b, &c, &d, &a, &datachunk[0], 20, &T20); set_round_2(&a, &b, &c, &d, &datachunk[5], 5, &T21); set_round_2(&d, &a, &b, &c, &datachunk[10], 9, &T22); set_round_2(&c, &d, &a, &b, &datachunk[15], 14, &T23); set_round_2(&b, &c, &d, &a, &datachunk[4], 20, &T24); set_round_2(&a, &b, &c, &d, &datachunk[9], 5, &T25); set_round_2(&d, &a, &b, &c, &datachunk[14], 9, &T26); set_round_2(&c, &d, &a, &b, &datachunk[3], 14, &T27); set_round_2(&b, &c, &d, &a, &datachunk[8], 20, &T28); set_round_2(&a, &b, &c, &d, &datachunk[13], 5, &T29); set_round_2(&d, &a, &b, &c, &datachunk[2], 9, &T30); set_round_2(&c, &d, &a, &b, &datachunk[7], 14, &T31); set_round_2(&b, &c, &d, &a, &datachunk[12], 20, &T32); set_round_3(&a, &b, &c, &d, &datachunk[5], 4, &T33); set_round_3(&d, &a, &b, &c, &datachunk[8], 11, &T34); set_round_3(&c, &d, &a, &b, &datachunk[11], 16, &T35); set_round_3(&b, &c, &d, &a, &datachunk[14], 23, &T36); set_round_3(&a, &b, &c, &d, &datachunk[1], 4, &T37); set_round_3(&d, &a, &b, &c, &datachunk[4], 11, &T38); set_round_3(&c, &d, &a, &b, &datachunk[7], 16, &T39); set_round_3(&b, &c, &d, &a, &datachunk[10], 23, &T40); set_round_3(&a, &b, &c, &d, &datachunk[13], 4, &T41); set_round_3(&d, &a, &b, &c, &datachunk[0], 11, &T42); set_round_3(&c, &d, &a, &b, &datachunk[3], 16, &T43); set_round_3(&b, &c, &d, &a, &datachunk[6], 23, &T44); set_round_3(&a, &b, &c, &d, &datachunk[9], 4, &T45); set_round_3(&d, &a, &b, &c, &datachunk[12], 11, &T46); set_round_3(&c, &d, &a, &b, &datachunk[15], 16, &T47); set_round_3(&b, &c, &d, &a, &datachunk[2], 23, &T48); set_round_4(&a, &b, &c, &d, &datachunk[0], 6, &T49); set_round_4(&d, &a, &b, &c, &datachunk[7], 10, &T50); set_round_4(&c, &d, &a, &b, &datachunk[14], 15, &T51); set_round_4(&b, &c, &d, &a, &datachunk[5], 21, &T52); set_round_4(&a, &b, &c, &d, &datachunk[12], 6, &T53); set_round_4(&d, &a, &b, &c, &datachunk[3], 10, &T54); set_round_4(&c, &d, &a, &b, &datachunk[10], 15, &T55); set_round_4(&b, &c, &d, &a, &datachunk[1], 21, &T56); set_round_4(&a, &b, &c, &d, &datachunk[8], 6, &T57); set_round_4(&d, &a, &b, &c, &datachunk[15], 10, &T58); set_round_4(&c, &d, &a, &b, &datachunk[6], 15, &T59); set_round_4(&b, &c, &d, &a, &datachunk[13], 21, &T60); set_round_4(&a, &b, &c, &d, &datachunk[4], 6, &T61); set_round_4(&d, &a, &b, &c, &datachunk[11], 10, &T62); set_round_4(&c, &d, &a, &b, &datachunk[2], 15, &T63); set_round_4(&b, &c, &d, &a, &datachunk[9], 21, &T64); a=spu_add(a,orig_a); b=spu_add(b,orig_b); c=spu_add(c,orig_c); d=spu_add(d,orig_d); //NOTE - the digest is in big endian =D - The results are back-to-front. I haven't written a converter yet. #ifndef SPEEDTEST printf("big-endian digest: 1234aaa : 1.%x 2.%x 3.%x 4.%x\n",spu_extract(a,0),spu_extract(b,0),spu_extract(c,0),spu_extract(d,0)); printf("big-endian digest: 2345aaa : 1.%x 2.%x 3.%x 4.%x\n",spu_extract(a,1),spu_extract(b,1),spu_extract(c,1),spu_extract(d,1)); printf("big-endian digest: 3456aaa : 1.%x 2.%x 3.%x 4.%x\n",spu_extract(a,2),spu_extract(b,2),spu_extract(c,2),spu_extract(d,2)); printf("big-endian digest: 4567aaa : 1.%x 2.%x 3.%x 4.%x\n",spu_extract(a,3),spu_extract(b,3),spu_extract(c,3),spu_extract(d,3)); #endif // jmt: to prevent compiler optimizing out dead code, return a simple function of a, b, c, d // this adds negligible execution time compared to the meat of this function return spu_or(spu_or(a, b), spu_or(c, d)); } // jmt: it is crucial that this function not be inlined, or the compiler will make // optimizations which will spoil the benchmark by removing apparently redundant code. void md5_process_benchmark(const vec_uint4 *datachunk1, const vec_uint4 *datachunk2, int wrap) __attribute__((noinline)); void md5_process_benchmark(const vec_uint4 *datachunk1, const vec_uint4 *datachunk2, int wrapMask) { unsigned int i; // jmt: I find on a PS3 that 3.2M iterations takes 1 second ($ time ./md5) for (i = 0; i < 3200000; i++) { // jmt: a result must be returned, and stored in a volatile variable, // to prevent the entire md5_process function being optimized out as // having no side-effects // we access an offset in datachunk1 (which we secretly know will // always be 0) to force the compiler to reload it every iteration volatile vec_uint4 result1 = md5_process(datachunk1 + ((i * 16) & wrapMask)); // jmt: there are plenty of stalls in the md5_process function // in which more work could be done. We do a second md5 hash // (on different data) in order to fill those pipeline stalls. // This barely takes any longer than just doing a single hash. volatile vec_uint4 result2 = md5_process(datachunk2 + ((i * 16) & wrapMask)); } } int main() { vec_uint4 datachunk[16]; datachunk[0]=(vec_uint4){0x34333231,0x35343332,0x36353433,0x37363534}; datachunk[1]=(vec_uint4){0x80616161,0x80616161,0x80616161,0x80616161}; datachunk[2]=(vec_uint4){0x80616162,0x80616162,0x80616162,0x80616162}; datachunk[3]=(vec_uint4){0x80616163,0x80616163,0x80616163,0x80616163}; datachunk[4]=(vec_uint4){0x80616164,0x80616164,0x80616164,0x80616164}; datachunk[5]=(vec_uint4){0x80616165,0x80616165,0x80616165,0x80616165}; datachunk[6]=(vec_uint4){0x80616166,0x80616166,0x80616166,0x80616166}; datachunk[7]=(vec_uint4){0x80616167,0x80616167,0x80616167,0x80616167}; datachunk[8]=(vec_uint4){0x80616168,0x80616168,0x80616168,0x80616168}; datachunk[9]=(vec_uint4){0x80616169,0x80616169,0x80616169,0x80616169}; datachunk[10]=(vec_uint4){0x8061616a,0x8061616a,0x8061616a,0x8061616a}; datachunk[11]=(vec_uint4){0x8061616b,0x8061616b,0x8061616b,0x8061616b}; datachunk[12]=(vec_uint4){0x8061616c,0x8061616c,0x8061616c,0x8061616c}; datachunk[13]=(vec_uint4){0x8061616d,0x8061616d,0x8061616d,0x8061616d}; datachunk[14]=(vec_uint4){0x00000038,0x00000038,0x00000038,0x00000038}; datachunk[15]=(vec_uint4){0x8061616e,0x8061616e,0x8061616e,0x8061616e}; #ifdef SPEEDTEST md5_process_benchmark(datachunk, datachunk, 0xF); #else md5_process(datachunk); #endif return 0; }