Changeset 3621 in MondoRescue for branches/3.3/mindi-busybox/libbb/hash_md5_sha.c
- Timestamp:
- Dec 20, 2016, 4:07:32 PM (7 years ago)
- Location:
- branches/3.3
- Files:
-
- 1 edited
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
branches/3.3/mindi-busybox/libbb/hash_md5_sha.c
r3232 r3621 85 85 t = bb_bswap_64(t); 86 86 /* wbuffer is suitably aligned for this */ 87 *( uint64_t *) (&ctx->wbuffer[64 - 8]) = t;87 *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t; 88 88 } 89 89 ctx->process_block(ctx); … … 138 138 /* Before we start, one word to the strange constants. 139 139 They are defined in RFC 1321 as 140 T[i] = (int)( 4294967296.0* fabs(sin(i))), i=1..64140 T[i] = (int)(2^32 * fabs(sin(i))), i=1..64 141 141 */ 142 142 static const uint32_t C_array[] = { … … 214 214 temp += FH(B, C, D); 215 215 break; 216 case 3:216 default: /* case 3 */ 217 217 temp += FI(B, C, D); 218 218 } … … 278 278 #else /* MD5_SMALL == 0 or 1 */ 279 279 280 uint32_t A_save = A;281 uint32_t B_save = B;282 uint32_t C_save = C;283 uint32_t D_save = D;284 280 # if MD5_SMALL == 1 285 281 const uint32_t *pc; … … 426 422 # endif 427 423 /* Add checksum to the starting values */ 428 ctx->hash[0] = A_save +A;429 ctx->hash[1] = B_save +B;430 ctx->hash[2] = C_save +C;431 ctx->hash[3] = D_save +D;424 ctx->hash[0] += A; 425 ctx->hash[1] += B; 426 ctx->hash[2] += C; 427 ctx->hash[3] += D; 432 428 #endif 433 429 } … … 884 880 t = ctx->total64[0] << 3; 885 881 t = SWAP_BE64(t); 886 *( uint64_t *) (&ctx->wbuffer[128 - 8]) = t;882 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t; 887 883 t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61); 888 884 t = SWAP_BE64(t); 889 *( uint64_t *) (&ctx->wbuffer[128 - 16]) = t;885 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t; 890 886 } 891 887 sha512_process_block128(ctx); … … 927 923 #endif 928 924 925 #define OPTIMIZE_SHA3_FOR_32 0 926 /* 927 * SHA3 can be optimized for 32-bit CPUs with bit-slicing: 928 * every 64-bit word of state[] can be split into two 32-bit words 929 * by even/odd bits. In this form, all rotations of sha3 round 930 * are 32-bit - and there are lots of them. 931 * However, it requires either splitting/combining state words 932 * before/after sha3 round (code does this now) 933 * or shuffling bits before xor'ing them into state and in sha3_end. 934 * Without shuffling, bit-slicing results in -130 bytes of code 935 * and marginal speedup (but of course it gives wrong result). 936 * With shuffling it works, but +260 code bytes, and slower. 937 * Disabled for now: 938 */ 939 #if 0 /* LONG_MAX == 0x7fffffff */ 940 # undef OPTIMIZE_SHA3_FOR_32 941 # define OPTIMIZE_SHA3_FOR_32 1 942 #endif 943 929 944 enum { 930 945 SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */ 931 946 }; 947 948 #if OPTIMIZE_SHA3_FOR_32 949 /* This splits every 64-bit word into a pair of 32-bit words, 950 * even bits go into first word, odd bits go to second one. 951 * The conversion is done in-place. 952 */ 953 static void split_halves(uint64_t *state) 954 { 955 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ 956 uint32_t *s32 = (uint32_t*)state; 957 uint32_t t, x0, x1; 958 int i; 959 for (i = 24; i >= 0; --i) { 960 x0 = s32[0]; 961 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1); 962 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2); 963 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4); 964 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8); 965 x1 = s32[1]; 966 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1); 967 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2); 968 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4); 969 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8); 970 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16); 971 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000); 972 } 973 } 974 /* The reverse operation */ 975 static void combine_halves(uint64_t *state) 976 { 977 uint32_t *s32 = (uint32_t*)state; 978 uint32_t t, x0, x1; 979 int i; 980 for (i = 24; i >= 0; --i) { 981 x0 = s32[0]; 982 x1 = s32[1]; 983 t = (x0 & 0x0000FFFF) | (x1 << 16); 984 x1 = (x0 >> 16) | (x1 & 0xFFFF0000); 985 x0 = t; 986 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8); 987 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4); 988 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2); 989 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1); 990 *s32++ = x0; 991 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8); 992 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4); 993 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2); 994 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1); 995 *s32++ = x1; 996 } 997 } 998 #endif 932 999 933 1000 /* … … 938 1005 enum { NROUNDS = 24 }; 939 1006 940 /* Elements should be 64-bit, but top half is always zero or 0x80000000. 941 * We encode 63rd bits in a separate word below. 942 * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit. 943 * The speed penalty is lost in the noise. 944 */ 1007 #if OPTIMIZE_SHA3_FOR_32 1008 /* 1009 static const uint32_t IOTA_CONST_0[NROUNDS] = { 1010 0x00000001UL, 1011 0x00000000UL, 1012 0x00000000UL, 1013 0x00000000UL, 1014 0x00000001UL, 1015 0x00000001UL, 1016 0x00000001UL, 1017 0x00000001UL, 1018 0x00000000UL, 1019 0x00000000UL, 1020 0x00000001UL, 1021 0x00000000UL, 1022 0x00000001UL, 1023 0x00000001UL, 1024 0x00000001UL, 1025 0x00000001UL, 1026 0x00000000UL, 1027 0x00000000UL, 1028 0x00000000UL, 1029 0x00000000UL, 1030 0x00000001UL, 1031 0x00000000UL, 1032 0x00000001UL, 1033 0x00000000UL, 1034 }; 1035 ** bits are in lsb: 0101 0000 1111 0100 1111 0001 1036 */ 1037 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1); 1038 static const uint32_t IOTA_CONST_1[NROUNDS] = { 1039 0x00000000UL, 1040 0x00000089UL, 1041 0x8000008bUL, 1042 0x80008080UL, 1043 0x0000008bUL, 1044 0x00008000UL, 1045 0x80008088UL, 1046 0x80000082UL, 1047 0x0000000bUL, 1048 0x0000000aUL, 1049 0x00008082UL, 1050 0x00008003UL, 1051 0x0000808bUL, 1052 0x8000000bUL, 1053 0x8000008aUL, 1054 0x80000081UL, 1055 0x80000081UL, 1056 0x80000008UL, 1057 0x00000083UL, 1058 0x80008003UL, 1059 0x80008088UL, 1060 0x80000088UL, 1061 0x00008000UL, 1062 0x80008082UL, 1063 }; 1064 1065 uint32_t *const s32 = (uint32_t*)state; 1066 unsigned round; 1067 1068 split_halves(state); 1069 1070 for (round = 0; round < NROUNDS; round++) { 1071 unsigned x; 1072 1073 /* Theta */ 1074 { 1075 uint32_t BC[20]; 1076 for (x = 0; x < 10; ++x) { 1077 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40]; 1078 } 1079 for (x = 0; x < 10; x += 2) { 1080 uint32_t ta, tb; 1081 ta = BC[x+8] ^ rotl32(BC[x+3], 1); 1082 tb = BC[x+9] ^ BC[x+2]; 1083 s32[x+0] ^= ta; 1084 s32[x+1] ^= tb; 1085 s32[x+10] ^= ta; 1086 s32[x+11] ^= tb; 1087 s32[x+20] ^= ta; 1088 s32[x+21] ^= tb; 1089 s32[x+30] ^= ta; 1090 s32[x+31] ^= tb; 1091 s32[x+40] ^= ta; 1092 s32[x+41] ^= tb; 1093 } 1094 } 1095 /* RhoPi */ 1096 { 1097 uint32_t t0a,t0b, t1a,t1b; 1098 t1a = s32[1*2+0]; 1099 t1b = s32[1*2+1]; 1100 1101 #define RhoPi(PI_LANE, ROT_CONST) \ 1102 t0a = s32[PI_LANE*2+0];\ 1103 t0b = s32[PI_LANE*2+1];\ 1104 if (ROT_CONST & 1) {\ 1105 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\ 1106 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\ 1107 } else {\ 1108 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\ 1109 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\ 1110 }\ 1111 t1a = t0a; t1b = t0b; 1112 1113 RhoPi(10, 1) 1114 RhoPi( 7, 3) 1115 RhoPi(11, 6) 1116 RhoPi(17,10) 1117 RhoPi(18,15) 1118 RhoPi( 3,21) 1119 RhoPi( 5,28) 1120 RhoPi(16,36) 1121 RhoPi( 8,45) 1122 RhoPi(21,55) 1123 RhoPi(24, 2) 1124 RhoPi( 4,14) 1125 RhoPi(15,27) 1126 RhoPi(23,41) 1127 RhoPi(19,56) 1128 RhoPi(13, 8) 1129 RhoPi(12,25) 1130 RhoPi( 2,43) 1131 RhoPi(20,62) 1132 RhoPi(14,18) 1133 RhoPi(22,39) 1134 RhoPi( 9,61) 1135 RhoPi( 6,20) 1136 RhoPi( 1,44) 1137 #undef RhoPi 1138 } 1139 /* Chi */ 1140 for (x = 0; x <= 40;) { 1141 uint32_t BC0, BC1, BC2, BC3, BC4; 1142 BC0 = s32[x + 0*2]; 1143 BC1 = s32[x + 1*2]; 1144 BC2 = s32[x + 2*2]; 1145 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); 1146 BC3 = s32[x + 3*2]; 1147 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); 1148 BC4 = s32[x + 4*2]; 1149 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); 1150 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); 1151 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); 1152 x++; 1153 BC0 = s32[x + 0*2]; 1154 BC1 = s32[x + 1*2]; 1155 BC2 = s32[x + 2*2]; 1156 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); 1157 BC3 = s32[x + 3*2]; 1158 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); 1159 BC4 = s32[x + 4*2]; 1160 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); 1161 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); 1162 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); 1163 x += 9; 1164 } 1165 /* Iota */ 1166 s32[0] ^= IOTA_CONST_0bits & 1; 1167 IOTA_CONST_0bits >>= 1; 1168 s32[1] ^= IOTA_CONST_1[round]; 1169 } 1170 1171 combine_halves(state); 1172 #else 1173 /* Native 64-bit algorithm */ 945 1174 static const uint16_t IOTA_CONST[NROUNDS] = { 1175 /* Elements should be 64-bit, but top half is always zero 1176 * or 0x80000000. We encode 63rd bits in a separate word below. 1177 * Same is true for 31th bits, which lets us use 16-bit table 1178 * instead of 64-bit. The speed penalty is lost in the noise. 1179 */ 946 1180 0x0001, 947 1181 0x8082, … … 984 1218 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/ 985 1219 986 unsigned x , y;1220 unsigned x; 987 1221 unsigned round; 988 1222 … … 1046 1280 #undef RhoPi_twice 1047 1281 } 1048 1049 1282 /* Chi */ 1050 for (y = 0; y <= 20; y += 5) { 1283 # if LONG_MAX > 0x7fffffff 1284 for (x = 0; x <= 20; x += 5) { 1051 1285 uint64_t BC0, BC1, BC2, BC3, BC4; 1052 BC0 = state[ y+ 0];1053 BC1 = state[ y+ 1];1054 BC2 = state[ y+ 2];1055 state[ y+ 0] = BC0 ^ ((~BC1) & BC2);1056 BC3 = state[ y+ 3];1057 state[ y+ 1] = BC1 ^ ((~BC2) & BC3);1058 BC4 = state[ y+ 4];1059 state[ y+ 2] = BC2 ^ ((~BC3) & BC4);1060 state[ y+ 3] = BC3 ^ ((~BC4) & BC0);1061 state[ y+ 4] = BC4 ^ ((~BC0) & BC1);1286 BC0 = state[x + 0]; 1287 BC1 = state[x + 1]; 1288 BC2 = state[x + 2]; 1289 state[x + 0] = BC0 ^ ((~BC1) & BC2); 1290 BC3 = state[x + 3]; 1291 state[x + 1] = BC1 ^ ((~BC2) & BC3); 1292 BC4 = state[x + 4]; 1293 state[x + 2] = BC2 ^ ((~BC3) & BC4); 1294 state[x + 3] = BC3 ^ ((~BC4) & BC0); 1295 state[x + 4] = BC4 ^ ((~BC0) & BC1); 1062 1296 } 1063 1297 # else 1298 /* Reduced register pressure version 1299 * for register-starved 32-bit arches 1300 * (i386: -95 bytes, and it is _faster_) 1301 */ 1302 for (x = 0; x <= 40;) { 1303 uint32_t BC0, BC1, BC2, BC3, BC4; 1304 uint32_t *const s32 = (uint32_t*)state; 1305 # if SHA3_SMALL 1306 do_half: 1307 # endif 1308 BC0 = s32[x + 0*2]; 1309 BC1 = s32[x + 1*2]; 1310 BC2 = s32[x + 2*2]; 1311 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); 1312 BC3 = s32[x + 3*2]; 1313 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); 1314 BC4 = s32[x + 4*2]; 1315 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); 1316 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); 1317 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); 1318 x++; 1319 # if SHA3_SMALL 1320 if (x & 1) 1321 goto do_half; 1322 x += 8; 1323 # else 1324 BC0 = s32[x + 0*2]; 1325 BC1 = s32[x + 1*2]; 1326 BC2 = s32[x + 2*2]; 1327 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); 1328 BC3 = s32[x + 3*2]; 1329 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); 1330 BC4 = s32[x + 4*2]; 1331 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); 1332 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); 1333 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); 1334 x += 9; 1335 # endif 1336 } 1337 # endif /* long is 32-bit */ 1064 1338 /* Iota */ 1065 1339 state[0] ^= IOTA_CONST[round] … … 1073 1347 } 1074 1348 } 1349 #endif 1075 1350 } 1076 1351
Note:
See TracChangeset
for help on using the changeset viewer.