gost-sse2.c 68 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685
  1. /**
  2. * Author......: Jens Steube <jens.steube@gmail.com>
  3. * License.....: MIT
  4. */
  5. static const uint32_t tables[4][256] =
  6. {
  7. {
  8. 0x00072000, 0x00075000, 0x00074800, 0x00071000,
  9. 0x00076800, 0x00074000, 0x00070000, 0x00077000,
  10. 0x00073000, 0x00075800, 0x00070800, 0x00076000,
  11. 0x00073800, 0x00077800, 0x00072800, 0x00071800,
  12. 0x0005a000, 0x0005d000, 0x0005c800, 0x00059000,
  13. 0x0005e800, 0x0005c000, 0x00058000, 0x0005f000,
  14. 0x0005b000, 0x0005d800, 0x00058800, 0x0005e000,
  15. 0x0005b800, 0x0005f800, 0x0005a800, 0x00059800,
  16. 0x00022000, 0x00025000, 0x00024800, 0x00021000,
  17. 0x00026800, 0x00024000, 0x00020000, 0x00027000,
  18. 0x00023000, 0x00025800, 0x00020800, 0x00026000,
  19. 0x00023800, 0x00027800, 0x00022800, 0x00021800,
  20. 0x00062000, 0x00065000, 0x00064800, 0x00061000,
  21. 0x00066800, 0x00064000, 0x00060000, 0x00067000,
  22. 0x00063000, 0x00065800, 0x00060800, 0x00066000,
  23. 0x00063800, 0x00067800, 0x00062800, 0x00061800,
  24. 0x00032000, 0x00035000, 0x00034800, 0x00031000,
  25. 0x00036800, 0x00034000, 0x00030000, 0x00037000,
  26. 0x00033000, 0x00035800, 0x00030800, 0x00036000,
  27. 0x00033800, 0x00037800, 0x00032800, 0x00031800,
  28. 0x0006a000, 0x0006d000, 0x0006c800, 0x00069000,
  29. 0x0006e800, 0x0006c000, 0x00068000, 0x0006f000,
  30. 0x0006b000, 0x0006d800, 0x00068800, 0x0006e000,
  31. 0x0006b800, 0x0006f800, 0x0006a800, 0x00069800,
  32. 0x0007a000, 0x0007d000, 0x0007c800, 0x00079000,
  33. 0x0007e800, 0x0007c000, 0x00078000, 0x0007f000,
  34. 0x0007b000, 0x0007d800, 0x00078800, 0x0007e000,
  35. 0x0007b800, 0x0007f800, 0x0007a800, 0x00079800,
  36. 0x00052000, 0x00055000, 0x00054800, 0x00051000,
  37. 0x00056800, 0x00054000, 0x00050000, 0x00057000,
  38. 0x00053000, 0x00055800, 0x00050800, 0x00056000,
  39. 0x00053800, 0x00057800, 0x00052800, 0x00051800,
  40. 0x00012000, 0x00015000, 0x00014800, 0x00011000,
  41. 0x00016800, 0x00014000, 0x00010000, 0x00017000,
  42. 0x00013000, 0x00015800, 0x00010800, 0x00016000,
  43. 0x00013800, 0x00017800, 0x00012800, 0x00011800,
  44. 0x0001a000, 0x0001d000, 0x0001c800, 0x00019000,
  45. 0x0001e800, 0x0001c000, 0x00018000, 0x0001f000,
  46. 0x0001b000, 0x0001d800, 0x00018800, 0x0001e000,
  47. 0x0001b800, 0x0001f800, 0x0001a800, 0x00019800,
  48. 0x00042000, 0x00045000, 0x00044800, 0x00041000,
  49. 0x00046800, 0x00044000, 0x00040000, 0x00047000,
  50. 0x00043000, 0x00045800, 0x00040800, 0x00046000,
  51. 0x00043800, 0x00047800, 0x00042800, 0x00041800,
  52. 0x0000a000, 0x0000d000, 0x0000c800, 0x00009000,
  53. 0x0000e800, 0x0000c000, 0x00008000, 0x0000f000,
  54. 0x0000b000, 0x0000d800, 0x00008800, 0x0000e000,
  55. 0x0000b800, 0x0000f800, 0x0000a800, 0x00009800,
  56. 0x00002000, 0x00005000, 0x00004800, 0x00001000,
  57. 0x00006800, 0x00004000, 0x00000000, 0x00007000,
  58. 0x00003000, 0x00005800, 0x00000800, 0x00006000,
  59. 0x00003800, 0x00007800, 0x00002800, 0x00001800,
  60. 0x0003a000, 0x0003d000, 0x0003c800, 0x00039000,
  61. 0x0003e800, 0x0003c000, 0x00038000, 0x0003f000,
  62. 0x0003b000, 0x0003d800, 0x00038800, 0x0003e000,
  63. 0x0003b800, 0x0003f800, 0x0003a800, 0x00039800,
  64. 0x0002a000, 0x0002d000, 0x0002c800, 0x00029000,
  65. 0x0002e800, 0x0002c000, 0x00028000, 0x0002f000,
  66. 0x0002b000, 0x0002d800, 0x00028800, 0x0002e000,
  67. 0x0002b800, 0x0002f800, 0x0002a800, 0x00029800,
  68. 0x0004a000, 0x0004d000, 0x0004c800, 0x00049000,
  69. 0x0004e800, 0x0004c000, 0x00048000, 0x0004f000,
  70. 0x0004b000, 0x0004d800, 0x00048800, 0x0004e000,
  71. 0x0004b800, 0x0004f800, 0x0004a800, 0x00049800,
  72. },
  73. {
  74. 0x03a80000, 0x03c00000, 0x03880000, 0x03e80000,
  75. 0x03d00000, 0x03980000, 0x03a00000, 0x03900000,
  76. 0x03f00000, 0x03f80000, 0x03e00000, 0x03b80000,
  77. 0x03b00000, 0x03800000, 0x03c80000, 0x03d80000,
  78. 0x06a80000, 0x06c00000, 0x06880000, 0x06e80000,
  79. 0x06d00000, 0x06980000, 0x06a00000, 0x06900000,
  80. 0x06f00000, 0x06f80000, 0x06e00000, 0x06b80000,
  81. 0x06b00000, 0x06800000, 0x06c80000, 0x06d80000,
  82. 0x05280000, 0x05400000, 0x05080000, 0x05680000,
  83. 0x05500000, 0x05180000, 0x05200000, 0x05100000,
  84. 0x05700000, 0x05780000, 0x05600000, 0x05380000,
  85. 0x05300000, 0x05000000, 0x05480000, 0x05580000,
  86. 0x00a80000, 0x00c00000, 0x00880000, 0x00e80000,
  87. 0x00d00000, 0x00980000, 0x00a00000, 0x00900000,
  88. 0x00f00000, 0x00f80000, 0x00e00000, 0x00b80000,
  89. 0x00b00000, 0x00800000, 0x00c80000, 0x00d80000,
  90. 0x00280000, 0x00400000, 0x00080000, 0x00680000,
  91. 0x00500000, 0x00180000, 0x00200000, 0x00100000,
  92. 0x00700000, 0x00780000, 0x00600000, 0x00380000,
  93. 0x00300000, 0x00000000, 0x00480000, 0x00580000,
  94. 0x04280000, 0x04400000, 0x04080000, 0x04680000,
  95. 0x04500000, 0x04180000, 0x04200000, 0x04100000,
  96. 0x04700000, 0x04780000, 0x04600000, 0x04380000,
  97. 0x04300000, 0x04000000, 0x04480000, 0x04580000,
  98. 0x04a80000, 0x04c00000, 0x04880000, 0x04e80000,
  99. 0x04d00000, 0x04980000, 0x04a00000, 0x04900000,
  100. 0x04f00000, 0x04f80000, 0x04e00000, 0x04b80000,
  101. 0x04b00000, 0x04800000, 0x04c80000, 0x04d80000,
  102. 0x07a80000, 0x07c00000, 0x07880000, 0x07e80000,
  103. 0x07d00000, 0x07980000, 0x07a00000, 0x07900000,
  104. 0x07f00000, 0x07f80000, 0x07e00000, 0x07b80000,
  105. 0x07b00000, 0x07800000, 0x07c80000, 0x07d80000,
  106. 0x07280000, 0x07400000, 0x07080000, 0x07680000,
  107. 0x07500000, 0x07180000, 0x07200000, 0x07100000,
  108. 0x07700000, 0x07780000, 0x07600000, 0x07380000,
  109. 0x07300000, 0x07000000, 0x07480000, 0x07580000,
  110. 0x02280000, 0x02400000, 0x02080000, 0x02680000,
  111. 0x02500000, 0x02180000, 0x02200000, 0x02100000,
  112. 0x02700000, 0x02780000, 0x02600000, 0x02380000,
  113. 0x02300000, 0x02000000, 0x02480000, 0x02580000,
  114. 0x03280000, 0x03400000, 0x03080000, 0x03680000,
  115. 0x03500000, 0x03180000, 0x03200000, 0x03100000,
  116. 0x03700000, 0x03780000, 0x03600000, 0x03380000,
  117. 0x03300000, 0x03000000, 0x03480000, 0x03580000,
  118. 0x06280000, 0x06400000, 0x06080000, 0x06680000,
  119. 0x06500000, 0x06180000, 0x06200000, 0x06100000,
  120. 0x06700000, 0x06780000, 0x06600000, 0x06380000,
  121. 0x06300000, 0x06000000, 0x06480000, 0x06580000,
  122. 0x05a80000, 0x05c00000, 0x05880000, 0x05e80000,
  123. 0x05d00000, 0x05980000, 0x05a00000, 0x05900000,
  124. 0x05f00000, 0x05f80000, 0x05e00000, 0x05b80000,
  125. 0x05b00000, 0x05800000, 0x05c80000, 0x05d80000,
  126. 0x01280000, 0x01400000, 0x01080000, 0x01680000,
  127. 0x01500000, 0x01180000, 0x01200000, 0x01100000,
  128. 0x01700000, 0x01780000, 0x01600000, 0x01380000,
  129. 0x01300000, 0x01000000, 0x01480000, 0x01580000,
  130. 0x02a80000, 0x02c00000, 0x02880000, 0x02e80000,
  131. 0x02d00000, 0x02980000, 0x02a00000, 0x02900000,
  132. 0x02f00000, 0x02f80000, 0x02e00000, 0x02b80000,
  133. 0x02b00000, 0x02800000, 0x02c80000, 0x02d80000,
  134. 0x01a80000, 0x01c00000, 0x01880000, 0x01e80000,
  135. 0x01d00000, 0x01980000, 0x01a00000, 0x01900000,
  136. 0x01f00000, 0x01f80000, 0x01e00000, 0x01b80000,
  137. 0x01b00000, 0x01800000, 0x01c80000, 0x01d80000,
  138. },
  139. {
  140. 0x30000002, 0x60000002, 0x38000002, 0x08000002,
  141. 0x28000002, 0x78000002, 0x68000002, 0x40000002,
  142. 0x20000002, 0x50000002, 0x48000002, 0x70000002,
  143. 0x00000002, 0x18000002, 0x58000002, 0x10000002,
  144. 0xb0000005, 0xe0000005, 0xb8000005, 0x88000005,
  145. 0xa8000005, 0xf8000005, 0xe8000005, 0xc0000005,
  146. 0xa0000005, 0xd0000005, 0xc8000005, 0xf0000005,
  147. 0x80000005, 0x98000005, 0xd8000005, 0x90000005,
  148. 0x30000005, 0x60000005, 0x38000005, 0x08000005,
  149. 0x28000005, 0x78000005, 0x68000005, 0x40000005,
  150. 0x20000005, 0x50000005, 0x48000005, 0x70000005,
  151. 0x00000005, 0x18000005, 0x58000005, 0x10000005,
  152. 0x30000000, 0x60000000, 0x38000000, 0x08000000,
  153. 0x28000000, 0x78000000, 0x68000000, 0x40000000,
  154. 0x20000000, 0x50000000, 0x48000000, 0x70000000,
  155. 0x00000000, 0x18000000, 0x58000000, 0x10000000,
  156. 0xb0000003, 0xe0000003, 0xb8000003, 0x88000003,
  157. 0xa8000003, 0xf8000003, 0xe8000003, 0xc0000003,
  158. 0xa0000003, 0xd0000003, 0xc8000003, 0xf0000003,
  159. 0x80000003, 0x98000003, 0xd8000003, 0x90000003,
  160. 0x30000001, 0x60000001, 0x38000001, 0x08000001,
  161. 0x28000001, 0x78000001, 0x68000001, 0x40000001,
  162. 0x20000001, 0x50000001, 0x48000001, 0x70000001,
  163. 0x00000001, 0x18000001, 0x58000001, 0x10000001,
  164. 0xb0000000, 0xe0000000, 0xb8000000, 0x88000000,
  165. 0xa8000000, 0xf8000000, 0xe8000000, 0xc0000000,
  166. 0xa0000000, 0xd0000000, 0xc8000000, 0xf0000000,
  167. 0x80000000, 0x98000000, 0xd8000000, 0x90000000,
  168. 0xb0000006, 0xe0000006, 0xb8000006, 0x88000006,
  169. 0xa8000006, 0xf8000006, 0xe8000006, 0xc0000006,
  170. 0xa0000006, 0xd0000006, 0xc8000006, 0xf0000006,
  171. 0x80000006, 0x98000006, 0xd8000006, 0x90000006,
  172. 0xb0000001, 0xe0000001, 0xb8000001, 0x88000001,
  173. 0xa8000001, 0xf8000001, 0xe8000001, 0xc0000001,
  174. 0xa0000001, 0xd0000001, 0xc8000001, 0xf0000001,
  175. 0x80000001, 0x98000001, 0xd8000001, 0x90000001,
  176. 0x30000003, 0x60000003, 0x38000003, 0x08000003,
  177. 0x28000003, 0x78000003, 0x68000003, 0x40000003,
  178. 0x20000003, 0x50000003, 0x48000003, 0x70000003,
  179. 0x00000003, 0x18000003, 0x58000003, 0x10000003,
  180. 0x30000004, 0x60000004, 0x38000004, 0x08000004,
  181. 0x28000004, 0x78000004, 0x68000004, 0x40000004,
  182. 0x20000004, 0x50000004, 0x48000004, 0x70000004,
  183. 0x00000004, 0x18000004, 0x58000004, 0x10000004,
  184. 0xb0000002, 0xe0000002, 0xb8000002, 0x88000002,
  185. 0xa8000002, 0xf8000002, 0xe8000002, 0xc0000002,
  186. 0xa0000002, 0xd0000002, 0xc8000002, 0xf0000002,
  187. 0x80000002, 0x98000002, 0xd8000002, 0x90000002,
  188. 0xb0000004, 0xe0000004, 0xb8000004, 0x88000004,
  189. 0xa8000004, 0xf8000004, 0xe8000004, 0xc0000004,
  190. 0xa0000004, 0xd0000004, 0xc8000004, 0xf0000004,
  191. 0x80000004, 0x98000004, 0xd8000004, 0x90000004,
  192. 0x30000006, 0x60000006, 0x38000006, 0x08000006,
  193. 0x28000006, 0x78000006, 0x68000006, 0x40000006,
  194. 0x20000006, 0x50000006, 0x48000006, 0x70000006,
  195. 0x00000006, 0x18000006, 0x58000006, 0x10000006,
  196. 0xb0000007, 0xe0000007, 0xb8000007, 0x88000007,
  197. 0xa8000007, 0xf8000007, 0xe8000007, 0xc0000007,
  198. 0xa0000007, 0xd0000007, 0xc8000007, 0xf0000007,
  199. 0x80000007, 0x98000007, 0xd8000007, 0x90000007,
  200. 0x30000007, 0x60000007, 0x38000007, 0x08000007,
  201. 0x28000007, 0x78000007, 0x68000007, 0x40000007,
  202. 0x20000007, 0x50000007, 0x48000007, 0x70000007,
  203. 0x00000007, 0x18000007, 0x58000007, 0x10000007,
  204. },
  205. {
  206. 0x000000e8, 0x000000d8, 0x000000a0, 0x00000088,
  207. 0x00000098, 0x000000f8, 0x000000a8, 0x000000c8,
  208. 0x00000080, 0x000000d0, 0x000000f0, 0x000000b8,
  209. 0x000000b0, 0x000000c0, 0x00000090, 0x000000e0,
  210. 0x000007e8, 0x000007d8, 0x000007a0, 0x00000788,
  211. 0x00000798, 0x000007f8, 0x000007a8, 0x000007c8,
  212. 0x00000780, 0x000007d0, 0x000007f0, 0x000007b8,
  213. 0x000007b0, 0x000007c0, 0x00000790, 0x000007e0,
  214. 0x000006e8, 0x000006d8, 0x000006a0, 0x00000688,
  215. 0x00000698, 0x000006f8, 0x000006a8, 0x000006c8,
  216. 0x00000680, 0x000006d0, 0x000006f0, 0x000006b8,
  217. 0x000006b0, 0x000006c0, 0x00000690, 0x000006e0,
  218. 0x00000068, 0x00000058, 0x00000020, 0x00000008,
  219. 0x00000018, 0x00000078, 0x00000028, 0x00000048,
  220. 0x00000000, 0x00000050, 0x00000070, 0x00000038,
  221. 0x00000030, 0x00000040, 0x00000010, 0x00000060,
  222. 0x000002e8, 0x000002d8, 0x000002a0, 0x00000288,
  223. 0x00000298, 0x000002f8, 0x000002a8, 0x000002c8,
  224. 0x00000280, 0x000002d0, 0x000002f0, 0x000002b8,
  225. 0x000002b0, 0x000002c0, 0x00000290, 0x000002e0,
  226. 0x000003e8, 0x000003d8, 0x000003a0, 0x00000388,
  227. 0x00000398, 0x000003f8, 0x000003a8, 0x000003c8,
  228. 0x00000380, 0x000003d0, 0x000003f0, 0x000003b8,
  229. 0x000003b0, 0x000003c0, 0x00000390, 0x000003e0,
  230. 0x00000568, 0x00000558, 0x00000520, 0x00000508,
  231. 0x00000518, 0x00000578, 0x00000528, 0x00000548,
  232. 0x00000500, 0x00000550, 0x00000570, 0x00000538,
  233. 0x00000530, 0x00000540, 0x00000510, 0x00000560,
  234. 0x00000268, 0x00000258, 0x00000220, 0x00000208,
  235. 0x00000218, 0x00000278, 0x00000228, 0x00000248,
  236. 0x00000200, 0x00000250, 0x00000270, 0x00000238,
  237. 0x00000230, 0x00000240, 0x00000210, 0x00000260,
  238. 0x000004e8, 0x000004d8, 0x000004a0, 0x00000488,
  239. 0x00000498, 0x000004f8, 0x000004a8, 0x000004c8,
  240. 0x00000480, 0x000004d0, 0x000004f0, 0x000004b8,
  241. 0x000004b0, 0x000004c0, 0x00000490, 0x000004e0,
  242. 0x00000168, 0x00000158, 0x00000120, 0x00000108,
  243. 0x00000118, 0x00000178, 0x00000128, 0x00000148,
  244. 0x00000100, 0x00000150, 0x00000170, 0x00000138,
  245. 0x00000130, 0x00000140, 0x00000110, 0x00000160,
  246. 0x000001e8, 0x000001d8, 0x000001a0, 0x00000188,
  247. 0x00000198, 0x000001f8, 0x000001a8, 0x000001c8,
  248. 0x00000180, 0x000001d0, 0x000001f0, 0x000001b8,
  249. 0x000001b0, 0x000001c0, 0x00000190, 0x000001e0,
  250. 0x00000768, 0x00000758, 0x00000720, 0x00000708,
  251. 0x00000718, 0x00000778, 0x00000728, 0x00000748,
  252. 0x00000700, 0x00000750, 0x00000770, 0x00000738,
  253. 0x00000730, 0x00000740, 0x00000710, 0x00000760,
  254. 0x00000368, 0x00000358, 0x00000320, 0x00000308,
  255. 0x00000318, 0x00000378, 0x00000328, 0x00000348,
  256. 0x00000300, 0x00000350, 0x00000370, 0x00000338,
  257. 0x00000330, 0x00000340, 0x00000310, 0x00000360,
  258. 0x000005e8, 0x000005d8, 0x000005a0, 0x00000588,
  259. 0x00000598, 0x000005f8, 0x000005a8, 0x000005c8,
  260. 0x00000580, 0x000005d0, 0x000005f0, 0x000005b8,
  261. 0x000005b0, 0x000005c0, 0x00000590, 0x000005e0,
  262. 0x00000468, 0x00000458, 0x00000420, 0x00000408,
  263. 0x00000418, 0x00000478, 0x00000428, 0x00000448,
  264. 0x00000400, 0x00000450, 0x00000470, 0x00000438,
  265. 0x00000430, 0x00000440, 0x00000410, 0x00000460,
  266. 0x00000668, 0x00000658, 0x00000620, 0x00000608,
  267. 0x00000618, 0x00000678, 0x00000628, 0x00000648,
  268. 0x00000600, 0x00000650, 0x00000670, 0x00000638,
  269. 0x00000630, 0x00000640, 0x00000610, 0x00000660,
  270. }
  271. };
  272. #define round(k1,k2) \
  273. { \
  274. uint32_t t; \
  275. t = (k1) + r; \
  276. l ^= tables[0][(t >> 0) & 0xff] ^ \
  277. tables[1][(t >> 8) & 0xff] ^ \
  278. tables[2][(t >> 16) & 0xff] ^ \
  279. tables[3][(t >> 24) & 0xff]; \
  280. t = (k2) + l; \
  281. r ^= tables[0][(t >> 0) & 0xff] ^ \
  282. tables[1][(t >> 8) & 0xff] ^ \
  283. tables[2][(t >> 16) & 0xff] ^ \
  284. tables[3][(t >> 24) & 0xff]; \
  285. }
  286. #define R(k,h,s,i) \
  287. { \
  288. uint32_t r; \
  289. uint32_t l; \
  290. r = h[i + 0]; \
  291. l = h[i + 1]; \
  292. round (k[0], k[1]); \
  293. round (k[2], k[3]); \
  294. round (k[4], k[5]); \
  295. round (k[6], k[7]); \
  296. round (k[0], k[1]); \
  297. round (k[2], k[3]); \
  298. round (k[4], k[5]); \
  299. round (k[6], k[7]); \
  300. round (k[0], k[1]); \
  301. round (k[2], k[3]); \
  302. round (k[4], k[5]); \
  303. round (k[6], k[7]); \
  304. round (k[7], k[6]); \
  305. round (k[5], k[4]); \
  306. round (k[3], k[2]); \
  307. round (k[1], k[0]); \
  308. s[i + 0] = l; \
  309. s[i + 1] = r; \
  310. }
  311. #define X(w,u,v) \
  312. w[0] = u[0] ^ v[0]; \
  313. w[1] = u[1] ^ v[1]; \
  314. w[2] = u[2] ^ v[2]; \
  315. w[3] = u[3] ^ v[3]; \
  316. w[4] = u[4] ^ v[4]; \
  317. w[5] = u[5] ^ v[5]; \
  318. w[6] = u[6] ^ v[6]; \
  319. w[7] = u[7] ^ v[7];
  320. #define P(k,w) \
  321. k[0] = ((w[0] & 0x000000ff) << 0) \
  322. | ((w[2] & 0x000000ff) << 8) \
  323. | ((w[4] & 0x000000ff) << 16) \
  324. | ((w[6] & 0x000000ff) << 24); \
  325. k[1] = ((w[0] & 0x0000ff00) >> 8) \
  326. | ((w[2] & 0x0000ff00) >> 0) \
  327. | ((w[4] & 0x0000ff00) << 8) \
  328. | ((w[6] & 0x0000ff00) << 16); \
  329. k[2] = ((w[0] & 0x00ff0000) >> 16) \
  330. | ((w[2] & 0x00ff0000) >> 8) \
  331. | ((w[4] & 0x00ff0000) << 0) \
  332. | ((w[6] & 0x00ff0000) << 8); \
  333. k[3] = ((w[0] & 0xff000000) >> 24) \
  334. | ((w[2] & 0xff000000) >> 16) \
  335. | ((w[4] & 0xff000000) >> 8) \
  336. | ((w[6] & 0xff000000) >> 0); \
  337. k[4] = ((w[1] & 0x000000ff) << 0) \
  338. | ((w[3] & 0x000000ff) << 8) \
  339. | ((w[5] & 0x000000ff) << 16) \
  340. | ((w[7] & 0x000000ff) << 24); \
  341. k[5] = ((w[1] & 0x0000ff00) >> 8) \
  342. | ((w[3] & 0x0000ff00) >> 0) \
  343. | ((w[5] & 0x0000ff00) << 8) \
  344. | ((w[7] & 0x0000ff00) << 16); \
  345. k[6] = ((w[1] & 0x00ff0000) >> 16) \
  346. | ((w[3] & 0x00ff0000) >> 8) \
  347. | ((w[5] & 0x00ff0000) << 0) \
  348. | ((w[7] & 0x00ff0000) << 8); \
  349. k[7] = ((w[1] & 0xff000000) >> 24) \
  350. | ((w[3] & 0xff000000) >> 16) \
  351. | ((w[5] & 0xff000000) >> 8) \
  352. | ((w[7] & 0xff000000) >> 0);
  353. #define A(x) \
  354. { \
  355. uint32_t l; \
  356. uint32_t r; \
  357. l = x[0] ^ x[2]; \
  358. r = x[1] ^ x[3]; \
  359. x[0] = x[2]; \
  360. x[1] = x[3]; \
  361. x[2] = x[4]; \
  362. x[3] = x[5]; \
  363. x[4] = x[6]; \
  364. x[5] = x[7]; \
  365. x[6] = l; \
  366. x[7] = r; \
  367. }
  368. #define AA(x) \
  369. { \
  370. uint32_t l; \
  371. uint32_t r; \
  372. l = x[0]; \
  373. r = x[2]; \
  374. x[0] = x[4]; \
  375. x[2] = x[6]; \
  376. x[4] = l ^ r; \
  377. x[6] = x[0] ^ r; \
  378. l = x[1]; \
  379. r = x[3]; \
  380. x[1] = x[5]; \
  381. x[3] = x[7]; \
  382. x[5] = l ^ r; \
  383. x[7] = x[1] ^ r; \
  384. }
  385. #define C(x) \
  386. x[0] ^= 0xff00ff00; \
  387. x[1] ^= 0xff00ff00; \
  388. x[2] ^= 0x00ff00ff; \
  389. x[3] ^= 0x00ff00ff; \
  390. x[4] ^= 0x00ffff00; \
  391. x[5] ^= 0xff0000ff; \
  392. x[6] ^= 0x000000ff; \
  393. x[7] ^= 0xff00ffff;
  394. #define SHIFT12(u,m,s) \
  395. u[0] = m[0] ^ s[6]; \
  396. u[1] = m[1] ^ s[7]; \
  397. u[2] = m[2] ^ (s[0] << 16) \
  398. ^ (s[0] >> 16) \
  399. ^ (s[0] & 0x0000ffff) \
  400. ^ (s[1] & 0x0000ffff) \
  401. ^ (s[1] >> 16) \
  402. ^ (s[2] << 16) \
  403. ^ s[6] \
  404. ^ (s[6] << 16) \
  405. ^ (s[7] & 0xffff0000) \
  406. ^ (s[7] >> 16); \
  407. u[3] = m[3] ^ (s[0] & 0x0000ffff) \
  408. ^ (s[0] << 16) \
  409. ^ (s[1] & 0x0000ffff) \
  410. ^ (s[1] << 16) \
  411. ^ (s[1] >> 16) \
  412. ^ (s[2] << 16) \
  413. ^ (s[2] >> 16) \
  414. ^ (s[3] << 16) \
  415. ^ s[6] \
  416. ^ (s[6] << 16) \
  417. ^ (s[6] >> 16) \
  418. ^ (s[7] & 0x0000ffff) \
  419. ^ (s[7] << 16) \
  420. ^ (s[7] >> 16); \
  421. u[4] = m[4] ^ (s[0] & 0xffff0000) \
  422. ^ (s[0] << 16) \
  423. ^ (s[0] >> 16) \
  424. ^ (s[1] & 0xffff0000) \
  425. ^ (s[1] >> 16) \
  426. ^ (s[2] << 16) \
  427. ^ (s[2] >> 16) \
  428. ^ (s[3] << 16) \
  429. ^ (s[3] >> 16) \
  430. ^ (s[4] << 16) \
  431. ^ (s[6] << 16) \
  432. ^ (s[6] >> 16) \
  433. ^ (s[7] & 0x0000ffff) \
  434. ^ (s[7] << 16) \
  435. ^ (s[7] >> 16); \
  436. u[5] = m[5] ^ (s[0] << 16) \
  437. ^ (s[0] >> 16) \
  438. ^ (s[0] & 0xffff0000) \
  439. ^ (s[1] & 0x0000ffff) \
  440. ^ s[2] \
  441. ^ (s[2] >> 16) \
  442. ^ (s[3] << 16) \
  443. ^ (s[3] >> 16) \
  444. ^ (s[4] << 16) \
  445. ^ (s[4] >> 16) \
  446. ^ (s[5] << 16) \
  447. ^ (s[6] << 16) \
  448. ^ (s[6] >> 16) \
  449. ^ (s[7] & 0xffff0000) \
  450. ^ (s[7] << 16) \
  451. ^ (s[7] >> 16); \
  452. u[6] = m[6] ^ s[0] \
  453. ^ (s[1] >> 16) \
  454. ^ (s[2] << 16) \
  455. ^ s[3] \
  456. ^ (s[3] >> 16) \
  457. ^ (s[4] << 16) \
  458. ^ (s[4] >> 16) \
  459. ^ (s[5] << 16) \
  460. ^ (s[5] >> 16) \
  461. ^ s[6] \
  462. ^ (s[6] << 16) \
  463. ^ (s[6] >> 16) \
  464. ^ (s[7] << 16); \
  465. u[7] = m[7] ^ (s[0] & 0xffff0000) \
  466. ^ (s[0] << 16) \
  467. ^ (s[1] & 0x0000ffff) \
  468. ^ (s[1] << 16) \
  469. ^ (s[2] >> 16) \
  470. ^ (s[3] << 16) \
  471. ^ s[4] \
  472. ^ (s[4] >> 16) \
  473. ^ (s[5] << 16) \
  474. ^ (s[5] >> 16) \
  475. ^ (s[6] >> 16) \
  476. ^ (s[7] & 0x0000ffff) \
  477. ^ (s[7] << 16) \
  478. ^ (s[7] >> 16);
  479. #define SHIFT16(h,v,u) \
  480. v[0] = h[0] ^ (u[1] << 16) \
  481. ^ (u[0] >> 16); \
  482. v[1] = h[1] ^ (u[2] << 16) \
  483. ^ (u[1] >> 16); \
  484. v[2] = h[2] ^ (u[3] << 16) \
  485. ^ (u[2] >> 16); \
  486. v[3] = h[3] ^ (u[4] << 16) \
  487. ^ (u[3] >> 16); \
  488. v[4] = h[4] ^ (u[5] << 16) \
  489. ^ (u[4] >> 16); \
  490. v[5] = h[5] ^ (u[6] << 16) \
  491. ^ (u[5] >> 16); \
  492. v[6] = h[6] ^ (u[7] << 16) \
  493. ^ (u[6] >> 16); \
  494. v[7] = h[7] ^ (u[0] & 0xffff0000) \
  495. ^ (u[0] << 16) \
  496. ^ (u[7] >> 16) \
  497. ^ (u[1] & 0xffff0000) \
  498. ^ (u[1] << 16) \
  499. ^ (u[6] << 16) \
  500. ^ (u[7] & 0xffff0000);
  501. #define SHIFT61(h,v) \
  502. h[0] = (v[0] & 0xffff0000) \
  503. ^ (v[0] << 16) \
  504. ^ (v[0] >> 16) \
  505. ^ (v[1] >> 16) \
  506. ^ (v[1] & 0xffff0000) \
  507. ^ (v[2] << 16) \
  508. ^ (v[3] >> 16) \
  509. ^ (v[4] << 16) \
  510. ^ (v[5] >> 16) \
  511. ^ v[5] \
  512. ^ (v[6] >> 16) \
  513. ^ (v[7] << 16) \
  514. ^ (v[7] >> 16) \
  515. ^ (v[7] & 0x0000ffff); \
  516. h[1] = (v[0] << 16) \
  517. ^ (v[0] >> 16) \
  518. ^ (v[0] & 0xffff0000) \
  519. ^ (v[1] & 0x0000ffff) \
  520. ^ v[2] \
  521. ^ (v[2] >> 16) \
  522. ^ (v[3] << 16) \
  523. ^ (v[4] >> 16) \
  524. ^ (v[5] << 16) \
  525. ^ (v[6] << 16) \
  526. ^ v[6] \
  527. ^ (v[7] & 0xffff0000) \
  528. ^ (v[7] >> 16); \
  529. h[2] = (v[0] & 0x0000ffff) \
  530. ^ (v[0] << 16) \
  531. ^ (v[1] << 16) \
  532. ^ (v[1] >> 16) \
  533. ^ (v[1] & 0xffff0000) \
  534. ^ (v[2] << 16) \
  535. ^ (v[3] >> 16) \
  536. ^ v[3] \
  537. ^ (v[4] << 16) \
  538. ^ (v[5] >> 16) \
  539. ^ v[6] \
  540. ^ (v[6] >> 16) \
  541. ^ (v[7] & 0x0000ffff) \
  542. ^ (v[7] << 16) \
  543. ^ (v[7] >> 16); \
  544. h[3] = (v[0] << 16) \
  545. ^ (v[0] >> 16) \
  546. ^ (v[0] & 0xffff0000) \
  547. ^ (v[1] & 0xffff0000) \
  548. ^ (v[1] >> 16) \
  549. ^ (v[2] << 16) \
  550. ^ (v[2] >> 16) \
  551. ^ v[2] \
  552. ^ (v[3] << 16) \
  553. ^ (v[4] >> 16) \
  554. ^ v[4] \
  555. ^ (v[5] << 16) \
  556. ^ (v[6] << 16) \
  557. ^ (v[7] & 0x0000ffff) \
  558. ^ (v[7] >> 16); \
  559. h[4] = (v[0] >> 16) \
  560. ^ (v[1] << 16) \
  561. ^ v[1] \
  562. ^ (v[2] >> 16) \
  563. ^ v[2] \
  564. ^ (v[3] << 16) \
  565. ^ (v[3] >> 16) \
  566. ^ v[3] \
  567. ^ (v[4] << 16) \
  568. ^ (v[5] >> 16) \
  569. ^ v[5] \
  570. ^ (v[6] << 16) \
  571. ^ (v[6] >> 16) \
  572. ^ (v[7] << 16); \
  573. h[5] = (v[0] << 16) \
  574. ^ (v[0] & 0xffff0000) \
  575. ^ (v[1] << 16) \
  576. ^ (v[1] >> 16) \
  577. ^ (v[1] & 0xffff0000) \
  578. ^ (v[2] << 16) \
  579. ^ v[2] \
  580. ^ (v[3] >> 16) \
  581. ^ v[3] \
  582. ^ (v[4] << 16) \
  583. ^ (v[4] >> 16) \
  584. ^ v[4] \
  585. ^ (v[5] << 16) \
  586. ^ (v[6] << 16) \
  587. ^ (v[6] >> 16) \
  588. ^ v[6] \
  589. ^ (v[7] << 16) \
  590. ^ (v[7] >> 16) \
  591. ^ (v[7] & 0xffff0000); \
  592. h[6] = v[0] \
  593. ^ v[2] \
  594. ^ (v[2] >> 16) \
  595. ^ v[3] \
  596. ^ (v[3] << 16) \
  597. ^ v[4] \
  598. ^ (v[4] >> 16) \
  599. ^ (v[5] << 16) \
  600. ^ (v[5] >> 16) \
  601. ^ v[5] \
  602. ^ (v[6] << 16) \
  603. ^ (v[6] >> 16) \
  604. ^ v[6] \
  605. ^ (v[7] << 16) \
  606. ^ v[7]; \
  607. h[7] = v[0] \
  608. ^ (v[0] >> 16) \
  609. ^ (v[1] << 16) \
  610. ^ (v[1] >> 16) \
  611. ^ (v[2] << 16) \
  612. ^ (v[3] >> 16) \
  613. ^ v[3] \
  614. ^ (v[4] << 16) \
  615. ^ v[4] \
  616. ^ (v[5] >> 16) \
  617. ^ v[5] \
  618. ^ (v[6] << 16) \
  619. ^ (v[6] >> 16) \
  620. ^ (v[7] << 16) \
  621. ^ v[7];
  622. #define PASS0(h,s,u,v) \
  623. { \
  624. uint32_t k[8]; \
  625. uint32_t w[8]; \
  626. X (w, u, v); \
  627. P (k, w); \
  628. R (k, h, s, 0); \
  629. A (u); \
  630. AA (v); \
  631. }
  632. #define PASS2(h,s,u,v) \
  633. { \
  634. uint32_t k[8]; \
  635. uint32_t w[8]; \
  636. X (w, u, v); \
  637. P (k, w); \
  638. R (k, h, s, 2); \
  639. A (u); \
  640. C (u); \
  641. AA (v); \
  642. }
  643. #define PASS4(h,s,u,v) \
  644. { \
  645. uint32_t k[8]; \
  646. uint32_t w[8]; \
  647. X (w, u, v); \
  648. P (k, w); \
  649. R (k, h, s, 4); \
  650. A (u); \
  651. AA (v); \
  652. }
  653. #define PASS6(h,s,u,v) \
  654. { \
  655. uint32_t k[8]; \
  656. uint32_t w[8]; \
  657. X (w, u, v); \
  658. P (k, w); \
  659. R (k, h, s, 6); \
  660. }
  661. ////////////////////////////////////
  662. // FUCKING SMART XOR MACROS START //
  663. ////////////////////////////////////
  664. #define XOR10(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9) \
  665. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(x0, x1), \
  666. _mm_xor_si128(x2, x3)), \
  667. _mm_xor_si128(_mm_xor_si128(x4, x5), \
  668. _mm_xor_si128(x6, x7))), \
  669. _mm_xor_si128(x8, x9));
  670. #define XOR11(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) \
  671. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(x0, x1), \
  672. _mm_xor_si128(x2, x3)), \
  673. _mm_xor_si128(_mm_xor_si128(x4, x5), \
  674. _mm_xor_si128(x6, x7))), \
  675. _mm_xor_si128(_mm_xor_si128(x8, x9), \
  676. x10));
  677. #define XOR13(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) \
  678. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
  679. _mm_xor_si128( x2, x3)), \
  680. _mm_xor_si128(_mm_xor_si128( x4, x5), \
  681. _mm_xor_si128( x6, x7))), \
  682. _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
  683. _mm_xor_si128(x10, x11)), \
  684. x12))
  685. #define XOR14(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) \
  686. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
  687. _mm_xor_si128( x2, x3)), \
  688. _mm_xor_si128(_mm_xor_si128( x4, x5), \
  689. _mm_xor_si128( x6, x7))), \
  690. _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
  691. _mm_xor_si128(x10, x11)), \
  692. _mm_xor_si128(x12, x13)));
  693. #define XOR15(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) \
  694. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
  695. _mm_xor_si128( x2, x3)), \
  696. _mm_xor_si128(_mm_xor_si128( x4, x5), \
  697. _mm_xor_si128( x6, x7))), \
  698. _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
  699. _mm_xor_si128(x10, x11)), \
  700. _mm_xor_si128(_mm_xor_si128(x12, x13), \
  701. x14)));
  702. #define XOR16(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
  703. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
  704. _mm_xor_si128( x2, x3)), \
  705. _mm_xor_si128(_mm_xor_si128( x4, x5), \
  706. _mm_xor_si128( x6, x7))), \
  707. _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
  708. _mm_xor_si128(x10, x11)), \
  709. _mm_xor_si128(_mm_xor_si128(x12, x13), \
  710. _mm_xor_si128(x14, x15))));
  711. #define XOR17(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) \
  712. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
  713. _mm_xor_si128( x2, x3)), \
  714. _mm_xor_si128(_mm_xor_si128( x4, x5), \
  715. _mm_xor_si128( x6, x7))), \
  716. _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
  717. _mm_xor_si128(x10, x11)), \
  718. _mm_xor_si128(_mm_xor_si128(x12, x13), \
  719. _mm_xor_si128(x14, x15)))), \
  720. x16);
  721. #define XOR19(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18) \
  722. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
  723. _mm_xor_si128( x2, x3)), \
  724. _mm_xor_si128(_mm_xor_si128( x4, x5), \
  725. _mm_xor_si128( x6, x7))), \
  726. _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
  727. _mm_xor_si128(x10, x11)), \
  728. _mm_xor_si128(_mm_xor_si128(x12, x13), \
  729. _mm_xor_si128(x14, x15)))), \
  730. _mm_xor_si128(_mm_xor_si128(x16, x17), \
  731. x18))
  732. #define XOR20(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19) \
  733. store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
  734. _mm_xor_si128( x2, x3)), \
  735. _mm_xor_si128(_mm_xor_si128( x4, x5), \
  736. _mm_xor_si128( x6, x7))), \
  737. _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
  738. _mm_xor_si128(x10, x11)), \
  739. _mm_xor_si128(_mm_xor_si128(x12, x13), \
  740. _mm_xor_si128(x14, x15)))), \
  741. _mm_xor_si128(_mm_xor_si128(x16, x17), \
  742. _mm_xor_si128(x18, x19)));
  743. //////////////////////////////////
  744. // FUCKING SMART XOR MACROS END //
  745. //////////////////////////////////
  746. ////////////////////////
  747. // SSE2 DEFINES START //
  748. ////////////////////////
  749. #define round0_SSE2(k1,k2) \
  750. { \
  751. uint32_t t; \
  752. uint32_t *_k1 = (uint32_t *)&k1; \
  753. uint32_t *_k2 = (uint32_t *)&k2; \
  754. uint32_t *_l = (uint32_t *)&l; \
  755. uint32_t *_r = (uint32_t *)&r; \
  756. t = (_k1[0]) + _r[0]; \
  757. _l[0] ^= tables[0][(t >> 0) & 0xff] ^ \
  758. tables[1][(t >> 8) & 0xff] ^ \
  759. tables[2][(t >> 16) & 0xff] ^ \
  760. tables[3][(t >> 24) & 0xff]; \
  761. t = (_k2[0]) + _l[0]; \
  762. _r[0] ^= tables[0][(t >> 0) & 0xff] ^ \
  763. tables[1][(t >> 8) & 0xff] ^ \
  764. tables[2][(t >> 16) & 0xff] ^ \
  765. tables[3][(t >> 24) & 0xff]; \
  766. }
  767. #define round1_SSE2(k1,k2) \
  768. { \
  769. uint32_t t; \
  770. uint32_t *_k1 = (uint32_t *)&k1; \
  771. uint32_t *_k2 = (uint32_t *)&k2; \
  772. uint32_t *_l = (uint32_t *)&l; \
  773. uint32_t *_r = (uint32_t *)&r; \
  774. t = (_k1[1]) + _r[1]; \
  775. _l[1] ^= tables[0][(t >> 0) & 0xff] ^ \
  776. tables[1][(t >> 8) & 0xff] ^ \
  777. tables[2][(t >> 16) & 0xff] ^ \
  778. tables[3][(t >> 24) & 0xff]; \
  779. t = (_k2[1]) + _l[1]; \
  780. _r[1] ^= tables[0][(t >> 0) & 0xff] ^ \
  781. tables[1][(t >> 8) & 0xff] ^ \
  782. tables[2][(t >> 16) & 0xff] ^ \
  783. tables[3][(t >> 24) & 0xff]; \
  784. }
  785. #define round2_SSE2(k1,k2) \
  786. { \
  787. uint32_t t; \
  788. uint32_t *_k1 = (uint32_t *)&k1; \
  789. uint32_t *_k2 = (uint32_t *)&k2; \
  790. uint32_t *_l = (uint32_t *)&l; \
  791. uint32_t *_r = (uint32_t *)&r; \
  792. t = (_k1[2]) + _r[2]; \
  793. _l[2] ^= tables[0][(t >> 0) & 0xff] ^ \
  794. tables[1][(t >> 8) & 0xff] ^ \
  795. tables[2][(t >> 16) & 0xff] ^ \
  796. tables[3][(t >> 24) & 0xff]; \
  797. t = (_k2[2]) + _l[2]; \
  798. _r[2] ^= tables[0][(t >> 0) & 0xff] ^ \
  799. tables[1][(t >> 8) & 0xff] ^ \
  800. tables[2][(t >> 16) & 0xff] ^ \
  801. tables[3][(t >> 24) & 0xff]; \
  802. }
  803. #define round3_SSE2(k1,k2) \
  804. { \
  805. uint32_t t; \
  806. uint32_t *_k1 = (uint32_t *)&k1; \
  807. uint32_t *_k2 = (uint32_t *)&k2; \
  808. uint32_t *_l = (uint32_t *)&l; \
  809. uint32_t *_r = (uint32_t *)&r; \
  810. t = (_k1[3]) + _r[3]; \
  811. _l[3] ^= tables[0][(t >> 0) & 0xff] ^ \
  812. tables[1][(t >> 8) & 0xff] ^ \
  813. tables[2][(t >> 16) & 0xff] ^ \
  814. tables[3][(t >> 24) & 0xff]; \
  815. t = (_k2[3]) + _l[3]; \
  816. _r[3] ^= tables[0][(t >> 0) & 0xff] ^ \
  817. tables[1][(t >> 8) & 0xff] ^ \
  818. tables[2][(t >> 16) & 0xff] ^ \
  819. tables[3][(t >> 24) & 0xff]; \
  820. }
  821. #define R_SSE2(k,h,s,i) \
  822. { \
  823. __m128i r; \
  824. __m128i l; \
  825. r = h[i + 0]; \
  826. l = h[i + 1]; \
  827. round0_SSE2 (k[0], k[1]); \
  828. round1_SSE2 (k[0], k[1]); \
  829. round2_SSE2 (k[0], k[1]); \
  830. round3_SSE2 (k[0], k[1]); \
  831. round0_SSE2 (k[2], k[3]); \
  832. round1_SSE2 (k[2], k[3]); \
  833. round2_SSE2 (k[2], k[3]); \
  834. round3_SSE2 (k[2], k[3]); \
  835. round0_SSE2 (k[4], k[5]); \
  836. round1_SSE2 (k[4], k[5]); \
  837. round2_SSE2 (k[4], k[5]); \
  838. round3_SSE2 (k[4], k[5]); \
  839. round0_SSE2 (k[6], k[7]); \
  840. round1_SSE2 (k[6], k[7]); \
  841. round2_SSE2 (k[6], k[7]); \
  842. round3_SSE2 (k[6], k[7]); \
  843. round0_SSE2 (k[0], k[1]); \
  844. round1_SSE2 (k[0], k[1]); \
  845. round2_SSE2 (k[0], k[1]); \
  846. round3_SSE2 (k[0], k[1]); \
  847. round0_SSE2 (k[2], k[3]); \
  848. round1_SSE2 (k[2], k[3]); \
  849. round2_SSE2 (k[2], k[3]); \
  850. round3_SSE2 (k[2], k[3]); \
  851. round0_SSE2 (k[4], k[5]); \
  852. round1_SSE2 (k[4], k[5]); \
  853. round2_SSE2 (k[4], k[5]); \
  854. round3_SSE2 (k[4], k[5]); \
  855. round0_SSE2 (k[6], k[7]); \
  856. round1_SSE2 (k[6], k[7]); \
  857. round2_SSE2 (k[6], k[7]); \
  858. round3_SSE2 (k[6], k[7]); \
  859. round0_SSE2 (k[0], k[1]); \
  860. round1_SSE2 (k[0], k[1]); \
  861. round2_SSE2 (k[0], k[1]); \
  862. round3_SSE2 (k[0], k[1]); \
  863. round0_SSE2 (k[2], k[3]); \
  864. round1_SSE2 (k[2], k[3]); \
  865. round2_SSE2 (k[2], k[3]); \
  866. round3_SSE2 (k[2], k[3]); \
  867. round0_SSE2 (k[4], k[5]); \
  868. round1_SSE2 (k[4], k[5]); \
  869. round2_SSE2 (k[4], k[5]); \
  870. round3_SSE2 (k[4], k[5]); \
  871. round0_SSE2 (k[6], k[7]); \
  872. round1_SSE2 (k[6], k[7]); \
  873. round2_SSE2 (k[6], k[7]); \
  874. round3_SSE2 (k[6], k[7]); \
  875. round0_SSE2 (k[7], k[6]); \
  876. round1_SSE2 (k[7], k[6]); \
  877. round2_SSE2 (k[7], k[6]); \
  878. round3_SSE2 (k[7], k[6]); \
  879. round0_SSE2 (k[5], k[4]); \
  880. round1_SSE2 (k[5], k[4]); \
  881. round2_SSE2 (k[5], k[4]); \
  882. round3_SSE2 (k[5], k[4]); \
  883. round0_SSE2 (k[3], k[2]); \
  884. round1_SSE2 (k[3], k[2]); \
  885. round2_SSE2 (k[3], k[2]); \
  886. round3_SSE2 (k[3], k[2]); \
  887. round0_SSE2 (k[1], k[0]); \
  888. round1_SSE2 (k[1], k[0]); \
  889. round2_SSE2 (k[1], k[0]); \
  890. round3_SSE2 (k[1], k[0]); \
  891. s[i + 0] = l; \
  892. s[i + 1] = r; \
  893. }
  894. #define X_SSE2(w,u,v) \
  895. w[0] = _mm_xor_si128(u[0], v[0]); \
  896. w[1] = _mm_xor_si128(u[1], v[1]); \
  897. w[2] = _mm_xor_si128(u[2], v[2]); \
  898. w[3] = _mm_xor_si128(u[3], v[3]); \
  899. w[4] = _mm_xor_si128(u[4], v[4]); \
  900. w[5] = _mm_xor_si128(u[5], v[5]); \
  901. w[6] = _mm_xor_si128(u[6], v[6]); \
  902. w[7] = _mm_xor_si128(u[7], v[7]);
  903. #define P_SSE2(k,w) \
  904. k[0] = _mm_or_si128(_mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x000000ff)), 0), \
  905. _mm_slli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x000000ff)), 8) ), \
  906. _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x000000ff)), 16), \
  907. _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x000000ff)), 24) ) ); \
  908. k[1] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x0000ff00)), 8), \
  909. _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x0000ff00)), 0) ), \
  910. _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x0000ff00)), 8), \
  911. _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x0000ff00)), 16) ) ); \
  912. k[2] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x00ff0000)), 16), \
  913. _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x00ff0000)), 8) ), \
  914. _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x00ff0000)), 0), \
  915. _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x00ff0000)), 8) ) ); \
  916. k[3] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0xff000000)), 24), \
  917. _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0xff000000)), 16) ), \
  918. _mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0xff000000)), 8), \
  919. _mm_srli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0xff000000)), 0) ) ); \
  920. k[4] = _mm_or_si128(_mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x000000ff)), 0), \
  921. _mm_slli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x000000ff)), 8) ), \
  922. _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x000000ff)), 16), \
  923. _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x000000ff)), 24) ) ); \
  924. k[5] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x0000ff00)), 8), \
  925. _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x0000ff00)), 0) ), \
  926. _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x0000ff00)), 8), \
  927. _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x0000ff00)), 16) ) ); \
  928. k[6] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x00ff0000)), 16), \
  929. _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x00ff0000)), 8) ), \
  930. _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x00ff0000)), 0), \
  931. _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x00ff0000)), 8) ) ); \
  932. k[7] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0xff000000)), 24), \
  933. _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0xff000000)), 16) ), \
  934. _mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0xff000000)), 8), \
  935. _mm_srli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0xff000000)), 0) ) );
  936. #define A_SSE2(x) \
  937. { \
  938. __m128i l; \
  939. __m128i r; \
  940. l = _mm_xor_si128(x[0], x[2]); \
  941. r = _mm_xor_si128(x[1], x[3]); \
  942. x[0] = x[2]; \
  943. x[1] = x[3]; \
  944. x[2] = x[4]; \
  945. x[3] = x[5]; \
  946. x[4] = x[6]; \
  947. x[5] = x[7]; \
  948. x[6] = l; \
  949. x[7] = r; \
  950. }
  951. #define AA_SSE2(x) \
  952. { \
  953. __m128i l; \
  954. __m128i r; \
  955. l = x[0]; \
  956. r = x[2]; \
  957. x[0] = x[4]; \
  958. x[2] = x[6]; \
  959. x[4] = _mm_xor_si128(l, r); \
  960. x[6] = _mm_xor_si128(x[0], r); \
  961. l = x[1]; \
  962. r = x[3]; \
  963. x[1] = x[5]; \
  964. x[3] = x[7]; \
  965. x[5] = _mm_xor_si128(l, r); \
  966. x[7] = _mm_xor_si128(x[1], r); \
  967. }
  968. #define C_SSE2(x) \
  969. x[0] = _mm_xor_si128(x[0], _mm_set1_epi32(0xff00ff00)); \
  970. x[1] = _mm_xor_si128(x[1], _mm_set1_epi32(0xff00ff00)); \
  971. x[2] = _mm_xor_si128(x[2], _mm_set1_epi32(0x00ff00ff)); \
  972. x[3] = _mm_xor_si128(x[3], _mm_set1_epi32(0x00ff00ff)); \
  973. x[4] = _mm_xor_si128(x[4], _mm_set1_epi32(0x00ffff00)); \
  974. x[5] = _mm_xor_si128(x[5], _mm_set1_epi32(0xff0000ff)); \
  975. x[6] = _mm_xor_si128(x[6], _mm_set1_epi32(0x000000ff)); \
  976. x[7] = _mm_xor_si128(x[7], _mm_set1_epi32(0xff00ffff));
  977. #define SHIFT12_SSE2(u,m,s) \
  978. u[0] = _mm_xor_si128(m[0], s[6]); \
  979. u[1] = _mm_xor_si128(m[1], s[7]); \
  980. XOR11(u[2], \
  981. m[2], \
  982. _mm_slli_epi32(s[0], 16), \
  983. _mm_srli_epi32(s[0], 16), \
  984. _mm_and_si128(s[0], _mm_set1_epi32(0x0000ffff)), \
  985. _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
  986. _mm_srli_epi32(s[1], 16), \
  987. _mm_slli_epi32(s[2], 16), \
  988. s[6], \
  989. _mm_slli_epi32(s[6], 16), \
  990. _mm_and_si128(s[7], _mm_set1_epi32(0xffff0000)), \
  991. _mm_srli_epi32(s[7], 16)); \
  992. XOR15(u[3], \
  993. m[3], \
  994. _mm_and_si128(s[0], _mm_set1_epi32(0x0000ffff)), \
  995. _mm_slli_epi32(s[0], 16), \
  996. _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
  997. _mm_slli_epi32(s[1], 16), \
  998. _mm_srli_epi32(s[1], 16), \
  999. _mm_slli_epi32(s[2], 16), \
  1000. _mm_srli_epi32(s[2], 16), \
  1001. _mm_slli_epi32(s[3], 16), \
  1002. s[6], \
  1003. _mm_slli_epi32(s[6], 16), \
  1004. _mm_srli_epi32(s[6], 16), \
  1005. _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)), \
  1006. _mm_slli_epi32(s[7], 16), \
  1007. _mm_srli_epi32(s[7], 16)); \
  1008. XOR16(u[4], \
  1009. m[4], \
  1010. _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)), \
  1011. _mm_slli_epi32(s[0], 16), \
  1012. _mm_srli_epi32(s[0], 16), \
  1013. _mm_and_si128(s[1], _mm_set1_epi32(0xffff0000)), \
  1014. _mm_srli_epi32(s[1], 16), \
  1015. _mm_slli_epi32(s[2], 16), \
  1016. _mm_srli_epi32(s[2], 16), \
  1017. _mm_slli_epi32(s[3], 16), \
  1018. _mm_srli_epi32(s[3], 16), \
  1019. _mm_slli_epi32(s[4], 16), \
  1020. _mm_slli_epi32(s[6], 16), \
  1021. _mm_srli_epi32(s[6], 16), \
  1022. _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)), \
  1023. _mm_slli_epi32(s[7], 16), \
  1024. _mm_srli_epi32(s[7], 16)); \
  1025. XOR17(u[5], \
  1026. m[5], \
  1027. _mm_slli_epi32(s[0], 16), \
  1028. _mm_srli_epi32(s[0], 16), \
  1029. _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)), \
  1030. _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
  1031. s[2], \
  1032. _mm_srli_epi32(s[2], 16), \
  1033. _mm_slli_epi32(s[3], 16), \
  1034. _mm_srli_epi32(s[3], 16), \
  1035. _mm_slli_epi32(s[4], 16), \
  1036. _mm_srli_epi32(s[4], 16), \
  1037. _mm_slli_epi32(s[5], 16), \
  1038. _mm_slli_epi32(s[6], 16), \
  1039. _mm_srli_epi32(s[6], 16), \
  1040. _mm_and_si128(s[7], _mm_set1_epi32(0xffff0000)), \
  1041. _mm_slli_epi32(s[7], 16), \
  1042. _mm_srli_epi32(s[7], 16)); \
  1043. XOR14(u[6], \
  1044. m[6], \
  1045. s[0], \
  1046. _mm_srli_epi32(s[1], 16), \
  1047. _mm_slli_epi32(s[2], 16), \
  1048. s[3], \
  1049. _mm_srli_epi32(s[3], 16), \
  1050. _mm_slli_epi32(s[4], 16), \
  1051. _mm_srli_epi32(s[4], 16), \
  1052. _mm_slli_epi32(s[5], 16), \
  1053. _mm_srli_epi32(s[5], 16), \
  1054. s[6], \
  1055. _mm_slli_epi32(s[6], 16), \
  1056. _mm_srli_epi32(s[6], 16), \
  1057. _mm_slli_epi32(s[7], 16)); \
  1058. XOR15(u[7], \
  1059. m[7], \
  1060. _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)), \
  1061. _mm_slli_epi32(s[0], 16), \
  1062. _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
  1063. _mm_slli_epi32(s[1], 16), \
  1064. _mm_srli_epi32(s[2], 16), \
  1065. _mm_slli_epi32(s[3], 16), \
  1066. s[4], \
  1067. _mm_srli_epi32(s[4], 16), \
  1068. _mm_slli_epi32(s[5], 16), \
  1069. _mm_srli_epi32(s[5], 16), \
  1070. _mm_srli_epi32(s[6], 16), \
  1071. _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)), \
  1072. _mm_slli_epi32(s[7], 16), \
  1073. _mm_srli_epi32(s[7], 16));
  1074. #define SHIFT16_SSE2(h,v,u) \
  1075. v[0] = _mm_xor_si128( _mm_xor_si128(h[0],_mm_slli_epi32(u[1], 16)), \
  1076. _mm_srli_epi32(u[0], 16)); \
  1077. v[1] = _mm_xor_si128(_mm_xor_si128( h[1], _mm_slli_epi32(u[2], 16)), \
  1078. _mm_srli_epi32(u[1], 16)); \
  1079. v[2] = _mm_xor_si128(_mm_xor_si128( h[2], _mm_slli_epi32(u[3], 16)), \
  1080. _mm_srli_epi32(u[2], 16)); \
  1081. v[3] = _mm_xor_si128(_mm_xor_si128( h[3], _mm_slli_epi32(u[4], 16)), \
  1082. _mm_srli_epi32(u[3], 16)); \
  1083. v[4] = _mm_xor_si128(_mm_xor_si128( h[4], _mm_slli_epi32(u[5], 16)), \
  1084. _mm_srli_epi32(u[4], 16)); \
  1085. v[5] = _mm_xor_si128(_mm_xor_si128( h[5], _mm_slli_epi32(u[6], 16)), \
  1086. _mm_srli_epi32(u[5], 16)); \
  1087. v[6] = _mm_xor_si128(_mm_xor_si128( h[6], _mm_slli_epi32(u[7], 16)), \
  1088. _mm_srli_epi32(u[6], 16)); \
  1089. v[7] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(h[7], \
  1090. _mm_and_si128(u[0], _mm_set1_epi32(0xffff0000))), \
  1091. _mm_xor_si128(_mm_slli_epi32(u[0], 16), \
  1092. _mm_srli_epi32(u[7], 16))), \
  1093. _mm_xor_si128(_mm_xor_si128(_mm_and_si128(u[1], _mm_set1_epi32(0xffff0000)), \
  1094. _mm_slli_epi32(u[1], 16)), \
  1095. _mm_xor_si128(_mm_slli_epi32(u[6], 16), \
  1096. _mm_and_si128(u[7], _mm_set1_epi32(0xffff0000)))));
  1097. #define SHIFT61_SSE2(h,v) \
  1098. XOR14(h[0], \
  1099. _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
  1100. _mm_slli_epi32(v[0], 16), \
  1101. _mm_srli_epi32(v[0], 16), \
  1102. _mm_srli_epi32(v[1], 16), \
  1103. _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
  1104. _mm_slli_epi32(v[2], 16), \
  1105. _mm_srli_epi32(v[3], 16), \
  1106. _mm_slli_epi32(v[4], 16), \
  1107. _mm_srli_epi32(v[5], 16), \
  1108. v[5], \
  1109. _mm_srli_epi32(v[6], 16), \
  1110. _mm_slli_epi32(v[7], 16), \
  1111. _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)), \
  1112. _mm_srli_epi32(v[7], 16)); \
  1113. XOR13(h[1], \
  1114. _mm_slli_epi32(v[0], 16), \
  1115. _mm_srli_epi32(v[0], 16), \
  1116. _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
  1117. _mm_and_si128(v[1], _mm_set1_epi32(0x0000ffff)), \
  1118. _mm_srli_epi32(v[2], 16), \
  1119. v[2], \
  1120. _mm_slli_epi32(v[3], 16), \
  1121. _mm_srli_epi32(v[4], 16), \
  1122. _mm_slli_epi32(v[5], 16), \
  1123. _mm_slli_epi32(v[6], 16), \
  1124. _mm_and_si128(v[7], _mm_set1_epi32(0xffff0000)), \
  1125. v[6], \
  1126. _mm_srli_epi32(v[7], 16)); \
  1127. XOR15(h[2], \
  1128. _mm_and_si128(v[0], _mm_set1_epi32(0x0000ffff)), \
  1129. _mm_slli_epi32(v[0], 16), \
  1130. _mm_slli_epi32(v[1], 16), \
  1131. _mm_srli_epi32(v[1], 16), \
  1132. _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
  1133. _mm_slli_epi32(v[2], 16), \
  1134. _mm_srli_epi32(v[3], 16), \
  1135. v[3], \
  1136. _mm_slli_epi32(v[4], 16), \
  1137. _mm_srli_epi32(v[5], 16), \
  1138. _mm_srli_epi32(v[6], 16), \
  1139. v[6], \
  1140. _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)), \
  1141. _mm_slli_epi32(v[7], 16), \
  1142. _mm_srli_epi32(v[7], 16)); \
  1143. XOR15(h[3], \
  1144. _mm_slli_epi32(v[0], 16), \
  1145. _mm_srli_epi32(v[0], 16), \
  1146. _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
  1147. _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
  1148. _mm_srli_epi32(v[1], 16), \
  1149. _mm_slli_epi32(v[2], 16), \
  1150. _mm_srli_epi32(v[2], 16), \
  1151. v[2], \
  1152. _mm_slli_epi32(v[3], 16), \
  1153. _mm_srli_epi32(v[4], 16), \
  1154. _mm_slli_epi32(v[5], 16), \
  1155. v[4], \
  1156. _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)), \
  1157. _mm_slli_epi32(v[6], 16), \
  1158. _mm_srli_epi32(v[7], 16)); \
  1159. XOR14(h[4], \
  1160. _mm_srli_epi32(v[0], 16), \
  1161. _mm_slli_epi32(v[1], 16), \
  1162. _mm_srli_epi32(v[2], 16), \
  1163. v[1], \
  1164. _mm_slli_epi32(v[3], 16), \
  1165. v[2], \
  1166. _mm_srli_epi32(v[3], 16), \
  1167. v[3], \
  1168. _mm_slli_epi32(v[4], 16), \
  1169. _mm_srli_epi32(v[5], 16), \
  1170. _mm_slli_epi32(v[6], 16), \
  1171. v[5], \
  1172. _mm_srli_epi32(v[6], 16), \
  1173. _mm_slli_epi32(v[7], 16)); \
  1174. XOR19(h[5], \
  1175. _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
  1176. _mm_slli_epi32(v[0], 16), \
  1177. _mm_slli_epi32(v[1], 16), \
  1178. _mm_srli_epi32(v[1], 16), \
  1179. _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
  1180. _mm_slli_epi32(v[2], 16), \
  1181. _mm_srli_epi32(v[3], 16), \
  1182. v[2], \
  1183. _mm_slli_epi32(v[4], 16), \
  1184. v[3], \
  1185. _mm_srli_epi32(v[4], 16), \
  1186. v[4], \
  1187. _mm_slli_epi32(v[5], 16), \
  1188. _mm_slli_epi32(v[6], 16), \
  1189. _mm_srli_epi32(v[6], 16), \
  1190. v[6], \
  1191. _mm_slli_epi32(v[7], 16), \
  1192. _mm_srli_epi32(v[7], 16), \
  1193. _mm_and_si128(v[7], _mm_set1_epi32(0xffff0000))); \
  1194. XOR15(h[6], \
  1195. v[0], \
  1196. v[2], \
  1197. _mm_srli_epi32(v[2], 16), \
  1198. v[3], \
  1199. _mm_slli_epi32(v[3], 16), \
  1200. v[4], \
  1201. _mm_srli_epi32(v[4], 16), \
  1202. _mm_slli_epi32(v[5], 16), \
  1203. _mm_srli_epi32(v[5], 16), \
  1204. v[5], \
  1205. _mm_slli_epi32(v[6], 16), \
  1206. _mm_srli_epi32(v[6], 16), \
  1207. _mm_slli_epi32(v[7], 16), \
  1208. v[6], \
  1209. v[7]); \
  1210. XOR15(h[7], \
  1211. _mm_srli_epi32(v[0], 16), \
  1212. v[0], \
  1213. _mm_slli_epi32(v[1], 16), \
  1214. _mm_srli_epi32(v[1], 16), \
  1215. _mm_slli_epi32(v[2], 16), \
  1216. _mm_srli_epi32(v[3], 16), \
  1217. _mm_slli_epi32(v[4], 16), \
  1218. v[3], \
  1219. _mm_srli_epi32(v[5], 16), \
  1220. v[4], \
  1221. _mm_slli_epi32(v[6], 16), \
  1222. v[5], \
  1223. _mm_srli_epi32(v[6], 16), \
  1224. _mm_slli_epi32(v[7], 16), \
  1225. v[7]);
  1226. #define PASS0_SSE2(h,s,u,v) \
  1227. { \
  1228. __m128i k[8]; \
  1229. __m128i w[8]; \
  1230. X_SSE2 (w, u, v); \
  1231. P_SSE2 (k, w); \
  1232. R_SSE2 (k, h, s, 0); \
  1233. A_SSE2 (u); \
  1234. AA_SSE2 (v); \
  1235. }
  1236. #define PASS2_SSE2(h,s,u,v) \
  1237. { \
  1238. __m128i k[8]; \
  1239. __m128i w[8]; \
  1240. X_SSE2 (w, u, v); \
  1241. P_SSE2 (k, w); \
  1242. R_SSE2 (k, h, s, 2); \
  1243. A_SSE2 (u); \
  1244. C_SSE2 (u); \
  1245. AA_SSE2 (v); \
  1246. }
  1247. #define PASS4_SSE2(h,s,u,v) \
  1248. { \
  1249. __m128i k[8]; \
  1250. __m128i w[8]; \
  1251. X_SSE2 (w, u, v); \
  1252. P_SSE2 (k, w); \
  1253. R_SSE2 (k, h, s, 4); \
  1254. A_SSE2 (u); \
  1255. AA_SSE2 (v); \
  1256. }
  1257. #define PASS6_SSE2(h,s,u,v) \
  1258. { \
  1259. __m128i k[8]; \
  1260. __m128i w[8]; \
  1261. X_SSE2 (w, u, v); \
  1262. P_SSE2 (k, w); \
  1263. R_SSE2 (k, h, s, 6); \
  1264. }
  1265. //////////////////////
  1266. // SSE2 DEFINES END //
  1267. //////////////////////
  1268. void hashcat_gost_64 (uint32_t digests[8][4], uint32_t blocks[16][4])
  1269. {
  1270. /**
  1271. * base
  1272. */
  1273. int id;
  1274. for (id = 0; id < 4; id++)
  1275. {
  1276. uint32_t data[8];
  1277. data[0] = blocks[0][id];
  1278. data[1] = blocks[1][id];
  1279. data[2] = blocks[2][id];
  1280. data[3] = blocks[3][id];
  1281. data[4] = blocks[4][id];
  1282. data[5] = blocks[5][id];
  1283. data[6] = blocks[6][id];
  1284. data[7] = blocks[7][id];
  1285. uint32_t state[16];
  1286. state[ 0] = 0;
  1287. state[ 1] = 0;
  1288. state[ 2] = 0;
  1289. state[ 3] = 0;
  1290. state[ 4] = 0;
  1291. state[ 5] = 0;
  1292. state[ 6] = 0;
  1293. state[ 7] = 0;
  1294. state[ 8] = data[0];
  1295. state[ 9] = data[1];
  1296. state[10] = data[2];
  1297. state[11] = data[3];
  1298. state[12] = data[4];
  1299. state[13] = data[5];
  1300. state[14] = data[6];
  1301. state[15] = data[7];
  1302. uint32_t state_m[8];
  1303. uint32_t data_m[8];
  1304. /* gost1 */
  1305. state_m[0] = state[0];
  1306. state_m[1] = state[1];
  1307. state_m[2] = state[2];
  1308. state_m[3] = state[3];
  1309. state_m[4] = state[4];
  1310. state_m[5] = state[5];
  1311. state_m[6] = state[6];
  1312. state_m[7] = state[7];
  1313. data_m[0] = data[0];
  1314. data_m[1] = data[1];
  1315. data_m[2] = data[2];
  1316. data_m[3] = data[3];
  1317. data_m[4] = data[4];
  1318. data_m[5] = data[5];
  1319. data_m[6] = data[6];
  1320. data_m[7] = data[7];
  1321. uint32_t tmp[8];
  1322. PASS0 (state, tmp, state_m, data_m);
  1323. PASS2 (state, tmp, state_m, data_m);
  1324. PASS4 (state, tmp, state_m, data_m);
  1325. PASS6 (state, tmp, state_m, data_m);
  1326. SHIFT12 (state_m, data, tmp);
  1327. SHIFT16 (state, data_m, state_m);
  1328. SHIFT61 (state, data_m);
  1329. data[0] = blocks[15][id];
  1330. data[1] = 0;
  1331. data[2] = 0;
  1332. data[3] = 0;
  1333. data[4] = 0;
  1334. data[5] = 0;
  1335. data[6] = 0;
  1336. data[7] = 0;
  1337. /* gost2 */
  1338. state_m[0] = state[0];
  1339. state_m[1] = state[1];
  1340. state_m[2] = state[2];
  1341. state_m[3] = state[3];
  1342. state_m[4] = state[4];
  1343. state_m[5] = state[5];
  1344. state_m[6] = state[6];
  1345. state_m[7] = state[7];
  1346. data_m[0] = data[0];
  1347. data_m[1] = data[1];
  1348. data_m[2] = data[2];
  1349. data_m[3] = data[3];
  1350. data_m[4] = data[4];
  1351. data_m[5] = data[5];
  1352. data_m[6] = data[6];
  1353. data_m[7] = data[7];
  1354. PASS0 (state, tmp, state_m, data_m);
  1355. PASS2 (state, tmp, state_m, data_m);
  1356. PASS4 (state, tmp, state_m, data_m);
  1357. PASS6 (state, tmp, state_m, data_m);
  1358. SHIFT12 (state_m, data, tmp);
  1359. SHIFT16 (state, data_m, state_m);
  1360. SHIFT61 (state, data_m);
  1361. /* gost3 */
  1362. data[0] = state[ 8];
  1363. data[1] = state[ 9];
  1364. data[2] = state[10];
  1365. data[3] = state[11];
  1366. data[4] = state[12];
  1367. data[5] = state[13];
  1368. data[6] = state[14];
  1369. data[7] = state[15];
  1370. state_m[0] = state[0];
  1371. state_m[1] = state[1];
  1372. state_m[2] = state[2];
  1373. state_m[3] = state[3];
  1374. state_m[4] = state[4];
  1375. state_m[5] = state[5];
  1376. state_m[6] = state[6];
  1377. state_m[7] = state[7];
  1378. data_m[0] = data[0];
  1379. data_m[1] = data[1];
  1380. data_m[2] = data[2];
  1381. data_m[3] = data[3];
  1382. data_m[4] = data[4];
  1383. data_m[5] = data[5];
  1384. data_m[6] = data[6];
  1385. data_m[7] = data[7];
  1386. PASS0 (state, tmp, state_m, data_m);
  1387. PASS2 (state, tmp, state_m, data_m);
  1388. PASS4 (state, tmp, state_m, data_m);
  1389. PASS6 (state, tmp, state_m, data_m);
  1390. SHIFT12 (state_m, data, tmp);
  1391. SHIFT16 (state, data_m, state_m);
  1392. SHIFT61 (state, data_m);
  1393. /* store */
  1394. digests[0][id] = state[0];
  1395. digests[1][id] = state[1];
  1396. digests[2][id] = state[2];
  1397. digests[3][id] = state[3];
  1398. digests[4][id] = state[4];
  1399. digests[5][id] = state[5];
  1400. digests[6][id] = state[6];
  1401. digests[7][id] = state[7];
  1402. BYTESWAP (digests[0][id]);
  1403. BYTESWAP (digests[1][id]);
  1404. BYTESWAP (digests[2][id]);
  1405. BYTESWAP (digests[3][id]);
  1406. BYTESWAP (digests[4][id]);
  1407. BYTESWAP (digests[5][id]);
  1408. BYTESWAP (digests[6][id]);
  1409. BYTESWAP (digests[7][id]);
  1410. }
  1411. }
  1412. void hashcat_gost_64_sse2 (__m128i digests[8], __m128i blocks[16])
  1413. {
  1414. __m128i data[8];
  1415. data[0] = blocks[0];
  1416. data[1] = blocks[1];
  1417. data[2] = blocks[2];
  1418. data[3] = blocks[3];
  1419. data[4] = blocks[4];
  1420. data[5] = blocks[5];
  1421. data[6] = blocks[6];
  1422. data[7] = blocks[7];
  1423. __m128i state[16];
  1424. state[ 0] = _mm_set1_epi32 (0);
  1425. state[ 1] = _mm_set1_epi32 (0);
  1426. state[ 2] = _mm_set1_epi32 (0);
  1427. state[ 3] = _mm_set1_epi32 (0);
  1428. state[ 4] = _mm_set1_epi32 (0);
  1429. state[ 5] = _mm_set1_epi32 (0);
  1430. state[ 6] = _mm_set1_epi32 (0);
  1431. state[ 7] = _mm_set1_epi32 (0);
  1432. state[ 8] = data[0];
  1433. state[ 9] = data[1];
  1434. state[10] = data[2];
  1435. state[11] = data[3];
  1436. state[12] = data[4];
  1437. state[13] = data[5];
  1438. state[14] = data[6];
  1439. state[15] = data[7];
  1440. __m128i state_m[8];
  1441. __m128i data_m[8];
  1442. /* gost1 */
  1443. state_m[0] = state[0];
  1444. state_m[1] = state[1];
  1445. state_m[2] = state[2];
  1446. state_m[3] = state[3];
  1447. state_m[4] = state[4];
  1448. state_m[5] = state[5];
  1449. state_m[6] = state[6];
  1450. state_m[7] = state[7];
  1451. data_m[0] = data[0];
  1452. data_m[1] = data[1];
  1453. data_m[2] = data[2];
  1454. data_m[3] = data[3];
  1455. data_m[4] = data[4];
  1456. data_m[5] = data[5];
  1457. data_m[6] = data[6];
  1458. data_m[7] = data[7];
  1459. __m128i tmp[8];
  1460. PASS0_SSE2 (state, tmp, state_m, data_m);
  1461. PASS2_SSE2 (state, tmp, state_m, data_m);
  1462. PASS4_SSE2 (state, tmp, state_m, data_m);
  1463. PASS6_SSE2 (state, tmp, state_m, data_m);
  1464. SHIFT12_SSE2 (state_m, data, tmp);
  1465. SHIFT16_SSE2 (state, data_m, state_m);
  1466. SHIFT61_SSE2 (state, data_m);
  1467. data[0] = blocks[15];
  1468. data[1] = _mm_set1_epi32 (0);
  1469. data[2] = _mm_set1_epi32 (0);
  1470. data[3] = _mm_set1_epi32 (0);
  1471. data[4] = _mm_set1_epi32 (0);
  1472. data[5] = _mm_set1_epi32 (0);
  1473. data[6] = _mm_set1_epi32 (0);
  1474. data[7] = _mm_set1_epi32 (0);
  1475. /* gost2 */
  1476. state_m[0] = state[0];
  1477. state_m[1] = state[1];
  1478. state_m[2] = state[2];
  1479. state_m[3] = state[3];
  1480. state_m[4] = state[4];
  1481. state_m[5] = state[5];
  1482. state_m[6] = state[6];
  1483. state_m[7] = state[7];
  1484. data_m[0] = data[0];
  1485. data_m[1] = data[1];
  1486. data_m[2] = data[2];
  1487. data_m[3] = data[3];
  1488. data_m[4] = data[4];
  1489. data_m[5] = data[5];
  1490. data_m[6] = data[6];
  1491. data_m[7] = data[7];
  1492. PASS0_SSE2 (state, tmp, state_m, data_m);
  1493. PASS2_SSE2 (state, tmp, state_m, data_m);
  1494. PASS4_SSE2 (state, tmp, state_m, data_m);
  1495. PASS6_SSE2 (state, tmp, state_m, data_m);
  1496. SHIFT12_SSE2 (state_m, data, tmp);
  1497. SHIFT16_SSE2 (state, data_m, state_m);
  1498. SHIFT61_SSE2 (state, data_m);
  1499. /* gost3 */
  1500. data[0] = state[ 8];
  1501. data[1] = state[ 9];
  1502. data[2] = state[10];
  1503. data[3] = state[11];
  1504. data[4] = state[12];
  1505. data[5] = state[13];
  1506. data[6] = state[14];
  1507. data[7] = state[15];
  1508. state_m[0] = state[0];
  1509. state_m[1] = state[1];
  1510. state_m[2] = state[2];
  1511. state_m[3] = state[3];
  1512. state_m[4] = state[4];
  1513. state_m[5] = state[5];
  1514. state_m[6] = state[6];
  1515. state_m[7] = state[7];
  1516. data_m[0] = data[0];
  1517. data_m[1] = data[1];
  1518. data_m[2] = data[2];
  1519. data_m[3] = data[3];
  1520. data_m[4] = data[4];
  1521. data_m[5] = data[5];
  1522. data_m[6] = data[6];
  1523. data_m[7] = data[7];
  1524. PASS0_SSE2 (state, tmp, state_m, data_m);
  1525. PASS2_SSE2 (state, tmp, state_m, data_m);
  1526. PASS4_SSE2 (state, tmp, state_m, data_m);
  1527. PASS6_SSE2 (state, tmp, state_m, data_m);
  1528. SHIFT12_SSE2 (state_m, data, tmp);
  1529. SHIFT16_SSE2 (state, data_m, state_m);
  1530. SHIFT61_SSE2 (state, data_m);
  1531. /* store */
  1532. uint32_t * tmpA;
  1533. digests[0] = state[0];
  1534. digests[1] = state[1];
  1535. digests[2] = state[2];
  1536. digests[3] = state[3];
  1537. digests[4] = state[4];
  1538. digests[5] = state[5];
  1539. digests[6] = state[6];
  1540. digests[7] = state[7];
  1541. tmpA = (uint32_t *)&digests[0];
  1542. BYTESWAP (tmpA[0]);
  1543. BYTESWAP (tmpA[1]);
  1544. BYTESWAP (tmpA[2]);
  1545. BYTESWAP (tmpA[3]);
  1546. tmpA = (uint32_t *)&digests[1];
  1547. BYTESWAP (tmpA[0]);
  1548. BYTESWAP (tmpA[1]);
  1549. BYTESWAP (tmpA[2]);
  1550. BYTESWAP (tmpA[3]);
  1551. tmpA = (uint32_t *)&digests[2];
  1552. BYTESWAP (tmpA[0]);
  1553. BYTESWAP (tmpA[1]);
  1554. BYTESWAP (tmpA[2]);
  1555. BYTESWAP (tmpA[3]);
  1556. tmpA = (uint32_t *)&digests[3];
  1557. BYTESWAP (tmpA[0]);
  1558. BYTESWAP (tmpA[1]);
  1559. BYTESWAP (tmpA[2]);
  1560. BYTESWAP (tmpA[3]);
  1561. tmpA = (uint32_t *)&digests[4];
  1562. BYTESWAP (tmpA[0]);
  1563. BYTESWAP (tmpA[1]);
  1564. BYTESWAP (tmpA[2]);
  1565. BYTESWAP (tmpA[3]);
  1566. tmpA = (uint32_t *)&digests[5];
  1567. BYTESWAP (tmpA[0]);
  1568. BYTESWAP (tmpA[1]);
  1569. BYTESWAP (tmpA[2]);
  1570. BYTESWAP (tmpA[3]);
  1571. tmpA = (uint32_t *)&digests[6];
  1572. BYTESWAP (tmpA[0]);
  1573. BYTESWAP (tmpA[1]);
  1574. BYTESWAP (tmpA[2]);
  1575. BYTESWAP (tmpA[3]);
  1576. tmpA = (uint32_t *)&digests[7];
  1577. BYTESWAP (tmpA[0]);
  1578. BYTESWAP (tmpA[1]);
  1579. BYTESWAP (tmpA[2]);
  1580. BYTESWAP (tmpA[3]);
  1581. }