crc32c-vpmsum_asm.S 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554
  1. /*
  2. * Calculate the checksum of data that is 16 byte aligned and a multiple of
  3. * 16 bytes.
  4. *
  5. * The first step is to reduce it to 1024 bits. We do this in 8 parallel
  6. * chunks in order to mask the latency of the vpmsum instructions. If we
  7. * have more than 32 kB of data to checksum we repeat this step multiple
  8. * times, passing in the previous 1024 bits.
  9. *
  10. * The next step is to reduce the 1024 bits to 64 bits. This step adds
  11. * 32 bits of 0s to the end - this matches what a CRC does. We just
  12. * calculate constants that land the data in this 32 bits.
  13. *
  14. * We then use fixed point Barrett reduction to compute a mod n over GF(2)
  15. * for n = CRC using POWER8 instructions. We use x = 32.
  16. *
  17. * http://en.wikipedia.org/wiki/Barrett_reduction
  18. *
  19. * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  20. *
  21. * This program is free software; you can redistribute it and/or
  22. * modify it under the terms of the GNU General Public License
  23. * as published by the Free Software Foundation; either version
  24. * 2 of the License, or (at your option) any later version.
  25. */
  26. #include <asm/ppc_asm.h>
  27. #include <asm/ppc-opcode.h>
  28. .section .rodata
  29. .balign 16
  30. .byteswap_constant:
  31. /* byte reverse permute constant */
  32. .octa 0x0F0E0D0C0B0A09080706050403020100
  33. #define MAX_SIZE 32768
  34. .constants:
  35. /* Reduce 262144 kbits to 1024 bits */
  36. /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
  37. .octa 0x00000000b6ca9e20000000009c37c408
  38. /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
  39. .octa 0x00000000350249a800000001b51df26c
  40. /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
  41. .octa 0x00000001862dac54000000000724b9d0
  42. /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
  43. .octa 0x00000001d87fb48c00000001c00532fe
  44. /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
  45. .octa 0x00000001f39b699e00000000f05a9362
  46. /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
  47. .octa 0x0000000101da11b400000001e1007970
  48. /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
  49. .octa 0x00000001cab571e000000000a57366ee
  50. /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
  51. .octa 0x00000000c7020cfe0000000192011284
  52. /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
  53. .octa 0x00000000cdaed1ae0000000162716d9a
  54. /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
  55. .octa 0x00000001e804effc00000000cd97ecde
  56. /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
  57. .octa 0x0000000077c3ea3a0000000058812bc0
  58. /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
  59. .octa 0x0000000068df31b40000000088b8c12e
  60. /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
  61. .octa 0x00000000b059b6c200000001230b234c
  62. /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
  63. .octa 0x0000000145fb8ed800000001120b416e
  64. /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
  65. .octa 0x00000000cbc0916800000001974aecb0
  66. /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
  67. .octa 0x000000005ceeedc2000000008ee3f226
  68. /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
  69. .octa 0x0000000047d74e8600000001089aba9a
  70. /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
  71. .octa 0x00000001407e9e220000000065113872
  72. /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
  73. .octa 0x00000001da967bda000000005c07ec10
  74. /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
  75. .octa 0x000000006c8983680000000187590924
  76. /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
  77. .octa 0x00000000f2d14c9800000000e35da7c6
  78. /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
  79. .octa 0x00000001993c6ad4000000000415855a
  80. /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
  81. .octa 0x000000014683d1ac0000000073617758
  82. /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
  83. .octa 0x00000001a7c93e6c0000000176021d28
  84. /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
  85. .octa 0x000000010211e90a00000001c358fd0a
  86. /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
  87. .octa 0x000000001119403e00000001ff7a2c18
  88. /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
  89. .octa 0x000000001c3261aa00000000f2d9f7e4
  90. /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
  91. .octa 0x000000014e37a634000000016cf1f9c8
  92. /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
  93. .octa 0x0000000073786c0c000000010af9279a
  94. /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
  95. .octa 0x000000011dc037f80000000004f101e8
  96. /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
  97. .octa 0x0000000031433dfc0000000070bcf184
  98. /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
  99. .octa 0x000000009cde8348000000000a8de642
  100. /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
  101. .octa 0x0000000038d3c2a60000000062ea130c
  102. /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
  103. .octa 0x000000011b25f26000000001eb31cbb2
  104. /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
  105. .octa 0x000000001629e6f00000000170783448
  106. /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
  107. .octa 0x0000000160838b4c00000001a684b4c6
  108. /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
  109. .octa 0x000000007a44011c00000000253ca5b4
  110. /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
  111. .octa 0x00000000226f417a0000000057b4b1e2
  112. /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
  113. .octa 0x0000000045eb2eb400000000b6bd084c
  114. /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
  115. .octa 0x000000014459d70c0000000123c2d592
  116. /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
  117. .octa 0x00000001d406ed8200000000159dafce
  118. /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
  119. .octa 0x0000000160c8e1a80000000127e1a64e
  120. /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
  121. .octa 0x0000000027ba80980000000056860754
  122. /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
  123. .octa 0x000000006d92d01800000001e661aae8
  124. /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
  125. .octa 0x000000012ed7e3f200000000f82c6166
  126. /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
  127. .octa 0x000000002dc8778800000000c4f9c7ae
  128. /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
  129. .octa 0x0000000018240bb80000000074203d20
  130. /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
  131. .octa 0x000000001ad381580000000198173052
  132. /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
  133. .octa 0x00000001396b78f200000001ce8aba54
  134. /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
  135. .octa 0x000000011a68133400000001850d5d94
  136. /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
  137. .octa 0x000000012104732e00000001d609239c
  138. /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
  139. .octa 0x00000000a140d90c000000001595f048
  140. /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
  141. .octa 0x00000001b7215eda0000000042ccee08
  142. /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
  143. .octa 0x00000001aaf1df3c000000010a389d74
  144. /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
  145. .octa 0x0000000029d15b8a000000012a840da6
  146. /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
  147. .octa 0x00000000f1a96922000000001d181c0c
  148. /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
  149. .octa 0x00000001ac80d03c0000000068b7d1f6
  150. /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
  151. .octa 0x000000000f11d56a000000005b0f14fc
  152. /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
  153. .octa 0x00000001f1c022a20000000179e9e730
  154. /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
  155. .octa 0x0000000173d00ae200000001ce1368d6
  156. /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
  157. .octa 0x00000001d4ffe4ac0000000112c3a84c
  158. /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
  159. .octa 0x000000016edc5ae400000000de940fee
  160. /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
  161. .octa 0x00000001f1a0214000000000fe896b7e
  162. /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
  163. .octa 0x00000000ca0b28a000000001f797431c
  164. /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
  165. .octa 0x00000001928e30a20000000053e989ba
  166. /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
  167. .octa 0x0000000097b1b002000000003920cd16
  168. /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
  169. .octa 0x00000000b15bf90600000001e6f579b8
  170. /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
  171. .octa 0x00000000411c5d52000000007493cb0a
  172. /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
  173. .octa 0x00000001c36f330000000001bdd376d8
  174. /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
  175. .octa 0x00000001119227e0000000016badfee6
  176. /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
  177. .octa 0x00000000114d47020000000071de5c58
  178. /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
  179. .octa 0x00000000458b5b9800000000453f317c
  180. /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
  181. .octa 0x000000012e31fb8e0000000121675cce
  182. /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
  183. .octa 0x000000005cf619d800000001f409ee92
  184. /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
  185. .octa 0x0000000063f4d8b200000000f36b9c88
  186. /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
  187. .octa 0x000000004138dc8a0000000036b398f4
  188. /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
  189. .octa 0x00000001d29ee8e000000001748f9adc
  190. /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
  191. .octa 0x000000006a08ace800000001be94ec00
  192. /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
  193. .octa 0x0000000127d4201000000000b74370d6
  194. /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
  195. .octa 0x0000000019d76b6200000001174d0b98
  196. /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
  197. .octa 0x00000001b1471f6e00000000befc06a4
  198. /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
  199. .octa 0x00000001f64c19cc00000001ae125288
  200. /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
  201. .octa 0x00000000003c0ea00000000095c19b34
  202. /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
  203. .octa 0x000000014d73abf600000001a78496f2
  204. /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
  205. .octa 0x00000001620eb84400000001ac5390a0
  206. /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
  207. .octa 0x0000000147655048000000002a80ed6e
  208. /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
  209. .octa 0x0000000067b5077e00000001fa9b0128
  210. /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
  211. .octa 0x0000000010ffe20600000001ea94929e
  212. /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
  213. .octa 0x000000000fee8f1e0000000125f4305c
  214. /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
  215. .octa 0x00000001da26fbae00000001471e2002
  216. /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
  217. .octa 0x00000001b3a8bd880000000132d2253a
  218. /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
  219. .octa 0x00000000e8f3898e00000000f26b3592
  220. /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
  221. .octa 0x00000000b0d0d28c00000000bc8b67b0
  222. /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
  223. .octa 0x0000000030f2a798000000013a826ef2
  224. /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
  225. .octa 0x000000000fba10020000000081482c84
  226. /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
  227. .octa 0x00000000bdb9bd7200000000e77307c2
  228. /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
  229. .octa 0x0000000075d3bf5a00000000d4a07ec8
  230. /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
  231. .octa 0x00000000ef1f98a00000000017102100
  232. /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
  233. .octa 0x00000000689c760200000000db406486
  234. /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
  235. .octa 0x000000016d5fa5fe0000000192db7f88
  236. /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
  237. .octa 0x00000001d0d2b9ca000000018bf67b1e
  238. /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
  239. .octa 0x0000000041e7b470000000007c09163e
  240. /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
  241. .octa 0x00000001cbb6495e000000000adac060
  242. /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
  243. .octa 0x000000010052a0b000000000bd8316ae
  244. /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
  245. .octa 0x00000001d8effb5c000000019f09ab54
  246. /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
  247. .octa 0x00000001d969853c0000000125155542
  248. /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
  249. .octa 0x00000000523ccce2000000018fdb5882
  250. /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
  251. .octa 0x000000001e2436bc00000000e794b3f4
  252. /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
  253. .octa 0x00000000ddd1c3a2000000016f9bb022
  254. /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
  255. .octa 0x0000000019fcfe3800000000290c9978
  256. /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
  257. .octa 0x00000001ce95db640000000083c0f350
  258. /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
  259. .octa 0x00000000af5828060000000173ea6628
  260. /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
  261. .octa 0x00000001006388f600000001c8b4e00a
  262. /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
  263. .octa 0x0000000179eca00a00000000de95d6aa
  264. /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
  265. .octa 0x0000000122410a6a000000010b7f7248
  266. /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
  267. .octa 0x000000004288e87c00000001326e3a06
  268. /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
  269. .octa 0x000000016c5490da00000000bb62c2e6
  270. /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
  271. .octa 0x00000000d1c71f6e0000000156a4b2c2
  272. /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
  273. .octa 0x00000001b4ce08a6000000011dfe763a
  274. /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
  275. .octa 0x00000001466ba60c000000007bcca8e2
  276. /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
  277. .octa 0x00000001f6c488a40000000186118faa
  278. /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
  279. .octa 0x000000013bfb06820000000111a65a88
  280. /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
  281. .octa 0x00000000690e9e54000000003565e1c4
  282. /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
  283. .octa 0x00000000281346b6000000012ed02a82
  284. /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
  285. .octa 0x000000015646402400000000c486ecfc
  286. /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
  287. .octa 0x000000016063a8dc0000000001b951b2
  288. /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
  289. .octa 0x0000000116a663620000000048143916
  290. /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
  291. .octa 0x000000017e8aa4d200000001dc2ae124
  292. /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
  293. .octa 0x00000001728eb10c00000001416c58d6
  294. /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
  295. .octa 0x00000001b08fd7fa00000000a479744a
  296. /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
  297. .octa 0x00000001092a16e80000000096ca3a26
  298. /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
  299. .octa 0x00000000a505637c00000000ff223d4e
  300. /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
  301. .octa 0x00000000d94869b2000000010e84da42
  302. /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
  303. .octa 0x00000001c8b203ae00000001b61ba3d0
  304. /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
  305. .octa 0x000000005704aea000000000680f2de8
  306. /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
  307. .octa 0x000000012e295fa2000000008772a9a8
  308. /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
  309. .octa 0x000000011d0908bc0000000155f295bc
  310. /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
  311. .octa 0x0000000193ed97ea00000000595f9282
  312. /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
  313. .octa 0x000000013a0f1c520000000164b1c25a
  314. /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
  315. .octa 0x000000010c2c40c000000000fbd67c50
  316. /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
  317. .octa 0x00000000ff6fac3e0000000096076268
  318. /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
  319. .octa 0x000000017b3609c000000001d288e4cc
  320. /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
  321. .octa 0x0000000088c8c92200000001eaac1bdc
  322. /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
  323. .octa 0x00000001751baae600000001f1ea39e2
  324. /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
  325. .octa 0x000000010795297200000001eb6506fc
  326. /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
  327. .octa 0x0000000162b00abe000000010f806ffe
  328. /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
  329. .octa 0x000000000d7b404c000000010408481e
  330. /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
  331. .octa 0x00000000763b13d40000000188260534
  332. /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
  333. .octa 0x00000000f6dc22d80000000058fc73e0
  334. /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
  335. .octa 0x000000007daae06000000000391c59b8
  336. /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
  337. .octa 0x000000013359ab7c000000018b638400
  338. /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
  339. .octa 0x000000008add438a000000011738f5c4
  340. /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
  341. .octa 0x00000001edbefdea000000008cf7c6da
  342. /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
  343. .octa 0x000000004104e0f800000001ef97fb16
  344. /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
  345. .octa 0x00000000b48a82220000000102130e20
  346. /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
  347. .octa 0x00000001bcb4684400000000db968898
  348. /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
  349. .octa 0x000000013293ce0a00000000b5047b5e
  350. /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
  351. .octa 0x00000001710d0844000000010b90fdb2
  352. /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
  353. .octa 0x0000000117907f6e000000004834a32e
  354. /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
  355. .octa 0x0000000087ddf93e0000000059c8f2b0
  356. /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
  357. .octa 0x000000005970e9b00000000122cec508
  358. /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
  359. .octa 0x0000000185b2b7d0000000000a330cda
  360. /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
  361. .octa 0x00000001dcee0efc000000014a47148c
  362. /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
  363. .octa 0x0000000030da27220000000042c61cb8
  364. /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
  365. .octa 0x000000012f925a180000000012fe6960
  366. /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
  367. .octa 0x00000000dd2e357c00000000dbda2c20
  368. /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
  369. .octa 0x00000000071c80de000000011122410c
  370. /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
  371. .octa 0x000000011513140a00000000977b2070
  372. /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
  373. .octa 0x00000001df876e8e000000014050438e
  374. /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
  375. .octa 0x000000015f81d6ce0000000147c840e8
  376. /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
  377. .octa 0x000000019dd94dbe00000001cc7c88ce
  378. /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
  379. .octa 0x00000001373d206e00000001476b35a4
  380. /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
  381. .octa 0x00000000668ccade000000013d52d508
  382. /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
  383. .octa 0x00000001b192d268000000008e4be32e
  384. /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
  385. .octa 0x00000000e30f3a7800000000024120fe
  386. /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
  387. .octa 0x000000010ef1f7bc00000000ddecddb4
  388. /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
  389. .octa 0x00000001f5ac738000000000d4d403bc
  390. /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
  391. .octa 0x000000011822ea7000000001734b89aa
  392. /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
  393. .octa 0x00000000c3a33848000000010e7a58d6
  394. /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
  395. .octa 0x00000001bd151c2400000001f9f04e9c
  396. /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
  397. .octa 0x0000000056002d7600000000b692225e
  398. /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
  399. .octa 0x000000014657c4f4000000019b8d3f3e
  400. /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
  401. .octa 0x0000000113742d7c00000001a874f11e
  402. /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
  403. .octa 0x000000019c5920ba000000010d5a4254
  404. /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
  405. .octa 0x000000005216d2d600000000bbb2f5d6
  406. /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
  407. .octa 0x0000000136f5ad8a0000000179cc0e36
  408. /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
  409. .octa 0x000000018b07beb600000001dca1da4a
  410. /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
  411. .octa 0x00000000db1e93b000000000feb1a192
  412. /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
  413. .octa 0x000000000b96fa3a00000000d1eeedd6
  414. /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
  415. .octa 0x00000001d9968af0000000008fad9bb4
  416. /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
  417. .octa 0x000000000e4a77a200000001884938e4
  418. /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
  419. .octa 0x00000000508c2ac800000001bc2e9bc0
  420. /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
  421. .octa 0x0000000021572a8000000001f9658a68
  422. /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
  423. .octa 0x00000001b859daf2000000001b9224fc
  424. /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
  425. .octa 0x000000016f7884740000000055b2fb84
  426. /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
  427. .octa 0x00000001b438810e000000018b090348
  428. /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
  429. .octa 0x0000000095ddc6f2000000011ccbd5ea
  430. /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
  431. .octa 0x00000001d977c20c0000000007ae47f8
  432. /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
  433. .octa 0x00000000ebedb99a0000000172acbec0
  434. /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
  435. .octa 0x00000001df9e9e9200000001c6e3ff20
  436. /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
  437. .octa 0x00000001a4a3f95200000000e1b38744
  438. /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
  439. .octa 0x00000000e2f5122000000000791585b2
  440. /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
  441. .octa 0x000000004aa01f3e00000000ac53b894
  442. /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
  443. .octa 0x00000000b3e90a5800000001ed5f2cf4
  444. /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
  445. .octa 0x000000000c9ca2aa00000001df48b2e0
  446. /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
  447. .octa 0x000000015168231600000000049c1c62
  448. /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
  449. .octa 0x0000000036fce78c000000017c460c12
  450. /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
  451. .octa 0x000000009037dc10000000015be4da7e
  452. /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
  453. .octa 0x00000000d3298582000000010f38f668
  454. /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
  455. .octa 0x00000001b42e8ad60000000039f40a00
  456. /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
  457. .octa 0x00000000142a983800000000bd4c10c4
  458. /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
  459. .octa 0x0000000109c7f1900000000042db1d98
  460. /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
  461. .octa 0x0000000056ff931000000001c905bae6
  462. /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
  463. .octa 0x00000001594513aa00000000069d40ea
  464. /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
  465. .octa 0x00000001e3b5b1e8000000008e4fbad0
  466. /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
  467. .octa 0x000000011dd5fc080000000047bedd46
  468. /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
  469. .octa 0x00000001675f0cc20000000026396bf8
  470. /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
  471. .octa 0x00000000d1c8dd4400000000379beb92
  472. /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
  473. .octa 0x0000000115ebd3d8000000000abae54a
  474. /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
  475. .octa 0x00000001ecbd0dac0000000007e6a128
  476. /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
  477. .octa 0x00000000cdf67af2000000000ade29d2
  478. /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
  479. .octa 0x000000004c01ff4c00000000f974c45c
  480. /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
  481. .octa 0x00000000f2d8657e00000000e77ac60a
  482. /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
  483. .octa 0x000000006bae74c40000000145895816
  484. /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
  485. .octa 0x0000000152af8aa00000000038e362be
  486. /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
  487. .octa 0x0000000004663802000000007f991a64
  488. /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
  489. .octa 0x00000001ab2f5afc00000000fa366d3a
  490. /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
  491. .octa 0x0000000074a4ebd400000001a2bb34f0
  492. /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
  493. .octa 0x00000001d7ab3a4c0000000028a9981e
  494. /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
  495. .octa 0x00000001a8da60c600000001dbc672be
  496. /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
  497. .octa 0x000000013cf6382000000000b04d77f6
  498. /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
  499. .octa 0x00000000bec12e1e0000000124400d96
  500. /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
  501. .octa 0x00000001c6368010000000014ca4b414
  502. /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
  503. .octa 0x00000001e6e78758000000012fe2c938
  504. /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
  505. .octa 0x000000008d7f2b3c00000001faed01e6
  506. /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
  507. .octa 0x000000016b4a156e000000007e80ecfe
  508. /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
  509. .octa 0x00000001c63cfeb60000000098daee94
  510. /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
  511. .octa 0x000000015f902670000000010a04edea
  512. /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
  513. .octa 0x00000001cd5de11e00000001c00b4524
  514. /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
  515. .octa 0x000000001acaec540000000170296550
  516. /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
  517. .octa 0x000000002bd0ca780000000181afaa48
  518. /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
  519. .octa 0x0000000032d63d5c0000000185a31ffa
  520. /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
  521. .octa 0x000000001c6d4e4c000000002469f608
  522. /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
  523. .octa 0x0000000106a60b92000000006980102a
  524. /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
  525. .octa 0x00000000d3855e120000000111ea9ca8
  526. /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
  527. .octa 0x00000000e312563600000001bd1d29ce
  528. /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
  529. .octa 0x000000009e8f7ea400000001b34b9580
  530. /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
  531. .octa 0x00000001c82e562c000000003076054e
  532. /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
  533. .octa 0x00000000ca9f09ce000000012a608ea4
  534. /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
  535. .octa 0x00000000c63764e600000000784d05fe
  536. /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
  537. .octa 0x0000000168d2e49e000000016ef0d82a
  538. /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
  539. .octa 0x00000000e986c1480000000075bda454
  540. /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
  541. .octa 0x00000000cfb65894000000003dc0a1c4
  542. /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
  543. .octa 0x0000000111cadee400000000e9a5d8be
  544. /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
  545. .octa 0x0000000171fb63ce00000001609bc4b4
  546. .short_constants:
  547. /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
  548. /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
  549. .octa 0x7fec2963e5bf80485cf015c388e56f72
  550. /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
  551. .octa 0x38e888d4844752a9963a18920246e2e6
  552. /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
  553. .octa 0x42316c00730206ad419a441956993a31
  554. /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
  555. .octa 0x543d5c543e65ddf9924752ba2b830011
  556. /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
  557. .octa 0x78e87aaf56767c9255bd7f9518e4a304
  558. /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
  559. .octa 0x8f68fcec1903da7f6d76739fe0553f1e
  560. /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
  561. .octa 0x3f4840246791d588c133722b1fe0b5c3
  562. /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
  563. .octa 0x34c96751b04de25a64b67ee0e55ef1f3
  564. /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
  565. .octa 0x156c8e180b4a395b069db049b8fdb1e7
  566. /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
  567. .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
  568. /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
  569. .octa 0x041d37768cd75659817cdc5119b29a35
  570. /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
  571. .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
  572. /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
  573. .octa 0x0e148e8252377a554f256efcb82be955
  574. /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
  575. .octa 0x9c25531d19e65ddeec1631edb2dea967
  576. /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
  577. .octa 0x790606ff9957c0a65d27e147510ac59a
  578. /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
  579. .octa 0x82f63b786ea2d55ca66805eb18b8ea18
  580. .barrett_constants:
  581. /* 33 bit reflected Barrett constant m - (4^32)/n */
  582. .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
  583. /* 33 bit reflected Barrett constant n */
  584. .octa 0x00000000000000000000000105ec76f1
  585. .text
  586. #if defined(__BIG_ENDIAN__)
  587. #define BYTESWAP_DATA
  588. #else
  589. #undef BYTESWAP_DATA
  590. #endif
  591. #define off16 r25
  592. #define off32 r26
  593. #define off48 r27
  594. #define off64 r28
  595. #define off80 r29
  596. #define off96 r30
  597. #define off112 r31
  598. #define const1 v24
  599. #define const2 v25
  600. #define byteswap v26
  601. #define mask_32bit v27
  602. #define mask_64bit v28
  603. #define zeroes v29
  604. #ifdef BYTESWAP_DATA
  605. #define VPERM(A, B, C, D) vperm A, B, C, D
  606. #else
  607. #define VPERM(A, B, C, D)
  608. #endif
  609. /* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
  610. FUNC_START(__crc32c_vpmsum)
  611. std r31,-8(r1)
  612. std r30,-16(r1)
  613. std r29,-24(r1)
  614. std r28,-32(r1)
  615. std r27,-40(r1)
  616. std r26,-48(r1)
  617. std r25,-56(r1)
  618. li off16,16
  619. li off32,32
  620. li off48,48
  621. li off64,64
  622. li off80,80
  623. li off96,96
  624. li off112,112
  625. li r0,0
  626. /* Enough room for saving 10 non volatile VMX registers */
  627. subi r6,r1,56+10*16
  628. subi r7,r1,56+2*16
  629. stvx v20,0,r6
  630. stvx v21,off16,r6
  631. stvx v22,off32,r6
  632. stvx v23,off48,r6
  633. stvx v24,off64,r6
  634. stvx v25,off80,r6
  635. stvx v26,off96,r6
  636. stvx v27,off112,r6
  637. stvx v28,0,r7
  638. stvx v29,off16,r7
  639. mr r10,r3
  640. vxor zeroes,zeroes,zeroes
  641. vspltisw v0,-1
  642. vsldoi mask_32bit,zeroes,v0,4
  643. vsldoi mask_64bit,zeroes,v0,8
  644. /* Get the initial value into v8 */
  645. vxor v8,v8,v8
  646. MTVRD(v8, R3)
  647. vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
  648. #ifdef BYTESWAP_DATA
  649. addis r3,r2,.byteswap_constant@toc@ha
  650. addi r3,r3,.byteswap_constant@toc@l
  651. lvx byteswap,0,r3
  652. addi r3,r3,16
  653. #endif
  654. cmpdi r5,256
  655. blt .Lshort
  656. rldicr r6,r5,0,56
  657. /* Checksum in blocks of MAX_SIZE */
  658. 1: lis r7,MAX_SIZE@h
  659. ori r7,r7,MAX_SIZE@l
  660. mr r9,r7
  661. cmpd r6,r7
  662. bgt 2f
  663. mr r7,r6
  664. 2: subf r6,r7,r6
  665. /* our main loop does 128 bytes at a time */
  666. srdi r7,r7,7
  667. /*
  668. * Work out the offset into the constants table to start at. Each
  669. * constant is 16 bytes, and it is used against 128 bytes of input
  670. * data - 128 / 16 = 8
  671. */
  672. sldi r8,r7,4
  673. srdi r9,r9,3
  674. subf r8,r8,r9
  675. /* We reduce our final 128 bytes in a separate step */
  676. addi r7,r7,-1
  677. mtctr r7
  678. addis r3,r2,.constants@toc@ha
  679. addi r3,r3,.constants@toc@l
  680. /* Find the start of our constants */
  681. add r3,r3,r8
  682. /* zero v0-v7 which will contain our checksums */
  683. vxor v0,v0,v0
  684. vxor v1,v1,v1
  685. vxor v2,v2,v2
  686. vxor v3,v3,v3
  687. vxor v4,v4,v4
  688. vxor v5,v5,v5
  689. vxor v6,v6,v6
  690. vxor v7,v7,v7
  691. lvx const1,0,r3
  692. /*
  693. * If we are looping back to consume more data we use the values
  694. * already in v16-v23.
  695. */
  696. cmpdi r0,1
  697. beq 2f
  698. /* First warm up pass */
  699. lvx v16,0,r4
  700. lvx v17,off16,r4
  701. VPERM(v16,v16,v16,byteswap)
  702. VPERM(v17,v17,v17,byteswap)
  703. lvx v18,off32,r4
  704. lvx v19,off48,r4
  705. VPERM(v18,v18,v18,byteswap)
  706. VPERM(v19,v19,v19,byteswap)
  707. lvx v20,off64,r4
  708. lvx v21,off80,r4
  709. VPERM(v20,v20,v20,byteswap)
  710. VPERM(v21,v21,v21,byteswap)
  711. lvx v22,off96,r4
  712. lvx v23,off112,r4
  713. VPERM(v22,v22,v22,byteswap)
  714. VPERM(v23,v23,v23,byteswap)
  715. addi r4,r4,8*16
  716. /* xor in initial value */
  717. vxor v16,v16,v8
  718. 2: bdz .Lfirst_warm_up_done
  719. addi r3,r3,16
  720. lvx const2,0,r3
  721. /* Second warm up pass */
  722. VPMSUMD(v8,v16,const1)
  723. lvx v16,0,r4
  724. VPERM(v16,v16,v16,byteswap)
  725. ori r2,r2,0
  726. VPMSUMD(v9,v17,const1)
  727. lvx v17,off16,r4
  728. VPERM(v17,v17,v17,byteswap)
  729. ori r2,r2,0
  730. VPMSUMD(v10,v18,const1)
  731. lvx v18,off32,r4
  732. VPERM(v18,v18,v18,byteswap)
  733. ori r2,r2,0
  734. VPMSUMD(v11,v19,const1)
  735. lvx v19,off48,r4
  736. VPERM(v19,v19,v19,byteswap)
  737. ori r2,r2,0
  738. VPMSUMD(v12,v20,const1)
  739. lvx v20,off64,r4
  740. VPERM(v20,v20,v20,byteswap)
  741. ori r2,r2,0
  742. VPMSUMD(v13,v21,const1)
  743. lvx v21,off80,r4
  744. VPERM(v21,v21,v21,byteswap)
  745. ori r2,r2,0
  746. VPMSUMD(v14,v22,const1)
  747. lvx v22,off96,r4
  748. VPERM(v22,v22,v22,byteswap)
  749. ori r2,r2,0
  750. VPMSUMD(v15,v23,const1)
  751. lvx v23,off112,r4
  752. VPERM(v23,v23,v23,byteswap)
  753. addi r4,r4,8*16
  754. bdz .Lfirst_cool_down
  755. /*
  756. * main loop. We modulo schedule it such that it takes three iterations
  757. * to complete - first iteration load, second iteration vpmsum, third
  758. * iteration xor.
  759. */
  760. .balign 16
  761. 4: lvx const1,0,r3
  762. addi r3,r3,16
  763. ori r2,r2,0
  764. vxor v0,v0,v8
  765. VPMSUMD(v8,v16,const2)
  766. lvx v16,0,r4
  767. VPERM(v16,v16,v16,byteswap)
  768. ori r2,r2,0
  769. vxor v1,v1,v9
  770. VPMSUMD(v9,v17,const2)
  771. lvx v17,off16,r4
  772. VPERM(v17,v17,v17,byteswap)
  773. ori r2,r2,0
  774. vxor v2,v2,v10
  775. VPMSUMD(v10,v18,const2)
  776. lvx v18,off32,r4
  777. VPERM(v18,v18,v18,byteswap)
  778. ori r2,r2,0
  779. vxor v3,v3,v11
  780. VPMSUMD(v11,v19,const2)
  781. lvx v19,off48,r4
  782. VPERM(v19,v19,v19,byteswap)
  783. lvx const2,0,r3
  784. ori r2,r2,0
  785. vxor v4,v4,v12
  786. VPMSUMD(v12,v20,const1)
  787. lvx v20,off64,r4
  788. VPERM(v20,v20,v20,byteswap)
  789. ori r2,r2,0
  790. vxor v5,v5,v13
  791. VPMSUMD(v13,v21,const1)
  792. lvx v21,off80,r4
  793. VPERM(v21,v21,v21,byteswap)
  794. ori r2,r2,0
  795. vxor v6,v6,v14
  796. VPMSUMD(v14,v22,const1)
  797. lvx v22,off96,r4
  798. VPERM(v22,v22,v22,byteswap)
  799. ori r2,r2,0
  800. vxor v7,v7,v15
  801. VPMSUMD(v15,v23,const1)
  802. lvx v23,off112,r4
  803. VPERM(v23,v23,v23,byteswap)
  804. addi r4,r4,8*16
  805. bdnz 4b
  806. .Lfirst_cool_down:
  807. /* First cool down pass */
  808. lvx const1,0,r3
  809. addi r3,r3,16
  810. vxor v0,v0,v8
  811. VPMSUMD(v8,v16,const1)
  812. ori r2,r2,0
  813. vxor v1,v1,v9
  814. VPMSUMD(v9,v17,const1)
  815. ori r2,r2,0
  816. vxor v2,v2,v10
  817. VPMSUMD(v10,v18,const1)
  818. ori r2,r2,0
  819. vxor v3,v3,v11
  820. VPMSUMD(v11,v19,const1)
  821. ori r2,r2,0
  822. vxor v4,v4,v12
  823. VPMSUMD(v12,v20,const1)
  824. ori r2,r2,0
  825. vxor v5,v5,v13
  826. VPMSUMD(v13,v21,const1)
  827. ori r2,r2,0
  828. vxor v6,v6,v14
  829. VPMSUMD(v14,v22,const1)
  830. ori r2,r2,0
  831. vxor v7,v7,v15
  832. VPMSUMD(v15,v23,const1)
  833. ori r2,r2,0
  834. .Lsecond_cool_down:
  835. /* Second cool down pass */
  836. vxor v0,v0,v8
  837. vxor v1,v1,v9
  838. vxor v2,v2,v10
  839. vxor v3,v3,v11
  840. vxor v4,v4,v12
  841. vxor v5,v5,v13
  842. vxor v6,v6,v14
  843. vxor v7,v7,v15
  844. /*
  845. * vpmsumd produces a 96 bit result in the least significant bits
  846. * of the register. Since we are bit reflected we have to shift it
  847. * left 32 bits so it occupies the least significant bits in the
  848. * bit reflected domain.
  849. */
  850. vsldoi v0,v0,zeroes,4
  851. vsldoi v1,v1,zeroes,4
  852. vsldoi v2,v2,zeroes,4
  853. vsldoi v3,v3,zeroes,4
  854. vsldoi v4,v4,zeroes,4
  855. vsldoi v5,v5,zeroes,4
  856. vsldoi v6,v6,zeroes,4
  857. vsldoi v7,v7,zeroes,4
  858. /* xor with last 1024 bits */
  859. lvx v8,0,r4
  860. lvx v9,off16,r4
  861. VPERM(v8,v8,v8,byteswap)
  862. VPERM(v9,v9,v9,byteswap)
  863. lvx v10,off32,r4
  864. lvx v11,off48,r4
  865. VPERM(v10,v10,v10,byteswap)
  866. VPERM(v11,v11,v11,byteswap)
  867. lvx v12,off64,r4
  868. lvx v13,off80,r4
  869. VPERM(v12,v12,v12,byteswap)
  870. VPERM(v13,v13,v13,byteswap)
  871. lvx v14,off96,r4
  872. lvx v15,off112,r4
  873. VPERM(v14,v14,v14,byteswap)
  874. VPERM(v15,v15,v15,byteswap)
  875. addi r4,r4,8*16
  876. vxor v16,v0,v8
  877. vxor v17,v1,v9
  878. vxor v18,v2,v10
  879. vxor v19,v3,v11
  880. vxor v20,v4,v12
  881. vxor v21,v5,v13
  882. vxor v22,v6,v14
  883. vxor v23,v7,v15
  884. li r0,1
  885. cmpdi r6,0
  886. addi r6,r6,128
  887. bne 1b
  888. /* Work out how many bytes we have left */
  889. andi. r5,r5,127
  890. /* Calculate where in the constant table we need to start */
  891. subfic r6,r5,128
  892. add r3,r3,r6
  893. /* How many 16 byte chunks are in the tail */
  894. srdi r7,r5,4
  895. mtctr r7
  896. /*
  897. * Reduce the previously calculated 1024 bits to 64 bits, shifting
  898. * 32 bits to include the trailing 32 bits of zeros
  899. */
  900. lvx v0,0,r3
  901. lvx v1,off16,r3
  902. lvx v2,off32,r3
  903. lvx v3,off48,r3
  904. lvx v4,off64,r3
  905. lvx v5,off80,r3
  906. lvx v6,off96,r3
  907. lvx v7,off112,r3
  908. addi r3,r3,8*16
  909. VPMSUMW(v0,v16,v0)
  910. VPMSUMW(v1,v17,v1)
  911. VPMSUMW(v2,v18,v2)
  912. VPMSUMW(v3,v19,v3)
  913. VPMSUMW(v4,v20,v4)
  914. VPMSUMW(v5,v21,v5)
  915. VPMSUMW(v6,v22,v6)
  916. VPMSUMW(v7,v23,v7)
  917. /* Now reduce the tail (0 - 112 bytes) */
  918. cmpdi r7,0
  919. beq 1f
  920. lvx v16,0,r4
  921. lvx v17,0,r3
  922. VPERM(v16,v16,v16,byteswap)
  923. VPMSUMW(v16,v16,v17)
  924. vxor v0,v0,v16
  925. bdz 1f
  926. lvx v16,off16,r4
  927. lvx v17,off16,r3
  928. VPERM(v16,v16,v16,byteswap)
  929. VPMSUMW(v16,v16,v17)
  930. vxor v0,v0,v16
  931. bdz 1f
  932. lvx v16,off32,r4
  933. lvx v17,off32,r3
  934. VPERM(v16,v16,v16,byteswap)
  935. VPMSUMW(v16,v16,v17)
  936. vxor v0,v0,v16
  937. bdz 1f
  938. lvx v16,off48,r4
  939. lvx v17,off48,r3
  940. VPERM(v16,v16,v16,byteswap)
  941. VPMSUMW(v16,v16,v17)
  942. vxor v0,v0,v16
  943. bdz 1f
  944. lvx v16,off64,r4
  945. lvx v17,off64,r3
  946. VPERM(v16,v16,v16,byteswap)
  947. VPMSUMW(v16,v16,v17)
  948. vxor v0,v0,v16
  949. bdz 1f
  950. lvx v16,off80,r4
  951. lvx v17,off80,r3
  952. VPERM(v16,v16,v16,byteswap)
  953. VPMSUMW(v16,v16,v17)
  954. vxor v0,v0,v16
  955. bdz 1f
  956. lvx v16,off96,r4
  957. lvx v17,off96,r3
  958. VPERM(v16,v16,v16,byteswap)
  959. VPMSUMW(v16,v16,v17)
  960. vxor v0,v0,v16
  961. /* Now xor all the parallel chunks together */
  962. 1: vxor v0,v0,v1
  963. vxor v2,v2,v3
  964. vxor v4,v4,v5
  965. vxor v6,v6,v7
  966. vxor v0,v0,v2
  967. vxor v4,v4,v6
  968. vxor v0,v0,v4
  969. .Lbarrett_reduction:
  970. /* Barrett constants */
  971. addis r3,r2,.barrett_constants@toc@ha
  972. addi r3,r3,.barrett_constants@toc@l
  973. lvx const1,0,r3
  974. lvx const2,off16,r3
  975. vsldoi v1,v0,v0,8
  976. vxor v0,v0,v1 /* xor two 64 bit results together */
  977. /* shift left one bit */
  978. vspltisb v1,1
  979. vsl v0,v0,v1
  980. vand v0,v0,mask_64bit
  981. /*
  982. * The reflected version of Barrett reduction. Instead of bit
  983. * reflecting our data (which is expensive to do), we bit reflect our
  984. * constants and our algorithm, which means the intermediate data in
  985. * our vector registers goes from 0-63 instead of 63-0. We can reflect
  986. * the algorithm because we don't carry in mod 2 arithmetic.
  987. */
  988. vand v1,v0,mask_32bit /* bottom 32 bits of a */
  989. VPMSUMD(v1,v1,const1) /* ma */
  990. vand v1,v1,mask_32bit /* bottom 32bits of ma */
  991. VPMSUMD(v1,v1,const2) /* qn */
  992. vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
  993. /*
  994. * Since we are bit reflected, the result (ie the low 32 bits) is in
  995. * the high 32 bits. We just need to shift it left 4 bytes
  996. * V0 [ 0 1 X 3 ]
  997. * V0 [ 0 X 2 3 ]
  998. */
  999. vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
  1000. /* Get it into r3 */
  1001. MFVRD(R3, v0)
  1002. .Lout:
  1003. subi r6,r1,56+10*16
  1004. subi r7,r1,56+2*16
  1005. lvx v20,0,r6
  1006. lvx v21,off16,r6
  1007. lvx v22,off32,r6
  1008. lvx v23,off48,r6
  1009. lvx v24,off64,r6
  1010. lvx v25,off80,r6
  1011. lvx v26,off96,r6
  1012. lvx v27,off112,r6
  1013. lvx v28,0,r7
  1014. lvx v29,off16,r7
  1015. ld r31,-8(r1)
  1016. ld r30,-16(r1)
  1017. ld r29,-24(r1)
  1018. ld r28,-32(r1)
  1019. ld r27,-40(r1)
  1020. ld r26,-48(r1)
  1021. ld r25,-56(r1)
  1022. blr
  1023. .Lfirst_warm_up_done:
  1024. lvx const1,0,r3
  1025. addi r3,r3,16
  1026. VPMSUMD(v8,v16,const1)
  1027. VPMSUMD(v9,v17,const1)
  1028. VPMSUMD(v10,v18,const1)
  1029. VPMSUMD(v11,v19,const1)
  1030. VPMSUMD(v12,v20,const1)
  1031. VPMSUMD(v13,v21,const1)
  1032. VPMSUMD(v14,v22,const1)
  1033. VPMSUMD(v15,v23,const1)
  1034. b .Lsecond_cool_down
  1035. .Lshort:
  1036. cmpdi r5,0
  1037. beq .Lzero
  1038. addis r3,r2,.short_constants@toc@ha
  1039. addi r3,r3,.short_constants@toc@l
  1040. /* Calculate where in the constant table we need to start */
  1041. subfic r6,r5,256
  1042. add r3,r3,r6
  1043. /* How many 16 byte chunks? */
  1044. srdi r7,r5,4
  1045. mtctr r7
  1046. vxor v19,v19,v19
  1047. vxor v20,v20,v20
  1048. lvx v0,0,r4
  1049. lvx v16,0,r3
  1050. VPERM(v0,v0,v16,byteswap)
  1051. vxor v0,v0,v8 /* xor in initial value */
  1052. VPMSUMW(v0,v0,v16)
  1053. bdz .Lv0
  1054. lvx v1,off16,r4
  1055. lvx v17,off16,r3
  1056. VPERM(v1,v1,v17,byteswap)
  1057. VPMSUMW(v1,v1,v17)
  1058. bdz .Lv1
  1059. lvx v2,off32,r4
  1060. lvx v16,off32,r3
  1061. VPERM(v2,v2,v16,byteswap)
  1062. VPMSUMW(v2,v2,v16)
  1063. bdz .Lv2
  1064. lvx v3,off48,r4
  1065. lvx v17,off48,r3
  1066. VPERM(v3,v3,v17,byteswap)
  1067. VPMSUMW(v3,v3,v17)
  1068. bdz .Lv3
  1069. lvx v4,off64,r4
  1070. lvx v16,off64,r3
  1071. VPERM(v4,v4,v16,byteswap)
  1072. VPMSUMW(v4,v4,v16)
  1073. bdz .Lv4
  1074. lvx v5,off80,r4
  1075. lvx v17,off80,r3
  1076. VPERM(v5,v5,v17,byteswap)
  1077. VPMSUMW(v5,v5,v17)
  1078. bdz .Lv5
  1079. lvx v6,off96,r4
  1080. lvx v16,off96,r3
  1081. VPERM(v6,v6,v16,byteswap)
  1082. VPMSUMW(v6,v6,v16)
  1083. bdz .Lv6
  1084. lvx v7,off112,r4
  1085. lvx v17,off112,r3
  1086. VPERM(v7,v7,v17,byteswap)
  1087. VPMSUMW(v7,v7,v17)
  1088. bdz .Lv7
  1089. addi r3,r3,128
  1090. addi r4,r4,128
  1091. lvx v8,0,r4
  1092. lvx v16,0,r3
  1093. VPERM(v8,v8,v16,byteswap)
  1094. VPMSUMW(v8,v8,v16)
  1095. bdz .Lv8
  1096. lvx v9,off16,r4
  1097. lvx v17,off16,r3
  1098. VPERM(v9,v9,v17,byteswap)
  1099. VPMSUMW(v9,v9,v17)
  1100. bdz .Lv9
  1101. lvx v10,off32,r4
  1102. lvx v16,off32,r3
  1103. VPERM(v10,v10,v16,byteswap)
  1104. VPMSUMW(v10,v10,v16)
  1105. bdz .Lv10
  1106. lvx v11,off48,r4
  1107. lvx v17,off48,r3
  1108. VPERM(v11,v11,v17,byteswap)
  1109. VPMSUMW(v11,v11,v17)
  1110. bdz .Lv11
  1111. lvx v12,off64,r4
  1112. lvx v16,off64,r3
  1113. VPERM(v12,v12,v16,byteswap)
  1114. VPMSUMW(v12,v12,v16)
  1115. bdz .Lv12
  1116. lvx v13,off80,r4
  1117. lvx v17,off80,r3
  1118. VPERM(v13,v13,v17,byteswap)
  1119. VPMSUMW(v13,v13,v17)
  1120. bdz .Lv13
  1121. lvx v14,off96,r4
  1122. lvx v16,off96,r3
  1123. VPERM(v14,v14,v16,byteswap)
  1124. VPMSUMW(v14,v14,v16)
  1125. bdz .Lv14
  1126. lvx v15,off112,r4
  1127. lvx v17,off112,r3
  1128. VPERM(v15,v15,v17,byteswap)
  1129. VPMSUMW(v15,v15,v17)
  1130. .Lv15: vxor v19,v19,v15
  1131. .Lv14: vxor v20,v20,v14
  1132. .Lv13: vxor v19,v19,v13
  1133. .Lv12: vxor v20,v20,v12
  1134. .Lv11: vxor v19,v19,v11
  1135. .Lv10: vxor v20,v20,v10
  1136. .Lv9: vxor v19,v19,v9
  1137. .Lv8: vxor v20,v20,v8
  1138. .Lv7: vxor v19,v19,v7
  1139. .Lv6: vxor v20,v20,v6
  1140. .Lv5: vxor v19,v19,v5
  1141. .Lv4: vxor v20,v20,v4
  1142. .Lv3: vxor v19,v19,v3
  1143. .Lv2: vxor v20,v20,v2
  1144. .Lv1: vxor v19,v19,v1
  1145. .Lv0: vxor v20,v20,v0
  1146. vxor v0,v19,v20
  1147. b .Lbarrett_reduction
  1148. .Lzero:
  1149. mr r3,r10
  1150. b .Lout
  1151. FUNC_END(__crc32_vpmsum)