sum_vmsl_s390x.s 24.5 KB
Newer Older
zhangweiwei's avatar
init  
zhangweiwei committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build s390x,go1.11,!gccgo,!appengine

#include "textflag.h"

// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.

// constants
#define EX0   V1
#define EX1   V2
#define EX2   V3

// temporaries
#define T_0 V4
#define T_1 V5
#define T_2 V6
#define T_3 V7
#define T_4 V8
#define T_5 V9
#define T_6 V10
#define T_7 V11
#define T_8 V12
#define T_9 V13
#define T_10 V14

// r**2 & r**4
#define R_0  V15
#define R_1  V16
#define R_2  V17
#define R5_1 V18
#define R5_2 V19
// key (r)
#define RSAVE_0 R7
#define RSAVE_1 R8
#define RSAVE_2 R9
#define R5SAVE_1 R10
#define R5SAVE_2 R11

// message block
#define M0 V20
#define M1 V21
#define M2 V22
#define M3 V23
#define M4 V24
#define M5 V25

// accumulator
#define H0_0 V26
#define H1_0 V27
#define H2_0 V28
#define H0_1 V29
#define H1_1 V30
#define H2_1 V31

GLOBL ·keyMask<>(SB), RODATA, $16
DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f

GLOBL ·bswapMask<>(SB), RODATA, $16
DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100

GLOBL ·constants<>(SB), RODATA, $48
// EX0
DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
DATA ·constants<>+8(SB)/8, $0x0000050403020100
// EX1
DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
DATA ·constants<>+24(SB)/8, $0x00000a0908070605
// EX2
DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b

GLOBL ·c<>(SB), RODATA, $48
// EX0
DATA ·c<>+0(SB)/8, $0x0000050403020100
DATA ·c<>+8(SB)/8, $0x0000151413121110
// EX1
DATA ·c<>+16(SB)/8, $0x00000a0908070605
DATA ·c<>+24(SB)/8, $0x00001a1918171615
// EX2
DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b

GLOBL ·reduce<>(SB), RODATA, $32
// 44 bit
DATA ·reduce<>+0(SB)/8, $0x0
DATA ·reduce<>+8(SB)/8, $0xfffffffffff
// 42 bit
DATA ·reduce<>+16(SB)/8, $0x0
DATA ·reduce<>+24(SB)/8, $0x3ffffffffff

// h = (f*g) % (2**130-5) [partial reduction]
// uses T_0...T_9 temporary registers
// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
	\ // Eliminate the dependency for the last 2 VMSLs
	VMSLG m02_0, r_2, m4_2, m4_2                       \
	VMSLG m13_0, r_2, m5_2, m5_2                       \ // 8 VMSLs pipelined
	VMSLG m02_0, r_0, m4_0, m4_0                       \
	VMSLG m02_1, r5_2, V0, T_0                         \
	VMSLG m02_0, r_1, m4_1, m4_1                       \
	VMSLG m02_1, r_0, V0, T_1                          \
	VMSLG m02_1, r_1, V0, T_2                          \
	VMSLG m02_2, r5_1, V0, T_3                         \
	VMSLG m02_2, r5_2, V0, T_4                         \
	VMSLG m13_0, r_0, m5_0, m5_0                       \
	VMSLG m13_1, r5_2, V0, T_5                         \
	VMSLG m13_0, r_1, m5_1, m5_1                       \
	VMSLG m13_1, r_0, V0, T_6                          \
	VMSLG m13_1, r_1, V0, T_7                          \
	VMSLG m13_2, r5_1, V0, T_8                         \
	VMSLG m13_2, r5_2, V0, T_9                         \
	VMSLG m02_2, r_0, m4_2, m4_2                       \
	VMSLG m13_2, r_0, m5_2, m5_2                       \
	VAQ   m4_0, T_0, m02_0                             \
	VAQ   m4_1, T_1, m02_1                             \
	VAQ   m5_0, T_5, m13_0                             \
	VAQ   m5_1, T_6, m13_1                             \
	VAQ   m02_0, T_3, m02_0                            \
	VAQ   m02_1, T_4, m02_1                            \
	VAQ   m13_0, T_8, m13_0                            \
	VAQ   m13_1, T_9, m13_1                            \
	VAQ   m4_2, T_2, m02_2                             \
	VAQ   m5_2, T_7, m13_2                             \

// SQUARE uses three limbs of r and r_2*5 to output square of r
// uses T_1, T_5 and T_7 temporary registers
// input: r_0, r_1, r_2, r5_2
// temp: TEMP0, TEMP1, TEMP2
// output: p0, p1, p2
#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
	VMSLG r_0, r_0, p0, p0     \
	VMSLG r_1, r5_2, V0, TEMP0 \
	VMSLG r_2, r5_2, p1, p1    \
	VMSLG r_0, r_1, V0, TEMP1  \
	VMSLG r_1, r_1, p2, p2     \
	VMSLG r_0, r_2, V0, TEMP2  \
	VAQ   TEMP0, p0, p0        \
	VAQ   TEMP1, p1, p1        \
	VAQ   TEMP2, p2, p2        \
	VAQ   TEMP0, p0, p0        \
	VAQ   TEMP1, p1, p1        \
	VAQ   TEMP2, p2, p2        \

// carry h0->h1->h2->h0 || h3->h4->h5->h3
// uses T_2, T_4, T_5, T_7, T_8, T_9
//       t6,  t7,  t8,  t9, t10, t11
// input: h0, h1, h2, h3, h4, h5
// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
// output: h0, h1, h2, h3, h4, h5
#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
	VLM    (R12), t6, t7  \ // 44 and 42 bit clear mask
	VLEIB  $7, $0x28, t10 \ // 5 byte shift mask
	VREPIB $4, t8         \ // 4 bit shift mask
	VREPIB $2, t11        \ // 2 bit shift mask
	VSRLB  t10, h0, t0    \ // h0 byte shift
	VSRLB  t10, h1, t1    \ // h1 byte shift
	VSRLB  t10, h2, t2    \ // h2 byte shift
	VSRLB  t10, h3, t3    \ // h3 byte shift
	VSRLB  t10, h4, t4    \ // h4 byte shift
	VSRLB  t10, h5, t5    \ // h5 byte shift
	VSRL   t8, t0, t0     \ // h0 bit shift
	VSRL   t8, t1, t1     \ // h2 bit shift
	VSRL   t11, t2, t2    \ // h2 bit shift
	VSRL   t8, t3, t3     \ // h3 bit shift
	VSRL   t8, t4, t4     \ // h4 bit shift
	VESLG  $2, t2, t9     \ // h2 carry x5
	VSRL   t11, t5, t5    \ // h5 bit shift
	VN     t6, h0, h0     \ // h0 clear carry
	VAQ    t2, t9, t2     \ // h2 carry x5
	VESLG  $2, t5, t9     \ // h5 carry x5
	VN     t6, h1, h1     \ // h1 clear carry
	VN     t7, h2, h2     \ // h2 clear carry
	VAQ    t5, t9, t5     \ // h5 carry x5
	VN     t6, h3, h3     \ // h3 clear carry
	VN     t6, h4, h4     \ // h4 clear carry
	VN     t7, h5, h5     \ // h5 clear carry
	VAQ    t0, h1, h1     \ // h0->h1
	VAQ    t3, h4, h4     \ // h3->h4
	VAQ    t1, h2, h2     \ // h1->h2
	VAQ    t4, h5, h5     \ // h4->h5
	VAQ    t2, h0, h0     \ // h2->h0
	VAQ    t5, h3, h3     \ // h5->h3
	VREPG  $1, t6, t6     \ // 44 and 42 bit masks across both halves
	VREPG  $1, t7, t7     \
	VSLDB  $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
	VSLDB  $8, h1, h1, h1 \
	VSLDB  $8, h2, h2, h2 \
	VO     h0, h3, h3     \
	VO     h1, h4, h4     \
	VO     h2, h5, h5     \
	VESRLG $44, h3, t0    \ // 44 bit shift right
	VESRLG $44, h4, t1    \
	VESRLG $42, h5, t2    \
	VN     t6, h3, h3     \ // clear carry bits
	VN     t6, h4, h4     \
	VN     t7, h5, h5     \
	VESLG  $2, t2, t9     \ // multiply carry by 5
	VAQ    t9, t2, t2     \
	VAQ    t0, h4, h4     \
	VAQ    t1, h5, h5     \
	VAQ    t2, h3, h3     \

// carry h0->h1->h2->h0
// input: h0, h1, h2
// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
// output: h0, h1, h2
#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
	VLEIB  $7, $0x28, t3 \ // 5 byte shift mask
	VREPIB $4, t4        \ // 4 bit shift mask
	VREPIB $2, t7        \ // 2 bit shift mask
	VGBM   $0x003F, t5   \ // mask to clear carry bits
	VSRLB  t3, h0, t0    \
	VSRLB  t3, h1, t1    \
	VSRLB  t3, h2, t2    \
	VESRLG $4, t5, t5    \ // 44 bit clear mask
	VSRL   t4, t0, t0    \
	VSRL   t4, t1, t1    \
	VSRL   t7, t2, t2    \
	VESRLG $2, t5, t6    \ // 42 bit clear mask
	VESLG  $2, t2, t8    \
	VAQ    t8, t2, t2    \
	VN     t5, h0, h0    \
	VN     t5, h1, h1    \
	VN     t6, h2, h2    \
	VAQ    t0, h1, h1    \
	VAQ    t1, h2, h2    \
	VAQ    t2, h0, h0    \
	VSRLB  t3, h0, t0    \
	VSRLB  t3, h1, t1    \
	VSRLB  t3, h2, t2    \
	VSRL   t4, t0, t0    \
	VSRL   t4, t1, t1    \
	VSRL   t7, t2, t2    \
	VN     t5, h0, h0    \
	VN     t5, h1, h1    \
	VESLG  $2, t2, t8    \
	VN     t6, h2, h2    \
	VAQ    t0, h1, h1    \
	VAQ    t8, t2, t2    \
	VAQ    t1, h2, h2    \
	VAQ    t2, h0, h0    \

// expands two message blocks into the lower halfs of the d registers
// moves the contents of the d registers into upper halfs
// input: in1, in2, d0, d1, d2, d3, d4, d5
// temp: TEMP0, TEMP1, TEMP2, TEMP3
// output: d0, d1, d2, d3, d4, d5
#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
	VGBM   $0xff3f, TEMP0      \
	VGBM   $0xff1f, TEMP1      \
	VESLG  $4, d1, TEMP2       \
	VESLG  $4, d4, TEMP3       \
	VESRLG $4, TEMP0, TEMP0    \
	VPERM  in1, d0, EX0, d0    \
	VPERM  in2, d3, EX0, d3    \
	VPERM  in1, d2, EX2, d2    \
	VPERM  in2, d5, EX2, d5    \
	VPERM  in1, TEMP2, EX1, d1 \
	VPERM  in2, TEMP3, EX1, d4 \
	VN     TEMP0, d0, d0       \
	VN     TEMP0, d3, d3       \
	VESRLG $4, d1, d1          \
	VESRLG $4, d4, d4          \
	VN     TEMP1, d2, d2       \
	VN     TEMP1, d5, d5       \
	VN     TEMP0, d1, d1       \
	VN     TEMP0, d4, d4       \

// expands one message block into the lower halfs of the d registers
// moves the contents of the d registers into upper halfs
// input: in, d0, d1, d2
// temp: TEMP0, TEMP1, TEMP2
// output: d0, d1, d2
#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
	VGBM   $0xff3f, TEMP0     \
	VESLG  $4, d1, TEMP2      \
	VGBM   $0xff1f, TEMP1     \
	VPERM  in, d0, EX0, d0    \
	VESRLG $4, TEMP0, TEMP0   \
	VPERM  in, d2, EX2, d2    \
	VPERM  in, TEMP2, EX1, d1 \
	VN     TEMP0, d0, d0      \
	VN     TEMP1, d2, d2      \
	VESRLG $4, d1, d1         \
	VN     TEMP0, d1, d1      \

// pack h2:h0 into h1:h0 (no carry)
// input: h0, h1, h2
// output: h0, h1, h2
#define PACK(h0, h1, h2) \
	VMRLG  h1, h2, h2  \ // copy h1 to upper half h2
	VESLG  $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
	VO     h0, h1, h0  \ // combine h0 with 20 bits from limb 1
	VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
	VLEIG  $1, $0, h1  \ // clear h2 stuff from lower half of h1
	VO     h0, h1, h0  \ // h0 now has 88 bits (limb 0 and 1)
	VLEIG  $0, $0, h2  \ // clear upper half of h2
	VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
	VLEIB  $7, $88, h1 \ // for byte shift (11 bytes)
	VSLB   h1, h2, h2  \ // shift h2 11 bytes to the left
	VO     h0, h2, h0  \ // combine h0 with 20 bits from limb 1
	VLEIG  $0, $0, h1  \ // clear upper half of h1

// if h > 2**130-5 then h -= 2**130-5
// input: h0, h1
// temp: t0, t1, t2
// output: h0
#define MOD(h0, h1, t0, t1, t2) \
	VZERO t0          \
	VLEIG $1, $5, t0  \
	VACCQ h0, t0, t1  \
	VAQ   h0, t0, t0  \
	VONE  t2          \
	VLEIG $1, $-4, t2 \
	VAQ   t2, t1, t1  \
	VACCQ h1, t1, t1  \
	VONE  t2          \
	VAQ   t2, t1, t1  \
	VN    h0, t1, t2  \
	VNC   t0, t1, t1  \
	VO    t1, t2, h0  \

// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
TEXT ·poly1305vmsl(SB), $0-32
	// This code processes 6 + up to 4 blocks (32 bytes) per iteration
	// using the algorithm described in:
	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
	// And as moddified for VMSL as described in
	// Accelerating Poly1305 Cryptographic Message Authentication on the z14
	// O'Farrell et al, CASCON 2017, p48-55
	// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht

	LMG   out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
	VZERO V0                // c

	// load EX0, EX1 and EX2
	MOVD constants<>(SB), R5
	VLM  (R5), EX0, EX2        // c

	// setup r
	VL    (R4), T_0
	MOVD  keyMask<>(SB), R6
	VL    (R6), T_1
	VN    T_0, T_1, T_0
	VZERO T_2                 // limbs for r
	VZERO T_3
	VZERO T_4
	EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)

	// T_2, T_3, T_4: [0, r]

	// setup r*20
	VLEIG $0, $0, T_0
	VLEIG $1, $20, T_0       // T_0: [0, 20]
	VZERO T_5
	VZERO T_6
	VMSLG T_0, T_3, T_5, T_5
	VMSLG T_0, T_4, T_6, T_6

	// store r for final block in GR
	VLGVG $1, T_2, RSAVE_0  // c
	VLGVG $1, T_3, RSAVE_1  // c
	VLGVG $1, T_4, RSAVE_2  // c
	VLGVG $1, T_5, R5SAVE_1 // c
	VLGVG $1, T_6, R5SAVE_2 // c

	// initialize h
	VZERO H0_0
	VZERO H1_0
	VZERO H2_0
	VZERO H0_1
	VZERO H1_1
	VZERO H2_1

	// initialize pointer for reduce constants
	MOVD reduce<>(SB), R12

	// calculate r**2 and 20*(r**2)
	VZERO R_0
	VZERO R_1
	VZERO R_2
	SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
	REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
	VZERO R5_1
	VZERO R5_2
	VMSLG T_0, R_1, R5_1, R5_1
	VMSLG T_0, R_2, R5_2, R5_2

	// skip r**4 calculation if 3 blocks or less
	CMPBLE R3, $48, b4

	// calculate r**4 and 20*(r**4)
	VZERO T_8
	VZERO T_9
	VZERO T_10
	SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
	REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
	VZERO T_2
	VZERO T_3
	VMSLG T_0, T_9, T_2, T_2
	VMSLG T_0, T_10, T_3, T_3

	// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
	VSLDB $8, T_8, T_8, T_8
	VSLDB $8, T_9, T_9, T_9
	VSLDB $8, T_10, T_10, T_10
	VSLDB $8, T_2, T_2, T_2
	VSLDB $8, T_3, T_3, T_3

	VO T_8, R_0, R_0
	VO T_9, R_1, R_1
	VO T_10, R_2, R_2
	VO T_2, R5_1, R5_1
	VO T_3, R5_2, R5_2

	CMPBLE R3, $80, load // less than or equal to 5 blocks in message

	// 6(or 5+1) blocks
	SUB    $81, R3
	VLM    (R2), M0, M4
	VLL    R3, 80(R2), M5
	ADD    $1, R3
	MOVBZ  $1, R0
	CMPBGE R3, $16, 2(PC)
	VLVGB  R3, R0, M5
	MOVD   $96(R2), R2
	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
	EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
	VLEIB  $2, $1, H2_0
	VLEIB  $2, $1, H2_1
	VLEIB  $10, $1, H2_0
	VLEIB  $10, $1, H2_1

	VZERO  M0
	VZERO  M1
	VZERO  M2
	VZERO  M3
	VZERO  T_4
	VZERO  T_10
	EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
	VLR    T_4, M4
	VLEIB  $10, $1, M2
	CMPBLT R3, $16, 2(PC)
	VLEIB  $10, $1, T_10
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
	VMRHG  V0, H0_1, H0_0
	VMRHG  V0, H1_1, H1_0
	VMRHG  V0, H2_1, H2_0
	VMRLG  V0, H0_1, H0_1
	VMRLG  V0, H1_1, H1_1
	VMRLG  V0, H2_1, H2_1

	SUB    $16, R3
	CMPBLE R3, $0, square

load:
	// load EX0, EX1 and EX2
	MOVD c<>(SB), R5
	VLM  (R5), EX0, EX2

loop:
	CMPBLE R3, $64, add // b4	// last 4 or less blocks left

	// next 4 full blocks
	VLM  (R2), M2, M5
	SUB  $64, R3
	MOVD $64(R2), R2
	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)

	// expacc in-lined to create [m2, m3] limbs
	VGBM   $0x3f3f, T_0     // 44 bit clear mask
	VGBM   $0x1f1f, T_1     // 40 bit clear mask
	VPERM  M2, M3, EX0, T_3
	VESRLG $4, T_0, T_0     // 44 bit clear mask ready
	VPERM  M2, M3, EX1, T_4
	VPERM  M2, M3, EX2, T_5
	VN     T_0, T_3, T_3
	VESRLG $4, T_4, T_4
	VN     T_1, T_5, T_5
	VN     T_0, T_4, T_4
	VMRHG  H0_1, T_3, H0_0
	VMRHG  H1_1, T_4, H1_0
	VMRHG  H2_1, T_5, H2_0
	VMRLG  H0_1, T_3, H0_1
	VMRLG  H1_1, T_4, H1_1
	VMRLG  H2_1, T_5, H2_1
	VLEIB  $10, $1, H2_0
	VLEIB  $10, $1, H2_1
	VPERM  M4, M5, EX0, T_3
	VPERM  M4, M5, EX1, T_4
	VPERM  M4, M5, EX2, T_5
	VN     T_0, T_3, T_3
	VESRLG $4, T_4, T_4
	VN     T_1, T_5, T_5
	VN     T_0, T_4, T_4
	VMRHG  V0, T_3, M0
	VMRHG  V0, T_4, M1
	VMRHG  V0, T_5, M2
	VMRLG  V0, T_3, M3
	VMRLG  V0, T_4, M4
	VMRLG  V0, T_5, M5
	VLEIB  $10, $1, M2
	VLEIB  $10, $1, M5

	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	CMPBNE R3, $0, loop
	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
	VMRHG  V0, H0_1, H0_0
	VMRHG  V0, H1_1, H1_0
	VMRHG  V0, H2_1, H2_0
	VMRLG  V0, H0_1, H0_1
	VMRLG  V0, H1_1, H1_1
	VMRLG  V0, H2_1, H2_1

	// load EX0, EX1, EX2
	MOVD constants<>(SB), R5
	VLM  (R5), EX0, EX2

	// sum vectors
	VAQ H0_0, H0_1, H0_0
	VAQ H1_0, H1_1, H1_0
	VAQ H2_0, H2_1, H2_0

	// h may be >= 2*(2**130-5) so we need to reduce it again
	// M0...M4 are used as temps here
	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)

next:  // carry h1->h2
	VLEIB  $7, $0x28, T_1
	VREPIB $4, T_2
	VGBM   $0x003F, T_3
	VESRLG $4, T_3

	// byte shift
	VSRLB T_1, H1_0, T_4

	// bit shift
	VSRL T_2, T_4, T_4

	// clear h1 carry bits
	VN T_3, H1_0, H1_0

	// add carry
	VAQ T_4, H2_0, H2_0

	// h is now < 2*(2**130-5)
	// pack h into h1 (hi) and h0 (lo)
	PACK(H0_0, H1_0, H2_0)

	// if h > 2**130-5 then h -= 2**130-5
	MOD(H0_0, H1_0, T_0, T_1, T_2)

	// h += s
	MOVD  bswapMask<>(SB), R5
	VL    (R5), T_1
	VL    16(R4), T_0
	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
	VAQ   T_0, H0_0, H0_0
	VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
	VST   H0_0, (R1)
	RET

add:
	// load EX0, EX1, EX2
	MOVD constants<>(SB), R5
	VLM  (R5), EX0, EX2

	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
	VMRHG  V0, H0_1, H0_0
	VMRHG  V0, H1_1, H1_0
	VMRHG  V0, H2_1, H2_0
	VMRLG  V0, H0_1, H0_1
	VMRLG  V0, H1_1, H1_1
	VMRLG  V0, H2_1, H2_1
	CMPBLE R3, $64, b4

b4:
	CMPBLE R3, $48, b3 // 3 blocks or less

	// 4(3+1) blocks remaining
	SUB    $49, R3
	VLM    (R2), M0, M2
	VLL    R3, 48(R2), M3
	ADD    $1, R3
	MOVBZ  $1, R0
	CMPBEQ R3, $16, 2(PC)
	VLVGB  R3, R0, M3
	MOVD   $64(R2), R2
	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
	VLEIB  $10, $1, H2_0
	VLEIB  $10, $1, H2_1
	VZERO  M0
	VZERO  M1
	VZERO  M4
	VZERO  M5
	VZERO  T_4
	VZERO  T_10
	EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
	VLR    T_4, M2
	VLEIB  $10, $1, M4
	CMPBNE R3, $16, 2(PC)
	VLEIB  $10, $1, T_10
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
	VMRHG  V0, H0_1, H0_0
	VMRHG  V0, H1_1, H1_0
	VMRHG  V0, H2_1, H2_0
	VMRLG  V0, H0_1, H0_1
	VMRLG  V0, H1_1, H1_1
	VMRLG  V0, H2_1, H2_1
	SUB    $16, R3
	CMPBLE R3, $0, square // this condition must always hold true!

b3:
	CMPBLE R3, $32, b2

	// 3 blocks remaining

	// setup [r²,r]
	VSLDB $8, R_0, R_0, R_0
	VSLDB $8, R_1, R_1, R_1
	VSLDB $8, R_2, R_2, R_2
	VSLDB $8, R5_1, R5_1, R5_1
	VSLDB $8, R5_2, R5_2, R5_2

	VLVGG $1, RSAVE_0, R_0
	VLVGG $1, RSAVE_1, R_1
	VLVGG $1, RSAVE_2, R_2
	VLVGG $1, R5SAVE_1, R5_1
	VLVGG $1, R5SAVE_2, R5_2

	// setup [h0, h1]
	VSLDB $8, H0_0, H0_0, H0_0
	VSLDB $8, H1_0, H1_0, H1_0
	VSLDB $8, H2_0, H2_0, H2_0
	VO    H0_1, H0_0, H0_0
	VO    H1_1, H1_0, H1_0
	VO    H2_1, H2_0, H2_0
	VZERO H0_1
	VZERO H1_1
	VZERO H2_1

	VZERO M0
	VZERO M1
	VZERO M2
	VZERO M3
	VZERO M4
	VZERO M5

	// H*[r**2, r]
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)

	SUB    $33, R3
	VLM    (R2), M0, M1
	VLL    R3, 32(R2), M2
	ADD    $1, R3
	MOVBZ  $1, R0
	CMPBEQ R3, $16, 2(PC)
	VLVGB  R3, R0, M2

	// H += m0
	VZERO T_1
	VZERO T_2
	VZERO T_3
	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
	VLEIB $10, $1, T_3
	VAG   H0_0, T_1, H0_0
	VAG   H1_0, T_2, H1_0
	VAG   H2_0, T_3, H2_0

	VZERO M0
	VZERO M3
	VZERO M4
	VZERO M5
	VZERO T_10

	// (H+m0)*r
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)

	// H += m1
	VZERO V0
	VZERO T_1
	VZERO T_2
	VZERO T_3
	EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
	VLEIB $10, $1, T_3
	VAQ   H0_0, T_1, H0_0
	VAQ   H1_0, T_2, H1_0
	VAQ   H2_0, T_3, H2_0
	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)

	// [H, m2] * [r**2, r]
	EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
	CMPBNE R3, $16, 2(PC)
	VLEIB  $10, $1, H2_0
	VZERO  M0
	VZERO  M1
	VZERO  M2
	VZERO  M3
	VZERO  M4
	VZERO  M5
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
	SUB    $16, R3
	CMPBLE R3, $0, next   // this condition must always hold true!

b2:
	CMPBLE R3, $16, b1

	// 2 blocks remaining

	// setup [r²,r]
	VSLDB $8, R_0, R_0, R_0
	VSLDB $8, R_1, R_1, R_1
	VSLDB $8, R_2, R_2, R_2
	VSLDB $8, R5_1, R5_1, R5_1
	VSLDB $8, R5_2, R5_2, R5_2

	VLVGG $1, RSAVE_0, R_0
	VLVGG $1, RSAVE_1, R_1
	VLVGG $1, RSAVE_2, R_2
	VLVGG $1, R5SAVE_1, R5_1
	VLVGG $1, R5SAVE_2, R5_2

	// setup [h0, h1]
	VSLDB $8, H0_0, H0_0, H0_0
	VSLDB $8, H1_0, H1_0, H1_0
	VSLDB $8, H2_0, H2_0, H2_0
	VO    H0_1, H0_0, H0_0
	VO    H1_1, H1_0, H1_0
	VO    H2_1, H2_0, H2_0
	VZERO H0_1
	VZERO H1_1
	VZERO H2_1

	VZERO M0
	VZERO M1
	VZERO M2
	VZERO M3
	VZERO M4
	VZERO M5

	// H*[r**2, r]
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
	VMRHG V0, H0_1, H0_0
	VMRHG V0, H1_1, H1_0
	VMRHG V0, H2_1, H2_0
	VMRLG V0, H0_1, H0_1
	VMRLG V0, H1_1, H1_1
	VMRLG V0, H2_1, H2_1

	// move h to the left and 0s at the right
	VSLDB $8, H0_0, H0_0, H0_0
	VSLDB $8, H1_0, H1_0, H1_0
	VSLDB $8, H2_0, H2_0, H2_0

	// get message blocks and append 1 to start
	SUB    $17, R3
	VL     (R2), M0
	VLL    R3, 16(R2), M1
	ADD    $1, R3
	MOVBZ  $1, R0
	CMPBEQ R3, $16, 2(PC)
	VLVGB  R3, R0, M1
	VZERO  T_6
	VZERO  T_7
	VZERO  T_8
	EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
	EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
	VLEIB  $2, $1, T_8
	CMPBNE R3, $16, 2(PC)
	VLEIB  $10, $1, T_8

	// add [m0, m1] to h
	VAG H0_0, T_6, H0_0
	VAG H1_0, T_7, H1_0
	VAG H2_0, T_8, H2_0

	VZERO M2
	VZERO M3
	VZERO M4
	VZERO M5
	VZERO T_10
	VZERO M0

	// at this point R_0 .. R5_2 look like [r**2, r]
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
	SUB    $16, R3, R3
	CMPBLE R3, $0, next

b1:
	CMPBLE R3, $0, next

	// 1 block remaining

	// setup [r²,r]
	VSLDB $8, R_0, R_0, R_0
	VSLDB $8, R_1, R_1, R_1
	VSLDB $8, R_2, R_2, R_2
	VSLDB $8, R5_1, R5_1, R5_1
	VSLDB $8, R5_2, R5_2, R5_2

	VLVGG $1, RSAVE_0, R_0
	VLVGG $1, RSAVE_1, R_1
	VLVGG $1, RSAVE_2, R_2
	VLVGG $1, R5SAVE_1, R5_1
	VLVGG $1, R5SAVE_2, R5_2

	// setup [h0, h1]
	VSLDB $8, H0_0, H0_0, H0_0
	VSLDB $8, H1_0, H1_0, H1_0
	VSLDB $8, H2_0, H2_0, H2_0
	VO    H0_1, H0_0, H0_0
	VO    H1_1, H1_0, H1_0
	VO    H2_1, H2_0, H2_0
	VZERO H0_1
	VZERO H1_1
	VZERO H2_1

	VZERO M0
	VZERO M1
	VZERO M2
	VZERO M3
	VZERO M4
	VZERO M5

	// H*[r**2, r]
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)

	// set up [0, m0] limbs
	SUB    $1, R3
	VLL    R3, (R2), M0
	ADD    $1, R3
	MOVBZ  $1, R0
	CMPBEQ R3, $16, 2(PC)
	VLVGB  R3, R0, M0
	VZERO  T_1
	VZERO  T_2
	VZERO  T_3
	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
	CMPBNE R3, $16, 2(PC)
	VLEIB  $10, $1, T_3

	// h+m0
	VAQ H0_0, T_1, H0_0
	VAQ H1_0, T_2, H1_0
	VAQ H2_0, T_3, H2_0

	VZERO M0
	VZERO M1
	VZERO M2
	VZERO M3
	VZERO M4
	VZERO M5
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)

	BR next

square:
	// setup [r²,r]
	VSLDB $8, R_0, R_0, R_0
	VSLDB $8, R_1, R_1, R_1
	VSLDB $8, R_2, R_2, R_2
	VSLDB $8, R5_1, R5_1, R5_1
	VSLDB $8, R5_2, R5_2, R5_2

	VLVGG $1, RSAVE_0, R_0
	VLVGG $1, RSAVE_1, R_1
	VLVGG $1, RSAVE_2, R_2
	VLVGG $1, R5SAVE_1, R5_1
	VLVGG $1, R5SAVE_2, R5_2

	// setup [h0, h1]
	VSLDB $8, H0_0, H0_0, H0_0
	VSLDB $8, H1_0, H1_0, H1_0
	VSLDB $8, H2_0, H2_0, H2_0
	VO    H0_1, H0_0, H0_0
	VO    H1_1, H1_0, H1_0
	VO    H2_1, H2_0, H2_0
	VZERO H0_1
	VZERO H1_1
	VZERO H2_1

	VZERO M0
	VZERO M1
	VZERO M2
	VZERO M3
	VZERO M4
	VZERO M5

	// (h0*r**2) + (h1*r)
	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
	BR next

TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1
	MOVD  $x-24(SP), R1
	XC    $24, 0(R1), 0(R1) // clear the storage
	MOVD  $2, R0            // R0 is the number of double words stored -1
	WORD  $0xB2B01000       // STFLE 0(R1)
	XOR   R0, R0            // reset the value of R0
	MOVBZ z-8(SP), R1
	AND   $0x01, R1
	BEQ   novmsl

vectorinstalled:
	// check if the vector instruction has been enabled
	VLEIB  $0, $0xF, V16
	VLGVB  $0, V16, R1
	CMPBNE R1, $0xF, novmsl
	MOVB   $1, ret+0(FP)    // have vx
	RET

novmsl:
	MOVB $0, ret+0(FP) // no vx
	RET