...

Text file src/internal/chacha8rand/chacha8_loong64.s

Documentation: internal/chacha8rand

     1// Copyright 2025 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "textflag.h"
     7
     8DATA	·chachaConst+0x00(SB)/4, $0x61707865
     9DATA	·chachaConst+0x04(SB)/4, $0x3320646e
    10DATA	·chachaConst+0x08(SB)/4, $0x79622d32
    11DATA	·chachaConst+0x0c(SB)/4, $0x6b206574
    12GLOBL	·chachaConst(SB), NOPTR|RODATA, $32
    13
    14DATA	·chachaIncRot+0x00(SB)/4, $0x00000000
    15DATA	·chachaIncRot+0x04(SB)/4, $0x00000001
    16DATA	·chachaIncRot+0x08(SB)/4, $0x00000002
    17DATA	·chachaIncRot+0x0c(SB)/4, $0x00000003
    18GLOBL	·chachaIncRot(SB), NOPTR|RODATA, $32
    19
    20// QR is the ChaCha8 quarter-round on a, b, c, and d.
    21#define QR(a, b, c, d) \
    22	VADDW	a, b, a; \
    23	VXORV	d, a, d; \
    24	VROTRW	$16, d; \
    25	VADDW	c, d, c; \
    26	VXORV	b, c, b; \
    27	VROTRW	$20, b; \
    28	VADDW	a, b, a; \
    29	VXORV	d, a, d; \
    30	VROTRW	$24, d; \
    31	VADDW	c, d, c; \
    32	VXORV	b, c, b; \
    33	VROTRW	$25, b
    34
    35
    36// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
    37TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
    38	// seed in R4
    39	// blocks in R5
    40	// counter in R6
    41
    42	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
    43	BNE	R7, lsx_chacha8
    44	JMP	·block_generic<ABIInternal>(SB)
    45	RET
    46
    47lsx_chacha8:
    48	MOVV	$·chachaConst(SB), R10
    49	MOVV	$·chachaIncRot(SB), R11
    50
    51	// load contants
    52	VMOVQ	(R10), V0.W4
    53	VMOVQ	4(R10), V1.W4
    54	VMOVQ	8(R10), V2.W4
    55	VMOVQ	12(R10), V3.W4
    56
    57	// load 4-32bit data from incRotMatrix added to counter
    58	VMOVQ	(R11), V30
    59
    60	// load seed
    61	VMOVQ	(R4), V4.W4
    62	VMOVQ	4(R4), V5.W4
    63	VMOVQ	8(R4), V6.W4
    64	VMOVQ	12(R4), V7.W4
    65	VMOVQ	16(R4), V8.W4
    66	VMOVQ	20(R4), V9.W4
    67	VMOVQ	24(R4), V10.W4
    68	VMOVQ	28(R4), V11.W4
    69
    70	// load counter and update counter
    71	VMOVQ	R6, V12.W4
    72	VADDW	V12, V30, V12
    73
    74	// zeros for remaining three matrix entries
    75	VXORV	V13, V13, V13
    76	VXORV	V14, V14, V14
    77	VXORV	V15, V15, V15
    78
    79	// save seed state for adding back later
    80	VMOVQ	V4, V20
    81	VMOVQ	V5, V21
    82	VMOVQ	V6, V22
    83	VMOVQ	V7, V23
    84	VMOVQ	V8, V24
    85	VMOVQ	V9, V25
    86	VMOVQ	V10, V26
    87	VMOVQ	V11, V27
    88
    89	// 4 iterations. Each iteration is 8 quarter-rounds.
    90	MOVV	$4, R7
    91loop:
    92	QR(V0, V4, V8, V12)
    93	QR(V1, V5, V9, V13)
    94	QR(V2, V6, V10, V14)
    95	QR(V3, V7, V11, V15)
    96
    97	QR(V0, V5, V10, V15)
    98	QR(V1, V6, V11, V12)
    99	QR(V2, V7, V8, V13)
   100	QR(V3, V4, V9, V14)
   101
   102	SUBV	$1, R7
   103	BNE	R7, R0, loop
   104
   105	// add seed back
   106	VADDW	V4, V20, V4
   107	VADDW	V5, V21, V5
   108	VADDW	V6, V22, V6
   109	VADDW	V7, V23, V7
   110	VADDW	V8, V24, V8
   111	VADDW	V9, V25, V9
   112	VADDW	V10, V26, V10
   113	VADDW	V11, V27, V11
   114
   115	// store blocks back to output buffer
   116	VMOVQ	V0, (R5)
   117	VMOVQ	V1, 16(R5)
   118	VMOVQ	V2, 32(R5)
   119	VMOVQ	V3, 48(R5)
   120	VMOVQ	V4, 64(R5)
   121	VMOVQ	V5, 80(R5)
   122	VMOVQ	V6, 96(R5)
   123	VMOVQ	V7, 112(R5)
   124	VMOVQ	V8, 128(R5)
   125	VMOVQ	V9, 144(R5)
   126	VMOVQ	V10, 160(R5)
   127	VMOVQ	V11, 176(R5)
   128	VMOVQ	V12, 192(R5)
   129	VMOVQ	V13, 208(R5)
   130	VMOVQ	V14, 224(R5)
   131	VMOVQ	V15, 240(R5)
   132
   133	RET

View as plain text