crc32_amd64.s

Documentation: hash/crc32

     1// Copyright 2011 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6#include "go_asm.h"
     7
     8// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
     9//
    10// func castagnoliSSE42(crc uint32, p []byte) uint32
    11TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
    12	MOVL crc+0(FP), AX  // CRC value
    13	MOVQ p+8(FP), SI  // data pointer
    14	MOVQ p_len+16(FP), CX  // len(p)
    15
    16	// If there are fewer than 8 bytes to process, skip alignment.
    17	CMPQ CX, $8
    18	JL less_than_8
    19
    20	MOVQ SI, BX
    21	ANDQ $7, BX
    22	JZ aligned
    23
    24	// Process the first few bytes to 8-byte align the input.
    25
    26	// BX = 8 - BX. We need to process this many bytes to align.
    27	SUBQ $1, BX
    28	XORQ $7, BX
    29
    30	BTQ $0, BX
    31	JNC align_2
    32
    33	CRC32B (SI), AX
    34	DECQ CX
    35	INCQ SI
    36
    37align_2:
    38	BTQ $1, BX
    39	JNC align_4
    40
    41	CRC32W (SI), AX
    42
    43	SUBQ $2, CX
    44	ADDQ $2, SI
    45
    46align_4:
    47	BTQ $2, BX
    48	JNC aligned
    49
    50	CRC32L (SI), AX
    51
    52	SUBQ $4, CX
    53	ADDQ $4, SI
    54
    55aligned:
    56	// The input is now 8-byte aligned and we can process 8-byte chunks.
    57	CMPQ CX, $8
    58	JL less_than_8
    59
    60	CRC32Q (SI), AX
    61	ADDQ $8, SI
    62	SUBQ $8, CX
    63	JMP aligned
    64
    65less_than_8:
    66	// We may have some bytes left over; process 4 bytes, then 2, then 1.
    67	BTQ $2, CX
    68	JNC less_than_4
    69
    70	CRC32L (SI), AX
    71	ADDQ $4, SI
    72
    73less_than_4:
    74	BTQ $1, CX
    75	JNC less_than_2
    76
    77	CRC32W (SI), AX
    78	ADDQ $2, SI
    79
    80less_than_2:
    81	BTQ $0, CX
    82	JNC done
    83
    84	CRC32B (SI), AX
    85
    86done:
    87	MOVL AX, ret+32(FP)
    88	RET
    89
    90// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
    91// bytes from each buffer.
    92//
    93// func castagnoliSSE42Triple(
    94//     crc1, crc2, crc3 uint32,
    95//     a, b, c []byte,
    96//     rounds uint32,
    97// ) (retA uint32, retB uint32, retC uint32)
    98TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
    99	MOVL crcA+0(FP), AX
   100	MOVL crcB+4(FP), CX
   101	MOVL crcC+8(FP), DX
   102
   103	MOVQ a+16(FP), R8   // data pointer
   104	MOVQ b+40(FP), R9   // data pointer
   105	MOVQ c+64(FP), R10  // data pointer
   106
   107	MOVL rounds+88(FP), R11
   108
   109loop:
   110	CRC32Q (R8), AX
   111	CRC32Q (R9), CX
   112	CRC32Q (R10), DX
   113
   114	CRC32Q 8(R8), AX
   115	CRC32Q 8(R9), CX
   116	CRC32Q 8(R10), DX
   117
   118	CRC32Q 16(R8), AX
   119	CRC32Q 16(R9), CX
   120	CRC32Q 16(R10), DX
   121
   122	ADDQ $24, R8
   123	ADDQ $24, R9
   124	ADDQ $24, R10
   125
   126	DECQ R11
   127	JNZ loop
   128
   129	MOVL AX, retA+96(FP)
   130	MOVL CX, retB+100(FP)
   131	MOVL DX, retC+104(FP)
   132	RET
   133
   134// CRC32 polynomial data
   135//
   136// These constants are lifted from the
   137// Linux kernel, since they avoid the costly
   138// PSHUFB 16 byte reversal proposed in the
   139// original Intel paper.
   140// Splatted so it can be loaded with a single VMOVDQU64
   141DATA r2r1<>+0(SB)/8, $0x154442bd4
   142DATA r2r1<>+8(SB)/8, $0x1c6e41596
   143DATA r2r1<>+16(SB)/8, $0x154442bd4
   144DATA r2r1<>+24(SB)/8, $0x1c6e41596
   145DATA r2r1<>+32(SB)/8, $0x154442bd4
   146DATA r2r1<>+40(SB)/8, $0x1c6e41596
   147DATA r2r1<>+48(SB)/8, $0x154442bd4
   148DATA r2r1<>+56(SB)/8, $0x1c6e41596
   149
   150DATA r4r3<>+0(SB)/8, $0x1751997d0
   151DATA r4r3<>+8(SB)/8, $0x0ccaa009e
   152DATA rupoly<>+0(SB)/8, $0x1db710641
   153DATA rupoly<>+8(SB)/8, $0x1f7011641
   154DATA r5<>+0(SB)/8, $0x163cd6124
   155
   156GLOBL r2r1<>(SB), RODATA, $64
   157GLOBL r4r3<>(SB),RODATA,$16
   158GLOBL rupoly<>(SB),RODATA,$16
   159GLOBL r5<>(SB),RODATA,$8
   160
   161// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   162// len(p) must be at least 64, and must be a multiple of 16.
   163
   164// func ieeeCLMUL(crc uint32, p []byte) uint32
   165TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
   166	MOVL   crc+0(FP), X0             // Initial CRC value
   167	MOVQ   p+8(FP), SI  	         // data pointer
   168	MOVQ   p_len+16(FP), CX          // len(p)
   169
   170	// Check feature support and length to be >= 1024 bytes.
   171	CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
   172	JNE  useSSE42
   173	CMPQ CX, $1024
   174	JL   useSSE42
   175
   176	// Use AVX512. Zero upper and Z10 and load initial CRC into lower part of Z10.
   177	VPXORQ    Z10, Z10, Z10
   178	VMOVAPS   X0, X10
   179	VMOVDQU64 (SI), Z1
   180	VPXORQ    Z10, Z1, Z1 // Merge initial CRC value into Z1
   181	ADDQ      $64, SI    // buf+=64
   182	SUBQ      $64, CX    // len-=64
   183
   184	VMOVDQU64 r2r1<>+0(SB), Z0
   185
   186loopback64Avx512:
   187	VMOVDQU64  (SI), Z11          // Load next
   188	VPCLMULQDQ $0x11, Z0, Z1, Z5
   189	VPCLMULQDQ $0, Z0, Z1, Z1
   190	VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
   191
   192	ADDQ $0x40, DI
   193	ADDQ $64, SI    // buf+=64
   194	SUBQ $64, CX    // len-=64
   195	CMPQ CX, $64    // Less than 64 bytes left?
   196	JGE  loopback64Avx512
   197
   198	// Unfold result into XMM1-XMM4 to match SSE4 code.
   199	VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
   200	VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
   201	VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
   202	VZEROUPPER
   203	JMP remain64
   204
   205	PCALIGN $16
   206useSSE42:
   207	MOVOU  (SI), X1
   208	MOVOU  16(SI), X2
   209	MOVOU  32(SI), X3
   210	MOVOU  48(SI), X4
   211	PXOR   X0, X1
   212	ADDQ   $64, SI                  // buf+=64
   213	SUBQ   $64, CX                  // len-=64
   214	CMPQ   CX, $64                  // Less than 64 bytes left
   215	JB     remain64
   216
   217	MOVOA  r2r1<>+0(SB), X0
   218loopback64:
   219	MOVOA  X1, X5
   220	MOVOA  X2, X6
   221	MOVOA  X3, X7
   222	MOVOA  X4, X8
   223
   224	PCLMULQDQ $0, X0, X1
   225	PCLMULQDQ $0, X0, X2
   226	PCLMULQDQ $0, X0, X3
   227	PCLMULQDQ $0, X0, X4
   228
   229	/* Load next early */
   230	MOVOU    (SI), X11
   231	MOVOU    16(SI), X12
   232	MOVOU    32(SI), X13
   233	MOVOU    48(SI), X14
   234
   235	PCLMULQDQ $0x11, X0, X5
   236	PCLMULQDQ $0x11, X0, X6
   237	PCLMULQDQ $0x11, X0, X7
   238	PCLMULQDQ $0x11, X0, X8
   239
   240	PXOR     X5, X1
   241	PXOR     X6, X2
   242	PXOR     X7, X3
   243	PXOR     X8, X4
   244
   245	PXOR     X11, X1
   246	PXOR     X12, X2
   247	PXOR     X13, X3
   248	PXOR     X14, X4
   249
   250	ADDQ    $0x40, DI
   251	ADDQ    $64, SI      // buf+=64
   252	SUBQ    $64, CX      // len-=64
   253	CMPQ    CX, $64      // Less than 64 bytes left?
   254	JGE     loopback64
   255
   256	PCALIGN $16
   257	/* Fold result into a single register (X1) */
   258remain64:
   259	MOVOA       r4r3<>+0(SB), X0
   260
   261	MOVOA       X1, X5
   262	PCLMULQDQ   $0, X0, X1
   263	PCLMULQDQ   $0x11, X0, X5
   264	PXOR        X5, X1
   265	PXOR        X2, X1
   266
   267	MOVOA       X1, X5
   268	PCLMULQDQ   $0, X0, X1
   269	PCLMULQDQ   $0x11, X0, X5
   270	PXOR        X5, X1
   271	PXOR        X3, X1
   272
   273	MOVOA       X1, X5
   274	PCLMULQDQ   $0, X0, X1
   275	PCLMULQDQ   $0x11, X0, X5
   276	PXOR        X5, X1
   277	PXOR        X4, X1
   278
   279	/* If there is less than 16 bytes left we are done */
   280	CMPQ        CX, $16
   281	JB          finish
   282
   283	/* Encode 16 bytes */
   284remain16:
   285	MOVOU       (SI), X10
   286	MOVOA       X1, X5
   287	PCLMULQDQ   $0, X0, X1
   288	PCLMULQDQ   $0x11, X0, X5
   289	PXOR        X5, X1
   290	PXOR        X10, X1
   291	SUBQ        $16, CX
   292	ADDQ        $16, SI
   293	CMPQ        CX, $16
   294	JGE         remain16
   295
   296finish:
   297	/* Fold final result into 32 bits and return it */
   298	PCMPEQB     X3, X3
   299	PCLMULQDQ   $1, X1, X0
   300	PSRLDQ      $8, X1
   301	PXOR        X0, X1
   302
   303	MOVOA       X1, X2
   304	MOVQ        r5<>+0(SB), X0
   305
   306	/* Creates 32 bit mask. Note that we don't care about upper half. */
   307	PSRLQ       $32, X3
   308
   309	PSRLDQ      $4, X2
   310	PAND        X3, X1
   311	PCLMULQDQ   $0, X0, X1
   312	PXOR        X2, X1
   313
   314	MOVOA       rupoly<>+0(SB), X0
   315
   316	MOVOA       X1, X2
   317	PAND        X3, X1
   318	PCLMULQDQ   $0x10, X0, X1
   319	PAND        X3, X1
   320	PCLMULQDQ   $0, X0, X1
   321	PXOR        X2, X1
   322
   323	PEXTRD	$1, X1, AX
   324	MOVL        AX, ret+32(FP)
   325
   326	RET
View as plain text