Text file src/internal/bytealg/index_amd64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Index(SB),NOSPLIT,$0-56
     9  	MOVQ a_base+0(FP), DI
    10  	MOVQ a_len+8(FP), DX
    11  	MOVQ b_base+24(FP), R8
    12  	MOVQ b_len+32(FP), AX
    13  	MOVQ DI, R10
    14  	LEAQ ret+48(FP), R11
    15  	JMP  indexbody<>(SB)
    16  
    17  TEXT ·IndexString(SB),NOSPLIT,$0-40
    18  	MOVQ a_base+0(FP), DI
    19  	MOVQ a_len+8(FP), DX
    20  	MOVQ b_base+16(FP), R8
    21  	MOVQ b_len+24(FP), AX
    22  	MOVQ DI, R10
    23  	LEAQ ret+32(FP), R11
    24  	JMP  indexbody<>(SB)
    25  
    26  // AX: length of string, that we are searching for
    27  // DX: length of string, in which we are searching
    28  // DI: pointer to string, in which we are searching
    29  // R8: pointer to string, that we are searching for
    30  // R11: address, where to put return value
    31  // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
    32  TEXT indexbody<>(SB),NOSPLIT,$0
    33  	CMPQ AX, DX
    34  	JA fail
    35  	CMPQ DX, $16
    36  	JAE sse42
    37  no_sse42:
    38  	CMPQ AX, $2
    39  	JA   _3_or_more
    40  	MOVW (R8), R8
    41  	LEAQ -1(DI)(DX*1), DX
    42  loop2:
    43  	MOVW (DI), SI
    44  	CMPW SI,R8
    45  	JZ success
    46  	ADDQ $1,DI
    47  	CMPQ DI,DX
    48  	JB loop2
    49  	JMP fail
    50  _3_or_more:
    51  	CMPQ AX, $3
    52  	JA   _4_or_more
    53  	MOVW 1(R8), BX
    54  	MOVW (R8), R8
    55  	LEAQ -2(DI)(DX*1), DX
    56  loop3:
    57  	MOVW (DI), SI
    58  	CMPW SI,R8
    59  	JZ   partial_success3
    60  	ADDQ $1,DI
    61  	CMPQ DI,DX
    62  	JB loop3
    63  	JMP fail
    64  partial_success3:
    65  	MOVW 1(DI), SI
    66  	CMPW SI,BX
    67  	JZ success
    68  	ADDQ $1,DI
    69  	CMPQ DI,DX
    70  	JB loop3
    71  	JMP fail
    72  _4_or_more:
    73  	CMPQ AX, $4
    74  	JA   _5_or_more
    75  	MOVL (R8), R8
    76  	LEAQ -3(DI)(DX*1), DX
    77  loop4:
    78  	MOVL (DI), SI
    79  	CMPL SI,R8
    80  	JZ   success
    81  	ADDQ $1,DI
    82  	CMPQ DI,DX
    83  	JB loop4
    84  	JMP fail
    85  _5_or_more:
    86  	CMPQ AX, $7
    87  	JA   _8_or_more
    88  	LEAQ 1(DI)(DX*1), DX
    89  	SUBQ AX, DX
    90  	MOVL -4(R8)(AX*1), BX
    91  	MOVL (R8), R8
    92  loop5to7:
    93  	MOVL (DI), SI
    94  	CMPL SI,R8
    95  	JZ   partial_success5to7
    96  	ADDQ $1,DI
    97  	CMPQ DI,DX
    98  	JB loop5to7
    99  	JMP fail
   100  partial_success5to7:
   101  	MOVL -4(AX)(DI*1), SI
   102  	CMPL SI,BX
   103  	JZ success
   104  	ADDQ $1,DI
   105  	CMPQ DI,DX
   106  	JB loop5to7
   107  	JMP fail
   108  _8_or_more:
   109  	CMPQ AX, $8
   110  	JA   _9_or_more
   111  	MOVQ (R8), R8
   112  	LEAQ -7(DI)(DX*1), DX
   113  loop8:
   114  	MOVQ (DI), SI
   115  	CMPQ SI,R8
   116  	JZ   success
   117  	ADDQ $1,DI
   118  	CMPQ DI,DX
   119  	JB loop8
   120  	JMP fail
   121  _9_or_more:
   122  	CMPQ AX, $15
   123  	JA   _16_or_more
   124  	LEAQ 1(DI)(DX*1), DX
   125  	SUBQ AX, DX
   126  	MOVQ -8(R8)(AX*1), BX
   127  	MOVQ (R8), R8
   128  loop9to15:
   129  	MOVQ (DI), SI
   130  	CMPQ SI,R8
   131  	JZ   partial_success9to15
   132  	ADDQ $1,DI
   133  	CMPQ DI,DX
   134  	JB loop9to15
   135  	JMP fail
   136  partial_success9to15:
   137  	MOVQ -8(AX)(DI*1), SI
   138  	CMPQ SI,BX
   139  	JZ success
   140  	ADDQ $1,DI
   141  	CMPQ DI,DX
   142  	JB loop9to15
   143  	JMP fail
   144  _16_or_more:
   145  	CMPQ AX, $16
   146  	JA   _17_or_more
   147  	MOVOU (R8), X1
   148  	LEAQ -15(DI)(DX*1), DX
   149  loop16:
   150  	MOVOU (DI), X2
   151  	PCMPEQB X1, X2
   152  	PMOVMSKB X2, SI
   153  	CMPQ  SI, $0xffff
   154  	JE   success
   155  	ADDQ $1,DI
   156  	CMPQ DI,DX
   157  	JB loop16
   158  	JMP fail
   159  _17_or_more:
   160  	CMPQ AX, $31
   161  	JA   _32_or_more
   162  	LEAQ 1(DI)(DX*1), DX
   163  	SUBQ AX, DX
   164  	MOVOU -16(R8)(AX*1), X0
   165  	MOVOU (R8), X1
   166  loop17to31:
   167  	MOVOU (DI), X2
   168  	PCMPEQB X1,X2
   169  	PMOVMSKB X2, SI
   170  	CMPQ  SI, $0xffff
   171  	JE   partial_success17to31
   172  	ADDQ $1,DI
   173  	CMPQ DI,DX
   174  	JB loop17to31
   175  	JMP fail
   176  partial_success17to31:
   177  	MOVOU -16(AX)(DI*1), X3
   178  	PCMPEQB X0, X3
   179  	PMOVMSKB X3, SI
   180  	CMPQ  SI, $0xffff
   181  	JE success
   182  	ADDQ $1,DI
   183  	CMPQ DI,DX
   184  	JB loop17to31
   185  	JMP fail
   186  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
   187  // So no need to check cpuid
   188  _32_or_more:
   189  	CMPQ AX, $32
   190  	JA   _33_to_63
   191  	VMOVDQU (R8), Y1
   192  	LEAQ -31(DI)(DX*1), DX
   193  loop32:
   194  	VMOVDQU (DI), Y2
   195  	VPCMPEQB Y1, Y2, Y3
   196  	VPMOVMSKB Y3, SI
   197  	CMPL  SI, $0xffffffff
   198  	JE   success_avx2
   199  	ADDQ $1,DI
   200  	CMPQ DI,DX
   201  	JB loop32
   202  	JMP fail_avx2
   203  _33_to_63:
   204  	LEAQ 1(DI)(DX*1), DX
   205  	SUBQ AX, DX
   206  	VMOVDQU -32(R8)(AX*1), Y0
   207  	VMOVDQU (R8), Y1
   208  loop33to63:
   209  	VMOVDQU (DI), Y2
   210  	VPCMPEQB Y1, Y2, Y3
   211  	VPMOVMSKB Y3, SI
   212  	CMPL  SI, $0xffffffff
   213  	JE   partial_success33to63
   214  	ADDQ $1,DI
   215  	CMPQ DI,DX
   216  	JB loop33to63
   217  	JMP fail_avx2
   218  partial_success33to63:
   219  	VMOVDQU -32(AX)(DI*1), Y3
   220  	VPCMPEQB Y0, Y3, Y4
   221  	VPMOVMSKB Y4, SI
   222  	CMPL  SI, $0xffffffff
   223  	JE success_avx2
   224  	ADDQ $1,DI
   225  	CMPQ DI,DX
   226  	JB loop33to63
   227  fail_avx2:
   228  	VZEROUPPER
   229  fail:
   230  	MOVQ $-1, (R11)
   231  	RET
   232  success_avx2:
   233  	VZEROUPPER
   234  	JMP success
   235  sse42:
   236  	CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
   237  	JNE no_sse42
   238  	CMPQ AX, $12
   239  	// PCMPESTRI is slower than normal compare,
   240  	// so using it makes sense only if we advance 4+ bytes per compare
   241  	// This value was determined experimentally and is the ~same
   242  	// on Nehalem (first with SSE42) and Haswell.
   243  	JAE _9_or_more
   244  	LEAQ 16(R8), SI
   245  	TESTW $0xff0, SI
   246  	JEQ no_sse42
   247  	MOVOU (R8), X1
   248  	LEAQ -15(DI)(DX*1), SI
   249  	MOVQ $16, R9
   250  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
   251  loop_sse42:
   252  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
   253  	// for equality (bits 2,3 are 11)
   254  	// result is not masked or inverted (bits 4,5 are 00)
   255  	// and corresponds to first matching byte (bit 6 is 0)
   256  	PCMPESTRI $0x0c, (DI), X1
   257  	// CX == 16 means no match,
   258  	// CX > R9 means partial match at the end of the string,
   259  	// otherwise sep is at offset CX from X1 start
   260  	CMPQ CX, R9
   261  	JBE sse42_success
   262  	ADDQ R9, DI
   263  	CMPQ DI, SI
   264  	JB loop_sse42
   265  	PCMPESTRI $0x0c, -1(SI), X1
   266  	CMPQ CX, R9
   267  	JA fail
   268  	LEAQ -1(SI), DI
   269  sse42_success:
   270  	ADDQ CX, DI
   271  success:
   272  	SUBQ R10, DI
   273  	MOVQ DI, (R11)
   274  	RET
   275  

View as plain text