1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 TEXT ·IndexByte(SB),NOSPLIT,$0-40
8 MOVD b_base+0(FP), R0
9 MOVD b_len+8(FP), R2
10 MOVBU c+24(FP), R1
11 MOVD $ret+32(FP), R8
12 B indexbytebody<>(SB)
13
14 TEXT ·IndexByteString(SB),NOSPLIT,$0-32
15 MOVD s_base+0(FP), R0
16 MOVD s_len+8(FP), R2
17 MOVBU c+16(FP), R1
18 MOVD $ret+24(FP), R8
19 B indexbytebody<>(SB)
20
21 // input:
22 // R0: data
23 // R1: byte to search
24 // R2: data len
25 // R8: address to put result
26 TEXT indexbytebody<>(SB),NOSPLIT,$0
27 // Core algorithm:
28 // For each 32-byte chunk we calculate a 64-bit syndrome value,
29 // with two bits per byte. For each tuple, bit 0 is set if the
30 // relevant byte matched the requested character and bit 1 is
31 // not used (faster than using a 32bit syndrome). Since the bits
32 // in the syndrome reflect exactly the order in which things occur
33 // in the original string, counting trailing zeros allows to
34 // identify exactly which byte has matched.
35
36 CBZ R2, fail
37 MOVD R0, R11
38 // Magic constant 0x40100401 allows us to identify
39 // which lane matches the requested byte.
40 // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
41 // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
42 MOVD $0x40100401, R5
43 VMOV R1, V0.B16
44 // Work with aligned 32-byte chunks
45 BIC $0x1f, R0, R3
46 VMOV R5, V5.S4
47 ANDS $0x1f, R0, R9
48 AND $0x1f, R2, R10
49 BEQ loop
50
51 // Input string is not 32-byte aligned. We calculate the
52 // syndrome value for the aligned 32 bytes block containing
53 // the first bytes and mask off the irrelevant part.
54 VLD1.P (R3), [V1.B16, V2.B16]
55 SUB $0x20, R9, R4
56 ADDS R4, R2, R2
57 VCMEQ V0.B16, V1.B16, V3.B16
58 VCMEQ V0.B16, V2.B16, V4.B16
59 VAND V5.B16, V3.B16, V3.B16
60 VAND V5.B16, V4.B16, V4.B16
61 VADDP V4.B16, V3.B16, V6.B16 // 256->128
62 VADDP V6.B16, V6.B16, V6.B16 // 128->64
63 VMOV V6.D[0], R6
64 // Clear the irrelevant lower bits
65 LSL $1, R9, R4
66 LSR R4, R6, R6
67 LSL R4, R6, R6
68 // The first block can also be the last
69 BLS masklast
70 // Have we found something already?
71 CBNZ R6, tail
72
73 loop:
74 VLD1.P (R3), [V1.B16, V2.B16]
75 SUBS $0x20, R2, R2
76 VCMEQ V0.B16, V1.B16, V3.B16
77 VCMEQ V0.B16, V2.B16, V4.B16
78 // If we're out of data we finish regardless of the result
79 BLS end
80 // Use a fast check for the termination condition
81 VORR V4.B16, V3.B16, V6.B16
82 VADDP V6.D2, V6.D2, V6.D2
83 VMOV V6.D[0], R6
84 // We're not out of data, loop if we haven't found the character
85 CBZ R6, loop
86
87 end:
88 // Termination condition found, let's calculate the syndrome value
89 VAND V5.B16, V3.B16, V3.B16
90 VAND V5.B16, V4.B16, V4.B16
91 VADDP V4.B16, V3.B16, V6.B16
92 VADDP V6.B16, V6.B16, V6.B16
93 VMOV V6.D[0], R6
94 // Only do the clear for the last possible block with less than 32 bytes
95 // Condition flags come from SUBS in the loop
96 BHS tail
97
98 masklast:
99 // Clear the irrelevant upper bits
100 ADD R9, R10, R4
101 AND $0x1f, R4, R4
102 SUB $0x20, R4, R4
103 NEG R4<<1, R4
104 LSL R4, R6, R6
105 LSR R4, R6, R6
106
107 tail:
108 // Check that we have found a character
109 CBZ R6, fail
110 // Count the trailing zeros using bit reversing
111 RBIT R6, R6
112 // Compensate the last post-increment
113 SUB $0x20, R3, R3
114 // And count the leading zeros
115 CLZ R6, R6
116 // R6 is twice the offset into the fragment
117 ADD R6>>1, R3, R0
118 // Compute the offset result
119 SUB R11, R0, R0
120 MOVD R0, (R8)
121 RET
122
123 fail:
124 MOVD $-1, R0
125 MOVD R0, (R8)
126 RET
127
View as plain text