blob: 9af177fa882d7e17376649395721e183332211e3 [file] [log] [blame]
Brent Austinba3052e2015-04-21 16:08:23 -07001// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package bytes implements functions for the manipulation of byte slices.
6// It is analogous to the facilities of the strings package.
7package bytes
8
9import (
10 "unicode"
11 "unicode/utf8"
12)
13
14func equalPortable(a, b []byte) bool {
15 if len(a) != len(b) {
16 return false
17 }
18 for i, c := range a {
19 if c != b[i] {
20 return false
21 }
22 }
23 return true
24}
25
Dan Willemsen09eb3b12015-09-16 14:34:17 -070026// explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
Brent Austinba3052e2015-04-21 16:08:23 -070027// up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
28func explode(s []byte, n int) [][]byte {
29 if n <= 0 {
30 n = len(s)
31 }
32 a := make([][]byte, n)
33 var size int
34 na := 0
35 for len(s) > 0 {
36 if na+1 >= n {
37 a[na] = s
38 na++
39 break
40 }
41 _, size = utf8.DecodeRune(s)
Dan Willemsena3223282018-02-27 19:41:43 -080042 a[na] = s[0:size:size]
Brent Austinba3052e2015-04-21 16:08:23 -070043 s = s[size:]
44 na++
45 }
46 return a[0:na]
47}
48
Dan Willemsend2797482017-07-26 13:13:13 -070049// countGeneric actually implements Count
50func countGeneric(s, sep []byte) int {
51 // special case
52 if len(sep) == 0 {
Brent Austinba3052e2015-04-21 16:08:23 -070053 return utf8.RuneCount(s) + 1
54 }
Dan Willemsend2797482017-07-26 13:13:13 -070055 n := 0
56 for {
57 i := Index(s, sep)
58 if i == -1 {
59 return n
Brent Austinba3052e2015-04-21 16:08:23 -070060 }
Dan Willemsend2797482017-07-26 13:13:13 -070061 n++
62 s = s[i+len(sep):]
Brent Austinba3052e2015-04-21 16:08:23 -070063 }
Brent Austinba3052e2015-04-21 16:08:23 -070064}
65
66// Contains reports whether subslice is within b.
67func Contains(b, subslice []byte) bool {
68 return Index(b, subslice) != -1
69}
70
Dan Willemsena3223282018-02-27 19:41:43 -080071// ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
Dan Willemsen38f2dba2016-07-08 14:54:35 -070072func ContainsAny(b []byte, chars string) bool {
73 return IndexAny(b, chars) >= 0
74}
75
Dan Willemsena3223282018-02-27 19:41:43 -080076// ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
Dan Willemsen38f2dba2016-07-08 14:54:35 -070077func ContainsRune(b []byte, r rune) bool {
78 return IndexRune(b, r) >= 0
79}
80
Brent Austinba3052e2015-04-21 16:08:23 -070081func indexBytePortable(s []byte, c byte) int {
82 for i, b := range s {
83 if b == c {
84 return i
85 }
86 }
87 return -1
88}
89
90// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
91func LastIndex(s, sep []byte) int {
92 n := len(sep)
93 if n == 0 {
94 return len(s)
95 }
96 c := sep[0]
97 for i := len(s) - n; i >= 0; i-- {
98 if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
99 return i
100 }
101 }
102 return -1
103}
104
Dan Willemsen09eb3b12015-09-16 14:34:17 -0700105// LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
106func LastIndexByte(s []byte, c byte) int {
107 for i := len(s) - 1; i >= 0; i-- {
108 if s[i] == c {
109 return i
110 }
111 }
112 return -1
113}
114
Dan Willemsena3223282018-02-27 19:41:43 -0800115// IndexRune interprets s as a sequence of UTF-8-encoded code points.
Brent Austinba3052e2015-04-21 16:08:23 -0700116// It returns the byte index of the first occurrence in s of the given rune.
117// It returns -1 if rune is not present in s.
Dan Willemsenebae3022017-01-13 23:01:08 -0800118// If r is utf8.RuneError, it returns the first instance of any
119// invalid UTF-8 byte sequence.
Brent Austinba3052e2015-04-21 16:08:23 -0700120func IndexRune(s []byte, r rune) int {
Dan Willemsenebae3022017-01-13 23:01:08 -0800121 switch {
122 case 0 <= r && r < utf8.RuneSelf:
123 return IndexByte(s, byte(r))
124 case r == utf8.RuneError:
125 for i := 0; i < len(s); {
126 r1, n := utf8.DecodeRune(s[i:])
127 if r1 == utf8.RuneError {
128 return i
129 }
130 i += n
Brent Austinba3052e2015-04-21 16:08:23 -0700131 }
Dan Willemsenebae3022017-01-13 23:01:08 -0800132 return -1
133 case !utf8.ValidRune(r):
134 return -1
135 default:
136 var b [utf8.UTFMax]byte
137 n := utf8.EncodeRune(b[:], r)
138 return Index(s, b[:n])
Brent Austinba3052e2015-04-21 16:08:23 -0700139 }
Brent Austinba3052e2015-04-21 16:08:23 -0700140}
141
142// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
143// It returns the byte index of the first occurrence in s of any of the Unicode
Dan Willemsen38f2dba2016-07-08 14:54:35 -0700144// code points in chars. It returns -1 if chars is empty or if there is no code
Brent Austinba3052e2015-04-21 16:08:23 -0700145// point in common.
146func IndexAny(s []byte, chars string) int {
Dan Willemsena3223282018-02-27 19:41:43 -0800147 if chars == "" {
148 // Avoid scanning all of s.
149 return -1
150 }
151 if len(s) > 8 {
152 if as, isASCII := makeASCIISet(chars); isASCII {
153 for i, c := range s {
154 if as.contains(c) {
Brent Austinba3052e2015-04-21 16:08:23 -0700155 return i
156 }
157 }
Dan Willemsena3223282018-02-27 19:41:43 -0800158 return -1
159 }
160 }
161 var width int
162 for i := 0; i < len(s); i += width {
163 r := rune(s[i])
164 if r < utf8.RuneSelf {
165 width = 1
166 } else {
167 r, width = utf8.DecodeRune(s[i:])
168 }
169 for _, ch := range chars {
170 if r == ch {
171 return i
172 }
Brent Austinba3052e2015-04-21 16:08:23 -0700173 }
174 }
175 return -1
176}
177
178// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
Dan Willemsen38f2dba2016-07-08 14:54:35 -0700179// points. It returns the byte index of the last occurrence in s of any of
180// the Unicode code points in chars. It returns -1 if chars is empty or if
Brent Austinba3052e2015-04-21 16:08:23 -0700181// there is no code point in common.
182func LastIndexAny(s []byte, chars string) int {
Dan Willemsena3223282018-02-27 19:41:43 -0800183 if chars == "" {
184 // Avoid scanning all of s.
185 return -1
186 }
187 if len(s) > 8 {
188 if as, isASCII := makeASCIISet(chars); isASCII {
189 for i := len(s) - 1; i >= 0; i-- {
190 if as.contains(s[i]) {
Brent Austinba3052e2015-04-21 16:08:23 -0700191 return i
192 }
193 }
Dan Willemsena3223282018-02-27 19:41:43 -0800194 return -1
195 }
196 }
197 for i := len(s); i > 0; {
198 r, size := utf8.DecodeLastRune(s[:i])
199 i -= size
200 for _, c := range chars {
201 if r == c {
202 return i
203 }
Brent Austinba3052e2015-04-21 16:08:23 -0700204 }
205 }
206 return -1
207}
208
209// Generic split: splits after each instance of sep,
210// including sepSave bytes of sep in the subslices.
211func genSplit(s, sep []byte, sepSave, n int) [][]byte {
212 if n == 0 {
213 return nil
214 }
215 if len(sep) == 0 {
216 return explode(s, n)
217 }
218 if n < 0 {
219 n = Count(s, sep) + 1
220 }
Dan Willemsend2797482017-07-26 13:13:13 -0700221
Brent Austinba3052e2015-04-21 16:08:23 -0700222 a := make([][]byte, n)
Dan Willemsend2797482017-07-26 13:13:13 -0700223 n--
224 i := 0
225 for i < n {
226 m := Index(s, sep)
227 if m < 0 {
228 break
Brent Austinba3052e2015-04-21 16:08:23 -0700229 }
Dan Willemsena3223282018-02-27 19:41:43 -0800230 a[i] = s[: m+sepSave : m+sepSave]
Dan Willemsend2797482017-07-26 13:13:13 -0700231 s = s[m+len(sep):]
232 i++
Brent Austinba3052e2015-04-21 16:08:23 -0700233 }
Dan Willemsend2797482017-07-26 13:13:13 -0700234 a[i] = s
235 return a[:i+1]
Brent Austinba3052e2015-04-21 16:08:23 -0700236}
237
238// SplitN slices s into subslices separated by sep and returns a slice of
239// the subslices between those separators.
240// If sep is empty, SplitN splits after each UTF-8 sequence.
241// The count determines the number of subslices to return:
242// n > 0: at most n subslices; the last subslice will be the unsplit remainder.
243// n == 0: the result is nil (zero subslices)
244// n < 0: all subslices
245func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
246
247// SplitAfterN slices s into subslices after each instance of sep and
248// returns a slice of those subslices.
249// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
250// The count determines the number of subslices to return:
251// n > 0: at most n subslices; the last subslice will be the unsplit remainder.
252// n == 0: the result is nil (zero subslices)
253// n < 0: all subslices
254func SplitAfterN(s, sep []byte, n int) [][]byte {
255 return genSplit(s, sep, len(sep), n)
256}
257
258// Split slices s into all subslices separated by sep and returns a slice of
259// the subslices between those separators.
260// If sep is empty, Split splits after each UTF-8 sequence.
261// It is equivalent to SplitN with a count of -1.
262func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
263
264// SplitAfter slices s into all subslices after each instance of sep and
265// returns a slice of those subslices.
266// If sep is empty, SplitAfter splits after each UTF-8 sequence.
267// It is equivalent to SplitAfterN with a count of -1.
268func SplitAfter(s, sep []byte) [][]byte {
269 return genSplit(s, sep, len(sep), -1)
270}
271
Dan Willemsena3223282018-02-27 19:41:43 -0800272var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
273
274// Fields interprets s as a sequence of UTF-8-encoded code points.
275// It splits the slice s around each instance of one or more consecutive white space
276// characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
277// empty slice if s contains only white space.
Brent Austinba3052e2015-04-21 16:08:23 -0700278func Fields(s []byte) [][]byte {
Dan Willemsena3223282018-02-27 19:41:43 -0800279 // First count the fields.
280 // This is an exact count if s is ASCII, otherwise it is an approximation.
281 n := 0
282 wasSpace := 1
283 // setBits is used to track which bits are set in the bytes of s.
284 setBits := uint8(0)
285 for i := 0; i < len(s); i++ {
286 r := s[i]
287 setBits |= r
288 isSpace := int(asciiSpace[r])
289 n += wasSpace & ^isSpace
290 wasSpace = isSpace
291 }
292
293 if setBits >= utf8.RuneSelf {
294 // Some runes in the input slice are not ASCII.
295 return FieldsFunc(s, unicode.IsSpace)
296 }
297
298 // ASCII fast path
299 a := make([][]byte, n)
300 na := 0
301 fieldStart := 0
302 i := 0
303 // Skip spaces in the front of the input.
304 for i < len(s) && asciiSpace[s[i]] != 0 {
305 i++
306 }
307 fieldStart = i
308 for i < len(s) {
309 if asciiSpace[s[i]] == 0 {
310 i++
311 continue
312 }
313 a[na] = s[fieldStart:i:i]
314 na++
315 i++
316 // Skip spaces in between fields.
317 for i < len(s) && asciiSpace[s[i]] != 0 {
318 i++
319 }
320 fieldStart = i
321 }
322 if fieldStart < len(s) { // Last field might end at EOF.
323 a[na] = s[fieldStart:len(s):len(s)]
324 }
325 return a
Brent Austinba3052e2015-04-21 16:08:23 -0700326}
327
Dan Willemsena3223282018-02-27 19:41:43 -0800328// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
Brent Austinba3052e2015-04-21 16:08:23 -0700329// It splits the slice s at each run of code points c satisfying f(c) and
Dan Willemsen38f2dba2016-07-08 14:54:35 -0700330// returns a slice of subslices of s. If all code points in s satisfy f(c), or
Brent Austinba3052e2015-04-21 16:08:23 -0700331// len(s) == 0, an empty slice is returned.
332// FieldsFunc makes no guarantees about the order in which it calls f(c).
333// If f does not return consistent results for a given c, FieldsFunc may crash.
334func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
Dan Willemsena3223282018-02-27 19:41:43 -0800335 // A span is used to record a slice of s of the form s[start:end].
336 // The start index is inclusive and the end index is exclusive.
337 type span struct {
338 start int
339 end int
340 }
341 spans := make([]span, 0, 32)
342
343 // Find the field start and end indices.
344 wasField := false
345 fromIndex := 0
Brent Austinba3052e2015-04-21 16:08:23 -0700346 for i := 0; i < len(s); {
Dan Willemsena3223282018-02-27 19:41:43 -0800347 size := 1
348 r := rune(s[i])
349 if r >= utf8.RuneSelf {
350 r, size = utf8.DecodeRune(s[i:])
351 }
352 if f(r) {
353 if wasField {
354 spans = append(spans, span{start: fromIndex, end: i})
355 wasField = false
356 }
357 } else {
358 if !wasField {
359 fromIndex = i
360 wasField = true
361 }
Brent Austinba3052e2015-04-21 16:08:23 -0700362 }
363 i += size
364 }
365
Dan Willemsena3223282018-02-27 19:41:43 -0800366 // Last field might end at EOF.
367 if wasField {
368 spans = append(spans, span{fromIndex, len(s)})
Brent Austinba3052e2015-04-21 16:08:23 -0700369 }
Dan Willemsena3223282018-02-27 19:41:43 -0800370
371 // Create subslices from recorded field indices.
372 a := make([][]byte, len(spans))
373 for i, span := range spans {
374 a[i] = s[span.start:span.end:span.end]
375 }
376
377 return a
Brent Austinba3052e2015-04-21 16:08:23 -0700378}
379
380// Join concatenates the elements of s to create a new byte slice. The separator
381// sep is placed between elements in the resulting slice.
382func Join(s [][]byte, sep []byte) []byte {
383 if len(s) == 0 {
384 return []byte{}
385 }
386 if len(s) == 1 {
387 // Just return a copy.
388 return append([]byte(nil), s[0]...)
389 }
390 n := len(sep) * (len(s) - 1)
391 for _, v := range s {
392 n += len(v)
393 }
394
395 b := make([]byte, n)
396 bp := copy(b, s[0])
397 for _, v := range s[1:] {
398 bp += copy(b[bp:], sep)
399 bp += copy(b[bp:], v)
400 }
401 return b
402}
403
404// HasPrefix tests whether the byte slice s begins with prefix.
405func HasPrefix(s, prefix []byte) bool {
406 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
407}
408
409// HasSuffix tests whether the byte slice s ends with suffix.
410func HasSuffix(s, suffix []byte) bool {
411 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
412}
413
414// Map returns a copy of the byte slice s with all its characters modified
415// according to the mapping function. If mapping returns a negative value, the character is
Dan Willemsena3223282018-02-27 19:41:43 -0800416// dropped from the byte slice with no replacement. The characters in s and the
417// output are interpreted as UTF-8-encoded code points.
Brent Austinba3052e2015-04-21 16:08:23 -0700418func Map(mapping func(r rune) rune, s []byte) []byte {
419 // In the worst case, the slice can grow when mapped, making
Dan Willemsen38f2dba2016-07-08 14:54:35 -0700420 // things unpleasant. But it's so rare we barge in assuming it's
421 // fine. It could also shrink but that falls out naturally.
Brent Austinba3052e2015-04-21 16:08:23 -0700422 maxbytes := len(s) // length of b
423 nbytes := 0 // number of bytes encoded in b
424 b := make([]byte, maxbytes)
425 for i := 0; i < len(s); {
426 wid := 1
427 r := rune(s[i])
428 if r >= utf8.RuneSelf {
429 r, wid = utf8.DecodeRune(s[i:])
430 }
431 r = mapping(r)
432 if r >= 0 {
433 rl := utf8.RuneLen(r)
434 if rl < 0 {
435 rl = len(string(utf8.RuneError))
436 }
437 if nbytes+rl > maxbytes {
438 // Grow the buffer.
439 maxbytes = maxbytes*2 + utf8.UTFMax
440 nb := make([]byte, maxbytes)
441 copy(nb, b[0:nbytes])
442 b = nb
443 }
444 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
445 }
446 i += wid
447 }
448 return b[0:nbytes]
449}
450
451// Repeat returns a new byte slice consisting of count copies of b.
Dan Willemsenebae3022017-01-13 23:01:08 -0800452//
453// It panics if count is negative or if
454// the result of (len(b) * count) overflows.
Brent Austinba3052e2015-04-21 16:08:23 -0700455func Repeat(b []byte, count int) []byte {
Dan Willemsenebae3022017-01-13 23:01:08 -0800456 // Since we cannot return an error on overflow,
457 // we should panic if the repeat will generate
458 // an overflow.
459 // See Issue golang.org/issue/16237.
460 if count < 0 {
461 panic("bytes: negative Repeat count")
462 } else if count > 0 && len(b)*count/count != len(b) {
463 panic("bytes: Repeat count causes overflow")
464 }
465
Brent Austinba3052e2015-04-21 16:08:23 -0700466 nb := make([]byte, len(b)*count)
467 bp := copy(nb, b)
468 for bp < len(nb) {
469 copy(nb[bp:], nb[:bp])
470 bp *= 2
471 }
472 return nb
473}
474
Dan Willemsena3223282018-02-27 19:41:43 -0800475// ToUpper treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters within it mapped to their upper case.
Brent Austinba3052e2015-04-21 16:08:23 -0700476func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }
477
Dan Willemsena3223282018-02-27 19:41:43 -0800478// ToLower treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their lower case.
Brent Austinba3052e2015-04-21 16:08:23 -0700479func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }
480
Dan Willemsena3223282018-02-27 19:41:43 -0800481// ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
Brent Austinba3052e2015-04-21 16:08:23 -0700482func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
483
Dan Willemsena3223282018-02-27 19:41:43 -0800484// ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
Brent Austinba3052e2015-04-21 16:08:23 -0700485// upper case, giving priority to the special casing rules.
Dan Willemsenebae3022017-01-13 23:01:08 -0800486func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
487 return Map(func(r rune) rune { return c.ToUpper(r) }, s)
Brent Austinba3052e2015-04-21 16:08:23 -0700488}
489
Dan Willemsena3223282018-02-27 19:41:43 -0800490// ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
Brent Austinba3052e2015-04-21 16:08:23 -0700491// lower case, giving priority to the special casing rules.
Dan Willemsenebae3022017-01-13 23:01:08 -0800492func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
493 return Map(func(r rune) rune { return c.ToLower(r) }, s)
Brent Austinba3052e2015-04-21 16:08:23 -0700494}
495
Dan Willemsena3223282018-02-27 19:41:43 -0800496// ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
Brent Austinba3052e2015-04-21 16:08:23 -0700497// title case, giving priority to the special casing rules.
Dan Willemsenebae3022017-01-13 23:01:08 -0800498func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
499 return Map(func(r rune) rune { return c.ToTitle(r) }, s)
Brent Austinba3052e2015-04-21 16:08:23 -0700500}
501
502// isSeparator reports whether the rune could mark a word boundary.
503// TODO: update when package unicode captures more of the properties.
504func isSeparator(r rune) bool {
505 // ASCII alphanumerics and underscore are not separators
506 if r <= 0x7F {
507 switch {
508 case '0' <= r && r <= '9':
509 return false
510 case 'a' <= r && r <= 'z':
511 return false
512 case 'A' <= r && r <= 'Z':
513 return false
514 case r == '_':
515 return false
516 }
517 return true
518 }
519 // Letters and digits are not separators
520 if unicode.IsLetter(r) || unicode.IsDigit(r) {
521 return false
522 }
523 // Otherwise, all we can do for now is treat spaces as separators.
524 return unicode.IsSpace(r)
525}
526
Dan Willemsena3223282018-02-27 19:41:43 -0800527// Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
528// words mapped to their title case.
Brent Austinba3052e2015-04-21 16:08:23 -0700529//
Dan Willemsen09eb3b12015-09-16 14:34:17 -0700530// BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
Brent Austinba3052e2015-04-21 16:08:23 -0700531func Title(s []byte) []byte {
532 // Use a closure here to remember state.
533 // Hackish but effective. Depends on Map scanning in order and calling
534 // the closure once per rune.
535 prev := ' '
536 return Map(
537 func(r rune) rune {
538 if isSeparator(prev) {
539 prev = r
540 return unicode.ToTitle(r)
541 }
542 prev = r
543 return r
544 },
545 s)
546}
547
Dan Willemsena3223282018-02-27 19:41:43 -0800548// TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
549// all leading UTF-8-encoded code points c that satisfy f(c).
Brent Austinba3052e2015-04-21 16:08:23 -0700550func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
551 i := indexFunc(s, f, false)
552 if i == -1 {
553 return nil
554 }
555 return s[i:]
556}
557
Dan Willemsena3223282018-02-27 19:41:43 -0800558// TrimRightFunc returns a subslice of s by slicing off all trailing
559// UTF-8-encoded code points c that satisfy f(c).
Brent Austinba3052e2015-04-21 16:08:23 -0700560func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
561 i := lastIndexFunc(s, f, false)
562 if i >= 0 && s[i] >= utf8.RuneSelf {
563 _, wid := utf8.DecodeRune(s[i:])
564 i += wid
565 } else {
566 i++
567 }
568 return s[0:i]
569}
570
571// TrimFunc returns a subslice of s by slicing off all leading and trailing
Dan Willemsena3223282018-02-27 19:41:43 -0800572// UTF-8-encoded code points c that satisfy f(c).
Brent Austinba3052e2015-04-21 16:08:23 -0700573func TrimFunc(s []byte, f func(r rune) bool) []byte {
574 return TrimRightFunc(TrimLeftFunc(s, f), f)
575}
576
577// TrimPrefix returns s without the provided leading prefix string.
578// If s doesn't start with prefix, s is returned unchanged.
579func TrimPrefix(s, prefix []byte) []byte {
580 if HasPrefix(s, prefix) {
581 return s[len(prefix):]
582 }
583 return s
584}
585
586// TrimSuffix returns s without the provided trailing suffix string.
587// If s doesn't end with suffix, s is returned unchanged.
588func TrimSuffix(s, suffix []byte) []byte {
589 if HasSuffix(s, suffix) {
590 return s[:len(s)-len(suffix)]
591 }
592 return s
593}
594
Dan Willemsena3223282018-02-27 19:41:43 -0800595// IndexFunc interprets s as a sequence of UTF-8-encoded code points.
Brent Austinba3052e2015-04-21 16:08:23 -0700596// It returns the byte index in s of the first Unicode
597// code point satisfying f(c), or -1 if none do.
598func IndexFunc(s []byte, f func(r rune) bool) int {
599 return indexFunc(s, f, true)
600}
601
Dan Willemsena3223282018-02-27 19:41:43 -0800602// LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
Brent Austinba3052e2015-04-21 16:08:23 -0700603// It returns the byte index in s of the last Unicode
604// code point satisfying f(c), or -1 if none do.
605func LastIndexFunc(s []byte, f func(r rune) bool) int {
606 return lastIndexFunc(s, f, true)
607}
608
609// indexFunc is the same as IndexFunc except that if
610// truth==false, the sense of the predicate function is
611// inverted.
612func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
613 start := 0
614 for start < len(s) {
615 wid := 1
616 r := rune(s[start])
617 if r >= utf8.RuneSelf {
618 r, wid = utf8.DecodeRune(s[start:])
619 }
620 if f(r) == truth {
621 return start
622 }
623 start += wid
624 }
625 return -1
626}
627
628// lastIndexFunc is the same as LastIndexFunc except that if
629// truth==false, the sense of the predicate function is
630// inverted.
631func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
632 for i := len(s); i > 0; {
633 r, size := rune(s[i-1]), 1
634 if r >= utf8.RuneSelf {
635 r, size = utf8.DecodeLastRune(s[0:i])
636 }
637 i -= size
638 if f(r) == truth {
639 return i
640 }
641 }
642 return -1
643}
644
Dan Willemsenebae3022017-01-13 23:01:08 -0800645// asciiSet is a 32-byte value, where each bit represents the presence of a
646// given ASCII character in the set. The 128-bits of the lower 16 bytes,
647// starting with the least-significant bit of the lowest word to the
648// most-significant bit of the highest word, map to the full range of all
649// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
650// ensuring that any non-ASCII character will be reported as not in the set.
651type asciiSet [8]uint32
652
653// makeASCIISet creates a set of ASCII characters and reports whether all
654// characters in chars are ASCII.
655func makeASCIISet(chars string) (as asciiSet, ok bool) {
656 for i := 0; i < len(chars); i++ {
657 c := chars[i]
658 if c >= utf8.RuneSelf {
659 return as, false
660 }
661 as[c>>5] |= 1 << uint(c&31)
662 }
663 return as, true
664}
665
666// contains reports whether c is inside the set.
667func (as *asciiSet) contains(c byte) bool {
668 return (as[c>>5] & (1 << uint(c&31))) != 0
669}
670
Brent Austinba3052e2015-04-21 16:08:23 -0700671func makeCutsetFunc(cutset string) func(r rune) bool {
Dan Willemsenebae3022017-01-13 23:01:08 -0800672 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
673 return func(r rune) bool {
674 return r == rune(cutset[0])
675 }
676 }
677 if as, isASCII := makeASCIISet(cutset); isASCII {
678 return func(r rune) bool {
679 return r < utf8.RuneSelf && as.contains(byte(r))
680 }
681 }
Brent Austinba3052e2015-04-21 16:08:23 -0700682 return func(r rune) bool {
683 for _, c := range cutset {
684 if c == r {
685 return true
686 }
687 }
688 return false
689 }
690}
691
692// Trim returns a subslice of s by slicing off all leading and
Dan Willemsena3223282018-02-27 19:41:43 -0800693// trailing UTF-8-encoded code points contained in cutset.
Brent Austinba3052e2015-04-21 16:08:23 -0700694func Trim(s []byte, cutset string) []byte {
695 return TrimFunc(s, makeCutsetFunc(cutset))
696}
697
698// TrimLeft returns a subslice of s by slicing off all leading
Dan Willemsena3223282018-02-27 19:41:43 -0800699// UTF-8-encoded code points contained in cutset.
Brent Austinba3052e2015-04-21 16:08:23 -0700700func TrimLeft(s []byte, cutset string) []byte {
701 return TrimLeftFunc(s, makeCutsetFunc(cutset))
702}
703
704// TrimRight returns a subslice of s by slicing off all trailing
Dan Willemsena3223282018-02-27 19:41:43 -0800705// UTF-8-encoded code points that are contained in cutset.
Brent Austinba3052e2015-04-21 16:08:23 -0700706func TrimRight(s []byte, cutset string) []byte {
707 return TrimRightFunc(s, makeCutsetFunc(cutset))
708}
709
710// TrimSpace returns a subslice of s by slicing off all leading and
711// trailing white space, as defined by Unicode.
712func TrimSpace(s []byte) []byte {
713 return TrimFunc(s, unicode.IsSpace)
714}
715
Dan Willemsena3223282018-02-27 19:41:43 -0800716// Runes interprets s as a sequence of UTF-8-encoded code points.
717// It returns a slice of runes (Unicode code points) equivalent to s.
Brent Austinba3052e2015-04-21 16:08:23 -0700718func Runes(s []byte) []rune {
719 t := make([]rune, utf8.RuneCount(s))
720 i := 0
721 for len(s) > 0 {
722 r, l := utf8.DecodeRune(s)
723 t[i] = r
724 i++
725 s = s[l:]
726 }
727 return t
728}
729
730// Replace returns a copy of the slice s with the first n
731// non-overlapping instances of old replaced by new.
732// If old is empty, it matches at the beginning of the slice
733// and after each UTF-8 sequence, yielding up to k+1 replacements
734// for a k-rune slice.
735// If n < 0, there is no limit on the number of replacements.
736func Replace(s, old, new []byte, n int) []byte {
737 m := 0
738 if n != 0 {
739 // Compute number of replacements.
740 m = Count(s, old)
741 }
742 if m == 0 {
743 // Just return a copy.
744 return append([]byte(nil), s...)
745 }
746 if n < 0 || m < n {
747 n = m
748 }
749
750 // Apply replacements to buffer.
751 t := make([]byte, len(s)+n*(len(new)-len(old)))
752 w := 0
753 start := 0
754 for i := 0; i < n; i++ {
755 j := start
756 if len(old) == 0 {
757 if i > 0 {
758 _, wid := utf8.DecodeRune(s[start:])
759 j += wid
760 }
761 } else {
762 j += Index(s[start:], old)
763 }
764 w += copy(t[w:], s[start:j])
765 w += copy(t[w:], new)
766 start = j + len(old)
767 }
768 w += copy(t[w:], s[start:])
769 return t[0:w]
770}
771
772// EqualFold reports whether s and t, interpreted as UTF-8 strings,
773// are equal under Unicode case-folding.
774func EqualFold(s, t []byte) bool {
775 for len(s) != 0 && len(t) != 0 {
776 // Extract first rune from each.
777 var sr, tr rune
778 if s[0] < utf8.RuneSelf {
779 sr, s = rune(s[0]), s[1:]
780 } else {
781 r, size := utf8.DecodeRune(s)
782 sr, s = r, s[size:]
783 }
784 if t[0] < utf8.RuneSelf {
785 tr, t = rune(t[0]), t[1:]
786 } else {
787 r, size := utf8.DecodeRune(t)
788 tr, t = r, t[size:]
789 }
790
791 // If they match, keep going; if not, return false.
792
793 // Easy case.
794 if tr == sr {
795 continue
796 }
797
798 // Make sr < tr to simplify what follows.
799 if tr < sr {
800 tr, sr = sr, tr
801 }
802 // Fast check for ASCII.
803 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
804 // ASCII, and sr is upper case. tr must be lower case.
805 if tr == sr+'a'-'A' {
806 continue
807 }
808 return false
809 }
810
Dan Willemsen38f2dba2016-07-08 14:54:35 -0700811 // General case. SimpleFold(x) returns the next equivalent rune > x
Brent Austinba3052e2015-04-21 16:08:23 -0700812 // or wraps around to smaller values.
813 r := unicode.SimpleFold(sr)
814 for r != sr && r < tr {
815 r = unicode.SimpleFold(r)
816 }
817 if r == tr {
818 continue
819 }
820 return false
821 }
822
Dan Willemsen38f2dba2016-07-08 14:54:35 -0700823 // One string is empty. Are both?
Brent Austinba3052e2015-04-21 16:08:23 -0700824 return len(s) == len(t)
825}
Dan Willemsena3223282018-02-27 19:41:43 -0800826
827func indexRabinKarp(s, sep []byte) int {
828 // Rabin-Karp search
829 hashsep, pow := hashStr(sep)
830 n := len(sep)
831 var h uint32
832 for i := 0; i < n; i++ {
833 h = h*primeRK + uint32(s[i])
834 }
835 if h == hashsep && Equal(s[:n], sep) {
836 return 0
837 }
838 for i := n; i < len(s); {
839 h *= primeRK
840 h += uint32(s[i])
841 h -= pow * uint32(s[i-n])
842 i++
843 if h == hashsep && Equal(s[i-n:i], sep) {
844 return i - n
845 }
846 }
847 return -1
848}
849
850// primeRK is the prime base used in Rabin-Karp algorithm.
851const primeRK = 16777619
852
853// hashStr returns the hash and the appropriate multiplicative
854// factor for use in Rabin-Karp algorithm.
855func hashStr(sep []byte) (uint32, uint32) {
856 hash := uint32(0)
857 for i := 0; i < len(sep); i++ {
858 hash = hash*primeRK + uint32(sep[i])
859 }
860 var pow, sq uint32 = 1, primeRK
861 for i := len(sep); i > 0; i >>= 1 {
862 if i&1 != 0 {
863 pow *= sq
864 }
865 sq *= sq
866 }
867 return hash, pow
868}