bytes.go 24.3 KB
Newer Older
1 2 3 4
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

5 6
// Package bytes implements functions for the manipulation of byte slices.
// It is analogous to the facilities of the strings package.
7 8 9
package bytes

import (
10
	"internal/bytealg"
11
	"unicode"
12
	"unicode/utf8"
13 14
)

15
func equalPortable(a, b []byte) bool {
16 17 18 19 20 21 22 23 24 25 26
	if len(a) != len(b) {
		return false
	}
	for i, c := range a {
		if c != b[i] {
			return false
		}
	}
	return true
}

27
// explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
28
// up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
29 30 31 32 33 34 35 36 37 38 39 40 41 42
func explode(s []byte, n int) [][]byte {
	if n <= 0 {
		n = len(s)
	}
	a := make([][]byte, n)
	var size int
	na := 0
	for len(s) > 0 {
		if na+1 >= n {
			a[na] = s
			na++
			break
		}
		_, size = utf8.DecodeRune(s)
43
		a[na] = s[0:size:size]
44 45 46 47 48 49
		s = s[size:]
		na++
	}
	return a[0:na]
}

50 51 52
// Count counts the number of non-overlapping instances of sep in s.
// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
func Count(s, sep []byte) int {
53 54
	// special case
	if len(sep) == 0 {
55 56
		return utf8.RuneCount(s) + 1
	}
57 58 59
	if len(sep) == 1 {
		return bytealg.Count(s, sep[0])
	}
60 61 62 63 64
	n := 0
	for {
		i := Index(s, sep)
		if i == -1 {
			return n
65
		}
66 67
		n++
		s = s[i+len(sep):]
68 69 70
	}
}

71
// Contains reports whether subslice is within b.
72 73 74 75
func Contains(b, subslice []byte) bool {
	return Index(b, subslice) != -1
}

76
// ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
77 78 79 80
func ContainsAny(b []byte, chars string) bool {
	return IndexAny(b, chars) >= 0
}

81
// ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
82 83 84 85
func ContainsRune(b []byte, r rune) bool {
	return IndexRune(b, r) >= 0
}

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
func indexBytePortable(s []byte, c byte) int {
	for i, b := range s {
		if b == c {
			return i
		}
	}
	return -1
}

// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
func LastIndex(s, sep []byte) int {
	n := len(sep)
	if n == 0 {
		return len(s)
	}
	c := sep[0]
	for i := len(s) - n; i >= 0; i-- {
		if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
			return i
		}
	}
	return -1
}

110 111 112 113 114 115 116 117 118 119
// LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
func LastIndexByte(s []byte, c byte) int {
	for i := len(s) - 1; i >= 0; i-- {
		if s[i] == c {
			return i
		}
	}
	return -1
}

120
// IndexRune interprets s as a sequence of UTF-8-encoded code points.
121 122
// It returns the byte index of the first occurrence in s of the given rune.
// It returns -1 if rune is not present in s.
123 124
// If r is utf8.RuneError, it returns the first instance of any
// invalid UTF-8 byte sequence.
125
func IndexRune(s []byte, r rune) int {
126 127 128 129 130 131 132 133 134 135
	switch {
	case 0 <= r && r < utf8.RuneSelf:
		return IndexByte(s, byte(r))
	case r == utf8.RuneError:
		for i := 0; i < len(s); {
			r1, n := utf8.DecodeRune(s[i:])
			if r1 == utf8.RuneError {
				return i
			}
			i += n
136
		}
137 138 139 140 141 142 143
		return -1
	case !utf8.ValidRune(r):
		return -1
	default:
		var b [utf8.UTFMax]byte
		n := utf8.EncodeRune(b[:], r)
		return Index(s, b[:n])
144 145 146 147 148
	}
}

// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
// It returns the byte index of the first occurrence in s of any of the Unicode
149
// code points in chars. It returns -1 if chars is empty or if there is no code
150 151
// point in common.
func IndexAny(s []byte, chars string) int {
152 153 154 155 156 157 158 159 160
	if chars == "" {
		// Avoid scanning all of s.
		return -1
	}
	if len(s) > 8 {
		if as, isASCII := makeASCIISet(chars); isASCII {
			for i, c := range s {
				if as.contains(c) {
					return i
161 162
				}
			}
163
			return -1
164
		}
165 166 167 168 169 170 171 172 173 174 175 176
	}
	var width int
	for i := 0; i < len(s); i += width {
		r := rune(s[i])
		if r < utf8.RuneSelf {
			width = 1
		} else {
			r, width = utf8.DecodeRune(s[i:])
		}
		for _, ch := range chars {
			if r == ch {
				return i
177 178 179 180 181 182
			}
		}
	}
	return -1
}

183
// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
184 185
// points. It returns the byte index of the last occurrence in s of any of
// the Unicode code points in chars. It returns -1 if chars is empty or if
186 187
// there is no code point in common.
func LastIndexAny(s []byte, chars string) int {
188 189 190 191 192 193 194 195 196
	if chars == "" {
		// Avoid scanning all of s.
		return -1
	}
	if len(s) > 8 {
		if as, isASCII := makeASCIISet(chars); isASCII {
			for i := len(s) - 1; i >= 0; i-- {
				if as.contains(s[i]) {
					return i
197 198
				}
			}
199
			return -1
200
		}
201 202 203 204 205 206 207
	}
	for i := len(s); i > 0; {
		r, size := utf8.DecodeLastRune(s[:i])
		i -= size
		for _, c := range chars {
			if r == c {
				return i
208 209 210 211 212 213
			}
		}
	}
	return -1
}

214
// Generic split: splits after each instance of sep,
215
// including sepSave bytes of sep in the subslices.
216 217 218 219 220 221 222 223 224 225
func genSplit(s, sep []byte, sepSave, n int) [][]byte {
	if n == 0 {
		return nil
	}
	if len(sep) == 0 {
		return explode(s, n)
	}
	if n < 0 {
		n = Count(s, sep) + 1
	}
226

227
	a := make([][]byte, n)
228 229 230 231 232 233
	n--
	i := 0
	for i < n {
		m := Index(s, sep)
		if m < 0 {
			break
234
		}
235
		a[i] = s[: m+sepSave : m+sepSave]
236 237
		s = s[m+len(sep):]
		i++
238
	}
239 240
	a[i] = s
	return a[:i+1]
241 242
}

243
// SplitN slices s into subslices separated by sep and returns a slice of
244
// the subslices between those separators.
245
// If sep is empty, SplitN splits after each UTF-8 sequence.
246 247 248 249
// The count determines the number of subslices to return:
//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
//   n == 0: the result is nil (zero subslices)
//   n < 0: all subslices
250
func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
251

252
// SplitAfterN slices s into subslices after each instance of sep and
253
// returns a slice of those subslices.
254
// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
255 256 257 258
// The count determines the number of subslices to return:
//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
//   n == 0: the result is nil (zero subslices)
//   n < 0: all subslices
259
func SplitAfterN(s, sep []byte, n int) [][]byte {
260 261 262
	return genSplit(s, sep, len(sep), n)
}

263 264 265 266 267 268 269 270 271 272 273 274 275 276
// Split slices s into all subslices separated by sep and returns a slice of
// the subslices between those separators.
// If sep is empty, Split splits after each UTF-8 sequence.
// It is equivalent to SplitN with a count of -1.
func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }

// SplitAfter slices s into all subslices after each instance of sep and
// returns a slice of those subslices.
// If sep is empty, SplitAfter splits after each UTF-8 sequence.
// It is equivalent to SplitAfterN with a count of -1.
func SplitAfter(s, sep []byte) [][]byte {
	return genSplit(s, sep, len(sep), -1)
}

277 278 279 280 281 282
var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}

// Fields interprets s as a sequence of UTF-8-encoded code points.
// It splits the slice s around each instance of one or more consecutive white space
// characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
// empty slice if s contains only white space.
283
func Fields(s []byte) [][]byte {
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
	// First count the fields.
	// This is an exact count if s is ASCII, otherwise it is an approximation.
	n := 0
	wasSpace := 1
	// setBits is used to track which bits are set in the bytes of s.
	setBits := uint8(0)
	for i := 0; i < len(s); i++ {
		r := s[i]
		setBits |= r
		isSpace := int(asciiSpace[r])
		n += wasSpace & ^isSpace
		wasSpace = isSpace
	}

	if setBits >= utf8.RuneSelf {
		// Some runes in the input slice are not ASCII.
		return FieldsFunc(s, unicode.IsSpace)
	}

	// ASCII fast path
	a := make([][]byte, n)
	na := 0
	fieldStart := 0
	i := 0
	// Skip spaces in the front of the input.
	for i < len(s) && asciiSpace[s[i]] != 0 {
		i++
	}
	fieldStart = i
	for i < len(s) {
		if asciiSpace[s[i]] == 0 {
			i++
			continue
		}
		a[na] = s[fieldStart:i:i]
		na++
		i++
		// Skip spaces in between fields.
		for i < len(s) && asciiSpace[s[i]] != 0 {
			i++
		}
		fieldStart = i
	}
	if fieldStart < len(s) { // Last field might end at EOF.
		a[na] = s[fieldStart:len(s):len(s)]
	}
	return a
331 332
}

333
// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
334
// It splits the slice s at each run of code points c satisfying f(c) and
335
// returns a slice of subslices of s. If all code points in s satisfy f(c), or
336
// len(s) == 0, an empty slice is returned.
337 338
// FieldsFunc makes no guarantees about the order in which it calls f(c).
// If f does not return consistent results for a given c, FieldsFunc may crash.
339
func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
340 341 342 343 344
	// A span is used to record a slice of s of the form s[start:end].
	// The start index is inclusive and the end index is exclusive.
	type span struct {
		start int
		end   int
345
	}
346
	spans := make([]span, 0, 32)
347

348 349 350 351 352 353 354 355
	// Find the field start and end indices.
	wasField := false
	fromIndex := 0
	for i := 0; i < len(s); {
		size := 1
		r := rune(s[i])
		if r >= utf8.RuneSelf {
			r, size = utf8.DecodeRune(s[i:])
356
		}
357 358 359 360 361 362 363 364 365 366
		if f(r) {
			if wasField {
				spans = append(spans, span{start: fromIndex, end: i})
				wasField = false
			}
		} else {
			if !wasField {
				fromIndex = i
				wasField = true
			}
367 368 369
		}
		i += size
	}
370 371 372 373 374 375 376 377 378 379 380 381 382

	// Last field might end at EOF.
	if wasField {
		spans = append(spans, span{fromIndex, len(s)})
	}

	// Create subslices from recorded field indices.
	a := make([][]byte, len(spans))
	for i, span := range spans {
		a[i] = s[span.start:span.end:span.end]
	}

	return a
383 384
}

385 386 387 388
// Join concatenates the elements of s to create a new byte slice. The separator
// sep is placed between elements in the resulting slice.
func Join(s [][]byte, sep []byte) []byte {
	if len(s) == 0 {
389 390
		return []byte{}
	}
391
	if len(s) == 1 {
392
		// Just return a copy.
393
		return append([]byte(nil), s[0]...)
394
	}
395 396 397
	n := len(sep) * (len(s) - 1)
	for _, v := range s {
		n += len(v)
398 399 400
	}

	b := make([]byte, n)
401 402
	bp := copy(b, s[0])
	for _, v := range s[1:] {
403
		bp += copy(b[bp:], sep)
404
		bp += copy(b[bp:], v)
405 406 407 408
	}
	return b
}

409
// HasPrefix tests whether the byte slice s begins with prefix.
410 411 412 413
func HasPrefix(s, prefix []byte) bool {
	return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
}

414
// HasSuffix tests whether the byte slice s ends with suffix.
415 416 417 418
func HasSuffix(s, suffix []byte) bool {
	return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
}

419
// Map returns a copy of the byte slice s with all its characters modified
420
// according to the mapping function. If mapping returns a negative value, the character is
421 422
// dropped from the byte slice with no replacement. The characters in s and the
// output are interpreted as UTF-8-encoded code points.
423
func Map(mapping func(r rune) rune, s []byte) []byte {
424
	// In the worst case, the slice can grow when mapped, making
425 426
	// things unpleasant. But it's so rare we barge in assuming it's
	// fine. It could also shrink but that falls out naturally.
427 428 429 430 431
	maxbytes := len(s) // length of b
	nbytes := 0        // number of bytes encoded in b
	b := make([]byte, maxbytes)
	for i := 0; i < len(s); {
		wid := 1
432 433 434
		r := rune(s[i])
		if r >= utf8.RuneSelf {
			r, wid = utf8.DecodeRune(s[i:])
435
		}
436 437
		r = mapping(r)
		if r >= 0 {
438 439 440 441 442
			rl := utf8.RuneLen(r)
			if rl < 0 {
				rl = len(string(utf8.RuneError))
			}
			if nbytes+rl > maxbytes {
443 444 445 446 447 448
				// Grow the buffer.
				maxbytes = maxbytes*2 + utf8.UTFMax
				nb := make([]byte, maxbytes)
				copy(nb, b[0:nbytes])
				b = nb
			}
449
			nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
450 451 452 453 454 455 456
		}
		i += wid
	}
	return b[0:nbytes]
}

// Repeat returns a new byte slice consisting of count copies of b.
457 458 459
//
// It panics if count is negative or if
// the result of (len(b) * count) overflows.
460
func Repeat(b []byte, count int) []byte {
461 462 463 464 465 466 467 468 469 470
	// Since we cannot return an error on overflow,
	// we should panic if the repeat will generate
	// an overflow.
	// See Issue golang.org/issue/16237.
	if count < 0 {
		panic("bytes: negative Repeat count")
	} else if count > 0 && len(b)*count/count != len(b) {
		panic("bytes: Repeat count causes overflow")
	}

471
	nb := make([]byte, len(b)*count)
472 473 474 475
	bp := copy(nb, b)
	for bp < len(nb) {
		copy(nb[bp:], nb[:bp])
		bp *= 2
476 477 478 479
	}
	return nb
}

480
// ToUpper treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters within it mapped to their upper case.
481 482
func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }

483
// ToLower treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their lower case.
484 485
func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }

486
// ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
487 488
func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }

489
// ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
490
// upper case, giving priority to the special casing rules.
491 492
func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
	return Map(func(r rune) rune { return c.ToUpper(r) }, s)
493 494
}

495
// ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
496
// lower case, giving priority to the special casing rules.
497 498
func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
	return Map(func(r rune) rune { return c.ToLower(r) }, s)
499 500
}

501
// ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
502
// title case, giving priority to the special casing rules.
503 504
func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
	return Map(func(r rune) rune { return c.ToTitle(r) }, s)
505 506 507 508
}

// isSeparator reports whether the rune could mark a word boundary.
// TODO: update when package unicode captures more of the properties.
509
func isSeparator(r rune) bool {
510
	// ASCII alphanumerics and underscore are not separators
511
	if r <= 0x7F {
512
		switch {
513
		case '0' <= r && r <= '9':
514
			return false
515
		case 'a' <= r && r <= 'z':
516
			return false
517
		case 'A' <= r && r <= 'Z':
518
			return false
519
		case r == '_':
520 521 522 523 524
			return false
		}
		return true
	}
	// Letters and digits are not separators
525
	if unicode.IsLetter(r) || unicode.IsDigit(r) {
526 527 528
		return false
	}
	// Otherwise, all we can do for now is treat spaces as separators.
529
	return unicode.IsSpace(r)
530 531
}

532 533
// Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
// words mapped to their title case.
534
//
535
// BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
536 537 538 539
func Title(s []byte) []byte {
	// Use a closure here to remember state.
	// Hackish but effective. Depends on Map scanning in order and calling
	// the closure once per rune.
540
	prev := ' '
541
	return Map(
542
		func(r rune) rune {
543 544 545 546 547 548 549 550 551 552
			if isSeparator(prev) {
				prev = r
				return unicode.ToTitle(r)
			}
			prev = r
			return r
		},
		s)
}

553 554
// TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
// all leading UTF-8-encoded code points c that satisfy f(c).
555
func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
556 557 558 559 560 561 562
	i := indexFunc(s, f, false)
	if i == -1 {
		return nil
	}
	return s[i:]
}

563 564
// TrimRightFunc returns a subslice of s by slicing off all trailing
// UTF-8-encoded code points c that satisfy f(c).
565
func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
566 567 568 569 570 571 572 573 574 575 576
	i := lastIndexFunc(s, f, false)
	if i >= 0 && s[i] >= utf8.RuneSelf {
		_, wid := utf8.DecodeRune(s[i:])
		i += wid
	} else {
		i++
	}
	return s[0:i]
}

// TrimFunc returns a subslice of s by slicing off all leading and trailing
577
// UTF-8-encoded code points c that satisfy f(c).
578
func TrimFunc(s []byte, f func(r rune) bool) []byte {
579 580 581
	return TrimRightFunc(TrimLeftFunc(s, f), f)
}

582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
// TrimPrefix returns s without the provided leading prefix string.
// If s doesn't start with prefix, s is returned unchanged.
func TrimPrefix(s, prefix []byte) []byte {
	if HasPrefix(s, prefix) {
		return s[len(prefix):]
	}
	return s
}

// TrimSuffix returns s without the provided trailing suffix string.
// If s doesn't end with suffix, s is returned unchanged.
func TrimSuffix(s, suffix []byte) []byte {
	if HasSuffix(s, suffix) {
		return s[:len(s)-len(suffix)]
	}
	return s
}

600
// IndexFunc interprets s as a sequence of UTF-8-encoded code points.
601 602
// It returns the byte index in s of the first Unicode
// code point satisfying f(c), or -1 if none do.
603
func IndexFunc(s []byte, f func(r rune) bool) int {
604 605 606
	return indexFunc(s, f, true)
}

607
// LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
608 609
// It returns the byte index in s of the last Unicode
// code point satisfying f(c), or -1 if none do.
610
func LastIndexFunc(s []byte, f func(r rune) bool) int {
611 612 613 614 615 616
	return lastIndexFunc(s, f, true)
}

// indexFunc is the same as IndexFunc except that if
// truth==false, the sense of the predicate function is
// inverted.
617
func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
618 619 620
	start := 0
	for start < len(s) {
		wid := 1
621 622 623
		r := rune(s[start])
		if r >= utf8.RuneSelf {
			r, wid = utf8.DecodeRune(s[start:])
624
		}
625
		if f(r) == truth {
626 627 628 629 630 631 632 633 634 635
			return start
		}
		start += wid
	}
	return -1
}

// lastIndexFunc is the same as LastIndexFunc except that if
// truth==false, the sense of the predicate function is
// inverted.
636
func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
637
	for i := len(s); i > 0; {
638 639 640 641
		r, size := rune(s[i-1]), 1
		if r >= utf8.RuneSelf {
			r, size = utf8.DecodeLastRune(s[0:i])
		}
642
		i -= size
643
		if f(r) == truth {
644 645 646 647 648 649
			return i
		}
	}
	return -1
}

650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
// asciiSet is a 32-byte value, where each bit represents the presence of a
// given ASCII character in the set. The 128-bits of the lower 16 bytes,
// starting with the least-significant bit of the lowest word to the
// most-significant bit of the highest word, map to the full range of all
// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
// ensuring that any non-ASCII character will be reported as not in the set.
type asciiSet [8]uint32

// makeASCIISet creates a set of ASCII characters and reports whether all
// characters in chars are ASCII.
func makeASCIISet(chars string) (as asciiSet, ok bool) {
	for i := 0; i < len(chars); i++ {
		c := chars[i]
		if c >= utf8.RuneSelf {
			return as, false
		}
		as[c>>5] |= 1 << uint(c&31)
	}
	return as, true
}

// contains reports whether c is inside the set.
func (as *asciiSet) contains(c byte) bool {
	return (as[c>>5] & (1 << uint(c&31))) != 0
}

676
func makeCutsetFunc(cutset string) func(r rune) bool {
677 678 679 680 681 682 683 684 685 686
	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
		return func(r rune) bool {
			return r == rune(cutset[0])
		}
	}
	if as, isASCII := makeASCIISet(cutset); isASCII {
		return func(r rune) bool {
			return r < utf8.RuneSelf && as.contains(byte(r))
		}
	}
687
	return func(r rune) bool {
688
		for _, c := range cutset {
689
			if c == r {
690 691 692 693 694 695 696 697
				return true
			}
		}
		return false
	}
}

// Trim returns a subslice of s by slicing off all leading and
698
// trailing UTF-8-encoded code points contained in cutset.
699 700 701 702 703
func Trim(s []byte, cutset string) []byte {
	return TrimFunc(s, makeCutsetFunc(cutset))
}

// TrimLeft returns a subslice of s by slicing off all leading
704
// UTF-8-encoded code points contained in cutset.
705 706 707 708 709
func TrimLeft(s []byte, cutset string) []byte {
	return TrimLeftFunc(s, makeCutsetFunc(cutset))
}

// TrimRight returns a subslice of s by slicing off all trailing
710
// UTF-8-encoded code points that are contained in cutset.
711 712 713 714 715
func TrimRight(s []byte, cutset string) []byte {
	return TrimRightFunc(s, makeCutsetFunc(cutset))
}

// TrimSpace returns a subslice of s by slicing off all leading and
716
// trailing white space, as defined by Unicode.
717 718 719 720
func TrimSpace(s []byte) []byte {
	return TrimFunc(s, unicode.IsSpace)
}

721 722
// Runes interprets s as a sequence of UTF-8-encoded code points.
// It returns a slice of runes (Unicode code points) equivalent to s.
723 724
func Runes(s []byte) []rune {
	t := make([]rune, utf8.RuneCount(s))
725 726 727 728 729 730 731 732 733 734 735 736
	i := 0
	for len(s) > 0 {
		r, l := utf8.DecodeRune(s)
		t[i] = r
		i++
		s = s[l:]
	}
	return t
}

// Replace returns a copy of the slice s with the first n
// non-overlapping instances of old replaced by new.
737 738 739
// If old is empty, it matches at the beginning of the slice
// and after each UTF-8 sequence, yielding up to k+1 replacements
// for a k-rune slice.
740 741
// If n < 0, there is no limit on the number of replacements.
func Replace(s, old, new []byte, n int) []byte {
742 743 744 745 746 747
	m := 0
	if n != 0 {
		// Compute number of replacements.
		m = Count(s, old)
	}
	if m == 0 {
748 749
		// Just return a copy.
		return append([]byte(nil), s...)
750 751
	}
	if n < 0 || m < n {
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
		n = m
	}

	// Apply replacements to buffer.
	t := make([]byte, len(s)+n*(len(new)-len(old)))
	w := 0
	start := 0
	for i := 0; i < n; i++ {
		j := start
		if len(old) == 0 {
			if i > 0 {
				_, wid := utf8.DecodeRune(s[start:])
				j += wid
			}
		} else {
			j += Index(s[start:], old)
		}
		w += copy(t[w:], s[start:j])
		w += copy(t[w:], new)
		start = j + len(old)
	}
	w += copy(t[w:], s[start:])
	return t[0:w]
}
776 777 778 779 780 781

// EqualFold reports whether s and t, interpreted as UTF-8 strings,
// are equal under Unicode case-folding.
func EqualFold(s, t []byte) bool {
	for len(s) != 0 && len(t) != 0 {
		// Extract first rune from each.
782
		var sr, tr rune
783
		if s[0] < utf8.RuneSelf {
784
			sr, s = rune(s[0]), s[1:]
785 786 787 788 789
		} else {
			r, size := utf8.DecodeRune(s)
			sr, s = r, s[size:]
		}
		if t[0] < utf8.RuneSelf {
790
			tr, t = rune(t[0]), t[1:]
791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
		} else {
			r, size := utf8.DecodeRune(t)
			tr, t = r, t[size:]
		}

		// If they match, keep going; if not, return false.

		// Easy case.
		if tr == sr {
			continue
		}

		// Make sr < tr to simplify what follows.
		if tr < sr {
			tr, sr = sr, tr
		}
		// Fast check for ASCII.
808 809 810
		if tr < utf8.RuneSelf {
			// ASCII only, sr/tr must be upper/lower case
			if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
811 812 813 814 815
				continue
			}
			return false
		}

816
		// General case. SimpleFold(x) returns the next equivalent rune > x
817 818 819 820 821 822 823 824 825 826 827
		// or wraps around to smaller values.
		r := unicode.SimpleFold(sr)
		for r != sr && r < tr {
			r = unicode.SimpleFold(r)
		}
		if r == tr {
			continue
		}
		return false
	}

828
	// One string is empty. Are both?
829 830
	return len(s) == len(t)
}
831

832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
	n := len(sep)
	switch {
	case n == 0:
		return 0
	case n == 1:
		return IndexByte(s, sep[0])
	case n == len(s):
		if Equal(sep, s) {
			return 0
		}
		return -1
	case n > len(s):
		return -1
	case n <= bytealg.MaxLen:
		// Use brute force when s and sep both are small
		if len(s) <= bytealg.MaxBruteForce {
			return bytealg.Index(s, sep)
		}
		c := sep[0]
		i := 0
		t := s[:len(s)-n+1]
		fails := 0
		for i < len(t) {
			if t[i] != c {
				// IndexByte is faster than bytealg.Index, so use it as long as
				// we're not getting lots of false positives.
				o := IndexByte(t[i:], c)
				if o < 0 {
					return -1
				}
				i += o
			}
			if Equal(s[i:i+n], sep) {
				return i
			}
			fails++
			i++
			// Switch to bytealg.Index when IndexByte produces too many false positives.
			if fails > bytealg.Cutover(i) {
				r := bytealg.Index(s[i:], sep)
				if r >= 0 {
					return r + i
				}
				return -1
			}
		}
		return -1
	}
	c := sep[0]
	i := 0
	fails := 0
	t := s[:len(s)-n+1]
	for i < len(t) {
		if t[i] != c {
			o := IndexByte(t[i:], c)
			if o < 0 {
				break
			}
			i += o
		}
		if Equal(s[i:i+n], sep) {
			return i
		}
		i++
		fails++
		if fails >= 4+i>>4 && i < len(t) {
			// Give up on IndexByte, it isn't skipping ahead
			// far enough to be better than Rabin-Karp.
			// Experiments (using IndexPeriodic) suggest
			// the cutover is about 16 byte skips.
			// TODO: if large prefixes of sep are matching
			// we should cutover at even larger average skips,
			// because Equal becomes that much more expensive.
			// This code does not take that effect into account.
			j := indexRabinKarp(s[i:], sep)
			if j < 0 {
				return -1
			}
			return i + j
		}
	}
	return -1
}

918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959
func indexRabinKarp(s, sep []byte) int {
	// Rabin-Karp search
	hashsep, pow := hashStr(sep)
	n := len(sep)
	var h uint32
	for i := 0; i < n; i++ {
		h = h*primeRK + uint32(s[i])
	}
	if h == hashsep && Equal(s[:n], sep) {
		return 0
	}
	for i := n; i < len(s); {
		h *= primeRK
		h += uint32(s[i])
		h -= pow * uint32(s[i-n])
		i++
		if h == hashsep && Equal(s[i-n:i], sep) {
			return i - n
		}
	}
	return -1
}

// primeRK is the prime base used in Rabin-Karp algorithm.
const primeRK = 16777619

// hashStr returns the hash and the appropriate multiplicative
// factor for use in Rabin-Karp algorithm.
func hashStr(sep []byte) (uint32, uint32) {
	hash := uint32(0)
	for i := 0; i < len(sep); i++ {
		hash = hash*primeRK + uint32(sep[i])
	}
	var pow, sq uint32 = 1, primeRK
	for i := len(sep); i > 0; i >>= 1 {
		if i&1 != 0 {
			pow *= sq
		}
		sq *= sq
	}
	return hash, pow
}