simplify_test.go 4.03 KB
Newer Older
1 2 3 4
// Copyright 2011 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

5
package syntax_test
6

7
import . "regexp/syntax"
8 9 10 11 12 13 14 15 16 17 18 19 20 21
import "testing"

var simplifyTests = []struct {
	Regexp string
	Simple string
}{
	// Already-simple constructs
	{`a`, `a`},
	{`ab`, `ab`},
	{`a|b`, `[a-b]`},
	{`ab|cd`, `ab|cd`},
	{`(ab)*`, `(ab)*`},
	{`(ab)+`, `(ab)+`},
	{`(ab)?`, `(ab)?`},
22
	{`.`, `(?s:.)`},
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
	{`^`, `^`},
	{`$`, `$`},
	{`[ac]`, `[ac]`},
	{`[^ac]`, `[^ac]`},

	// Posix character classes
	{`[[:alnum:]]`, `[0-9A-Za-z]`},
	{`[[:alpha:]]`, `[A-Za-z]`},
	{`[[:blank:]]`, `[\t ]`},
	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
	{`[[:digit:]]`, `[0-9]`},
	{`[[:graph:]]`, `[!-~]`},
	{`[[:lower:]]`, `[a-z]`},
	{`[[:print:]]`, `[ -~]`},
	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
	{`[[:space:]]`, `[\t-\r ]`},
	{`[[:upper:]]`, `[A-Z]`},
	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},

	// Perl character classes
	{`\d`, `[0-9]`},
	{`\s`, `[\t-\n\f-\r ]`},
	{`\w`, `[0-9A-Z_a-z]`},
	{`\D`, `[^0-9]`},
	{`\S`, `[^\t-\n\f-\r ]`},
	{`\W`, `[^0-9A-Z_a-z]`},
	{`[\d]`, `[0-9]`},
	{`[\s]`, `[\t-\n\f-\r ]`},
	{`[\w]`, `[0-9A-Z_a-z]`},
	{`[\D]`, `[^0-9]`},
	{`[\S]`, `[^\t-\n\f-\r ]`},
	{`[\W]`, `[^0-9A-Z_a-z]`},

	// Posix repetitions
	{`a{1}`, `a`},
	{`a{2}`, `aa`},
	{`a{5}`, `aaaaa`},
	{`a{0,1}`, `a?`},
	// The next three are illegible because Simplify inserts (?:)
	// parens instead of () parens to avoid creating extra
	// captured subexpressions.  The comments show a version with fewer parens.
	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
	{`a{0,}`, `a*`},
	{`a{1,}`, `a+`},
	{`a{2,}`, `aa+`},
	{`a{5,}`, `aaaaa+`},

	// Test that operators simplify their arguments.
	{`(?:a{1,}){1,}`, `a+`},
	{`(a{1,}b{1,})`, `(a+b+)`},
	{`a{1,}|b{1,}`, `a+|b+`},
	{`(?:a{1,})*`, `(?:a+)*`},
	{`(?:a{1,})+`, `a+`},
	{`(?:a{1,})?`, `(?:a+)?`},
	{``, `(?:)`},
	{`a{0}`, `(?:)`},

	// Character class simplification
	{`[ab]`, `[a-b]`},
	{`[a-za-za-z]`, `[a-z]`},
	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
	{`[ABCDEFGH]`, `[A-H]`},
	{`[AB-CD-EF-GH]`, `[A-H]`},
	{`[W-ZP-XE-R]`, `[E-Z]`},
	{`[a-ee-gg-m]`, `[a-m]`},
	{`[a-ea-ha-m]`, `[a-m]`},
	{`[a-ma-ha-e]`, `[a-m]`},
	{`[a-zA-Z0-9 -~]`, `[ -~]`},

	// Empty character classes
	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},

	// Full character classes
101
	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
102 103 104

	// Unicode case folding.
	{`(?i)A`, `(?i:A)`},
105
	{`(?i)a`, `(?i:A)`},
106 107 108
	{`(?i)[A]`, `(?i:A)`},
	{`(?i)[a]`, `(?i:A)`},
	{`(?i)K`, `(?i:K)`},
109 110
	{`(?i)k`, `(?i:K)`},
	{`(?i)\x{212a}`, "(?i:K)"},
111 112 113 114 115
	{`(?i)[K]`, "[Kk\u212A]"},
	{`(?i)[k]`, "[Kk\u212A]"},
	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
116
	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152

	// Empty string as a regular expression.
	// The empty string must be preserved inside parens in order
	// to make submatches work right, so these tests are less
	// interesting than they might otherwise be.  String inserts
	// explicit (?:) in place of non-parenthesized empty strings,
	// to make them easier to spot for other parsers.
	{`(a|b|)`, `([a-b]|(?:))`},
	{`(|)`, `()`},
	{`a()`, `a()`},
	{`(()|())`, `(()|())`},
	{`(a|)`, `(a|(?:))`},
	{`ab()cd()`, `ab()cd()`},
	{`()`, `()`},
	{`()*`, `()*`},
	{`()+`, `()+`},
	{`()?`, `()?`},
	{`(){0}`, `(?:)`},
	{`(){1}`, `()`},
	{`(){1,}`, `()+`},
	{`(){0,2}`, `(?:()()?)?`},
}

func TestSimplify(t *testing.T) {
	for _, tt := range simplifyTests {
		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
		if err != nil {
			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
			continue
		}
		s := re.Simplify().String()
		if s != tt.Simple {
			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
		}
	}
}