prog.go 7.65 KB
Newer Older
1 2 3 4
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

5 6 7 8 9
package syntax

import (
	"bytes"
	"strconv"
10
	"unicode"
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
)

// Compiled program.
// May not belong in this package, but convenient for now.

// A Prog is a compiled regular expression program.
type Prog struct {
	Inst   []Inst
	Start  int // index of start instruction
	NumCap int // number of InstCapture insts in re
}

// An InstOp is an instruction opcode.
type InstOp uint8

const (
	InstAlt InstOp = iota
	InstAltMatch
	InstCapture
	InstEmptyWidth
	InstMatch
	InstFail
	InstNop
	InstRune
35 36 37
	InstRune1
	InstRuneAny
	InstRuneAnyNotNL
38 39
)

40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
var instOpNames = []string{
	"InstAlt",
	"InstAltMatch",
	"InstCapture",
	"InstEmptyWidth",
	"InstMatch",
	"InstFail",
	"InstNop",
	"InstRune",
	"InstRune1",
	"InstRuneAny",
	"InstRuneAnyNotNL",
}

func (i InstOp) String() string {
	if uint(i) >= uint(len(instOpNames)) {
		return ""
	}
	return instOpNames[i]
}

61 62 63 64 65 66 67 68 69 70 71 72
// An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8

const (
	EmptyBeginLine EmptyOp = 1 << iota
	EmptyEndLine
	EmptyBeginText
	EmptyEndText
	EmptyWordBoundary
	EmptyNoWordBoundary
)

73 74 75 76 77 78
// EmptyOpContext returns the zero-width assertions
// satisfied at the position between the runes r1 and r2.
// Passing r1 == -1 indicates that the position is
// at the beginning of the text.
// Passing r2 == -1 indicates that the position is
// at the end of the text.
79
func EmptyOpContext(r1, r2 rune) EmptyOp {
80 81 82 83 84 85
	var op EmptyOp = EmptyNoWordBoundary
	var boundary byte
	switch {
	case IsWordChar(r1):
		boundary = 1
	case r1 == '\n':
86
		op |= EmptyBeginLine
87 88
	case r1 < 0:
		op |= EmptyBeginText | EmptyBeginLine
89
	}
90 91 92 93
	switch {
	case IsWordChar(r2):
		boundary ^= 1
	case r2 == '\n':
94
		op |= EmptyEndLine
95 96
	case r2 < 0:
		op |= EmptyEndText | EmptyEndLine
97
	}
98 99
	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
100 101 102 103 104 105 106
	}
	return op
}

// IsWordChar reports whether r is consider a ``word character''
// during the evaluation of the \b and \B zero-width assertions.
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
107
func IsWordChar(r rune) bool {
108 109 110
	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}

111 112 113 114 115
// An Inst is a single instruction in a regular expression program.
type Inst struct {
	Op   InstOp
	Out  uint32 // all but InstMatch, InstFail
	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
116
	Rune []rune
117 118 119 120 121 122 123 124 125 126
}

func (p *Prog) String() string {
	var b bytes.Buffer
	dumpProg(&b, p)
	return b.String()
}

// skipNop follows any no-op or capturing instructions
// and returns the resulting pc.
127
func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
128 129 130 131 132
	i := &p.Inst[pc]
	for i.Op == InstNop || i.Op == InstCapture {
		pc = i.Out
		i = &p.Inst[pc]
	}
133
	return i, pc
134 135
}

136 137 138 139 140 141 142 143 144 145
// op returns i.Op but merges all the Rune special cases into InstRune
func (i *Inst) op() InstOp {
	op := i.Op
	switch op {
	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
		op = InstRune
	}
	return op
}

146 147 148 149
// Prefix returns a literal string that all matches for the
// regexp must start with.  Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
150
	i, _ := p.skipNop(uint32(p.Start))
151 152

	// Avoid allocation of buffer if prefix is empty.
153
	if i.op() != InstRune || len(i.Rune) != 1 {
154 155 156 157 158
		return "", i.Op == InstMatch
	}

	// Have prefix; gather characters.
	var buf bytes.Buffer
159
	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
160
		buf.WriteRune(i.Rune[0])
161
		i, _ = p.skipNop(i.Out)
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
	}
	return buf.String(), i.Op == InstMatch
}

// StartCond returns the leading empty-width conditions that must
// be true in any match.  It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
	var flag EmptyOp
	pc := uint32(p.Start)
	i := &p.Inst[pc]
Loop:
	for {
		switch i.Op {
		case InstEmptyWidth:
			flag |= EmptyOp(i.Arg)
		case InstFail:
			return ^EmptyOp(0)
		case InstCapture, InstNop:
			// skip
		default:
			break Loop
		}
		pc = i.Out
		i = &p.Inst[pc]
	}
	return flag
}

190 191
const noMatch = -1

192 193
// MatchRune returns true if the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
194
func (i *Inst) MatchRune(r rune) bool {
195 196 197 198 199 200 201 202 203
	return i.MatchRunePos(r) != noMatch
}

// MatchRunePos checks whether the instruction matches (and consumes) r.
// If so, MatchRunePos returns the index of the matching rune pair
// (or, when len(i.Rune) == 1, rune singleton).
// If not, MatchRunePos returns -1.
// MatchRunePos should only be called when i.Op == InstRune.
func (i *Inst) MatchRunePos(r rune) int {
204 205 206 207
	rune := i.Rune

	// Special case: single-rune slice is from literal string, not char class.
	if len(rune) == 1 {
208 209
		r0 := rune[0]
		if r == r0 {
210
			return 0
211 212 213 214
		}
		if Flags(i.Arg)&FoldCase != 0 {
			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
				if r == r1 {
215
					return 0
216 217 218
				}
			}
		}
219
		return noMatch
220 221 222 223 224 225
	}

	// Peek at the first few pairs.
	// Should handle ASCII well.
	for j := 0; j < len(rune) && j <= 8; j += 2 {
		if r < rune[j] {
226
			return noMatch
227 228
		}
		if r <= rune[j+1] {
229
			return j / 2
230 231 232 233 234 235 236 237 238 239
		}
	}

	// Otherwise binary search.
	lo := 0
	hi := len(rune) / 2
	for lo < hi {
		m := lo + (hi-lo)/2
		if c := rune[2*m]; c <= r {
			if r <= rune[2*m+1] {
240
				return m
241 242 243 244 245 246
			}
			lo = m + 1
		} else {
			hi = m
		}
	}
247
	return noMatch
248 249 250 251
}

// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
// Since we act on runes, it would be easy to support Unicode here.
252 253 254 255 256
func wordRune(r rune) bool {
	return r == '_' ||
		('A' <= r && r <= 'Z') ||
		('a' <= r && r <= 'z') ||
		('0' <= r && r <= '9')
257 258 259 260 261
}

// MatchEmptyWidth returns true if the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == InstEmptyWidth.
262
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
	switch EmptyOp(i.Arg) {
	case EmptyBeginLine:
		return before == '\n' || before == -1
	case EmptyEndLine:
		return after == '\n' || after == -1
	case EmptyBeginText:
		return before == -1
	case EmptyEndText:
		return after == -1
	case EmptyWordBoundary:
		return wordRune(before) != wordRune(after)
	case EmptyNoWordBoundary:
		return wordRune(before) == wordRune(after)
	}
	panic("unknown empty width arg")
}

func (i *Inst) String() string {
	var b bytes.Buffer
	dumpInst(&b, i)
	return b.String()
}

func bw(b *bytes.Buffer, args ...string) {
	for _, s := range args {
		b.WriteString(s)
	}
}

func dumpProg(b *bytes.Buffer, p *Prog) {
	for j := range p.Inst {
		i := &p.Inst[j]
		pc := strconv.Itoa(j)
		if len(pc) < 3 {
			b.WriteString("   "[len(pc):])
		}
		if j == p.Start {
			pc += "*"
		}
		bw(b, pc, "\t")
		dumpInst(b, i)
		bw(b, "\n")
	}
}

func u32(i uint32) string {
309
	return strconv.FormatUint(uint64(i), 10)
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
}

func dumpInst(b *bytes.Buffer, i *Inst) {
	switch i.Op {
	case InstAlt:
		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
	case InstAltMatch:
		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
	case InstCapture:
		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
	case InstEmptyWidth:
		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
	case InstMatch:
		bw(b, "match")
	case InstFail:
		bw(b, "fail")
	case InstNop:
		bw(b, "nop -> ", u32(i.Out))
	case InstRune:
		if i.Rune == nil {
			// shouldn't happen
			bw(b, "rune <nil>")
		}
333 334 335 336 337 338 339 340 341 342 343
		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
		if Flags(i.Arg)&FoldCase != 0 {
			bw(b, "/i")
		}
		bw(b, " -> ", u32(i.Out))
	case InstRune1:
		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
	case InstRuneAny:
		bw(b, "any -> ", u32(i.Out))
	case InstRuneAnyNotNL:
		bw(b, "anynotnl -> ", u32(i.Out))
344 345
	}
}