[BACK]Return to shtokenizer.go CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / pkgsrc / pkgtools / pkglint / files

File: [cvs.NetBSD.org] / pkgsrc / pkgtools / pkglint / files / shtokenizer.go (download)

Revision 1.23, Sun Dec 8 00:06:38 2019 UTC (2 months, 2 weeks ago) by rillig
Branch: MAIN
Changes since 1.22: +7 -5 lines

pkgtools/pkglint: update to 19.3.14

Changes since 19.3.13:

When pkglint suggests to replace !empty(VARNAME:Mfixed) with ${VARNAME}
== fixed, the exact suggested expression is now part of the diagnostic.
The check and the autofix have been improved. They now apply only to the
last modifier in the whole chain, everything else was a bug in pkglint.

Pkglint now knows the scope of variables better than before. It knows
the difference between variables from <sys.mk> like MACHINE_ARCH, which
are always in scope, and those from mk/defaults/mk.conf, which only come
into scope later, after bsd.prefs.mk has been included. It warns when
variables are used too early, for example in .if conditions.

The pathnames in ALTERNATIVES files are now checked for absolute
pathnames. This mistake doesn't happen in practice, but the code for
converting the different path types internally made it necessary to add
these checks. At least this prevents typos.

The special check for obsolete licenses has been removed since their
license files have been removed and that is checked as well.

Variables named *_AWK may be appended to.

The variables _PKG_SILENT and _PKG_DEBUG are no longer deprecated, they
are obsolete now. They are not used in main pkgsrc and pkgsrc-wip
anymore.

When a package sets a default value for a user-settable variable (which
is something that should not happen anyway), it should .include
bsd.prefs.mk before, in order to not accidentally overwrite the
user-specified value.

Variable modifiers of the form :from=to are now parsed like in bmake.
They are greedy and eat up any following colons as well. This means that
${VAR:.c=.o:Q} replaces source.c with source.o:Q, instead of quoting it.
Pkglint now warns about such cases.

The handling of relative paths in diagnostics is now consistent. All
paths that are part of a diagnostic are relative to the line that issues
the diagnostic.

Fatal errors are no longer suppressed in --autofix mode.

Plus lots of refactoring, to prevent accidental mixing of incompatible
relative paths.

package pkglint

import "netbsd.org/pkglint/textproc"

type ShTokenizer struct {
	parser *MkLexer
	inWord bool
}

func NewShTokenizer(diag Autofixer, text string, emitWarnings bool) *ShTokenizer {
	// TODO: Switching to NewMkParser is nontrivial since emitWarnings must equal (line != nil).
	// assert((line != nil) == emitWarnings)
	if diag != nil {
		emitWarnings = true
	}
	mklex := NewMkLexer(text, diag)
	return &ShTokenizer{mklex, false}
}

// ShAtom parses a basic building block of a shell program.
// Examples for such atoms are: variable reference (both make and shell),
// operator, text, quote, space.
//
// See ShQuote.Feed
func (p *ShTokenizer) ShAtom(quoting ShQuoting) *ShAtom {
	if p.parser.EOF() {
		return nil
	}

	lexer := p.parser.lexer
	mark := lexer.Mark()

	if varuse := p.parser.VarUse(); varuse != nil {
		return &ShAtom{shtVaruse, lexer.Since(mark), quoting, varuse}
	}

	// TODO: Most probably there is a more elegant way than the large switch block below.

	var atom *ShAtom
	switch quoting {
	case shqPlain:
		atom = p.shAtomPlain()
	case shqDquot:
		atom = p.shAtomDquot()
	case shqSquot:
		atom = p.shAtomSquot()
	case shqBackt:
		atom = p.shAtomBackt()
	case shqSubsh:
		atom = p.shAtomSubsh()
	case shqDquotBackt:
		atom = p.shAtomDquotBackt()
	case shqBacktDquot:
		atom = p.shAtomBacktDquot()
	case shqBacktSquot:
		atom = p.shAtomBacktSquot()
	case shqSubshDquot:
		atom = p.shAtomSubshDquot()
	case shqSubshSquot:
		atom = p.shAtomSubshSquot()
	case shqSubshBackt:
		atom = p.shAtomSubshBackt()
	case shqDquotBacktDquot:
		atom = p.shAtomDquotBacktDquot()
	case shqDquotBacktSquot:
		atom = p.shAtomDquotBacktSquot()
	}

	if atom == nil {
		lexer.Reset(mark)
		if hasPrefix(lexer.Rest(), "$${") {
			p.parser.Warnf("Unclosed shell variable starting at %q.", shorten(lexer.Rest(), 20))
		} else {
			p.parser.Warnf("Internal pkglint error in ShTokenizer.ShAtom at %q (quoting=%s).",
				// TODO: shorten(lexer.Rest(), 20)
				lexer.Rest(), quoting.String())
		}
	}
	return atom
}

func (p *ShTokenizer) shAtomPlain() *ShAtom {
	const q = shqPlain
	if op := p.shOperator(q); op != nil {
		return op
	}
	inWord := p.inWord
	p.inWord = false
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquot, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqSquot, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqBackt, nil}
	case lexer.PeekByte() == '#' && !inWord:
		rest := lexer.Rest()
		lexer.Skip(len(rest))
		return &ShAtom{shtComment, rest, q, nil}
	case lexer.SkipString("$$("):
		return &ShAtom{shtSubshell, lexer.Since(mark), shqSubsh, nil}
	}

	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomDquot() *ShAtom {
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqPlain, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBackt, nil}
	}
	return p.shAtomInternal(shqDquot, true, false)
}

func (p *ShTokenizer) shAtomSquot() *ShAtom {
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqPlain, nil}
	}
	return p.shAtomInternal(shqSquot, false, true)
}

func (p *ShTokenizer) shAtomBackt() *ShAtom {
	const q = shqBackt
	if op := p.shOperator(q); op != nil {
		return op
	}
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqBacktDquot, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqPlain, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqBacktSquot, nil}
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	case lexer.SkipRegexp(regcomp("^#[^`]*")):
		return &ShAtom{shtComment, lexer.Since(mark), q, nil}
	}
	return p.shAtomInternal(q, false, false)
}

// In pkgsrc, the $(...) subshell syntax is not used, in order to preserve
// compatibility with /bin/sh from Solaris 7.
func (p *ShTokenizer) shAtomSubsh() *ShAtom {
	const q = shqSubsh
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqSubshDquot, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqSubshSquot, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqSubshBackt, nil}
	case lexer.SkipRegexp(regcomp(`^#[^)]*`)):
		return &ShAtom{shtComment, lexer.Since(mark), q, nil}
	case lexer.SkipByte(')'):
		// The closing parenthesis can have multiple meanings:
		// - end of a subshell, such as (echo "in a subshell")
		// - end of a subshell variable expression, such as var=$$(echo "from a subshell")
		// - end of a case pattern
		// In the "subshell variable expression" case, the atom type
		// could be shtText since it is part of a text node. On the
		// other hand, pkglint doesn't tokenize shell programs correctly
		// anyway. This needs to be fixed someday.
		return &ShAtom{shtOperator, lexer.Since(mark), shqPlain, nil}
	}
	if op := p.shOperator(q); op != nil {
		return op
	}
	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomDquotBackt() *ShAtom {
	const q = shqDquotBackt
	if op := p.shOperator(q); op != nil {
		return op
	}
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquot, nil}
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBacktDquot, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBacktSquot, nil}
	case lexer.SkipRegexp(regcomp("^#[^`]*")):
		return &ShAtom{shtComment, lexer.Since(mark), q, nil}
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	}
	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomBacktDquot() *ShAtom {
	const q = shqBacktDquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqBackt, nil}
	}
	return p.shAtomInternal(q, true, false)
}

func (p *ShTokenizer) shAtomBacktSquot() *ShAtom {
	const q = shqBacktSquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqBackt, nil}
	}
	return p.shAtomInternal(q, false, true)
}

func (p *ShTokenizer) shAtomSubshDquot() *ShAtom {
	const q = shqSubshDquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqSubsh, nil}
	}
	return p.shAtomInternal(q, true, false)
}

func (p *ShTokenizer) shAtomSubshSquot() *ShAtom {
	const q = shqSubshSquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqSubsh, nil}
	}
	return p.shAtomInternal(q, false, true)
}

func (p *ShTokenizer) shAtomSubshBackt() *ShAtom {
	const q = shqSubshBackt
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('`'):
		return &ShAtom{shtOperator, lexer.Since(mark), shqSubsh, nil}
	case lexer.SkipHspace():
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	}
	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomDquotBacktDquot() *ShAtom {
	const q = shqDquotBacktDquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBackt, nil}
	}
	return p.shAtomInternal(q, true, false)
}

func (p *ShTokenizer) shAtomDquotBacktSquot() *ShAtom {
	const q = shqDquotBacktSquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBackt, nil}
	}
	return p.shAtomInternal(q, false, true)
}

// shAtomInternal reads the next shtText or shtShVarUse.
//
// Examples:
//  while
//  text$$,text
//  $$!
//  $$$$
//  text
//  ${var:=default}
func (p *ShTokenizer) shAtomInternal(q ShQuoting, dquot, squot bool) *ShAtom {
	if shVarUse := p.shVarUse(q); shVarUse != nil {
		p.inWord = true
		return shVarUse
	}

	lexer := p.parser.lexer
	mark := lexer.Mark()

loop:
	for {
		_ = `^[\t "$&'();<>\\|]+` // These are not allowed in shqPlain.

		switch {
		case lexer.SkipRegexp(regcomp(`^[!#%*+,\-./0-9:=?@A-Z\[\]^_a-z{}~]+`)):
			break
		case dquot && lexer.SkipRegexp(regcomp(`^[\t &'();<>|]+`)):
			break
		case squot && lexer.SkipByte('`'):
			break
		case squot && lexer.SkipRegexp(regcomp(`^[\t "&();<>\\|]+`)):
			break
		case squot && lexer.SkipString("$$"):
			break
		case squot:
			break loop
		case lexer.SkipString("\\$$"):
			break
		case lexer.SkipRegexp(regcomp(`^\\[^$]`)):
			break
		case matches(lexer.Rest(), `^\$\$[^!#(*\-0-9?@A-Z_a-z{]`):
			lexer.NextString("$$")
		case lexer.Rest() == "$$":
			lexer.Skip(2)
		case lexer.Rest() == "$":
			lexer.Skip(1)
		default:
			break loop
		}
	}

	if token := lexer.Since(mark); token != "" {
		p.inWord = true
		return &ShAtom{shtText, token, q, nil}
	}
	return nil
}

// shVarUse parses a use of a shell variable, like $$var or $${var:=value}.
//
// See http://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
func (p *ShTokenizer) shVarUse(q ShQuoting) *ShAtom {
	lexer := p.parser.lexer
	beforeDollar := lexer.Mark()

	if !lexer.SkipString("$$") {
		return nil
	}

	if lexer.TestByteSet(textproc.Digit) {
		lexer.Skip(1)
		text := lexer.Since(beforeDollar)
		return &ShAtom{shtShVarUse, text, q, text[2:]}
	}

	brace := lexer.SkipByte('{')

	varnameStart := lexer.Mark()
	if !lexer.SkipRegexp(regcomp(`^(?:[!#*\-?@]|\$\$|[A-Za-z_]\w*|\d+)`)) {
		lexer.Reset(beforeDollar)
		return nil
	}

	shVarname := lexer.Since(varnameStart)
	if shVarname == "$$" {
		shVarname = "$"
	}

	if brace {
		lexer.SkipRegexp(regcomp(`^(?:##?|%%?|:?[+\-=?])[^$\\{}]*`))
		if !lexer.SkipByte('}') {
			lexer.Reset(beforeDollar)
			return nil
		}
	}

	return &ShAtom{shtShVarUse, lexer.Since(beforeDollar), q, shVarname}
}

func (p *ShTokenizer) shOperator(q ShQuoting) *ShAtom {
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipString("||"),
		lexer.SkipString("&&"),
		lexer.SkipString(";;"),
		lexer.NextBytesFunc(func(b byte) bool { return b == '\n' }) != "",
		lexer.SkipByte(';'),
		lexer.SkipByte('('),
		lexer.SkipByte(')'),
		lexer.SkipByte('|'),
		lexer.SkipByte('&'):
		return &ShAtom{shtOperator, lexer.Since(mark), q, nil}
	case lexer.SkipRegexp(regcomp(`^\d*(?:<<-|<<|<&|<>|>>|>&|>\||<|>)`)):
		return &ShAtom{shtOperator, lexer.Since(mark), q, nil}
	}
	return nil
}

func (p *ShTokenizer) ShAtoms() []*ShAtom {
	var atoms []*ShAtom
	q := shqPlain
	for {
		atom := p.ShAtom(q)
		if atom == nil {
			return atoms
		}
		atoms = append(atoms, atom)
		q = atom.Quoting
	}
}

func (p *ShTokenizer) ShToken() *ShToken {
	var curr *ShAtom
	q := shqPlain
	prevQ := q

	peek := func() *ShAtom {
		if curr == nil {
			curr = p.ShAtom(q)
			if curr != nil {
				prevQ = q
				q = curr.Quoting
			}
		}
		return curr
	}
	skip := func() {
		curr = nil
	}

	lexer := p.parser.lexer
	initialMark := lexer.Mark()

	for peek() != nil && peek().Type == shtSpace {
		skip()
		initialMark = lexer.Mark()
	}

	if curr == nil {
		return nil
	}

	if !curr.Type.IsWord() && q != shqSubsh {
		return NewShToken(curr.MkText, curr)
	}

	var atoms []*ShAtom
	for {
		mark := lexer.Mark()
		peek()
		if curr == nil || !curr.Type.IsWord() && q == shqPlain && prevQ != shqSubsh {
			lexer.Reset(mark)
			break
		}
		atoms = append(atoms, curr)
		skip()
	}

	if q != shqPlain {
		lexer.Reset(initialMark)
		return nil
	}

	return NewShToken(lexer.Since(initialMark), atoms...)
}

func (p *ShTokenizer) Rest() string {
	return p.parser.Rest()
}