package pkglint

import "netbsd.org/pkglint/textproc"

type ShTokenizer struct {
	parser *MkParser
}

func NewShTokenizer(line *Line, text string, emitWarnings bool) *ShTokenizer {
	// TODO: Switching to NewMkParser is nontrivial since emitWarnings must equal (line != nil).
	// assert((line != nil) == emitWarnings)
	p := MkParser{line, textproc.NewLexer(text), emitWarnings}
	return &ShTokenizer{&p}
}

// ShAtom parses a basic building block of a shell program.
// Examples for such atoms are: variable reference (both make and shell),
// operator, text, quote, space.
//
// See ShQuote.Feed
func (p *ShTokenizer) ShAtom(quoting ShQuoting) *ShAtom {
	if p.parser.EOF() {
		return nil
	}

	lexer := p.parser.lexer
	mark := lexer.Mark()

	if varuse := p.parser.VarUse(); varuse != nil {
		return &ShAtom{shtVaruse, lexer.Since(mark), quoting, varuse}
	}

	// TODO: Most probably there is a more elegant way than the large switch block below.

	var atom *ShAtom
	switch quoting {
	case shqPlain:
		atom = p.shAtomPlain()
	case shqDquot:
		atom = p.shAtomDquot()
	case shqSquot:
		atom = p.shAtomSquot()
	case shqBackt:
		atom = p.shAtomBackt()
	case shqSubsh:
		atom = p.shAtomSubsh()
	case shqDquotBackt:
		atom = p.shAtomDquotBackt()
	case shqBacktDquot:
		atom = p.shAtomBacktDquot()
	case shqBacktSquot:
		atom = p.shAtomBacktSquot()
	case shqSubshDquot:
		atom = p.shAtomSubshDquot()
	case shqSubshSquot:
		atom = p.shAtomSubshSquot()
	case shqSubshBackt:
		atom = p.shAtomSubshBackt()
	case shqDquotBacktDquot:
		atom = p.shAtomDquotBacktDquot()
	case shqDquotBacktSquot:
		atom = p.shAtomDquotBacktSquot()
	}

	if atom == nil {
		lexer.Reset(mark)
		if hasPrefix(lexer.Rest(), "$${") {
			p.parser.Line.Warnf("Unclosed shell variable starting at %q.", shorten(lexer.Rest(), 20))
		} else {
			p.parser.Line.Warnf("Internal pkglint error in ShTokenizer.ShAtom at %q (quoting=%s).", lexer.Rest(), quoting)
		}
	}
	return atom
}

func (p *ShTokenizer) shAtomPlain() *ShAtom {
	const q = shqPlain
	if op := p.shOperator(q); op != nil {
		return op
	}
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquot, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqSquot, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqBackt, nil}
	case lexer.PeekByte() == '#':
		rest := lexer.Rest()
		lexer.Skip(len(rest))
		return &ShAtom{shtComment, rest, q, nil}
	case lexer.SkipString("$$("):
		return &ShAtom{shtSubshell, lexer.Since(mark), shqSubsh, nil}
	}

	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomDquot() *ShAtom {
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqPlain, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBackt, nil}
	}
	return p.shAtomInternal(shqDquot, true, false)
}

func (p *ShTokenizer) shAtomSquot() *ShAtom {
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqPlain, nil}
	}
	return p.shAtomInternal(shqSquot, false, true)
}

func (p *ShTokenizer) shAtomBackt() *ShAtom {
	const q = shqBackt
	if op := p.shOperator(q); op != nil {
		return op
	}
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqBacktDquot, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqPlain, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqBacktSquot, nil}
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	case lexer.SkipRegexp(regcomp("^#[^`]*")):
		return &ShAtom{shtComment, lexer.Since(mark), q, nil}
	}
	return p.shAtomInternal(q, false, false)
}

// In pkgsrc, the $(...) subshell syntax is not used, in order to preserve
// compatibility with /bin/sh from Solaris 7.
func (p *ShTokenizer) shAtomSubsh() *ShAtom {
	const q = shqSubsh
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqSubshDquot, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqSubshSquot, nil}
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqSubshBackt, nil}
	case lexer.SkipRegexp(regcomp(`^#[^)]*`)):
		return &ShAtom{shtComment, lexer.Since(mark), q, nil}
	case lexer.SkipByte(')'):
		// The closing parenthesis can have multiple meanings:
		// - end of a subshell, such as (echo "in a subshell")
		// - end of a subshell variable expression, such as var=$$(echo "from a subshell")
		// - end of a case pattern
		// In the "subshell variable expression" case, the atom type
		// could be shtText since it is part of a text node. On the
		// other hand, pkglint doesn't tokenize shell programs correctly
		// anyway. This needs to be fixed someday.
		return &ShAtom{shtOperator, lexer.Since(mark), shqPlain, nil}
	}
	if op := p.shOperator(q); op != nil {
		return op
	}
	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomDquotBackt() *ShAtom {
	const q = shqDquotBackt
	if op := p.shOperator(q); op != nil {
		return op
	}
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('`'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquot, nil}
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBacktDquot, nil}
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBacktSquot, nil}
	case lexer.SkipRegexp(regcomp("^#[^`]*")):
		return &ShAtom{shtComment, lexer.Since(mark), q, nil}
	case lexer.NextHspace() != "":
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	}
	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomBacktDquot() *ShAtom {
	const q = shqBacktDquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqBackt, nil}
	}
	return p.shAtomInternal(q, true, false)
}

func (p *ShTokenizer) shAtomBacktSquot() *ShAtom {
	const q = shqBacktSquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqBackt, nil}
	}
	return p.shAtomInternal(q, false, true)
}

func (p *ShTokenizer) shAtomSubshDquot() *ShAtom {
	const q = shqSubshDquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqSubsh, nil}
	}
	return p.shAtomInternal(q, true, false)
}

func (p *ShTokenizer) shAtomSubshSquot() *ShAtom {
	const q = shqSubshSquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqSubsh, nil}
	}
	return p.shAtomInternal(q, false, true)
}

func (p *ShTokenizer) shAtomSubshBackt() *ShAtom {
	const q = shqSubshBackt
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('`'):
		return &ShAtom{shtOperator, lexer.Since(mark), shqSubsh, nil}
	case lexer.SkipHspace():
		return &ShAtom{shtSpace, lexer.Since(mark), q, nil}
	}
	return p.shAtomInternal(q, false, false)
}

func (p *ShTokenizer) shAtomDquotBacktDquot() *ShAtom {
	const q = shqDquotBacktDquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('"'):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBackt, nil}
	}
	return p.shAtomInternal(q, true, false)
}

func (p *ShTokenizer) shAtomDquotBacktSquot() *ShAtom {
	const q = shqDquotBacktSquot
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipByte('\''):
		return &ShAtom{shtText, lexer.Since(mark), shqDquotBackt, nil}
	}
	return p.shAtomInternal(q, false, true)
}

// shAtomInternal reads the next shtText or shtShVarUse.
//
// Examples:
//  while
//  text$$,text
//  $$!
//  $$$$
//  text
//  ${var:=default}
func (p *ShTokenizer) shAtomInternal(q ShQuoting, dquot, squot bool) *ShAtom {
	if shVarUse := p.shVarUse(q); shVarUse != nil {
		return shVarUse
	}

	lexer := p.parser.lexer
	mark := lexer.Mark()

loop:
	for {
		_ = `^[\t "$&'();<>\\|]+` // These are not allowed in shqPlain.

		switch {
		case lexer.SkipRegexp(regcomp(`^[!#%*+,\-./0-9:=?@A-Z\[\]^_a-z{}~]+`)):
			break
		case dquot && lexer.SkipRegexp(regcomp(`^[\t &'();<>|]+`)):
			break
		case squot && lexer.SkipByte('`'):
			break
		case squot && lexer.SkipRegexp(regcomp(`^[\t "&();<>\\|]+`)):
			break
		case squot && lexer.SkipString("$$"):
			break
		case squot:
			break loop
		case lexer.SkipString("\\$$"):
			break
		case lexer.SkipRegexp(regcomp(`^\\[^$]`)):
			break
		case matches(lexer.Rest(), `^\$\$[^!#(*\-0-9?@A-Z_a-z{]`):
			lexer.NextString("$$")
		case lexer.Rest() == "$$":
			lexer.Skip(2)
		case lexer.Rest() == "$":
			lexer.Skip(1)
		default:
			break loop
		}
	}

	if token := lexer.Since(mark); token != "" {
		return &ShAtom{shtText, token, q, nil}
	}
	return nil
}

// shVarUse parses a use of a shell variable, like $$var or $${var:=value}.
//
// See http://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
func (p *ShTokenizer) shVarUse(q ShQuoting) *ShAtom {
	lexer := p.parser.lexer
	beforeDollar := lexer.Mark()

	if !lexer.SkipString("$$") {
		return nil
	}

	if lexer.TestByteSet(textproc.Digit) {
		lexer.Skip(1)
		text := lexer.Since(beforeDollar)
		return &ShAtom{shtShVarUse, text, q, text[2:]}
	}

	brace := lexer.SkipByte('{')

	varnameStart := lexer.Mark()
	if !lexer.SkipRegexp(regcomp(`^(?:[!#*\-?@]|\$\$|[A-Za-z_]\w*|\d+)`)) {
		lexer.Reset(beforeDollar)
		return nil
	}

	shVarname := lexer.Since(varnameStart)
	if shVarname == "$$" {
		shVarname = "$"
	}

	if brace {
		lexer.SkipRegexp(regcomp(`^(?:##?|%%?|:?[+\-=?])[^$\\{}]*`))
		if !lexer.SkipByte('}') {
			lexer.Reset(beforeDollar)
			return nil
		}
	}

	return &ShAtom{shtShVarUse, lexer.Since(beforeDollar), q, shVarname}
}

func (p *ShTokenizer) shOperator(q ShQuoting) *ShAtom {
	lexer := p.parser.lexer
	mark := lexer.Mark()
	switch {
	case lexer.SkipString("||"),
		lexer.SkipString("&&"),
		lexer.SkipString(";;"),
		lexer.NextBytesFunc(func(b byte) bool { return b == '\n' }) != "",
		lexer.SkipByte(';'),
		lexer.SkipByte('('),
		lexer.SkipByte(')'),
		lexer.SkipByte('|'),
		lexer.SkipByte('&'):
		return &ShAtom{shtOperator, lexer.Since(mark), q, nil}
	case lexer.SkipRegexp(regcomp(`^\d*(?:<<-|<<|<&|<>|>>|>&|>\||<|>)`)):
		return &ShAtom{shtOperator, lexer.Since(mark), q, nil}
	}
	return nil
}

func (p *ShTokenizer) ShAtoms() []*ShAtom {
	var atoms []*ShAtom
	q := shqPlain
	for {
		atom := p.ShAtom(q)
		if atom == nil {
			return atoms
		}
		atoms = append(atoms, atom)
		q = atom.Quoting
	}
}

func (p *ShTokenizer) ShToken() *ShToken {
	var curr *ShAtom
	q := shqPlain
	prevQ := q

	peek := func() *ShAtom {
		if curr == nil {
			curr = p.ShAtom(q)
			if curr != nil {
				prevQ = q
				q = curr.Quoting
			}
		}
		return curr
	}
	skip := func() {
		curr = nil
	}

	lexer := p.parser.lexer
	initialMark := lexer.Mark()

	for peek() != nil && peek().Type == shtSpace {
		skip()
		initialMark = lexer.Mark()
	}

	if curr == nil {
		return nil
	}

	if !curr.Type.IsWord() && q != shqSubsh {
		return NewShToken(curr.MkText, curr)
	}

	var atoms []*ShAtom
	for {
		mark := lexer.Mark()
		peek()
		if curr == nil || !curr.Type.IsWord() && q == shqPlain && prevQ != shqSubsh {
			lexer.Reset(mark)
			break
		}
		atoms = append(atoms, curr)
		skip()
	}

	if q != shqPlain {
		lexer.Reset(initialMark)
		return nil
	}

	return NewShToken(lexer.Since(initialMark), atoms...)
}

func (p *ShTokenizer) Rest() string {
	return p.parser.Rest()
}