Source code for discodop.cli

"""Command-line interfaces to modules."""
from sys import argv, stdout, stderr, version_info
from sys import exit as sysexit

COMMANDS = {
		'runexp': 'Run experiment: grammar extraction, parsing & evaluation.',
		'fragments': 'Extract recurring fragments from treebanks.',
		'eval': 'Evaluate discontinuous parse trees; similar to EVALB.',
		'treetransforms': 'Apply tree transformations '
			'and convert between formats.',
		'treedraw': 'Visualize (discontinuous) trees.',
		'treesearch': 'Query treebanks.',
		'grammar': 'Read off grammars from treebanks.',
		'parser': 'Simple command line parser.',
		'demos': 'Show some demonstrations of formalisms encoded in LCFRS.',
		'gen': 'Generate sentences from a PLCFRS.',
	}


[docs]def main():
	"""Expose command-line interfaces."""
	from os import execlp
	from os.path import basename
	thiscmd = basename(argv[0])
	if len(argv) == 2 and argv[1] in ('-v', '--version'):
		from discodop import __version__
		print(__version__)
	elif len(argv) <= 1 or argv[1] not in dict(COMMANDS):
		print('Usage: %s <command> [arguments]\n' % thiscmd, file=stderr)
		print('Command is one of:', file=stderr)
		for a, b in COMMANDS.items():
			print('   %s  %s' % (a.ljust(15), b))
		print('for additional instructions issue: %s <command> --help'
			% thiscmd, file=stderr)
	elif len(argv) == 3 and argv[2] in ('-h', '--help'):
		# help on subcommand
		execlp('man', 'man', 'discodop-%s' % argv[1])
	else:
		cmd = argv[1]
		# use the CLI defined here, or default to the module's main function.
		if cmd in globals():
			globals()[cmd]()
		else:
			getattr(__import__('discodop.%s' % cmd,
					fromlist=['main']), 'main')()


[docs]def treedraw():
	"""Usage: discodop treedraw [<treebank>...] [options]

If no treebank is given, input is read from standard input; format is detected.
Pipe the output through 'less -R' to preserve the colors."""
	from getopt import gnu_getopt, GetoptError
	from itertools import islice, chain
	from .treebank import READERS, incrementaltreereader
	from .tree import DrawTree, frontier
	from .util import openread
	try:
		from itertools import izip
	except ImportError:
		izip = zip

	def processtree(tree, sent):
		"""Produced output for a single tree."""
		dt = DrawTree(
				tree, sent, abbr='--abbr' in opts, secedge='--secedge' in opts)
		if output == 'text' or output == 'html':
			return dt.text(unicodelines=True, ansi=ansi, html=html,
					funcsep=funcsep)
		elif output == 'svg':
			return dt.svg(funcsep=funcsep)
		elif output == 'tikznode':
			return dt.tikznode(funcsep=funcsep) + '\n'
		elif output == 'tikzmatrix':
			return dt.tikzmatrix(funcsep=funcsep) + '\n'
		elif output == 'tikzqtree':
			return dt.tikzqtree() + '\n'
		raise ValueError('unrecognized --output format')

	flags = ('test', 'help', 'abbr', 'plain', 'frontier', 'secedge')
	options = ('fmt=', 'encoding=', 'functions=', 'morphology=', 'numtrees=',
			'output=')
	try:
		opts, args = gnu_getopt(argv[2:], 'hn:fm', flags + options)
	except GetoptError as err:
		print('error:', err, file=stderr)
		print(treedraw.__doc__)
		sysexit(2)
	opts = dict(opts)
	limit = opts.get('--numtrees', opts.get('-n'))
	limit = int(limit) if limit else None
	output = opts.get('--output', 'text')
	if '-f' in opts:
		opts['--functions'] = 'add'
	if '-m' in opts:
		opts['--morphology'] = 'add'
	funcsep = ('-' if opts.get('--functions')
		in ('add', 'between') else None)
	ansi = output == 'text' and '--plain' not in opts
	html = output == 'html' and '--plain' not in opts
	if output in ('html', 'svg'):
		print(DrawTree.templates[output][0])  # preamble
	elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
		print(DrawTree.templates['latex'][0])  # preamble
	if args and opts.get('--fmt', 'export') != 'auto':
		reader = READERS[opts.get('--fmt', 'export')]
		corpora = []
		for path in args:
			corpus = reader(
					path,
					encoding=opts.get('--encoding', 'utf8'),
					functions=opts.get('--functions'),
					morphology=opts.get('--morphology'))
			corpora.append(iter(corpus.itertrees()))
		for items in islice(izip(*corpora), 0, limit):
			for arg, (sentid, item) in zip(args, items):
				if len(args) > 1:
					print(arg, end=':')
				print(sentid, end='. ')
				if '--frontier' in opts:
					print(frontier(item.tree, item.sent), item.comment)
				else:
					print(item.comment)
					print(processtree(item.tree, item.sent))
	else:  # read from stdin + detect format
		encoding = opts.get('--encoding', 'utf8')
		if not args:
			args = ['-']
		stream = chain.from_iterable(
				openread(fname, encoding=encoding)
				for fname in args)
		trees = islice(incrementaltreereader(stream,
				morphology=opts.get('--morphology'),
				functions=opts.get('--functions'),
				othertext=True),
				0, limit)
		try:
			n = 1
			for tree, sent, rest in trees:
				if tree is None:
					print(rest)
					continue
				print(n, end='. ')
				if '--frontier' in opts:
					print('%s %s' % (frontier(tree, sent), rest))
				else:
					print(rest or '')
					print(processtree(tree, sent))
				n += 1
		except (IOError, KeyboardInterrupt):
			pass
	if output in ('html', 'svg'):
		print(DrawTree.templates[output][1])  # postamble
	elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
		print(DrawTree.templates['latex'][1])  # postamble


[docs]def runexp(args=None):
	"""Usage: discodop runexp <parameter file> [--rerun]

If a parameter file is given, an experiment is run. See the file sample.prm for
an example parameter file. To repeat an experiment with an existing grammar,
pass the option --rerun. The directory with the name of the parameter file
without extension must exist in the current path; its results will be
overwritten."""
	import io
	import os
	from .parser import readparam
	from .runexp import startexp, parsetepacoc
	if args is None:
		args = argv[2:]
	if '--tepacoc' in args:
		args.remove('--tepacoc')
		if args:
			print('error: incorrect arguments', file=stderr)
			print(runexp.__doc__)
			sysexit(2)
		parsetepacoc()
		return
	rerun = '--rerun' in args
	if rerun:
		args.remove('--rerun')
	if len(args) != 1:
		print('error: incorrect arguments %r' % args, file=stderr)
		print(runexp.__doc__)
		sysexit(2)
	params = readparam(args[0])
	resultdir = args[0].rsplit('.', 1)[0]
	top = startexp(params, resultdir=resultdir, rerun=rerun)
	if not rerun:  # copy parameter file to result dir
		with io.open(args[0], encoding='utf8') as inp:
			paramlines = inp.readlines()
		if paramlines[0].startswith("top='"):
			paramlines = paramlines[1:]
		outfile = os.path.join(resultdir, 'params.prm')
		with io.open(outfile, 'w', encoding='utf8') as out:
			out.write("top='%s',\n" % top)
			out.writelines(paramlines)


[docs]def treetransforms():
	"""Treebank binarization and conversion.
Usage: discodop treetransforms [input [output]] [options]
where input and output are treebanks; standard in/output is used if not given.
"""
	import io
	from getopt import gnu_getopt, GetoptError
	from itertools import islice
	from . import treebank, treebanktransforms
	from .treetransforms import (canonicalize, binarize,
			unbinarize, optimalbinarize, splitdiscnodes, mergediscnodes,
			raisediscnodes, introducepreterminals, markovthreshold,
			canonicallyorderedtree)
	flags = ('binarize optimalbinarize unbinarize splitdisc mergedisc '
			'raisedisc canonical introducepreterminals renumber sentid '
			'removeempty help markorigin markhead leftunary rightunary '
			'tailmarker direction dot').split()
	options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= '
			'punct= headrules= functions= morphology= lemmas= factor= fmt= '
			'markorigin= maxlen= enc= transforms= markovthreshold= labelfun= '
			'transforms= reversetransforms= filterlabels= ').split()
	try:
		origopts, args = gnu_getopt(argv[2:], 'h:v:H:', flags + options)
		if len(args) > 2:
			raise GetoptError('expected 0, 1, or 2 positional arguments')
	except GetoptError as err:
		print('error:', err, file=stderr)
		print(treetransforms.__doc__)
		sysexit(2)
	opts = dict(origopts)
	if '--fmt' in opts:
		opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt']
	if '--enc' in opts:
		opts['--inputenc'] = opts['--outputenc'] = opts['--enc']
	if opts.get('--outputfmt', treebank.WRITERS[0]) not in treebank.WRITERS:
		print('error: unrecognized output format: %r\navailable formats: %s'
				% (opts.get('--outputfmt'), ' '.join(treebank.WRITERS)),
				file=stderr)
		sysexit(2)
	infilename = (args[0] if len(args) >= 1 else '-')
	outfilename = (args[1] if len(args) == 2 and args[1] != '-'
			else stdout.fileno())

	# open corpus
	corpus = treebank.READERS[opts.get('--inputfmt', 'export')](
			infilename,
			encoding=opts.get('--inputenc', 'utf8'),
			headrules=opts.get('--headrules'),
			ensureroot=opts.get('--ensureroot'),
			removeempty='--removeempty' in opts,
			punct=opts.get('--punct'),
			functions=opts.get('--functions'),
			morphology=opts.get('--morphology'),
			lemmas=opts.get('--lemmas'))
	start, end = opts.get('--slice', ':').split(':')
	start, end = (int(start) if start else None), (int(end) if end else None)
	# FIXME: support negative indices
	trees = corpus.itertrees(start, end)
	if '--maxlen' in opts:
		maxlen = int(opts['--maxlen'])
		trees = ((key, item) for key, item in trees
				if len(item.sent) <= maxlen)
	if '--renumber' in opts:
		trees = (('%8d' % n, item) for n, (_key, item) in enumerate(trees, 1))

	# select transformations
	actions = []
	for key, value in origopts:  # pylint: disable=unused-variable
		if key == '--introducepreterminals':
			actions.append(lambda tree, sent:
					(introducepreterminals(tree, sent), sent))
		elif key == '--transforms':
			actions.append(lambda tree, sent, value=value:
					(treebanktransforms.transform(tree, sent,
						treebanktransforms.expandpresets(value.split(','))),
					sent))
		elif key in ('--binarize', '--optimalbinarize'):
			if key == '--binarize':
				actions.append(lambda tree, sent:
						(binarize(
							tree,
							opts.get('--factor', 'right'),
							int(opts.get('-h', 999)),
							int(opts.get('-v', 1)),
							revhorzmarkov=int(opts.get('-H', 0)),
							leftmostunary='--leftunary' in opts,
							rightmostunary='--rightunary' in opts,
							tailmarker='$' if '--tailmarker' in opts else '',
							direction='--direction' in opts,
							headoutward='--headrules' in opts,
							markhead='--markhead' in opts,
							dot='--dot' in opts,
							filterlabels=tuple(opts.get(
								'--filterlabels', '').split()),
							labelfun=eval(  # pylint: disable=eval-used
								opts['--labelfun'])
								if '--labelfun' in opts else None),
						sent))
			elif key == '--optimalbinarize':
				actions.append(lambda tree, sent:
						(optimalbinarize(
							tree, '|',
							'--headrules' in opts,
							int(opts.get('-h', 999)),
							int(opts.get('-v', 1))),
						sent))
		elif key == '--splitdisc':
			actions.append(lambda tree, sent:
					(splitdiscnodes(tree, '--markorigin' in opts), sent))
		elif key == '--canonical':
			actions.append(lambda tree, sent:
					(canonicallyorderedtree(tree, sent), sent))
		elif key == '--mergedisc':
			actions.append(lambda tree, sent: (mergediscnodes(tree), sent))
		elif key == '--raisedisc':
			actions.append(lambda tree, sent: (raisediscnodes(tree), sent))
		elif key == '--unbinarize':
			actions.append(lambda tree, sent: (unbinarize(tree, sent), sent))
		elif key == '--reversetransforms':
			actions.append(lambda tree, sent, value=value:
					(treebanktransforms.reversetransform(tree, sent,
						treebanktransforms.expandpresets(value.split(','))),
					sent))

	# read, transform, & write trees
	if actions:
		def applytransforms(trees):
			"""Apply transforms and yield modified items."""
			for key, item in trees:
				for action in actions:
					item.tree, item.sent = action(item.tree, item.sent)
				yield key, item

		trees = applytransforms(trees)
		if 'binarize' in opts and '--markovthreshold' in opts:
			trees = list(trees)
			h, v = int(opts.get('-h', 999)), int(opts.get('-v', 1))
			revh = int(opts.get('-H', 0))
			markovthreshold([item.tree for _, item in trees],
					int(opts['--markovthreshold']),
					revh + h - 1,
					v - 1 if v > 1 else 1)

	if opts.get('--outputfmt') in ('mst', 'conll'):
		if not opts.get('--headrules'):
			raise ValueError('need head rules for dependency conversion')
	cnt = 0
	encoding = opts.get('outputenc', 'utf8')
	with io.open(outfilename, 'w', encoding=encoding) as outfile:
		# copy trees verbatim when only taking slice or converting encoding
		if (not actions
				and opts.get('--inputfmt') == opts.get('--outputfmt')
				and opts.get('--inputfmt') in (
					'export', 'bracket', 'discbracket')
				and set(opts) <= {'--slice', '--inputenc', '--outputenc',
					'--inputfmt', '--outputfmt'}):
			for block in islice(corpus.blocks().values(), start, end):
				outfile.write(block)
				cnt += 1
		else:
			if opts.get('--outputfmt', 'export') == 'bracket':
				trees = ((key, canonicalize(item.tree) and item)
						for key, item in trees)
			if opts.get('--outputfmt', 'export') == 'export':
				outfile.write(treebank.EXPORTHEADER)
			fmt = opts.get('--outputfmt', 'export')
			sentid = '--sentid' in opts
			for key, item in trees:
				outfile.write(treebank.writetree(item.tree, item.sent, key,
						fmt, comment=item.comment, sentid=sentid))
				cnt += 1
	print('%s: transformed %d trees' % (args[0] if args else 'stdin', cnt),
			file=stderr)


[docs]def grammar():
	"""Read off grammars from treebanks.
Usage: discodop grammar <type> <input> <output> [options]
or: discodop grammar param <parameter-file> <output-directory>
or: discodop grammar info <rules-file>
or: discodop grammar merge (rules|lexicon|fragments) \
<input1> <input2>... <output>"""
	import io
	import os
	import codecs
	import logging
	from gzip import open as gzipopen
	from getopt import gnu_getopt, GetoptError
	from .tree import STRTERMRE
	from .util import openread
	from .treebank import READERS
	from .treetransforms import addfanoutmarkers, canonicalize
	from .grammar import (treebankgrammar, dopreduction, doubledop, dop1,
			compiletsg, writegrammar, grammarinfo, grammarstats,
			splitweight, merge, sumfrags, sumrules, sumlex, stripweight,
			addindices)
	from .parser import readparam
	from .runexp import (loadtraincorpus, getposmodel, dobinarization,
			getgrammars)
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	shortoptions = 'hs:'
	options = ('help', 'gzip', 'packed', 'inputfmt=', 'inputenc=',
			'dopestimator=', 'maxdepth=', 'maxfrontier=', 'numproc=')
	try:
		opts, args = gnu_getopt(argv[2:], shortoptions, options)
		model = args[0]
		if model not in ('info', 'merge'):
			if len(args) != 3:
				raise ValueError('expected 2 arguments: treebank grammar')
			treebankfile = args[1]
			grammarfile = args[2]
	except (GetoptError, IndexError, ValueError) as err:
		print('error: %r' % err, file=stderr)
		print(grammar.__doc__)
		sysexit(2)
	opts = dict(opts)
	if model not in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop', 'dop1',
			'ptsg', 'param', 'info', 'merge'):
		raise ValueError('unrecognized model: %r' % model)
	if opts.get('dopestimator', 'rfe') not in ('rfe', 'ewe', 'shortest'):
		raise ValueError('unrecognized estimator: %r' % opts['dopestimator'])

	if model == 'info':
		grammarstats(args[1])
		return
	elif model == 'merge':
		if len(args) < 5:
			raise ValueError('need at least 2 input and 1 output arguments.')
		if args[1] == 'rules':
			merge(args[2:-1], args[-1], sumrules, stripweight)
		elif args[1] == 'lexicon':
			merge(args[2:-1], args[-1], sumlex, lambda x: x.split(None, 1)[0])
		elif args[1] == 'fragments':
			merge(args[2:-1], args[-1], sumfrags, lambda x: x.rsplit('\t', 1)[0])
		return
	elif model == 'param':
		if opts:
			raise ValueError('all options should be set in parameter file.')
		prm = readparam(args[1])
		resultdir = args[2]
		if os.path.exists(resultdir):
			raise ValueError('Directory %r already exists.\n' % resultdir)
		os.mkdir(resultdir)
		trees, sents, train_tagged_sents = loadtraincorpus(
				prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
				prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
				prm.transformations, prm.relationalrealizational, resultdir)
		if prm.postagging and prm.postagging.method == 'unknownword':
			sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
		elif not prm.postagging:
			lexmodel = None
	elif model == 'ptsg':  # read fragments
		xfragments = {frag: splitweight(weight) for frag, weight
				in (line.split('\t') for line in openread(treebankfile,
					encoding=opts.get('--inputenc', 'utf8')))}
		if STRTERMRE.search(next(iter(xfragments))) is not None:
			xfragments = {addindices(frag): splitweight(weight) for frag, weight
					in xfragments.items()}
	else:  # read treebank
		corpus = READERS[opts.get('--inputfmt', 'export')](
				treebankfile,
				encoding=opts.get('--inputenc', 'utf8'))
		trees = list(corpus.trees().values())
		sents = list(corpus.sents().values())
		if not trees:
			raise ValueError('no trees; is --inputfmt correct?')
		for a in trees:
			canonicalize(a)
			addfanoutmarkers(a)

	# read off grammar
	if model in ('pcfg', 'plcfrs'):
		xgrammar = treebankgrammar(trees, sents)
	elif model == 'dopreduction':
		xgrammar, altweights = dopreduction(trees, sents,
				packedgraph='--packed' in opts)
	elif model == 'doubledop':
		xgrammar, backtransform, altweights, _ = doubledop(trees, sents,
				numproc=int(opts.get('--numproc', 1)))
	elif model == 'dop1':
		xgrammar, backtransform, altweights, _ = dop1(trees, sents,
				maxdepth=int(opts.get('--maxdepth', 3)),
				maxfrontier=int(opts.get('--maxfrontier', 999)))
	elif model == 'ptsg':
		xgrammar, backtransform, altweights = compiletsg(xfragments)
	elif model == 'param':
		getgrammars(dobinarization(trees, sents, prm.binarization,
				prm.relationalrealizational),
				sents, prm.stages, prm.testcorpus.maxwords, resultdir,
				prm.numproc, lexmodel, trees[0].label)
		paramfile = os.path.join(resultdir, 'params.prm')
		with openread(args[1]) as inp:
			with io.open(paramfile, 'w', encoding='utf8') as out:
				out.write("top='%s',\n%s" % (trees[0].label, inp.read()))
		return  # grammars have already been written
	if opts.get('--dopestimator', 'rfe') != 'rfe':
		xgrammar = [(rule, w) for (rule, _), w in
				zip(xgrammar, altweights[opts['--dopestimator']])]

	rulesname = grammarfile + '.rules'
	lexiconname = grammarfile + '.lex'
	myopen = open
	if '--gzip' in opts:
		myopen = gzipopen
		rulesname += '.gz'
		lexiconname += '.gz'
	bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket'
	if model == 'ptsg':
		bitpar = STRTERMRE.search(next(iter(xfragments))) is not None

	rules, lexicon = writegrammar(xgrammar, bitpar=bitpar)
	# write output
	with codecs.getwriter('utf8')(myopen(rulesname, 'wb')) as rulesfile:
		rulesfile.write(rules)
	with codecs.getwriter('utf8')(myopen(lexiconname, 'wb')) as lexiconfile:
		lexiconfile.write(lexicon)
	if model in ('doubledop', 'ptsg'):
		backtransformfile = '%s.backtransform%s' % (grammarfile,
			'.gz' if '--gzip' in opts else '')
		with codecs.getwriter('utf8')(myopen(backtransformfile, 'wb')) as bt:
			bt.writelines('%s\n' % a for a in backtransform)
		print('wrote backtransform to', backtransformfile)
	print('wrote grammar to %s and %s.' % (rulesname, lexiconname))
	start = opts.get('-s', next(iter(xgrammar))[0][0][0]
			if model == 'ptsg' else trees[0].label)
	if version_info[0] == 2:
		start = start.decode('utf8')
	if len(xgrammar) < 10000:  # this is very slow so skip with large grammars
		print(grammarinfo(xgrammar))
	try:
		from .containers import Grammar
		print(Grammar(rulesname, lexiconname, start=start).testgrammar()[1])
	except (ImportError, AssertionError) as err:
		print(err)


if __name__ == "__main__":
	main()

__all__ = ['treedraw', 'runexp', 'treetransforms', 'grammar', 'main']