Source code for discodop.cli

"""Command-line interfaces to modules."""
from __future__ import division, print_function, absolute_import, \
		unicode_literals
from sys import argv, stdout, stderr, version_info
from sys import exit as sysexit

COMMANDS = {
		'runexp': 'Run experiment: grammar extraction, parsing & evaluation.',
		'fragments': 'Extract recurring fragments from treebanks.',
		'eval': 'Evaluate discontinuous parse trees; similar to EVALB.',
		'treetransforms': 'Apply tree transformations '
			'and convert between formats.',
		'treedraw': 'Visualize (discontinuous) trees.',
		'treesearch': 'Query treebanks.',
		'grammar': 'Read off grammars from treebanks.',
		'parser': 'Simple command line parser.',
		'demos': 'Show some demonstrations of formalisms encoded in LCFRS.',
		'gen': 'Generate sentences from a PLCFRS.',
	}


[docs]def main():
	"""Expose command-line interfaces."""
	from os import execlp
	from os.path import basename
	thiscmd = basename(argv[0])
	if len(argv) == 2 and argv[1] in ('-v', '--version'):
		from discodop import __version__
		print(__version__)
	elif len(argv) <= 1 or argv[1] not in dict(COMMANDS):
		print('Usage: %s <command> [arguments]\n' % thiscmd, file=stderr)
		print('Command is one of:', file=stderr)
		for a, b in COMMANDS.items():
			print('   %s  %s' % (a.ljust(15), b))
		print('for additional instructions issue: %s <command> --help'
			% thiscmd, file=stderr)
	elif len(argv) == 3 and argv[2] in ('-h', '--help'):
		# help on subcommand
		execlp('man', 'man', 'discodop-%s' % argv[1])
	else:
		cmd = argv[1]
		# use the CLI defined here, or default to the module's main function.
		try:
			globals()[cmd]()
		except KeyError:
			getattr(__import__('discodop.%s' % cmd,
					fromlist=['main']), 'main')()


[docs]def treedraw():
	"""Usage: discodop treedraw [<treebank>...] [options]

If no treebank is given, input is read from standard input; format is detected.
Pipe the output through 'less -R' to preserve the colors."""
	from getopt import gnu_getopt, GetoptError
	from itertools import islice, chain
	from .treebank import READERS, incrementaltreereader
	from .tree import DrawTree, frontier
	from .util import openread

	def processtree(tree, sent):
		"""Produced output for a single tree."""
		if output == 'frontier':
			return frontier(tree, sent)
		dt = DrawTree(tree, sent, abbr='--abbr' in opts)
		if output == 'text' or output == 'html':
			return dt.text(unicodelines=True, ansi=ansi, html=html,
					funcsep=funcsep)
		elif output == 'svg':
			return dt.svg(funcsep=funcsep)
		elif output == 'tikznode':
			return dt.tikznode(funcsep=funcsep) + '\n'
		elif output == 'tikzmatrix':
			return dt.tikzmatrix(funcsep=funcsep) + '\n'
		elif output == 'tikzqtree':
			return dt.tikzqtree() + '\n'
		raise ValueError('unrecognized --output format')

	flags = ('test', 'help', 'abbr', 'plain', 'frontier')
	options = ('fmt=', 'encoding=', 'functions=', 'morphology=', 'numtrees=',
			'output=')
	try:
		opts, args = gnu_getopt(argv[2:], 'hn:', flags + options)
	except GetoptError as err:
		print('error:', err, file=stderr)
		print(treedraw.__doc__)
		sysexit(2)
	opts = dict(opts)
	limit = opts.get('--numtrees', opts.get('-n'))
	limit = int(limit) if limit else None
	output = opts.get('--output', 'text')
	funcsep = ('-' if opts.get('--functions')
		in ('add', 'between') else None)
	ansi = output == 'text' and '--plain' not in opts
	html = output == 'html' and '--plain' not in opts
	if output in ('html', 'svg'):
		print(DrawTree.templates[output][0])  # preamble
	elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
		print(DrawTree.templates['latex'][0])  # preamble
	if args and opts.get('--fmt', 'export') != 'auto':
		reader = READERS[opts.get('--fmt', 'export')]
		corpora = []
		for path in args:
			corpus = reader(
					path,
					encoding=opts.get('--encoding', 'utf8'),
					functions=opts.get('--functions'),
					morphology=opts.get('--morphology'))
			corpora.append((corpus.trees(), corpus.sents()))
		numsents = len(corpus.sents())
		print('Viewing:', ' '.join(args))
		for n, sentid in enumerate(islice(corpora[0][0], 0, limit), 1):
			print('%d of %s (sentid=%s; len=%d):' % (
					n, numsents, sentid, len(corpora[0][1][sentid])))
			for trees, sents in corpora:
				tree, sent = trees[sentid], sents[sentid]
				print(processtree(tree, sent))
	else:  # read from stdin + detect format
		encoding = opts.get('--encoding', 'utf8')
		if not args:
			args = ['-']
		stream = chain.from_iterable(
				openread(fname, encoding=encoding)
				for fname in args)
		trees = islice(incrementaltreereader(stream,
				morphology=opts.get('--morphology'),
				functions=opts.get('--functions'),
				othertext=True),
				0, limit)
		try:
			n = 1
			for tree, sent, rest in trees:
				if tree is None:
					print(rest)
					continue
				print('%d. (len=%d):' % (n, len(sent)), end=' ')
				if '--frontier' in opts:
					print('%s %s' % (frontier(tree, sent), rest))
				else:
					print(rest or '')
					print(processtree(tree, sent))
				n += 1
		except (IOError, KeyboardInterrupt):
			pass
	if output in ('html', 'svg'):
		print(DrawTree.templates[output][1])  # postamble
	elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
		print(DrawTree.templates['latex'][1])  # postamble


[docs]def runexp(args=None):
	"""Usage: discodop runexp <parameter file> [--rerun]

If a parameter file is given, an experiment is run. See the file sample.prm for
an example parameter file. To repeat an experiment with an existing grammar,
pass the option --rerun. The directory with the name of the parameter file
without extension must exist in the current path; its results will be
overwritten."""
	import io
	import os
	from .parser import readparam
	from .runexp import startexp, parsetepacoc
	if args is None:
		args = argv[2:]
	if len(args) == 0:
		print('error: incorrect number of arguments', file=stderr)
		print(runexp.__doc__)
		sysexit(2)
	elif '--tepacoc' in args:
		parsetepacoc()
	else:
		rerun = '--rerun' in args
		if rerun:
			args.remove('--rerun')
		params = readparam(args[0])
		resultdir = args[0].rsplit('.', 1)[0]
		top = startexp(
				params, resultdir=resultdir, rerun=rerun)
		if not rerun:  # copy parameter file to result dir
			paramlines = io.open(args[0], encoding='utf8').readlines()
			if paramlines[0].startswith("top='"):
				paramlines = paramlines[1:]
			outfile = os.path.join(resultdir, 'params.prm')
			with io.open(outfile, 'w', encoding='utf8') as out:
				out.write("top='%s',\n" % top)
				out.writelines(paramlines)


[docs]def treetransforms():
	"""Treebank binarization and conversion.
Usage: discodop treetransforms [input [output]] [options]
where input and output are treebanks; standard in/output is used if not given.
"""
	import io
	from getopt import gnu_getopt, GetoptError
	from itertools import islice
	from . import treebank, treebanktransforms
	from .treetransforms import canonicalize, binarize, \
			unbinarize, optimalbinarize, splitdiscnodes, mergediscnodes, \
			introducepreterminals, markovthreshold
	flags = ('binarize optimalbinarize unbinarize splitdisc mergedisc '
			'introducepreterminals renumber sentid removeempty '
			'help markorigin markhead leftunary rightunary '
			'tailmarker direction').split()
	options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= '
			'punct= headrules= functions= morphology= lemmas= factor= fmt= '
			'markorigin= maxlen= enc= transforms= markovthreshold= labelfun= '
			'transforms= reversetransforms= ').split()
	try:
		origopts, args = gnu_getopt(argv[2:], 'h:v:H:', flags + options)
		if len(args) > 2:
			raise GetoptError('expected 0, 1, or 2 positional arguments')
	except GetoptError as err:
		print('error:', err, file=stderr)
		print(treetransforms.__doc__)
		sysexit(2)
	opts = dict(origopts)
	if '--fmt' in opts:
		opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt']
	if '--enc' in opts:
		opts['--inputenc'] = opts['--outputenc'] = opts['--enc']
	if opts.get('--outputfmt', treebank.WRITERS[0]) not in treebank.WRITERS:
		print('error: unrecognized output format: %r\navailable formats: %s'
				% (opts.get('--outputfmt'), ' '.join(treebank.WRITERS)),
				file=stderr)
		sysexit(2)
	infilename = (args[0] if len(args) >= 1 else '-')
	outfilename = (args[1] if len(args) == 2 and args[1] != '-'
			else stdout.fileno())

	# open corpus
	corpus = treebank.READERS[opts.get('--inputfmt', 'export')](
			infilename,
			encoding=opts.get('--inputenc', 'utf8'),
			headrules=opts.get('--headrules'),
			ensureroot=opts.get('--ensureroot'),
			removeempty='--removeempty' in opts,
			punct=opts.get('--punct'),
			functions=opts.get('--functions'),
			morphology=opts.get('--morphology'),
			lemmas=opts.get('--lemmas'))
	start, end = opts.get('--slice', ':').split(':')
	start, end = (int(start) if start else None), (int(end) if end else None)
	trees = corpus.itertrees(start, end)
	if '--maxlen' in opts:
		maxlen = int(opts['--maxlen'])
		trees = ((key, item) for key, item in trees
				if len(item.sent) <= maxlen)
	if '--renumber' in opts:
		trees = (('%8d' % n, item) for n, (_key, item) in enumerate(trees, 1))

	# select transformations
	actions = []
	for key, value in origopts:  # pylint: disable=unused-variable
		if key == '--introducepreterminals':
			actions.append(lambda tree, sent:
					(introducepreterminals(tree, sent), sent))
		if key == '--transforms':
			actions.append(lambda tree, sent, value=value:
					(treebanktransforms.transform(tree, sent,
						treebanktransforms.expandpresets(value.split(','))),
					sent))
		if key in ('--binarize', '--optimalbinarize'):
			if key == '--binarize':
				actions.append(lambda tree, sent:
						(binarize(
							tree,
							opts.get('--factor', 'right'),
							int(opts.get('-h', 999)),
							int(opts.get('-v', 1)),
							revhorzmarkov=int(opts.get('-H', 0)),
							leftmostunary='--leftunary' in opts,
							rightmostunary='--rightunary' in opts,
							tailmarker='$' if '--tailmarker' in opts else '',
							direction='--direction' in opts,
							headoutward='--headrules' in opts,
							markhead='--markhead' in opts,
							labelfun=eval(  # pylint: disable=eval-used
								opts['--labelfun'])
								if '--labelfun' in opts else None),
						sent))
			elif key == '--optimalbinarize':
				actions.append(lambda tree, sent:
						(optimalbinarize(
							tree, '|',
							'--headrules' in opts,
							int(opts.get('-h', 999)),
							int(opts.get('-v', 1))),
						sent))
		if key == '--splitdisc':
			actions.append(lambda tree, sent:
					(splitdiscnodes(tree, '--markorigin' in opts), sent))
		if key == '--mergediscnodes':
			actions.append(lambda tree, sent: (mergediscnodes(tree), sent))
		if key == '--unbinarize':
			actions.append(lambda tree, sent: (unbinarize(tree, sent), sent))
		if key == '--reversetransforms':
			actions.append(lambda tree, sent, value=value:
					(treebanktransforms.reversetransform(tree,
						treebanktransforms.expandpresets(value.split(','))),
					sent))

	# read, transform, & write trees
	if actions:
		def applytransforms(trees):
			"""Apply transforms and yield modified items."""
			for key, item in trees:
				for action in actions:
					item.tree, item.sent = action(item.tree, item.sent)
				yield key, item

		trees = applytransforms(trees)
		if 'binarize' in opts and '--markovthreshold' in opts:
			trees = list(trees)
			h, v = int(opts.get('-h', 999)), int(opts.get('-v', 1))
			revh = int(opts.get('-H', 0))
			markovthreshold([item.tree for _, item in trees],
					int(opts['--markovthreshold']),
					revh + h - 1,
					v - 1 if v > 1 else 1)

	if opts.get('--outputfmt') in ('mst', 'conll'):
		if not opts.get('--headrules'):
			raise ValueError('need head rules for dependency conversion')
	cnt = 0
	if opts.get('--outputfmt') == 'dact':
		import alpinocorpus
		outfile = alpinocorpus.CorpusWriter(outfilename)
		if (not actions and opts.get('--inputfmt') in ('alpino', 'dact')
				and set(opts) <= {'--slice', '--inputfmt', '--outputfmt',
				'--renumber'}):
			for n, (key, block) in islice(enumerate(
					corpus.blocks().items(), 1), start, end):
				outfile.write((('%8d' % n) if '--renumber' in opts
						else key).encode('utf8'), block)
				cnt += 1
		else:
			for key, item in trees:
				outfile.write(key.encode('utf8'), treebank.writetree(
						item.tree, item.sent, key, 'alpino',
						comment=item.comment).encode('utf8'))
				cnt += 1
	else:
		encoding = opts.get('outputenc', 'utf8')
		with io.open(outfilename, 'w', encoding=encoding) as outfile:
			# copy trees verbatim when only taking slice or converting encoding
			if (not actions and opts.get('--inputfmt') == opts.get(
					'--outputfmt') and set(opts) <= {'--slice', '--inputenc',
					'--outputenc', '--inputfmt', '--outputfmt'}):
				for block in islice(corpus.blocks().values(), start, end):
					outfile.write(block)
					cnt += 1
			else:
				if opts.get('--outputfmt', 'export') == 'bracket':
					trees = ((key, canonicalize(item.tree) and item)
							for key, item in trees)
				if opts.get('--outputfmt', 'export') == 'export':
					outfile.write(treebank.EXPORTHEADER)
				fmt = opts.get('--outputfmt', 'export')
				sentid = '--sentid' in opts
				for key, item in trees:
					outfile.write(treebank.writetree(item.tree, item.sent, key,
							fmt, comment=item.comment, sentid=sentid))
					cnt += 1
	print('%s: transformed %d trees' % (args[0] if args else 'stdin', cnt),
			file=stderr)


[docs]def grammar():
	"""Read off grammars from treebanks.
Usage: discodop grammar <type> <input> <output> [options]
or: discodop grammar param <parameter-file> <output-directory>
or: discodop grammar info <rules-file>
or: discodop grammar merge (rules|lexicon|fragments) \
<input1> <input2>... <output>"""
	import io
	import os
	import codecs
	import logging
	from gzip import open as gzipopen
	from getopt import gnu_getopt, GetoptError
	from .tree import STRTERMRE
	from .util import openread
	from .treebank import READERS
	from .treetransforms import addfanoutmarkers, canonicalize
	from .grammar import treebankgrammar, dopreduction, doubledop, dop1, \
			compiletsg, writegrammar, grammarinfo, grammarstats, \
			splitweight, merge, sumfrags, sumrules, sumlex, stripweight, \
			addindices
	from .parser import readparam
	from .runexp import loadtraincorpus, getposmodel, dobinarization, \
			getgrammars
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	shortoptions = 'hs:'
	options = ('help', 'gzip', 'packed', 'bitpar', 'inputfmt=', 'inputenc=',
			'dopestimator=', 'maxdepth=', 'maxfrontier=', 'numproc=')
	try:
		opts, args = gnu_getopt(argv[2:], shortoptions, options)
		model = args[0]
		if model not in ('info', 'merge'):
			treebankfile, grammarfile = args[1:
					]  # pylint: disable=unbalanced-tuple-unpacking
	except (GetoptError, IndexError, ValueError) as err:
		print('error: %r' % err, file=stderr)
		print(grammar.__doc__)
		sysexit(2)
	opts = dict(opts)
	if model not in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop', 'dop1',
			'ptsg', 'param', 'info', 'merge'):
		raise ValueError('unrecognized model: %r' % model)
	if opts.get('dopestimator', 'rfe') not in ('rfe', 'ewe', 'shortest'):
		raise ValueError('unrecognized estimator: %r' % opts['dopestimator'])

	if model == 'info':
		grammarstats(args[1])
		return
	elif model == 'merge':
		if len(args) < 5:
			raise ValueError('need at least 2 input and 1 output arguments.')
		if args[1] == 'rules':
			merge(args[2:-1], args[-1], sumrules, stripweight)
		elif args[1] == 'lexicon':
			merge(args[2:-1], args[-1], sumlex, lambda x: x.split(None, 1)[0])
		elif args[1] == 'fragments':
			merge(args[2:-1], args[-1], sumfrags, lambda x: x.rsplit('\t', 1)[0])
		return
	elif model == 'param':
		if opts:
			raise ValueError('all options should be set in parameter file.')
		prm = readparam(args[1])
		resultdir = args[2]
		if os.path.exists(resultdir):
			raise ValueError('Directory %r already exists.\n' % resultdir)
		os.mkdir(resultdir)
		trees, sents, train_tagged_sents = loadtraincorpus(
				prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
				prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
				prm.transformations, prm.relationalrealizational)
		simplelexsmooth = False
		if prm.postagging and prm.postagging.method == 'unknownword':
			sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
			simplelexsmooth = prm.postagging.simplelexsmooth
	elif model == 'ptsg':  # read fragments
		xfragments = {frag: splitweight(weight) for frag, weight
				in (line.split('\t') for line in openread(treebankfile,
					encoding=opts.get('--inputenc', 'utf8')))}
		if STRTERMRE.search(next(iter(xfragments))) is not None:
			xfragments = {addindices(frag): splitweight(weight) for frag, weight
					in xfragments.items()}
	else:  # read treebank
		corpus = READERS[opts.get('--inputfmt', 'export')](
				treebankfile,
				encoding=opts.get('--inputenc', 'utf8'))
		trees = list(corpus.trees().values())
		sents = list(corpus.sents().values())
		if not trees:
			raise ValueError('no trees; is --inputfmt correct?')
		for a in trees:
			canonicalize(a)
			addfanoutmarkers(a)

	# read off grammar
	if model in ('pcfg', 'plcfrs'):
		xgrammar = treebankgrammar(trees, sents)
	elif model == 'dopreduction':
		xgrammar, altweights = dopreduction(trees, sents,
				packedgraph='--packed' in opts)
	elif model == 'doubledop':
		xgrammar, backtransform, altweights, _ = doubledop(trees, sents,
				numproc=int(opts.get('--numproc', 1)),
				binarized='--bitpar' not in opts)
	elif model == 'dop1':
		xgrammar, backtransform, altweights, _ = dop1(trees, sents,
				maxdepth=int(opts.get('--maxdepth', 3)),
				maxfrontier=int(opts.get('--maxfrontier', 999)),
				binarized='--bitpar' not in opts)
	elif model == 'ptsg':
		xgrammar, backtransform, altweights = compiletsg(xfragments,
				binarized='--bitpar' not in opts)
	elif model == 'param':
		getgrammars(dobinarization(trees, sents, prm.binarization,
				prm.relationalrealizational),
				sents, prm.stages, prm.testcorpus.maxwords, resultdir,
				prm.numproc, lexmodel, simplelexsmooth, trees[0].label)
		paramfile = os.path.join(resultdir, 'params.prm')
		with openread(args[1]) as inp:
			with io.open(paramfile, 'w', encoding='utf8') as out:
				out.write("top='%s',\n%s" % (trees[0].label, inp.read()))
		return  # grammars have already been written
	if opts.get('--dopestimator', 'rfe') != 'rfe':
		xgrammar = [(rule, w) for (rule, _), w in
				zip(xgrammar, altweights[opts['--dopestimator']])]

	rulesname = grammarfile + '.rules'
	lexiconname = grammarfile + '.lex'
	myopen = open
	if '--gzip' in opts:
		myopen = gzipopen
		rulesname += '.gz'
		lexiconname += '.gz'
	bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket'
	if model == 'ptsg':
		bitpar = STRTERMRE.search(next(iter(xfragments))) is not None
	if '--bitpar' in opts and not bitpar:
		raise ValueError('parsing with an unbinarized grammar requires '
				'a grammar in bitpar format.')

	rules, lexicon = writegrammar(xgrammar, bitpar=bitpar)
	# write output
	with codecs.getwriter('utf8')(myopen(rulesname, 'wb')) as rulesfile:
		rulesfile.write(rules)
	with codecs.getwriter('utf8')(myopen(lexiconname, 'wb')) as lexiconfile:
		lexiconfile.write(lexicon)
	if model in ('doubledop', 'ptsg'):
		backtransformfile = '%s.backtransform%s' % (grammarfile,
			'.gz' if '--gzip' in opts else '')
		with codecs.getwriter('utf8')(myopen(backtransformfile, 'wb')) as bt:
			bt.writelines('%s\n' % a for a in backtransform)
		print('wrote backtransform to', backtransformfile)
	print('wrote grammar to %s and %s.' % (rulesname, lexiconname))
	start = opts.get('-s', next(iter(xgrammar))[0][0][0]
			if model == 'ptsg' else trees[0].label)
	if version_info[0] == 2:
		start = start.decode('utf8')
	if len(xgrammar) < 10000:  # this is very slow so skip with large grammars
		print(grammarinfo(xgrammar))
	try:
		from .containers import Grammar
		print(Grammar(rules, lexicon, binarized='--bitpar' not in opts,
				start=start).testgrammar()[1])
	except (ImportError, AssertionError) as err:
		print(err)


if __name__ == "__main__":
	main()

__all__ = ['treedraw', 'runexp', 'treetransforms', 'grammar', 'main']