"""Command-line interfaces to modules."""
from __future__ import division, print_function, absolute_import, \
unicode_literals
from sys import argv, stdout, stderr, version_info
from sys import exit as sysexit
COMMANDS = {
'runexp': 'Run experiment: grammar extraction, parsing & evaluation.',
'fragments': 'Extract recurring fragments from treebanks.',
'eval': 'Evaluate discontinuous parse trees; similar to EVALB.',
'treetransforms': 'Apply tree transformations '
'and convert between formats.',
'treedraw': 'Visualize (discontinuous) trees.',
'treesearch': 'Query treebanks.',
'grammar': 'Read off grammars from treebanks.',
'parser': 'Simple command line parser.',
'demos': 'Show some demonstrations of formalisms encoded in LCFRS.',
'gen': 'Generate sentences from a PLCFRS.',
}
[docs]def main():
"""Expose command-line interfaces."""
from os import execlp
from os.path import basename
thiscmd = basename(argv[0])
if len(argv) == 2 and argv[1] in ('-v', '--version'):
from discodop import __version__
print(__version__)
elif len(argv) <= 1 or argv[1] not in dict(COMMANDS):
print('Usage: %s <command> [arguments]\n' % thiscmd, file=stderr)
print('Command is one of:', file=stderr)
for a, b in COMMANDS.items():
print(' %s %s' % (a.ljust(15), b))
print('for additional instructions issue: %s <command> --help'
% thiscmd, file=stderr)
elif len(argv) == 3 and argv[2] in ('-h', '--help'):
# help on subcommand
execlp('man', 'man', 'discodop-%s' % argv[1])
else:
cmd = argv[1]
# use the CLI defined here, or default to the module's main function.
try:
globals()[cmd]()
except KeyError:
getattr(__import__('discodop.%s' % cmd,
fromlist=['main']), 'main')()
[docs]def treedraw():
"""Usage: discodop treedraw [<treebank>...] [options]
If no treebank is given, input is read from standard input; format is detected.
Pipe the output through 'less -R' to preserve the colors."""
from getopt import gnu_getopt, GetoptError
from itertools import islice, chain
from .treebank import READERS, incrementaltreereader
from .tree import DrawTree, frontier
from .util import openread
def processtree(tree, sent):
"""Produced output for a single tree."""
if output == 'frontier':
return frontier(tree, sent)
dt = DrawTree(tree, sent, abbr='--abbr' in opts)
if output == 'text' or output == 'html':
return dt.text(unicodelines=True, ansi=ansi, html=html,
funcsep=funcsep)
elif output == 'svg':
return dt.svg(funcsep=funcsep)
elif output == 'tikznode':
return dt.tikznode(funcsep=funcsep) + '\n'
elif output == 'tikzmatrix':
return dt.tikzmatrix(funcsep=funcsep) + '\n'
elif output == 'tikzqtree':
return dt.tikzqtree() + '\n'
raise ValueError('unrecognized --output format')
flags = ('test', 'help', 'abbr', 'plain', 'frontier')
options = ('fmt=', 'encoding=', 'functions=', 'morphology=', 'numtrees=',
'output=')
try:
opts, args = gnu_getopt(argv[2:], 'hn:', flags + options)
except GetoptError as err:
print('error:', err, file=stderr)
print(treedraw.__doc__)
sysexit(2)
opts = dict(opts)
limit = opts.get('--numtrees', opts.get('-n'))
limit = int(limit) if limit else None
output = opts.get('--output', 'text')
funcsep = ('-' if opts.get('--functions')
in ('add', 'between') else None)
ansi = output == 'text' and '--plain' not in opts
html = output == 'html' and '--plain' not in opts
if output in ('html', 'svg'):
print(DrawTree.templates[output][0]) # preamble
elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
print(DrawTree.templates['latex'][0]) # preamble
if args and opts.get('--fmt', 'export') != 'auto':
reader = READERS[opts.get('--fmt', 'export')]
corpora = []
for path in args:
corpus = reader(
path,
encoding=opts.get('--encoding', 'utf8'),
functions=opts.get('--functions'),
morphology=opts.get('--morphology'))
corpora.append((corpus.trees(), corpus.sents()))
numsents = len(corpus.sents())
print('Viewing:', ' '.join(args))
for n, sentid in enumerate(islice(corpora[0][0], 0, limit), 1):
print('%d of %s (sentid=%s; len=%d):' % (
n, numsents, sentid, len(corpora[0][1][sentid])))
for trees, sents in corpora:
tree, sent = trees[sentid], sents[sentid]
print(processtree(tree, sent))
else: # read from stdin + detect format
encoding = opts.get('--encoding', 'utf8')
if not args:
args = ['-']
stream = chain.from_iterable(
openread(fname, encoding=encoding)
for fname in args)
trees = islice(incrementaltreereader(stream,
morphology=opts.get('--morphology'),
functions=opts.get('--functions'),
othertext=True),
0, limit)
try:
n = 1
for tree, sent, rest in trees:
if tree is None:
print(rest)
continue
print('%d. (len=%d):' % (n, len(sent)), end=' ')
if '--frontier' in opts:
print('%s %s' % (frontier(tree, sent), rest))
else:
print(rest or '')
print(processtree(tree, sent))
n += 1
except (IOError, KeyboardInterrupt):
pass
if output in ('html', 'svg'):
print(DrawTree.templates[output][1]) # postamble
elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
print(DrawTree.templates['latex'][1]) # postamble
[docs]def runexp(args=None):
"""Usage: discodop runexp <parameter file> [--rerun]
If a parameter file is given, an experiment is run. See the file sample.prm for
an example parameter file. To repeat an experiment with an existing grammar,
pass the option --rerun. The directory with the name of the parameter file
without extension must exist in the current path; its results will be
overwritten."""
import io
import os
from .parser import readparam
from .runexp import startexp, parsetepacoc
if args is None:
args = argv[2:]
if len(args) == 0:
print('error: incorrect number of arguments', file=stderr)
print(runexp.__doc__)
sysexit(2)
elif '--tepacoc' in args:
parsetepacoc()
else:
rerun = '--rerun' in args
if rerun:
args.remove('--rerun')
params = readparam(args[0])
resultdir = args[0].rsplit('.', 1)[0]
top = startexp(
params, resultdir=resultdir, rerun=rerun)
if not rerun: # copy parameter file to result dir
paramlines = io.open(args[0], encoding='utf8').readlines()
if paramlines[0].startswith("top='"):
paramlines = paramlines[1:]
outfile = os.path.join(resultdir, 'params.prm')
with io.open(outfile, 'w', encoding='utf8') as out:
out.write("top='%s',\n" % top)
out.writelines(paramlines)
[docs]def grammar():
"""Read off grammars from treebanks.
Usage: discodop grammar <type> <input> <output> [options]
or: discodop grammar param <parameter-file> <output-directory>
or: discodop grammar info <rules-file>
or: discodop grammar merge (rules|lexicon|fragments) \
<input1> <input2>... <output>"""
import io
import os
import codecs
import logging
from gzip import open as gzipopen
from getopt import gnu_getopt, GetoptError
from .tree import STRTERMRE
from .util import openread
from .treebank import READERS
from .treetransforms import addfanoutmarkers, canonicalize
from .grammar import treebankgrammar, dopreduction, doubledop, dop1, \
compiletsg, writegrammar, grammarinfo, grammarstats, \
splitweight, merge, sumfrags, sumrules, sumlex, stripweight, \
addindices
from .parser import readparam
from .runexp import loadtraincorpus, getposmodel, dobinarization, \
getgrammars
logging.basicConfig(level=logging.DEBUG, format='%(message)s')
shortoptions = 'hs:'
options = ('help', 'gzip', 'packed', 'bitpar', 'inputfmt=', 'inputenc=',
'dopestimator=', 'maxdepth=', 'maxfrontier=', 'numproc=')
try:
opts, args = gnu_getopt(argv[2:], shortoptions, options)
model = args[0]
if model not in ('info', 'merge'):
treebankfile, grammarfile = args[1:
] # pylint: disable=unbalanced-tuple-unpacking
except (GetoptError, IndexError, ValueError) as err:
print('error: %r' % err, file=stderr)
print(grammar.__doc__)
sysexit(2)
opts = dict(opts)
if model not in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop', 'dop1',
'ptsg', 'param', 'info', 'merge'):
raise ValueError('unrecognized model: %r' % model)
if opts.get('dopestimator', 'rfe') not in ('rfe', 'ewe', 'shortest'):
raise ValueError('unrecognized estimator: %r' % opts['dopestimator'])
if model == 'info':
grammarstats(args[1])
return
elif model == 'merge':
if len(args) < 5:
raise ValueError('need at least 2 input and 1 output arguments.')
if args[1] == 'rules':
merge(args[2:-1], args[-1], sumrules, stripweight)
elif args[1] == 'lexicon':
merge(args[2:-1], args[-1], sumlex, lambda x: x.split(None, 1)[0])
elif args[1] == 'fragments':
merge(args[2:-1], args[-1], sumfrags, lambda x: x.rsplit('\t', 1)[0])
return
elif model == 'param':
if opts:
raise ValueError('all options should be set in parameter file.')
prm = readparam(args[1])
resultdir = args[2]
if os.path.exists(resultdir):
raise ValueError('Directory %r already exists.\n' % resultdir)
os.mkdir(resultdir)
trees, sents, train_tagged_sents = loadtraincorpus(
prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
prm.transformations, prm.relationalrealizational)
simplelexsmooth = False
if prm.postagging and prm.postagging.method == 'unknownword':
sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
simplelexsmooth = prm.postagging.simplelexsmooth
elif model == 'ptsg': # read fragments
xfragments = {frag: splitweight(weight) for frag, weight
in (line.split('\t') for line in openread(treebankfile,
encoding=opts.get('--inputenc', 'utf8')))}
if STRTERMRE.search(next(iter(xfragments))) is not None:
xfragments = {addindices(frag): splitweight(weight) for frag, weight
in xfragments.items()}
else: # read treebank
corpus = READERS[opts.get('--inputfmt', 'export')](
treebankfile,
encoding=opts.get('--inputenc', 'utf8'))
trees = list(corpus.trees().values())
sents = list(corpus.sents().values())
if not trees:
raise ValueError('no trees; is --inputfmt correct?')
for a in trees:
canonicalize(a)
addfanoutmarkers(a)
# read off grammar
if model in ('pcfg', 'plcfrs'):
xgrammar = treebankgrammar(trees, sents)
elif model == 'dopreduction':
xgrammar, altweights = dopreduction(trees, sents,
packedgraph='--packed' in opts)
elif model == 'doubledop':
xgrammar, backtransform, altweights, _ = doubledop(trees, sents,
numproc=int(opts.get('--numproc', 1)),
binarized='--bitpar' not in opts)
elif model == 'dop1':
xgrammar, backtransform, altweights, _ = dop1(trees, sents,
maxdepth=int(opts.get('--maxdepth', 3)),
maxfrontier=int(opts.get('--maxfrontier', 999)),
binarized='--bitpar' not in opts)
elif model == 'ptsg':
xgrammar, backtransform, altweights = compiletsg(xfragments,
binarized='--bitpar' not in opts)
elif model == 'param':
getgrammars(dobinarization(trees, sents, prm.binarization,
prm.relationalrealizational),
sents, prm.stages, prm.testcorpus.maxwords, resultdir,
prm.numproc, lexmodel, simplelexsmooth, trees[0].label)
paramfile = os.path.join(resultdir, 'params.prm')
with openread(args[1]) as inp:
with io.open(paramfile, 'w', encoding='utf8') as out:
out.write("top='%s',\n%s" % (trees[0].label, inp.read()))
return # grammars have already been written
if opts.get('--dopestimator', 'rfe') != 'rfe':
xgrammar = [(rule, w) for (rule, _), w in
zip(xgrammar, altweights[opts['--dopestimator']])]
rulesname = grammarfile + '.rules'
lexiconname = grammarfile + '.lex'
myopen = open
if '--gzip' in opts:
myopen = gzipopen
rulesname += '.gz'
lexiconname += '.gz'
bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket'
if model == 'ptsg':
bitpar = STRTERMRE.search(next(iter(xfragments))) is not None
if '--bitpar' in opts and not bitpar:
raise ValueError('parsing with an unbinarized grammar requires '
'a grammar in bitpar format.')
rules, lexicon = writegrammar(xgrammar, bitpar=bitpar)
# write output
with codecs.getwriter('utf8')(myopen(rulesname, 'wb')) as rulesfile:
rulesfile.write(rules)
with codecs.getwriter('utf8')(myopen(lexiconname, 'wb')) as lexiconfile:
lexiconfile.write(lexicon)
if model in ('doubledop', 'ptsg'):
backtransformfile = '%s.backtransform%s' % (grammarfile,
'.gz' if '--gzip' in opts else '')
with codecs.getwriter('utf8')(myopen(backtransformfile, 'wb')) as bt:
bt.writelines('%s\n' % a for a in backtransform)
print('wrote backtransform to', backtransformfile)
print('wrote grammar to %s and %s.' % (rulesname, lexiconname))
start = opts.get('-s', next(iter(xgrammar))[0][0][0]
if model == 'ptsg' else trees[0].label)
if version_info[0] == 2:
start = start.decode('utf8')
if len(xgrammar) < 10000: # this is very slow so skip with large grammars
print(grammarinfo(xgrammar))
try:
from .containers import Grammar
print(Grammar(rules, lexicon, binarized='--bitpar' not in opts,
start=start).testgrammar()[1])
except (ImportError, AssertionError) as err:
print(err)
if __name__ == "__main__":
main()
__all__ = ['treedraw', 'runexp', 'treetransforms', 'grammar', 'main']