"""Command-line interfaces to modules."""
from sys import argv, stdout, stderr, version_info
from sys import exit as sysexit
COMMANDS = {
'runexp': 'Run experiment: grammar extraction, parsing & evaluation.',
'fragments': 'Extract recurring fragments from treebanks.',
'eval': 'Evaluate discontinuous parse trees; similar to EVALB.',
'treetransforms': 'Apply tree transformations '
'and convert between formats.',
'treedraw': 'Visualize (discontinuous) trees.',
'treesearch': 'Query treebanks.',
'grammar': 'Read off grammars from treebanks.',
'parser': 'Simple command line parser.',
'demos': 'Show some demonstrations of formalisms encoded in LCFRS.',
'gen': 'Generate sentences from a PLCFRS.',
}
[docs]def main():
"""Expose command-line interfaces."""
from os import execlp
from os.path import basename
thiscmd = basename(argv[0])
if len(argv) == 2 and argv[1] in ('-v', '--version'):
from discodop import __version__
print(__version__)
elif len(argv) <= 1 or argv[1] not in dict(COMMANDS):
print('Usage: %s <command> [arguments]\n' % thiscmd, file=stderr)
print('Command is one of:', file=stderr)
for a, b in COMMANDS.items():
print(' %s %s' % (a.ljust(15), b))
print('for additional instructions issue: %s <command> --help'
% thiscmd, file=stderr)
elif len(argv) == 3 and argv[2] in ('-h', '--help'):
# help on subcommand
execlp('man', 'man', 'discodop-%s' % argv[1])
else:
cmd = argv[1]
# use the CLI defined here, or default to the module's main function.
if cmd in globals():
globals()[cmd]()
else:
getattr(__import__('discodop.%s' % cmd,
fromlist=['main']), 'main')()
[docs]def treedraw():
"""Usage: discodop treedraw [<treebank>...] [options]
If no treebank is given, input is read from standard input; format is detected.
Pipe the output through 'less -R' to preserve the colors."""
from getopt import gnu_getopt, GetoptError
from itertools import islice, chain
from .treebank import READERS, incrementaltreereader
from .tree import DrawTree, frontier
from .util import openread
try:
from itertools import izip
except ImportError:
izip = zip
def processtree(tree, sent):
"""Produced output for a single tree."""
dt = DrawTree(
tree, sent, abbr='--abbr' in opts, secedge='--secedge' in opts)
if output == 'text' or output == 'html':
return dt.text(unicodelines=True, ansi=ansi, html=html,
funcsep=funcsep)
elif output == 'svg':
return dt.svg(funcsep=funcsep)
elif output == 'tikznode':
return dt.tikznode(funcsep=funcsep) + '\n'
elif output == 'tikzmatrix':
return dt.tikzmatrix(funcsep=funcsep) + '\n'
elif output == 'tikzqtree':
return dt.tikzqtree() + '\n'
raise ValueError('unrecognized --output format')
flags = ('test', 'help', 'abbr', 'plain', 'frontier', 'secedge')
options = ('fmt=', 'encoding=', 'functions=', 'morphology=', 'numtrees=',
'output=')
try:
opts, args = gnu_getopt(argv[2:], 'hn:fm', flags + options)
except GetoptError as err:
print('error:', err, file=stderr)
print(treedraw.__doc__)
sysexit(2)
opts = dict(opts)
limit = opts.get('--numtrees', opts.get('-n'))
limit = int(limit) if limit else None
output = opts.get('--output', 'text')
if '-f' in opts:
opts['--functions'] = 'add'
if '-m' in opts:
opts['--morphology'] = 'add'
funcsep = ('-' if opts.get('--functions')
in ('add', 'between') else None)
ansi = output == 'text' and '--plain' not in opts
html = output == 'html' and '--plain' not in opts
if output in ('html', 'svg'):
print(DrawTree.templates[output][0]) # preamble
elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
print(DrawTree.templates['latex'][0]) # preamble
if args and opts.get('--fmt', 'export') != 'auto':
reader = READERS[opts.get('--fmt', 'export')]
corpora = []
for path in args:
corpus = reader(
path,
encoding=opts.get('--encoding', 'utf8'),
functions=opts.get('--functions'),
morphology=opts.get('--morphology'))
corpora.append(iter(corpus.itertrees()))
for items in islice(izip(*corpora), 0, limit):
for arg, (sentid, item) in zip(args, items):
if len(args) > 1:
print(arg, end=':')
print(sentid, end='. ')
if '--frontier' in opts:
print(frontier(item.tree, item.sent), item.comment)
else:
print(item.comment)
print(processtree(item.tree, item.sent))
else: # read from stdin + detect format
encoding = opts.get('--encoding', 'utf8')
if not args:
args = ['-']
stream = chain.from_iterable(
openread(fname, encoding=encoding)
for fname in args)
trees = islice(incrementaltreereader(stream,
morphology=opts.get('--morphology'),
functions=opts.get('--functions'),
othertext=True),
0, limit)
try:
n = 1
for tree, sent, rest in trees:
if tree is None:
print(rest)
continue
print(n, end='. ')
if '--frontier' in opts:
print('%s %s' % (frontier(tree, sent), rest))
else:
print(rest or '')
print(processtree(tree, sent))
n += 1
except (IOError, KeyboardInterrupt):
pass
if output in ('html', 'svg'):
print(DrawTree.templates[output][1]) # postamble
elif output in ('tikznode', 'tikzmatrix', 'tikzqtree'):
print(DrawTree.templates['latex'][1]) # postamble
[docs]def runexp(args=None):
"""Usage: discodop runexp <parameter file> [--rerun]
If a parameter file is given, an experiment is run. See the file sample.prm for
an example parameter file. To repeat an experiment with an existing grammar,
pass the option --rerun. The directory with the name of the parameter file
without extension must exist in the current path; its results will be
overwritten."""
import io
import os
from .parser import readparam
from .runexp import startexp, parsetepacoc
if args is None:
args = argv[2:]
if '--tepacoc' in args:
args.remove('--tepacoc')
if args:
print('error: incorrect arguments', file=stderr)
print(runexp.__doc__)
sysexit(2)
parsetepacoc()
return
rerun = '--rerun' in args
if rerun:
args.remove('--rerun')
if len(args) != 1:
print('error: incorrect arguments %r' % args, file=stderr)
print(runexp.__doc__)
sysexit(2)
params = readparam(args[0])
resultdir = args[0].rsplit('.', 1)[0]
top = startexp(params, resultdir=resultdir, rerun=rerun)
if not rerun: # copy parameter file to result dir
with io.open(args[0], encoding='utf8') as inp:
paramlines = inp.readlines()
if paramlines[0].startswith("top='"):
paramlines = paramlines[1:]
outfile = os.path.join(resultdir, 'params.prm')
with io.open(outfile, 'w', encoding='utf8') as out:
out.write("top='%s',\n" % top)
out.writelines(paramlines)
[docs]def treetransforms():
"""Treebank binarization and conversion.
Usage: discodop treetransforms [input [output]] [options]
where input and output are treebanks; standard in/output is used if not given.
"""
import io
from getopt import gnu_getopt, GetoptError
from itertools import islice
from . import treebank, treebanktransforms
from .treetransforms import (canonicalize, binarize,
unbinarize, optimalbinarize, splitdiscnodes, mergediscnodes,
raisediscnodes, introducepreterminals, markovthreshold,
canonicallyorderedtree)
flags = ('binarize optimalbinarize unbinarize splitdisc mergedisc '
'raisedisc canonical introducepreterminals renumber sentid '
'removeempty help markorigin markhead leftunary rightunary '
'tailmarker direction dot').split()
options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= '
'punct= headrules= functions= morphology= lemmas= factor= fmt= '
'markorigin= maxlen= enc= transforms= markovthreshold= labelfun= '
'transforms= reversetransforms= filterlabels= ').split()
try:
origopts, args = gnu_getopt(argv[2:], 'h:v:H:', flags + options)
if len(args) > 2:
raise GetoptError('expected 0, 1, or 2 positional arguments')
except GetoptError as err:
print('error:', err, file=stderr)
print(treetransforms.__doc__)
sysexit(2)
opts = dict(origopts)
if '--fmt' in opts:
opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt']
if '--enc' in opts:
opts['--inputenc'] = opts['--outputenc'] = opts['--enc']
if opts.get('--outputfmt', treebank.WRITERS[0]) not in treebank.WRITERS:
print('error: unrecognized output format: %r\navailable formats: %s'
% (opts.get('--outputfmt'), ' '.join(treebank.WRITERS)),
file=stderr)
sysexit(2)
infilename = (args[0] if len(args) >= 1 else '-')
outfilename = (args[1] if len(args) == 2 and args[1] != '-'
else stdout.fileno())
# open corpus
corpus = treebank.READERS[opts.get('--inputfmt', 'export')](
infilename,
encoding=opts.get('--inputenc', 'utf8'),
headrules=opts.get('--headrules'),
ensureroot=opts.get('--ensureroot'),
removeempty='--removeempty' in opts,
punct=opts.get('--punct'),
functions=opts.get('--functions'),
morphology=opts.get('--morphology'),
lemmas=opts.get('--lemmas'))
start, end = opts.get('--slice', ':').split(':')
start, end = (int(start) if start else None), (int(end) if end else None)
# FIXME: support negative indices
trees = corpus.itertrees(start, end)
if '--maxlen' in opts:
maxlen = int(opts['--maxlen'])
trees = ((key, item) for key, item in trees
if len(item.sent) <= maxlen)
if '--renumber' in opts:
trees = (('%8d' % n, item) for n, (_key, item) in enumerate(trees, 1))
# select transformations
actions = []
for key, value in origopts: # pylint: disable=unused-variable
if key == '--introducepreterminals':
actions.append(lambda tree, sent:
(introducepreterminals(tree, sent), sent))
elif key == '--transforms':
actions.append(lambda tree, sent, value=value:
(treebanktransforms.transform(tree, sent,
treebanktransforms.expandpresets(value.split(','))),
sent))
elif key in ('--binarize', '--optimalbinarize'):
if key == '--binarize':
actions.append(lambda tree, sent:
(binarize(
tree,
opts.get('--factor', 'right'),
int(opts.get('-h', 999)),
int(opts.get('-v', 1)),
revhorzmarkov=int(opts.get('-H', 0)),
leftmostunary='--leftunary' in opts,
rightmostunary='--rightunary' in opts,
tailmarker='$' if '--tailmarker' in opts else '',
direction='--direction' in opts,
headoutward='--headrules' in opts,
markhead='--markhead' in opts,
dot='--dot' in opts,
filterlabels=tuple(opts.get(
'--filterlabels', '').split()),
labelfun=eval( # pylint: disable=eval-used
opts['--labelfun'])
if '--labelfun' in opts else None),
sent))
elif key == '--optimalbinarize':
actions.append(lambda tree, sent:
(optimalbinarize(
tree, '|',
'--headrules' in opts,
int(opts.get('-h', 999)),
int(opts.get('-v', 1))),
sent))
elif key == '--splitdisc':
actions.append(lambda tree, sent:
(splitdiscnodes(tree, '--markorigin' in opts), sent))
elif key == '--canonical':
actions.append(lambda tree, sent:
(canonicallyorderedtree(tree, sent), sent))
elif key == '--mergedisc':
actions.append(lambda tree, sent: (mergediscnodes(tree), sent))
elif key == '--raisedisc':
actions.append(lambda tree, sent: (raisediscnodes(tree), sent))
elif key == '--unbinarize':
actions.append(lambda tree, sent: (unbinarize(tree, sent), sent))
elif key == '--reversetransforms':
actions.append(lambda tree, sent, value=value:
(treebanktransforms.reversetransform(tree, sent,
treebanktransforms.expandpresets(value.split(','))),
sent))
# read, transform, & write trees
if actions:
def applytransforms(trees):
"""Apply transforms and yield modified items."""
for key, item in trees:
for action in actions:
item.tree, item.sent = action(item.tree, item.sent)
yield key, item
trees = applytransforms(trees)
if 'binarize' in opts and '--markovthreshold' in opts:
trees = list(trees)
h, v = int(opts.get('-h', 999)), int(opts.get('-v', 1))
revh = int(opts.get('-H', 0))
markovthreshold([item.tree for _, item in trees],
int(opts['--markovthreshold']),
revh + h - 1,
v - 1 if v > 1 else 1)
if opts.get('--outputfmt') in ('mst', 'conll'):
if not opts.get('--headrules'):
raise ValueError('need head rules for dependency conversion')
cnt = 0
encoding = opts.get('outputenc', 'utf8')
with io.open(outfilename, 'w', encoding=encoding) as outfile:
# copy trees verbatim when only taking slice or converting encoding
if (not actions
and opts.get('--inputfmt') == opts.get('--outputfmt')
and opts.get('--inputfmt') in (
'export', 'bracket', 'discbracket')
and set(opts) <= {'--slice', '--inputenc', '--outputenc',
'--inputfmt', '--outputfmt'}):
for block in islice(corpus.blocks().values(), start, end):
outfile.write(block)
cnt += 1
else:
if opts.get('--outputfmt', 'export') == 'bracket':
trees = ((key, canonicalize(item.tree) and item)
for key, item in trees)
if opts.get('--outputfmt', 'export') == 'export':
outfile.write(treebank.EXPORTHEADER)
fmt = opts.get('--outputfmt', 'export')
sentid = '--sentid' in opts
for key, item in trees:
outfile.write(treebank.writetree(item.tree, item.sent, key,
fmt, comment=item.comment, sentid=sentid))
cnt += 1
print('%s: transformed %d trees' % (args[0] if args else 'stdin', cnt),
file=stderr)
[docs]def grammar():
"""Read off grammars from treebanks.
Usage: discodop grammar <type> <input> <output> [options]
or: discodop grammar param <parameter-file> <output-directory>
or: discodop grammar info <rules-file>
or: discodop grammar merge (rules|lexicon|fragments) \
<input1> <input2>... <output>"""
import io
import os
import codecs
import logging
from gzip import open as gzipopen
from getopt import gnu_getopt, GetoptError
from .tree import STRTERMRE
from .util import openread
from .treebank import READERS
from .treetransforms import addfanoutmarkers, canonicalize
from .grammar import (treebankgrammar, dopreduction, doubledop, dop1,
compiletsg, writegrammar, grammarinfo, grammarstats,
splitweight, merge, sumfrags, sumrules, sumlex, stripweight,
addindices)
from .parser import readparam
from .runexp import (loadtraincorpus, getposmodel, dobinarization,
getgrammars)
logging.basicConfig(level=logging.DEBUG, format='%(message)s')
shortoptions = 'hs:'
options = ('help', 'gzip', 'packed', 'inputfmt=', 'inputenc=',
'dopestimator=', 'maxdepth=', 'maxfrontier=', 'numproc=')
try:
opts, args = gnu_getopt(argv[2:], shortoptions, options)
model = args[0]
if model not in ('info', 'merge'):
if len(args) != 3:
raise ValueError('expected 2 arguments: treebank grammar')
treebankfile = args[1]
grammarfile = args[2]
except (GetoptError, IndexError, ValueError) as err:
print('error: %r' % err, file=stderr)
print(grammar.__doc__)
sysexit(2)
opts = dict(opts)
if model not in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop', 'dop1',
'ptsg', 'param', 'info', 'merge'):
raise ValueError('unrecognized model: %r' % model)
if opts.get('dopestimator', 'rfe') not in ('rfe', 'ewe', 'shortest'):
raise ValueError('unrecognized estimator: %r' % opts['dopestimator'])
if model == 'info':
grammarstats(args[1])
return
elif model == 'merge':
if len(args) < 5:
raise ValueError('need at least 2 input and 1 output arguments.')
if args[1] == 'rules':
merge(args[2:-1], args[-1], sumrules, stripweight)
elif args[1] == 'lexicon':
merge(args[2:-1], args[-1], sumlex, lambda x: x.split(None, 1)[0])
elif args[1] == 'fragments':
merge(args[2:-1], args[-1], sumfrags, lambda x: x.rsplit('\t', 1)[0])
return
elif model == 'param':
if opts:
raise ValueError('all options should be set in parameter file.')
prm = readparam(args[1])
resultdir = args[2]
if os.path.exists(resultdir):
raise ValueError('Directory %r already exists.\n' % resultdir)
os.mkdir(resultdir)
trees, sents, train_tagged_sents = loadtraincorpus(
prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
prm.transformations, prm.relationalrealizational, resultdir)
if prm.postagging and prm.postagging.method == 'unknownword':
sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
elif not prm.postagging:
lexmodel = None
elif model == 'ptsg': # read fragments
xfragments = {frag: splitweight(weight) for frag, weight
in (line.split('\t') for line in openread(treebankfile,
encoding=opts.get('--inputenc', 'utf8')))}
if STRTERMRE.search(next(iter(xfragments))) is not None:
xfragments = {addindices(frag): splitweight(weight) for frag, weight
in xfragments.items()}
else: # read treebank
corpus = READERS[opts.get('--inputfmt', 'export')](
treebankfile,
encoding=opts.get('--inputenc', 'utf8'))
trees = list(corpus.trees().values())
sents = list(corpus.sents().values())
if not trees:
raise ValueError('no trees; is --inputfmt correct?')
for a in trees:
canonicalize(a)
addfanoutmarkers(a)
# read off grammar
if model in ('pcfg', 'plcfrs'):
xgrammar = treebankgrammar(trees, sents)
elif model == 'dopreduction':
xgrammar, altweights = dopreduction(trees, sents,
packedgraph='--packed' in opts)
elif model == 'doubledop':
xgrammar, backtransform, altweights, _ = doubledop(trees, sents,
numproc=int(opts.get('--numproc', 1)))
elif model == 'dop1':
xgrammar, backtransform, altweights, _ = dop1(trees, sents,
maxdepth=int(opts.get('--maxdepth', 3)),
maxfrontier=int(opts.get('--maxfrontier', 999)))
elif model == 'ptsg':
xgrammar, backtransform, altweights = compiletsg(xfragments)
elif model == 'param':
getgrammars(dobinarization(trees, sents, prm.binarization,
prm.relationalrealizational),
sents, prm.stages, prm.testcorpus.maxwords, resultdir,
prm.numproc, lexmodel, trees[0].label)
paramfile = os.path.join(resultdir, 'params.prm')
with openread(args[1]) as inp:
with io.open(paramfile, 'w', encoding='utf8') as out:
out.write("top='%s',\n%s" % (trees[0].label, inp.read()))
return # grammars have already been written
if opts.get('--dopestimator', 'rfe') != 'rfe':
xgrammar = [(rule, w) for (rule, _), w in
zip(xgrammar, altweights[opts['--dopestimator']])]
rulesname = grammarfile + '.rules'
lexiconname = grammarfile + '.lex'
myopen = open
if '--gzip' in opts:
myopen = gzipopen
rulesname += '.gz'
lexiconname += '.gz'
bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket'
if model == 'ptsg':
bitpar = STRTERMRE.search(next(iter(xfragments))) is not None
rules, lexicon = writegrammar(xgrammar, bitpar=bitpar)
# write output
with codecs.getwriter('utf8')(myopen(rulesname, 'wb')) as rulesfile:
rulesfile.write(rules)
with codecs.getwriter('utf8')(myopen(lexiconname, 'wb')) as lexiconfile:
lexiconfile.write(lexicon)
if model in ('doubledop', 'ptsg'):
backtransformfile = '%s.backtransform%s' % (grammarfile,
'.gz' if '--gzip' in opts else '')
with codecs.getwriter('utf8')(myopen(backtransformfile, 'wb')) as bt:
bt.writelines('%s\n' % a for a in backtransform)
print('wrote backtransform to', backtransformfile)
print('wrote grammar to %s and %s.' % (rulesname, lexiconname))
start = opts.get('-s', next(iter(xgrammar))[0][0][0]
if model == 'ptsg' else trees[0].label)
if version_info[0] == 2:
start = start.decode('utf8')
if len(xgrammar) < 10000: # this is very slow so skip with large grammars
print(grammarinfo(xgrammar))
try:
from .containers import Grammar
print(Grammar(rulesname, lexiconname, start=start).testgrammar()[1])
except (ImportError, AssertionError) as err:
print(err)
if __name__ == "__main__":
main()
__all__ = ['treedraw', 'runexp', 'treetransforms', 'grammar', 'main']