"""Run an experiment given a parameter file.
Does grammar extraction, parsing, and evaluation."""
import io
import os
import re
import csv
import sys
import json
import gzip
import codecs
import logging
import multiprocessing
from math import log
from time import process_time
from collections import defaultdict, Counter, OrderedDict
import pickle
from itertools import zip_longest # pylint: disable=E0611
import numpy as np
from . import eval as evalmod
from . import (__version__, treebank, treebanktransforms, treetransforms,
grammar, lexicon, parser, estimates)
from .treetransforms import binarizetree
from .util import workerfunc
from .containers import Grammar
INTERNALPARAMS = None
[docs]def initworker(params):
"""Set global parameter object."""
global INTERNALPARAMS
# this variable is global because we want to pass it to the fork through
# inheritance from its parent, instead of through serialization.
INTERNALPARAMS = params
[docs]def startexp(
prm, # A DictObj with the structure of parser.DEFAULTS
resultdir='results',
rerun=False):
"""Execute an experiment."""
if rerun:
if not os.path.exists(resultdir):
raise ValueError('Directory %r does not exist.\n--rerun requires a'
' directory with the grammar(s) of a previous experiment.'
% resultdir)
else:
if os.path.exists(resultdir):
raise ValueError('Directory %r exists.\n'
'Use --rerun to parse with existing grammar '
'and overwrite previous results.' % resultdir)
os.mkdir(resultdir)
# Log everything, and send it to stderr, in a format with just the message.
formatstr = '%(message)s'
if prm.verbosity == 0:
logging.basicConfig(level=logging.WARNING, format=formatstr)
elif prm.verbosity == 1:
logging.basicConfig(level=logging.INFO, format=formatstr)
elif prm.verbosity == 2:
logging.basicConfig(level=logging.DEBUG, format=formatstr)
elif 3 <= prm.verbosity <= 4:
logging.basicConfig(level=5, format=formatstr)
else:
raise ValueError('verbosity should be >= 0 and <= 4. ')
# also log to a file
fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
fileobj.setLevel(logging.DEBUG)
fileobj.setFormatter(logging.Formatter(formatstr))
logging.getLogger('').addHandler(fileobj)
logging.info('Disco-DOP %s, running on Python %s',
__version__, sys.version.split()[0])
logging.info('Parameter file: %r', resultdir + '.prm')
if not rerun:
trees, sents, train_tagged_sents = loadtraincorpus(
prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
prm.transformations, prm.relationalrealizational, resultdir)
elif isinstance(prm.traincorpus.numsents, float):
raise ValueError('need to specify number of training set sentences, '
'not fraction, in rerun mode.')
testsettb = treebank.READERS[prm.corpusfmt](
prm.testcorpus.path, encoding=prm.testcorpus.encoding,
headrules=prm.binarization.headrules,
removeempty=prm.removeempty, morphology=prm.morphology,
functions=prm.functions, ensureroot=prm.ensureroot)
if isinstance(prm.testcorpus.numsents, float):
prm.testcorpus.numsents = int(prm.testcorpus.numsents
* len(testsettb.blocks()))
if prm.testcorpus.skiptrain:
prm.testcorpus.skip += ( # pylint: disable=maybe-no-member
prm.traincorpus.numsents) # pylint: disable=maybe-no-member
test_blocks = OrderedDict()
test_trees = OrderedDict()
test_tagged_sents = OrderedDict()
for n, item in testsettb.itertrees(
prm.testcorpus.skip,
prm.testcorpus.skip # pylint: disable=no-member
+ prm.testcorpus.numsents):
if 1 <= len(item.sent) <= prm.testcorpus.maxwords:
test_blocks[n] = item.block
test_trees[n] = item.tree
test_tagged_sents[n] = [(word, tag) for word, (_, tag)
in zip(item.sent, sorted(item.tree.pos()))]
logging.info('%d test sentences after length restriction <= %d',
len(test_trees), prm.testcorpus.maxwords)
lexmodel = None
test_tagged_sents_mangled = test_tagged_sents
if prm.postagging and prm.postagging.method in (
'treetagger', 'stanford', 'frog'):
if prm.postagging.method == 'treetagger':
# these two tags are never given by tree-tagger,
# so collect words whose tag needs to be overriden
overridetags = ('PTKANT', 'PIDAT')
elif prm.postagging.method == 'stanford':
overridetags = ('PTKANT', )
elif prm.postagging.method == 'frog':
overridetags = ()
taglex = defaultdict(set)
for sent in train_tagged_sents:
for word, tag in sent:
taglex[word].add(tag)
overridetagdict = {tag:
{word for word, tags in taglex.items() if tags == {tag}}
for tag in overridetags}
tagmap = {'$(': '$[', 'PAV': 'PROAV'}
test_tagged_sents_mangled = lexicon.externaltagging(
prm.postagging.method, prm.postagging.model, test_tagged_sents,
overridetagdict, tagmap)
if prm.postagging.retag and not rerun:
logging.info('re-tagging training corpus')
sents_to_tag = OrderedDict(enumerate(train_tagged_sents))
train_tagged_sents = lexicon.externaltagging(prm.postagging.method,
prm.postagging.model, sents_to_tag, overridetagdict,
tagmap).values()
for tree, tagged in zip(trees, train_tagged_sents):
for node in tree.subtrees(
lambda n: len(n) == 1 and isinstance(n[0], int)):
node.label = tagged[node[0]][1]
usetags = True # give these tags to parser
elif prm.postagging and prm.postagging.method == 'unknownword':
if not rerun:
sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
with open(resultdir + '/closedclasswords.txt', 'w') as out:
out.writelines(w + '\n' for w in lexmodel[3])
usetags = False # make sure gold POS tags are not given to parser
else:
usetags = True # give gold POS tags to parser
# 0: test sentences as they should be handed to the parser,
# 1: gold trees for evaluation purposes
# 2: gold sents because test sentences may be mangled by unknown word model
# 3: blocks from treebank file to reproduce the relevant part of the
# original treebank verbatim.
testset = OrderedDict((n, (
test_tagged_sents_mangled[n],
test_trees[n],
test_tagged_sents[n],
block))
for n, block in test_blocks.items())
if not test_tagged_sents:
raise ValueError('test corpus (selection) should be non-empty.')
if rerun:
trees, sents = [], []
roots = {t.label for t in trees} | {test_trees[n].label for n in testset}
if len(roots) != 1:
raise ValueError('expected unique ROOT label: %r' % roots)
top = roots.pop()
funcclassifier = None
if rerun:
parser.readgrammars(resultdir, prm.stages, prm.postagging,
prm.transformations, top)
if prm.predictfunctions:
import joblib
funcclassifier = joblib.load('%s/funcclassifier.pickle' % resultdir)
else:
logging.info('read training & test corpus')
if prm.predictfunctions:
import joblib
from . import functiontags
logging.info('training function tag classifier')
funcclassifier, msg = functiontags.trainfunctionclassifier(
trees, sents, prm.numproc)
joblib.dump(funcclassifier, '%s/funcclassifier.pickle' % resultdir,
compress=3)
logging.info(msg)
getgrammars(dobinarization(trees, sents, prm.binarization,
prm.relationalrealizational),
sents, prm.stages, prm.testcorpus.maxwords, resultdir,
prm.numproc, lexmodel, top)
evalparam = evalmod.readparam(prm.evalparam)
evalparam['DEBUG'] = -1
evalparam['CUTOFF_LEN'] = 40
deletelabel = evalparam.get('DELETE_LABEL', ())
deleteword = evalparam.get('DELETE_WORD', ())
begin = process_time()
theparser = parser.Parser(prm, funcclassifier=funcclassifier)
results = doparsing(parser=theparser, testset=testset, resultdir=resultdir,
usetags=usetags, numproc=prm.numproc, deletelabel=deletelabel,
deleteword=deleteword, corpusfmt=prm.corpusfmt,
morphology=prm.morphology, evalparam=evalparam)
if prm.numproc == 1:
logging.info(
'time elapsed during parsing: %gs', process_time() - begin)
for result in results:
nsent = len(result.parsetrees)
overcutoff = any(len(a) > evalparam['CUTOFF_LEN']
for a in test_tagged_sents.values())
header = (' ' + result.name.upper() + ' ').center(
44 if overcutoff else 35, '=')
evalsummary = result.evaluator.summary()
coverage = 'coverage: %s = %6.2f' % (
('%d / %d' % (nsent - result.noparse, nsent)).rjust(
25 if overcutoff else 14),
100.0 * (nsent - result.noparse) / nsent)
logging.info('\n'.join(('', header, evalsummary, coverage)))
return top
[docs]def loadtraincorpus(corpusfmt, traincorpus, binarization, punct, functions,
morphology, removeempty, ensureroot, transformations,
relationalrealizational, resultdir):
"""Load the training corpus."""
train = treebank.READERS[corpusfmt](traincorpus.path,
encoding=traincorpus.encoding, headrules=binarization.headrules,
removeempty=removeempty, ensureroot=ensureroot, punct=punct,
functions=functions, morphology=morphology)
if isinstance(traincorpus.numsents, float):
traincorpus.numsents = int(traincorpus.numsents * len(train.sents()))
trainset = [item for _, item in train.itertrees(
traincorpus.skip, traincorpus.skip + traincorpus.numsents)
if 1 <= len(item.sent) <= traincorpus.maxwords]
trees = [item.tree for item in trainset]
sents = [item.sent for item in trainset]
logging.info('%d training sentences after length restriction <= %d',
len(trees), traincorpus.maxwords)
if not trees:
raise ValueError('training corpus (selection) should be non-empty.')
if transformations:
if 'ftbundocompounds' in transformations:
treebanktransforms.getftbcompounds(
trees, sents, resultdir + '/compounds.txt')
newtrees, newsents = [], []
for tree, sent in zip(trees, sents):
treebanktransforms.transform(tree, sent, transformations)
if sent:
newtrees.append(tree)
newsents.append(sent)
trees, sents = newtrees, newsents
if relationalrealizational:
trees = [treebanktransforms.rrtransform(
tree, **relationalrealizational)[0] for tree in trees]
train_tagged_sents = [[(word, tag) for word, (_, tag)
in zip(sent, sorted(tree.pos()))]
for tree, sent in zip(trees, sents)]
return trees, sents, train_tagged_sents
[docs]def getposmodel(postagging, train_tagged_sents):
"""Apply unknown word model to sentences before extracting grammar."""
postagging.update(unknownwordfun=lexicon.UNKNOWNWORDFUNC[postagging.model])
# get smoothed probalities for lexical productions
lexmodel, msg = lexicon.getunknownwordmodel(
train_tagged_sents, postagging.unknownwordfun,
postagging.unknownthreshold, postagging.openclassthreshold)
logging.info(msg)
# NB: commonwords is the subset of words that are above the frequency
# threshold. for training purposes we work with the subset, at test time we
# exploit the full set of known words from the training set.
sigs, allwords, commonwords, closedclasswords = lexmodel[:4]
postagging.update(sigs=sigs, lexicon=allwords,
closedclasswords=closedclasswords)
# replace rare train words with signatures
sents = lexicon.replaceraretrainwords(train_tagged_sents,
postagging.unknownwordfun, commonwords)
return sents, lexmodel
[docs]def dobinarization(trees, sents, binarization, relationalrealizational,
logmsg=True):
"""Apply binarization to treebank."""
# fixme: this n should correspond to sentence id
tbfanout, n = treetransforms.treebankfanout(trees)
if logmsg:
logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s',
tbfanout, n, trees[n], ' '.join(sents[n]))
# binarization
begin = process_time()
msg = 'binarization: %s' % binarization.method
if binarization.fanout_marks_before_bin:
trees = [treetransforms.addfanoutmarkers(t) for t in trees]
if binarization.method == 'default':
msg += ' %s h=%d v=%d %s' % (
binarization.factor, binarization.h, binarization.v,
'tailmarker' if binarization.tailmarker else '')
elif binarization.method == 'optimalhead':
msg += ' h=%d v=%d' % (
binarization.h, binarization.v)
if binarization.method is not None:
trees = [binarizetree(t, binarization, relationalrealizational)
for t in trees]
if binarization.markovthreshold:
msg1 = treetransforms.markovthreshold(trees,
binarization.markovthreshold,
binarization.h + binarization.revh - 1,
max(binarization.v - 1, 1))
if logmsg:
logging.info(msg1)
trees = [treetransforms.addfanoutmarkers(t) for t in trees]
if logmsg:
logging.info(
'%s; cpu time elapsed: %gs', msg, process_time() - begin)
return trees
[docs]def getgrammars(trees, sents, stages, testmaxwords, resultdir,
numproc, lexmodel, top):
"""Read off the requested grammars."""
tbfanout, n = treetransforms.treebankfanout(trees)
logging.info('binarized treebank fan-out: %d #%d', tbfanout, n)
mappings = [None for _ in stages]
for n, stage in enumerate(stages):
traintrees = trees
stage.mapping = None
prevn = 0
if n and stage.prune:
prevn = [a.name for a in stages].index(stage.prune)
if stage.split:
traintrees = [treetransforms.binarize(
treetransforms.splitdiscnodes(
tree.copy(True),
stage.markorigin),
childchar=':', dot=True, ids=grammar.UniqueIDs())
for tree in traintrees]
logging.info('splitted discontinuous nodes')
if stage.collapse:
traintrees, mappings[n] = treebanktransforms.collapselabels(
[tree.copy(True) for tree in traintrees],
tbmapping=treebanktransforms.MAPPINGS[
stage.collapse[0]][stage.collapse[1]])
logging.info('collapsed phrase labels for multilevel '
'coarse-to-fine parsing to %s level %d',
*stage.collapse)
if n and mappings[prevn] is not None:
# Given original labels A, convert CTF mapping1 A => C,
# and mapping2 A => B to a mapping B => C.
mapping1, mapping2 = mappings[prevn], mappings[n]
if mappings[n] is None:
stage.mapping = {a: mapping1[a] for a in mapping1}
else:
stage.mapping = {mapping2[a]: mapping1[a] for a in mapping2}
if stage.mode.startswith('pcfg'):
if tbfanout != 1 and not stage.split:
raise ValueError('Cannot extract PCFG from treebank '
'with discontinuities.')
backtransform = extrarules = None
if lexmodel:
extrarules = lexicon.simplesmoothlexicon(lexmodel)
if stage.mode == 'mc-rerank':
from . import _fragments
gram = parser.DictObj(_fragments.getctrees(zip(trees, sents)))
tree = gram.trees1.extract(0, gram.vocab)
gram.start = tree[:tree.index(' ')].lstrip('(')
with gzip.open('%s/%s.train.pickle.gz' % (resultdir, stage.name),
'wb', compresslevel=1) as out:
out.write(pickle.dumps(gram, protocol=-1))
elif stage.dop:
rules = lex = None
if stage.dop in ('doubledop', 'dop1'):
if stage.dop == 'doubledop':
(xgrammar, backtransform,
altweights, fragments) = grammar.doubledop(
traintrees, sents,
numproc=numproc, maxdepth=stage.maxdepth,
maxfrontier=stage.maxfrontier,
extrarules=extrarules)
elif stage.dop == 'dop1':
(xgrammar, backtransform,
altweights, fragments) = grammar.dop1(
traintrees, sents, maxdepth=stage.maxdepth,
maxfrontier=stage.maxfrontier,
extrarules=extrarules)
# dump fragments
with codecs.getwriter('utf8')(gzip.open(
'%s/%s.fragments.gz' % (resultdir, stage.name), 'wb',
compresslevel=1)) as out:
out.writelines('%s\t%d\n' % (a, len(b))
for a, b in fragments)
elif stage.dop == 'reduction':
xgrammar, altweights = grammar.dopreduction(
traintrees, sents, packedgraph=stage.packedgraph,
extrarules=extrarules)
elif stage.dop == 'ostag':
rules, lex, inittrees, auxtrees = grammar.doubleostagfromtsg(
traintrees, sents, numproc=numproc,
packedgraph=stage.packedgraph,
extrarules=extrarules)
altweights = {}
with codecs.getwriter('utf8')(gzip.open(
'%s/%s.init.gz' % (resultdir, stage.name),
'wb', compresslevel=1)) as out:
out.writelines('%s\t%s\n' % a for a in inittrees.items())
with codecs.getwriter('utf8')(gzip.open(
'%s/%s.aux.gz' % (resultdir, stage.name),
'wb', compresslevel=1)) as out:
out.writelines('%s\t%s\n' % a for a in auxtrees.items())
else:
raise ValueError('unrecognized DOP model: %r' % stage.dop)
nodes = sum(len(list(a.subtrees())) for a in traintrees)
msg = grammar.grammarinfo(xgrammar)
if rules is None:
rules, lex = grammar.writegrammar(xgrammar)
rulesfile = '%s/%s.rules.gz' % (resultdir, stage.name)
lexiconfile = '%s/%s.lex.gz' % (resultdir, stage.name)
with codecs.getwriter('utf8')(gzip.open(rulesfile, 'wb',
compresslevel=1)) as out:
out.write(rules)
with codecs.getwriter('utf8')(gzip.open(lexiconfile, 'wb',
compresslevel=1)) as out:
out.write(lex)
# write prob models
np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name),
**altweights)
gram = Grammar(rulesfile, lexiconfile, start=top,
altweights='%s/%s.probs.npz' % (resultdir, stage.name),
backtransform=backtransform)
logging.info('DOP model based on %d sentences, %d nodes, '
'%d nonterminals', len(traintrees), nodes, gram.nonterminals)
logging.info(msg)
if stage.estimator != 'rfe':
gram.switch('%s' % stage.estimator)
logging.info(gram.testgrammar()[1])
if stage.dop in ('doubledop', 'dop1'):
# backtransform keys are line numbers to rules file;
# to see them together do:
# $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz)
with codecs.getwriter('utf8')(gzip.open(
'%s/%s.backtransform.gz' % (resultdir, stage.name),
'wb', compresslevel=1)) as out:
out.writelines('%s\n' % a for a in backtransform)
# recoverfragments() relies on this mapping to identify
# binarization nodes. treeparsing() relies on this as well.
msg = gram.getmapping(
None, neverblockre=re.compile('.+}<'))
if n and stage.prune:
msg = gram.getmapping(stages[prevn].grammar,
striplabelre=None if stages[prevn].dop
else re.compile('@.+$'),
neverblockre=re.compile('.+}<'),
splitprune=not stage.split and stages[prevn].split,
markorigin=stages[prevn].markorigin,
mapping=stage.mapping)
logging.info(msg)
else: # dop reduction
if n and stage.prune: # dop reduction
msg = gram.getmapping(stages[prevn].grammar,
striplabelre=None if stages[prevn].dop
and stages[prevn].dop not in ('doubledop', 'dop1')
else re.compile(r'@[-0-9]+(?:\$\[.*\])?$'),
neverblockre=re.compile(stage.neverblockre)
if stage.neverblockre else None,
splitprune=not stage.split and stages[prevn].split,
markorigin=stages[prevn].markorigin,
mapping=stage.mapping)
if stage.mode == 'dop-rerank':
gram.getrulemapping(stages[prevn].grammar,
re.compile(r'@[-0-9]+\b'))
logging.info(msg)
if stage.objective == 'sl-dop': # needed for treeparsing()
_ = gram.getmapping(
None, striplabelre=re.compile(r'@[-0-9]+\b'))
gram.getrulemapping(gram, re.compile(r'@[-0-9]+\b')
)
else: # not stage.dop
xgrammar = grammar.treebankgrammar(traintrees, sents,
extrarules=extrarules)
logging.info('induced %s based on %d sentences',
('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'),
len(traintrees))
if stage.split or os.path.exists('%s/pcdist.txt' % resultdir):
logging.info(grammar.grammarinfo(xgrammar))
else:
logging.info(grammar.grammarinfo(xgrammar,
dump='%s/pcdist.txt' % resultdir))
rules, lex = grammar.writegrammar(xgrammar)
rulesfile = '%s/%s.rules.gz' % (resultdir, stage.name)
lexiconfile = '%s/%s.lex.gz' % (resultdir, stage.name)
with codecs.getwriter('utf8')(gzip.open(rulesfile, 'wb',
compresslevel=1)) as out:
out.write(rules)
with codecs.getwriter('utf8')(gzip.open(lexiconfile, 'wb',
compresslevel=1)) as out:
out.write(lex)
gram = Grammar(rulesfile, lexiconfile, start=top)
logging.info(gram.testgrammar()[1])
if n and stage.prune:
msg = gram.getmapping(stages[prevn].grammar,
striplabelre=None,
neverblockre=re.compile(stage.neverblockre)
if stage.neverblockre else None,
splitprune=not stage.split and stages[prevn].split,
markorigin=stages[prevn].markorigin,
mapping=stage.mapping)
logging.info(msg)
logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz',
resultdir, stage.name,
',backtransform' if stage.dop in ('doubledop', 'dop1') else '')
outside = None
if stage.estimates in ('SX', 'SXlrgaps'):
if stage.estimates == 'SX' and tbfanout != 1 and not stage.split:
raise ValueError('SX estimate requires PCFG.')
elif stage.mode != 'plcfrs':
raise ValueError('estimates require parser w/agenda.')
begin = process_time()
logging.info('computing %s estimates', stage.estimates)
if stage.estimates == 'SX':
outside = estimates.getpcfgestimates(
gram, testmaxwords, trees[0].label)
elif stage.estimates == 'SXlrgaps':
outside = estimates.getestimates(
gram, testmaxwords, trees[0].label)
logging.info('estimates done. cpu time elapsed: %gs',
process_time() - begin)
np.savez_compressed('%s/%s.outside.npz' % (
resultdir, stage.name), outside=outside)
logging.info('saved %s estimates', stage.estimates)
elif stage.estimates:
raise ValueError('unrecognized value; specify SX or SXlrgaps.')
stage.update(grammar=gram, outside=outside)
if any(stage.mapping is not None for stage in stages):
with codecs.getwriter('utf8')(gzip.open('%s/mapping.json.gz' % (
resultdir), 'wb', compresslevel=1)) as mappingfile:
mappingfile.write(json.dumps([stage.mapping for stage in stages]))
[docs]def doparsing(**kwds):
"""Parse a set of sentences using worker processes."""
params = parser.DictObj(usetags=True, numproc=None, tailmarker='',
category=None, deletelabel=(), deleteword=(), corpusfmt='export')
params.update(kwds)
results = [parser.DictObj(name=stage.name)
for stage in params.parser.stages]
for result in results:
result.update(
parsetrees=dict.fromkeys(params.testset),
sents=dict.fromkeys(params.testset),
logprob=dict.fromkeys(params.testset, float('nan')),
frags=dict.fromkeys(params.testset, 0),
numitems=dict.fromkeys(params.testset, 0),
golditems=dict.fromkeys(params.testset, 0),
totalgolditems=dict.fromkeys(params.testset, 0),
elapsedtime=dict.fromkeys(params.testset),
evaluator=evalmod.Evaluator(params.evalparam), noparse=0)
if params.numproc == 1:
initworker(params)
dowork = (worker(a) for a in params.testset.items())
else:
pool = multiprocessing.Pool(processes=params.numproc,
initializer=initworker, initargs=(params,))
dowork = pool.imap_unordered(
mpworker, params.testset.items())
logging.info('going to parse %d sentences.', len(params.testset))
# main parse loop over each sentence in test corpus
for nsent, data in enumerate(dowork, 1):
sentid, sent, sentresults = data
_sent, goldtree, goldsent, _ = params.testset[sentid]
goldsent = [w for w, _t in goldsent]
logging.debug('%d/%d (%s). [len=%d] %s\n',
nsent, len(params.testset), sentid, len(sent),
' '.join(goldsent))
for n, result in enumerate(sentresults):
assert (results[n].parsetrees[sentid] is None
and results[n].elapsedtime[sentid] is None)
results[n].parsetrees[sentid] = result.parsetree
results[n].sents[sentid] = sent
if isinstance(result.prob, tuple):
try:
results[n].logprob[sentid] = [log(a) for a in result.prob
if isinstance(a, float) and 0 < a <= 1][0]
except (ValueError, IndexError):
results[n].logprob[sentid] = 300.0
results[n].frags[sentid] = ([abs(a) for a in result.prob
if isinstance(a, int)] or [None])[0]
elif isinstance(result.prob, float):
try:
results[n].logprob[sentid] = log(result.prob)
except ValueError:
results[n].logprob[sentid] = 300.0
if result.fragments is not None:
results[n].frags[sentid] = len(result.fragments)
results[n].numitems[sentid] = result.numitems
results[n].golditems[sentid] = result.golditems
results[n].totalgolditems[sentid] = result.totalgolditems
results[n].elapsedtime[sentid] = result.elapsedtime
if result.noparse:
results[n].noparse += 1
sentmetrics = results[n].evaluator.add(
sentid, goldtree.copy(True), goldsent,
result.parsetree.copy(True), sent)
msg = result.msg
scores = sentmetrics.scores()
msg += '\tPOS %(POS)s ' % scores
if not scores['FUN'].endswith('nan'):
msg += 'FUN %(FUN)s ' % scores
if scores['LF'] == '100.00':
msg += 'LF exact match'
else:
msg += 'LF %(LF)s' % scores
try:
msg += '\n\t' + sentmetrics.bracketings()
except Exception as err: # pylint: disable=broad-except
msg += 'PROBLEM bracketings:\n%s\n%s' % (
result.parsetree, err)
msg += '\n'
if n + 1 == len(sentresults):
try:
msg += sentmetrics.visualize()
except Exception as err: # pylint: disable=broad-except
msg += 'PROBLEM drawing tree:\n%s\n%s' % (
sentmetrics.ctree, err)
logging.debug(msg)
msg = ''
for n, result in enumerate(sentresults):
metrics = results[n].evaluator.acc.scores()
msg += ('%(name)s cov %(cov)5.2f; pos %(tag)s; %(fun1)s'
'ex %(ex)s; lp %(lp)s; lr %(lr)s; lf %(lf)s\n' % dict(
name=result.name.ljust(7),
cov=100 * (1 - results[n].noparse / nsent),
fun1='' if metrics['fun'].endswith('nan') else
('fun %(fun)s; ' % metrics),
**metrics))
logging.debug(msg)
if params.numproc != 1:
pool.terminate()
pool.join()
del dowork, pool
writeresults(results, params)
return results
@workerfunc
def mpworker(args):
"""Multiprocessing wrapper of ``worker``."""
return worker(args)
[docs]def worker(args):
"""Parse a sentence using global Parser object, and evaluate incrementally.
:returns: a string with diagnostic information, as well as a list of
DictObj instances with the results for each stage."""
nsent, (tagged_sent, goldtree, _, _) = args
sent = [w for w, _ in tagged_sent]
prm = INTERNALPARAMS
results = list(prm.parser.parse(sent,
tags=[t for _, t in tagged_sent] if prm.usetags else None,
goldtree=goldtree)) # only used to determine quality of pruning
return (nsent, sent, results)
[docs]def writeresults(results, params):
"""Write parsing results to files in same format as the original corpus.
(Or export if writer not implemented)."""
ext = {'export': 'export', 'bracket': 'mrg',
'discbracket': 'dbr', 'alpino': 'xml'}
category = (params.category + '.') if params.category else ''
if params.corpusfmt not in ('export', 'bracket', 'discbracket'):
# convert gold corpus because writing this formats is unsupported
corpusfmt = 'export'
with io.open('%s/%sgold.%s' % (params.resultdir, category,
ext[corpusfmt]), 'w', encoding='utf8') as out:
out.writelines(treebank.writetree(
goldtree, [w for w, _ in goldsent], n,
corpusfmt, morphology=params.morphology)
for n, (_, goldtree, goldsent, _)
in params.testset.items())
else:
corpusfmt = params.corpusfmt
with io.open('%s/%sgold.%s' % (params.resultdir, category,
ext[corpusfmt]), 'w', encoding='utf8') as out:
out.writelines(a for _, _, _, a in params.testset.values())
for res in results:
with io.open('%s/%s%s.%s' % (params.resultdir, category, res.name,
ext[corpusfmt]), 'w', encoding='utf8') as out:
out.writelines(treebank.writetree(
res.parsetrees[n], res.sents[n], n, corpusfmt,
morphology=params.morphology)
for n in params.testset)
fileobj = open('%s/stats.tsv' % params.resultdir, 'w',
encoding='utf8', newline='')
with fileobj as out:
fields = ['sentid', 'len', 'stage', 'elapsedtime', 'logprob', 'frags',
'numitems', 'golditems', 'totalgolditems']
writer = csv.writer(out, dialect='excel-tab')
writer.writerow(fields)
writer.writerows([n, len(params.testset[n][2]), res.name]
+ [getattr(res, field)[n] for field in fields[3:]]
for n in params.testset
for res in results)
logging.info('wrote results to %s/%s%s.%s', params.resultdir, category,
(('{%s}' % ','.join(res.name for res in results))
if len(results) > 1 else results[0].name),
ext[corpusfmt])
[docs]def oldeval(results, goldbrackets):
"""Simple evaluation."""
nsent = len(results[0].parsetrees)
if nsent:
for result in results:
logging.info('%s lp %5.2f lr %5.2f lf %5.2f\n'
'coverage %d / %d = %5.2f %% '
'exact match %d / %d = %5.2f %%\n',
result.name,
100 * evalmod.precision(goldbrackets, result.brackets),
100 * evalmod.recall(goldbrackets, result.brackets),
100 * evalmod.f_measure(goldbrackets, result.brackets),
nsent - result.noparse, nsent,
100 * (nsent - result.noparse) / nsent,
result.exact, nsent, 100 * result.exact / nsent)
[docs]def readtepacoc():
"""Read the tepacoc test set."""
tepacocids = set()
tepacocsents = defaultdict(list)
cat = 'undefined'
tepacoc = io.open('../tepacoc.txt', encoding='utf8')
for line in tepacoc.read().splitlines():
fields = line.split('\t') # = [id, '', sent]
if line.strip() and len(fields) == 3:
if fields[0].strip():
# subtract one because our ids are zero-based, tepacoc 1-based
sentid = int(fields[0]) - 1
tepacocids.add(sentid)
tepacocsents[cat].append((sentid, fields[2].split()))
else: # new category
cat = fields[2]
if cat.startswith('CUC'):
cat = 'CUC'
elif fields[0] == 'TuBa':
break
return tepacocids, tepacocsents
[docs]def parsetepacoc(
stages=(dict(mode='pcfg', split=True, markorigin=True),
dict(mode='plcfrs', prune=True, k=10000),
dict(mode='plcfrs', prune=True, k=5000, dop='doubledop',
estimator='rfe', objective='mpp')),
trainmaxwords=999, trainnumsents=25005, testmaxwords=999,
binarization=parser.DictObj(
method='default', h=1, v=1, factor='right', tailmarker='',
headrules='negra.headrules',
leftmostunary=True, rightmostunary=True,
markhead=False, fanout_marks_before_bin=False),
transformations=None, usetagger='stanford', resultdir='tepacoc',
numproc=1):
"""Parse the tepacoc test set."""
for stage in stages:
for key in stage:
if key not in parser.DEFAULTSTAGE:
raise ValueError('unrecognized option: %r' % key)
stages = [parser.DictObj({k: stage.get(k, v) for k, v
in parser.DEFAULTSTAGE.items()}) for stage in stages]
os.mkdir(resultdir)
# Log everything, and send it to stderr, in a format with just the message.
formatstr = '%(message)s'
logging.basicConfig(level=logging.DEBUG, format=formatstr)
# log up to INFO to a results log file
fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
fileobj.setLevel(logging.INFO)
fileobj.setFormatter(logging.Formatter(formatstr))
logging.getLogger('').addHandler(fileobj)
tepacocids, tepacocsents = readtepacoc()
try:
(corpus_sents, corpus_taggedsents,
corpus_trees, corpus_blocks) = pickle.load(
gzip.open('tiger.pickle.gz', 'rb'))
except IOError: # file not found
corpus = treebank.READERS['export'](
'../tiger/corpus/tiger_release_aug07.export',
headrules=binarization.headrules,
headfinal=True, headreverse=False, punct='move',
encoding='iso-8859-1')
corpus_sents = list(corpus.sents().values())
corpus_taggedsents = list(corpus.tagged_sents().values())
corpus_trees = list(corpus.trees().values())
if transformations:
for tree, sent in zip(corpus_trees, corpus_sents):
treebanktransforms.transform(tree, sent, transformations)
corpus_blocks = list(corpus.blocks().values())
with gzip.open('tiger.pickle.gz', 'wb') as out:
pickle.dump((corpus_sents, corpus_taggedsents, corpus_trees,
corpus_blocks), out, protocol=-1)
# test sets (one for each category)
testsets = {}
allsents = []
for cat, catsents in tepacocsents.items():
testset = sents, trees, goldsents, blocks = [], [], [], []
for n, sent in catsents:
if sent != corpus_sents[n]:
logging.error(
'mismatch. sent %d:\n%r\n%r\n'
'not in corpus %r\nnot in tepacoc %r',
n + 1, sent, corpus_sents[n],
[a for a, b in zip_longest(sent, corpus_sents[n])
if a and a != b],
[b for a, b in zip_longest(sent, corpus_sents[n])
if b and a != b])
elif len(corpus_sents[n]) <= testmaxwords:
sents.append(corpus_taggedsents[n])
trees.append(corpus_trees[n])
goldsents.append(corpus_taggedsents[n])
blocks.append(corpus_blocks[n])
allsents.extend(sents)
logging.info('category: %s, %d of %d sentences',
cat, len(testset[0]), len(catsents))
testsets[cat] = testset
testsets['baseline'] = zip(*[sent for n, sent in
enumerate(zip(corpus_taggedsents, corpus_trees,
corpus_taggedsents, corpus_blocks))
if len(sent[1]) <= trainmaxwords
and n not in tepacocids][trainnumsents:trainnumsents + 2000])
allsents.extend(testsets['baseline'][0])
if usetagger:
overridetags = ('PTKANT', 'VAIMP')
taglex = defaultdict(set)
for sent in corpus_taggedsents[:trainnumsents]:
for word, tag in sent:
taglex[word].add(tag)
overridetagdict = {tag:
{word for word, tags in taglex.items()
if tags == {tag}} for tag in overridetags}
tagmap = {'$(': '$[', 'PAV': 'PROAV', 'PIDAT': 'PIAT'}
# the sentences in the list allsents are modified in-place so that
# the relevant copy in testsets[cat][0] is updated as well.
lexicon.externaltagging(
usetagger, '', allsents, overridetagdict, tagmap)
# training set
trees, sents, blocks = zip(*[sent for n, sent in
enumerate(zip(corpus_trees, corpus_sents,
corpus_blocks)) if len(sent[1]) <= trainmaxwords
and n not in tepacocids][:trainnumsents])
getgrammars(dobinarization(trees, sents, binarization, False),
sents, stages, testmaxwords, resultdir,
numproc, None, trees[0].label)
del corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks
results = {}
cnt = 0
params = parser.DictObj(parser.DEFAULTS)
params.update(stages=stages, binarization=binarization,
transformations=transformations)
theparser = parser.Parser(params)
for cat, testset in sorted(testsets.items()):
if cat == 'baseline':
continue
logging.info('category: %s', cat)
begin = process_time()
results[cat] = doparsing(parser=theparser, testset=testset,
resultdir=resultdir, usetags=True, numproc=numproc,
category=cat)
cnt += len(testset[0])
logging.info('time elapsed during parsing: %g', process_time() - begin)
goldbrackets = Counter()
totresults = [parser.DictObj(name=stage.name) for stage in stages]
for result in totresults:
result.elapsedtime = [None] * cnt
result.parsetrees = [None] * cnt
result.brackets = Counter()
result.exact = result.noparse = 0
goldblocks = []
goldsents = []
# FIXME
for cat, res in results.items():
logging.info('category: %s', cat)
goldbrackets |= res[2]
goldblocks.extend(res[3])
goldsents.extend(res[4])
for result, totresult in zip(res[0], totresults):
totresult.exact += result.exact
totresult.noparse += result.noparse
totresult.brackets |= result.brackets
totresult.elapsedtime.extend(result.elapsedtime)
oldeval(*res)
logging.info('TOTAL')
oldeval(totresults, goldbrackets)
# write TOTAL results file with all tepacoc sentences (not the baseline)
for stage in stages:
with io.open('TOTAL.%s.export' % stage.name,
'w', encoding='utf8') as tmp:
tmp.writelines(io.open('%s.%s.export' % (cat, stage.name),
encoding='utf8').read() for cat in list(results) + ['gold'])
# do baseline separately because it shouldn't count towards the total score
cat = 'baseline'
logging.info('category: %s', cat)
oldeval(*doparsing(parser=theparser, testset=testsets[cat],
resultdir=resultdir, usetags=True, numproc=numproc, category=cat))
__all__ = ['initworker', 'startexp', 'loadtraincorpus', 'getposmodel',
'dobinarization', 'getgrammars', 'doparsing', 'worker', 'writeresults',
'oldeval', 'readtepacoc', 'parsetepacoc']