Source code for discodop.runexp

"""Run an experiment given a parameter file.

Does grammar extraction, parsing, and evaluation."""
import io
import os
import re
import csv
import sys
import json
import gzip
import codecs
import logging
import multiprocessing
from math import log
from time import process_time
from collections import defaultdict, Counter, OrderedDict
import pickle
from itertools import zip_longest  # pylint: disable=E0611
import numpy as np
from . import eval as evalmod
from . import (__version__, treebank, treebanktransforms, treetransforms,
		grammar, lexicon, parser, estimates)
from .treetransforms import binarizetree
from .util import workerfunc
from .containers import Grammar

INTERNALPARAMS = None


[docs]def initworker(params): """Set global parameter object.""" global INTERNALPARAMS # this variable is global because we want to pass it to the fork through # inheritance from its parent, instead of through serialization. INTERNALPARAMS = params
[docs]def startexp( prm, # A DictObj with the structure of parser.DEFAULTS resultdir='results', rerun=False): """Execute an experiment.""" if rerun: if not os.path.exists(resultdir): raise ValueError('Directory %r does not exist.\n--rerun requires a' ' directory with the grammar(s) of a previous experiment.' % resultdir) else: if os.path.exists(resultdir): raise ValueError('Directory %r exists.\n' 'Use --rerun to parse with existing grammar ' 'and overwrite previous results.' % resultdir) os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' if prm.verbosity == 0: logging.basicConfig(level=logging.WARNING, format=formatstr) elif prm.verbosity == 1: logging.basicConfig(level=logging.INFO, format=formatstr) elif prm.verbosity == 2: logging.basicConfig(level=logging.DEBUG, format=formatstr) elif 3 <= prm.verbosity <= 4: logging.basicConfig(level=5, format=formatstr) else: raise ValueError('verbosity should be >= 0 and <= 4. ') # also log to a file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) fileobj.setLevel(logging.DEBUG) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) logging.info('Disco-DOP %s, running on Python %s', __version__, sys.version.split()[0]) logging.info('Parameter file: %r', resultdir + '.prm') if not rerun: trees, sents, train_tagged_sents = loadtraincorpus( prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct, prm.functions, prm.morphology, prm.removeempty, prm.ensureroot, prm.transformations, prm.relationalrealizational, resultdir) elif isinstance(prm.traincorpus.numsents, float): raise ValueError('need to specify number of training set sentences, ' 'not fraction, in rerun mode.') testsettb = treebank.READERS[prm.corpusfmt]( prm.testcorpus.path, encoding=prm.testcorpus.encoding, headrules=prm.binarization.headrules, removeempty=prm.removeempty, morphology=prm.morphology, functions=prm.functions, ensureroot=prm.ensureroot) if isinstance(prm.testcorpus.numsents, float): prm.testcorpus.numsents = int(prm.testcorpus.numsents * len(testsettb.blocks())) if prm.testcorpus.skiptrain: prm.testcorpus.skip += ( # pylint: disable=maybe-no-member prm.traincorpus.numsents) # pylint: disable=maybe-no-member test_blocks = OrderedDict() test_trees = OrderedDict() test_tagged_sents = OrderedDict() for n, item in testsettb.itertrees( prm.testcorpus.skip, prm.testcorpus.skip # pylint: disable=no-member + prm.testcorpus.numsents): if 1 <= len(item.sent) <= prm.testcorpus.maxwords: test_blocks[n] = item.block test_trees[n] = item.tree test_tagged_sents[n] = [(word, tag) for word, (_, tag) in zip(item.sent, sorted(item.tree.pos()))] logging.info('%d test sentences after length restriction <= %d', len(test_trees), prm.testcorpus.maxwords) lexmodel = None test_tagged_sents_mangled = test_tagged_sents if prm.postagging and prm.postagging.method in ( 'treetagger', 'stanford', 'frog'): if prm.postagging.method == 'treetagger': # these two tags are never given by tree-tagger, # so collect words whose tag needs to be overriden overridetags = ('PTKANT', 'PIDAT') elif prm.postagging.method == 'stanford': overridetags = ('PTKANT', ) elif prm.postagging.method == 'frog': overridetags = () taglex = defaultdict(set) for sent in train_tagged_sents: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV'} test_tagged_sents_mangled = lexicon.externaltagging( prm.postagging.method, prm.postagging.model, test_tagged_sents, overridetagdict, tagmap) if prm.postagging.retag and not rerun: logging.info('re-tagging training corpus') sents_to_tag = OrderedDict(enumerate(train_tagged_sents)) train_tagged_sents = lexicon.externaltagging(prm.postagging.method, prm.postagging.model, sents_to_tag, overridetagdict, tagmap).values() for tree, tagged in zip(trees, train_tagged_sents): for node in tree.subtrees( lambda n: len(n) == 1 and isinstance(n[0], int)): node.label = tagged[node[0]][1] usetags = True # give these tags to parser elif prm.postagging and prm.postagging.method == 'unknownword': if not rerun: sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents) with open(resultdir + '/closedclasswords.txt', 'w') as out: out.writelines(w + '\n' for w in lexmodel[3]) usetags = False # make sure gold POS tags are not given to parser else: usetags = True # give gold POS tags to parser # 0: test sentences as they should be handed to the parser, # 1: gold trees for evaluation purposes # 2: gold sents because test sentences may be mangled by unknown word model # 3: blocks from treebank file to reproduce the relevant part of the # original treebank verbatim. testset = OrderedDict((n, ( test_tagged_sents_mangled[n], test_trees[n], test_tagged_sents[n], block)) for n, block in test_blocks.items()) if not test_tagged_sents: raise ValueError('test corpus (selection) should be non-empty.') if rerun: trees, sents = [], [] roots = {t.label for t in trees} | {test_trees[n].label for n in testset} if len(roots) != 1: raise ValueError('expected unique ROOT label: %r' % roots) top = roots.pop() funcclassifier = None if rerun: parser.readgrammars(resultdir, prm.stages, prm.postagging, prm.transformations, top) if prm.predictfunctions: import joblib funcclassifier = joblib.load('%s/funcclassifier.pickle' % resultdir) else: logging.info('read training & test corpus') if prm.predictfunctions: import joblib from . import functiontags logging.info('training function tag classifier') funcclassifier, msg = functiontags.trainfunctionclassifier( trees, sents, prm.numproc) joblib.dump(funcclassifier, '%s/funcclassifier.pickle' % resultdir, compress=3) logging.info(msg) getgrammars(dobinarization(trees, sents, prm.binarization, prm.relationalrealizational), sents, prm.stages, prm.testcorpus.maxwords, resultdir, prm.numproc, lexmodel, top) evalparam = evalmod.readparam(prm.evalparam) evalparam['DEBUG'] = -1 evalparam['CUTOFF_LEN'] = 40 deletelabel = evalparam.get('DELETE_LABEL', ()) deleteword = evalparam.get('DELETE_WORD', ()) begin = process_time() theparser = parser.Parser(prm, funcclassifier=funcclassifier) results = doparsing(parser=theparser, testset=testset, resultdir=resultdir, usetags=usetags, numproc=prm.numproc, deletelabel=deletelabel, deleteword=deleteword, corpusfmt=prm.corpusfmt, morphology=prm.morphology, evalparam=evalparam) if prm.numproc == 1: logging.info( 'time elapsed during parsing: %gs', process_time() - begin) for result in results: nsent = len(result.parsetrees) overcutoff = any(len(a) > evalparam['CUTOFF_LEN'] for a in test_tagged_sents.values()) header = (' ' + result.name.upper() + ' ').center( 44 if overcutoff else 35, '=') evalsummary = result.evaluator.summary() coverage = 'coverage: %s = %6.2f' % ( ('%d / %d' % (nsent - result.noparse, nsent)).rjust( 25 if overcutoff else 14), 100.0 * (nsent - result.noparse) / nsent) logging.info('\n'.join(('', header, evalsummary, coverage))) return top
[docs]def loadtraincorpus(corpusfmt, traincorpus, binarization, punct, functions, morphology, removeempty, ensureroot, transformations, relationalrealizational, resultdir): """Load the training corpus.""" train = treebank.READERS[corpusfmt](traincorpus.path, encoding=traincorpus.encoding, headrules=binarization.headrules, removeempty=removeempty, ensureroot=ensureroot, punct=punct, functions=functions, morphology=morphology) if isinstance(traincorpus.numsents, float): traincorpus.numsents = int(traincorpus.numsents * len(train.sents())) trainset = [item for _, item in train.itertrees( traincorpus.skip, traincorpus.skip + traincorpus.numsents) if 1 <= len(item.sent) <= traincorpus.maxwords] trees = [item.tree for item in trainset] sents = [item.sent for item in trainset] logging.info('%d training sentences after length restriction <= %d', len(trees), traincorpus.maxwords) if not trees: raise ValueError('training corpus (selection) should be non-empty.') if transformations: if 'ftbundocompounds' in transformations: treebanktransforms.getftbcompounds( trees, sents, resultdir + '/compounds.txt') newtrees, newsents = [], [] for tree, sent in zip(trees, sents): treebanktransforms.transform(tree, sent, transformations) if sent: newtrees.append(tree) newsents.append(sent) trees, sents = newtrees, newsents if relationalrealizational: trees = [treebanktransforms.rrtransform( tree, **relationalrealizational)[0] for tree in trees] train_tagged_sents = [[(word, tag) for word, (_, tag) in zip(sent, sorted(tree.pos()))] for tree, sent in zip(trees, sents)] return trees, sents, train_tagged_sents
[docs]def getposmodel(postagging, train_tagged_sents): """Apply unknown word model to sentences before extracting grammar.""" postagging.update(unknownwordfun=lexicon.UNKNOWNWORDFUNC[postagging.model]) # get smoothed probalities for lexical productions lexmodel, msg = lexicon.getunknownwordmodel( train_tagged_sents, postagging.unknownwordfun, postagging.unknownthreshold, postagging.openclassthreshold) logging.info(msg) # NB: commonwords is the subset of words that are above the frequency # threshold. for training purposes we work with the subset, at test time we # exploit the full set of known words from the training set. sigs, allwords, commonwords, closedclasswords = lexmodel[:4] postagging.update(sigs=sigs, lexicon=allwords, closedclasswords=closedclasswords) # replace rare train words with signatures sents = lexicon.replaceraretrainwords(train_tagged_sents, postagging.unknownwordfun, commonwords) return sents, lexmodel
[docs]def dobinarization(trees, sents, binarization, relationalrealizational, logmsg=True): """Apply binarization to treebank.""" # fixme: this n should correspond to sentence id tbfanout, n = treetransforms.treebankfanout(trees) if logmsg: logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = process_time() msg = 'binarization: %s' % binarization.method if binarization.fanout_marks_before_bin: trees = [treetransforms.addfanoutmarkers(t) for t in trees] if binarization.method == 'default': msg += ' %s h=%d v=%d %s' % ( binarization.factor, binarization.h, binarization.v, 'tailmarker' if binarization.tailmarker else '') elif binarization.method == 'optimalhead': msg += ' h=%d v=%d' % ( binarization.h, binarization.v) if binarization.method is not None: trees = [binarizetree(t, binarization, relationalrealizational) for t in trees] if binarization.markovthreshold: msg1 = treetransforms.markovthreshold(trees, binarization.markovthreshold, binarization.h + binarization.revh - 1, max(binarization.v - 1, 1)) if logmsg: logging.info(msg1) trees = [treetransforms.addfanoutmarkers(t) for t in trees] if logmsg: logging.info( '%s; cpu time elapsed: %gs', msg, process_time() - begin) return trees
[docs]def getgrammars(trees, sents, stages, testmaxwords, resultdir, numproc, lexmodel, top): """Read off the requested grammars.""" tbfanout, n = treetransforms.treebankfanout(trees) logging.info('binarized treebank fan-out: %d #%d', tbfanout, n) mappings = [None for _ in stages] for n, stage in enumerate(stages): traintrees = trees stage.mapping = None prevn = 0 if n and stage.prune: prevn = [a.name for a in stages].index(stage.prune) if stage.split: traintrees = [treetransforms.binarize( treetransforms.splitdiscnodes( tree.copy(True), stage.markorigin), childchar=':', dot=True, ids=grammar.UniqueIDs()) for tree in traintrees] logging.info('splitted discontinuous nodes') if stage.collapse: traintrees, mappings[n] = treebanktransforms.collapselabels( [tree.copy(True) for tree in traintrees], tbmapping=treebanktransforms.MAPPINGS[ stage.collapse[0]][stage.collapse[1]]) logging.info('collapsed phrase labels for multilevel ' 'coarse-to-fine parsing to %s level %d', *stage.collapse) if n and mappings[prevn] is not None: # Given original labels A, convert CTF mapping1 A => C, # and mapping2 A => B to a mapping B => C. mapping1, mapping2 = mappings[prevn], mappings[n] if mappings[n] is None: stage.mapping = {a: mapping1[a] for a in mapping1} else: stage.mapping = {mapping2[a]: mapping1[a] for a in mapping2} if stage.mode.startswith('pcfg'): if tbfanout != 1 and not stage.split: raise ValueError('Cannot extract PCFG from treebank ' 'with discontinuities.') backtransform = extrarules = None if lexmodel: extrarules = lexicon.simplesmoothlexicon(lexmodel) if stage.mode == 'mc-rerank': from . import _fragments gram = parser.DictObj(_fragments.getctrees(zip(trees, sents))) tree = gram.trees1.extract(0, gram.vocab) gram.start = tree[:tree.index(' ')].lstrip('(') with gzip.open('%s/%s.train.pickle.gz' % (resultdir, stage.name), 'wb', compresslevel=1) as out: out.write(pickle.dumps(gram, protocol=-1)) elif stage.dop: rules = lex = None if stage.dop in ('doubledop', 'dop1'): if stage.dop == 'doubledop': (xgrammar, backtransform, altweights, fragments) = grammar.doubledop( traintrees, sents, numproc=numproc, maxdepth=stage.maxdepth, maxfrontier=stage.maxfrontier, extrarules=extrarules) elif stage.dop == 'dop1': (xgrammar, backtransform, altweights, fragments) = grammar.dop1( traintrees, sents, maxdepth=stage.maxdepth, maxfrontier=stage.maxfrontier, extrarules=extrarules) # dump fragments with codecs.getwriter('utf8')(gzip.open( '%s/%s.fragments.gz' % (resultdir, stage.name), 'wb', compresslevel=1)) as out: out.writelines('%s\t%d\n' % (a, len(b)) for a, b in fragments) elif stage.dop == 'reduction': xgrammar, altweights = grammar.dopreduction( traintrees, sents, packedgraph=stage.packedgraph, extrarules=extrarules) elif stage.dop == 'ostag': rules, lex, inittrees, auxtrees = grammar.doubleostagfromtsg( traintrees, sents, numproc=numproc, packedgraph=stage.packedgraph, extrarules=extrarules) altweights = {} with codecs.getwriter('utf8')(gzip.open( '%s/%s.init.gz' % (resultdir, stage.name), 'wb', compresslevel=1)) as out: out.writelines('%s\t%s\n' % a for a in inittrees.items()) with codecs.getwriter('utf8')(gzip.open( '%s/%s.aux.gz' % (resultdir, stage.name), 'wb', compresslevel=1)) as out: out.writelines('%s\t%s\n' % a for a in auxtrees.items()) else: raise ValueError('unrecognized DOP model: %r' % stage.dop) nodes = sum(len(list(a.subtrees())) for a in traintrees) msg = grammar.grammarinfo(xgrammar) if rules is None: rules, lex = grammar.writegrammar(xgrammar) rulesfile = '%s/%s.rules.gz' % (resultdir, stage.name) lexiconfile = '%s/%s.lex.gz' % (resultdir, stage.name) with codecs.getwriter('utf8')(gzip.open(rulesfile, 'wb', compresslevel=1)) as out: out.write(rules) with codecs.getwriter('utf8')(gzip.open(lexiconfile, 'wb', compresslevel=1)) as out: out.write(lex) # write prob models np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name), **altweights) gram = Grammar(rulesfile, lexiconfile, start=top, altweights='%s/%s.probs.npz' % (resultdir, stage.name), backtransform=backtransform) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, gram.nonterminals) logging.info(msg) if stage.estimator != 'rfe': gram.switch('%s' % stage.estimator) logging.info(gram.testgrammar()[1]) if stage.dop in ('doubledop', 'dop1'): # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('utf8')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'wb', compresslevel=1)) as out: out.writelines('%s\n' % a for a in backtransform) # recoverfragments() relies on this mapping to identify # binarization nodes. treeparsing() relies on this as well. msg = gram.getmapping( None, neverblockre=re.compile('.+}<')) if n and stage.prune: msg = gram.getmapping(stages[prevn].grammar, striplabelre=None if stages[prevn].dop else re.compile('@.+$'), neverblockre=re.compile('.+}<'), splitprune=not stage.split and stages[prevn].split, markorigin=stages[prevn].markorigin, mapping=stage.mapping) logging.info(msg) else: # dop reduction if n and stage.prune: # dop reduction msg = gram.getmapping(stages[prevn].grammar, striplabelre=None if stages[prevn].dop and stages[prevn].dop not in ('doubledop', 'dop1') else re.compile(r'@[-0-9]+(?:\$\[.*\])?$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=not stage.split and stages[prevn].split, markorigin=stages[prevn].markorigin, mapping=stage.mapping) if stage.mode == 'dop-rerank': gram.getrulemapping(stages[prevn].grammar, re.compile(r'@[-0-9]+\b')) logging.info(msg) if stage.objective == 'sl-dop': # needed for treeparsing() _ = gram.getmapping( None, striplabelre=re.compile(r'@[-0-9]+\b')) gram.getrulemapping(gram, re.compile(r'@[-0-9]+\b') ) else: # not stage.dop xgrammar = grammar.treebankgrammar(traintrees, sents, extrarules=extrarules) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammar.grammarinfo(xgrammar)) else: logging.info(grammar.grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) rules, lex = grammar.writegrammar(xgrammar) rulesfile = '%s/%s.rules.gz' % (resultdir, stage.name) lexiconfile = '%s/%s.lex.gz' % (resultdir, stage.name) with codecs.getwriter('utf8')(gzip.open(rulesfile, 'wb', compresslevel=1)) as out: out.write(rules) with codecs.getwriter('utf8')(gzip.open(lexiconfile, 'wb', compresslevel=1)) as out: out.write(lex) gram = Grammar(rulesfile, lexiconfile, start=top) logging.info(gram.testgrammar()[1]) if n and stage.prune: msg = gram.getmapping(stages[prevn].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=not stage.split and stages[prevn].split, markorigin=stages[prevn].markorigin, mapping=stage.mapping) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.dop in ('doubledop', 'dop1') else '') outside = None if stage.estimates in ('SX', 'SXlrgaps'): if stage.estimates == 'SX' and tbfanout != 1 and not stage.split: raise ValueError('SX estimate requires PCFG.') elif stage.mode != 'plcfrs': raise ValueError('estimates require parser w/agenda.') begin = process_time() logging.info('computing %s estimates', stage.estimates) if stage.estimates == 'SX': outside = estimates.getpcfgestimates( gram, testmaxwords, trees[0].label) elif stage.estimates == 'SXlrgaps': outside = estimates.getestimates( gram, testmaxwords, trees[0].label) logging.info('estimates done. cpu time elapsed: %gs', process_time() - begin) np.savez_compressed('%s/%s.outside.npz' % ( resultdir, stage.name), outside=outside) logging.info('saved %s estimates', stage.estimates) elif stage.estimates: raise ValueError('unrecognized value; specify SX or SXlrgaps.') stage.update(grammar=gram, outside=outside) if any(stage.mapping is not None for stage in stages): with codecs.getwriter('utf8')(gzip.open('%s/mapping.json.gz' % ( resultdir), 'wb', compresslevel=1)) as mappingfile: mappingfile.write(json.dumps([stage.mapping for stage in stages]))
[docs]def doparsing(**kwds): """Parse a set of sentences using worker processes.""" params = parser.DictObj(usetags=True, numproc=None, tailmarker='', category=None, deletelabel=(), deleteword=(), corpusfmt='export') params.update(kwds) results = [parser.DictObj(name=stage.name) for stage in params.parser.stages] for result in results: result.update( parsetrees=dict.fromkeys(params.testset), sents=dict.fromkeys(params.testset), logprob=dict.fromkeys(params.testset, float('nan')), frags=dict.fromkeys(params.testset, 0), numitems=dict.fromkeys(params.testset, 0), golditems=dict.fromkeys(params.testset, 0), totalgolditems=dict.fromkeys(params.testset, 0), elapsedtime=dict.fromkeys(params.testset), evaluator=evalmod.Evaluator(params.evalparam), noparse=0) if params.numproc == 1: initworker(params) dowork = (worker(a) for a in params.testset.items()) else: pool = multiprocessing.Pool(processes=params.numproc, initializer=initworker, initargs=(params,)) dowork = pool.imap_unordered( mpworker, params.testset.items()) logging.info('going to parse %d sentences.', len(params.testset)) # main parse loop over each sentence in test corpus for nsent, data in enumerate(dowork, 1): sentid, sent, sentresults = data _sent, goldtree, goldsent, _ = params.testset[sentid] goldsent = [w for w, _t in goldsent] logging.debug('%d/%d (%s). [len=%d] %s\n', nsent, len(params.testset), sentid, len(sent), ' '.join(goldsent)) for n, result in enumerate(sentresults): assert (results[n].parsetrees[sentid] is None and results[n].elapsedtime[sentid] is None) results[n].parsetrees[sentid] = result.parsetree results[n].sents[sentid] = sent if isinstance(result.prob, tuple): try: results[n].logprob[sentid] = [log(a) for a in result.prob if isinstance(a, float) and 0 < a <= 1][0] except (ValueError, IndexError): results[n].logprob[sentid] = 300.0 results[n].frags[sentid] = ([abs(a) for a in result.prob if isinstance(a, int)] or [None])[0] elif isinstance(result.prob, float): try: results[n].logprob[sentid] = log(result.prob) except ValueError: results[n].logprob[sentid] = 300.0 if result.fragments is not None: results[n].frags[sentid] = len(result.fragments) results[n].numitems[sentid] = result.numitems results[n].golditems[sentid] = result.golditems results[n].totalgolditems[sentid] = result.totalgolditems results[n].elapsedtime[sentid] = result.elapsedtime if result.noparse: results[n].noparse += 1 sentmetrics = results[n].evaluator.add( sentid, goldtree.copy(True), goldsent, result.parsetree.copy(True), sent) msg = result.msg scores = sentmetrics.scores() msg += '\tPOS %(POS)s ' % scores if not scores['FUN'].endswith('nan'): msg += 'FUN %(FUN)s ' % scores if scores['LF'] == '100.00': msg += 'LF exact match' else: msg += 'LF %(LF)s' % scores try: msg += '\n\t' + sentmetrics.bracketings() except Exception as err: # pylint: disable=broad-except msg += 'PROBLEM bracketings:\n%s\n%s' % ( result.parsetree, err) msg += '\n' if n + 1 == len(sentresults): try: msg += sentmetrics.visualize() except Exception as err: # pylint: disable=broad-except msg += 'PROBLEM drawing tree:\n%s\n%s' % ( sentmetrics.ctree, err) logging.debug(msg) msg = '' for n, result in enumerate(sentresults): metrics = results[n].evaluator.acc.scores() msg += ('%(name)s cov %(cov)5.2f; pos %(tag)s; %(fun1)s' 'ex %(ex)s; lp %(lp)s; lr %(lr)s; lf %(lf)s\n' % dict( name=result.name.ljust(7), cov=100 * (1 - results[n].noparse / nsent), fun1='' if metrics['fun'].endswith('nan') else ('fun %(fun)s; ' % metrics), **metrics)) logging.debug(msg) if params.numproc != 1: pool.terminate() pool.join() del dowork, pool writeresults(results, params) return results
@workerfunc def mpworker(args): """Multiprocessing wrapper of ``worker``.""" return worker(args)
[docs]def worker(args): """Parse a sentence using global Parser object, and evaluate incrementally. :returns: a string with diagnostic information, as well as a list of DictObj instances with the results for each stage.""" nsent, (tagged_sent, goldtree, _, _) = args sent = [w for w, _ in tagged_sent] prm = INTERNALPARAMS results = list(prm.parser.parse(sent, tags=[t for _, t in tagged_sent] if prm.usetags else None, goldtree=goldtree)) # only used to determine quality of pruning return (nsent, sent, results)
[docs]def writeresults(results, params): """Write parsing results to files in same format as the original corpus. (Or export if writer not implemented).""" ext = {'export': 'export', 'bracket': 'mrg', 'discbracket': 'dbr', 'alpino': 'xml'} category = (params.category + '.') if params.category else '' if params.corpusfmt not in ('export', 'bracket', 'discbracket'): # convert gold corpus because writing this formats is unsupported corpusfmt = 'export' with io.open('%s/%sgold.%s' % (params.resultdir, category, ext[corpusfmt]), 'w', encoding='utf8') as out: out.writelines(treebank.writetree( goldtree, [w for w, _ in goldsent], n, corpusfmt, morphology=params.morphology) for n, (_, goldtree, goldsent, _) in params.testset.items()) else: corpusfmt = params.corpusfmt with io.open('%s/%sgold.%s' % (params.resultdir, category, ext[corpusfmt]), 'w', encoding='utf8') as out: out.writelines(a for _, _, _, a in params.testset.values()) for res in results: with io.open('%s/%s%s.%s' % (params.resultdir, category, res.name, ext[corpusfmt]), 'w', encoding='utf8') as out: out.writelines(treebank.writetree( res.parsetrees[n], res.sents[n], n, corpusfmt, morphology=params.morphology) for n in params.testset) fileobj = open('%s/stats.tsv' % params.resultdir, 'w', encoding='utf8', newline='') with fileobj as out: fields = ['sentid', 'len', 'stage', 'elapsedtime', 'logprob', 'frags', 'numitems', 'golditems', 'totalgolditems'] writer = csv.writer(out, dialect='excel-tab') writer.writerow(fields) writer.writerows([n, len(params.testset[n][2]), res.name] + [getattr(res, field)[n] for field in fields[3:]] for n in params.testset for res in results) logging.info('wrote results to %s/%s%s.%s', params.resultdir, category, (('{%s}' % ','.join(res.name for res in results)) if len(results) > 1 else results[0].name), ext[corpusfmt])
[docs]def oldeval(results, goldbrackets): """Simple evaluation.""" nsent = len(results[0].parsetrees) if nsent: for result in results: logging.info('%s lp %5.2f lr %5.2f lf %5.2f\n' 'coverage %d / %d = %5.2f %% ' 'exact match %d / %d = %5.2f %%\n', result.name, 100 * evalmod.precision(goldbrackets, result.brackets), 100 * evalmod.recall(goldbrackets, result.brackets), 100 * evalmod.f_measure(goldbrackets, result.brackets), nsent - result.noparse, nsent, 100 * (nsent - result.noparse) / nsent, result.exact, nsent, 100 * result.exact / nsent)
[docs]def readtepacoc(): """Read the tepacoc test set.""" tepacocids = set() tepacocsents = defaultdict(list) cat = 'undefined' tepacoc = io.open('../tepacoc.txt', encoding='utf8') for line in tepacoc.read().splitlines(): fields = line.split('\t') # = [id, '', sent] if line.strip() and len(fields) == 3: if fields[0].strip(): # subtract one because our ids are zero-based, tepacoc 1-based sentid = int(fields[0]) - 1 tepacocids.add(sentid) tepacocsents[cat].append((sentid, fields[2].split())) else: # new category cat = fields[2] if cat.startswith('CUC'): cat = 'CUC' elif fields[0] == 'TuBa': break return tepacocids, tepacocsents
[docs]def parsetepacoc( stages=(dict(mode='pcfg', split=True, markorigin=True), dict(mode='plcfrs', prune=True, k=10000), dict(mode='plcfrs', prune=True, k=5000, dop='doubledop', estimator='rfe', objective='mpp')), trainmaxwords=999, trainnumsents=25005, testmaxwords=999, binarization=parser.DictObj( method='default', h=1, v=1, factor='right', tailmarker='', headrules='negra.headrules', leftmostunary=True, rightmostunary=True, markhead=False, fanout_marks_before_bin=False), transformations=None, usetagger='stanford', resultdir='tepacoc', numproc=1): """Parse the tepacoc test set.""" for stage in stages: for key in stage: if key not in parser.DEFAULTSTAGE: raise ValueError('unrecognized option: %r' % key) stages = [parser.DictObj({k: stage.get(k, v) for k, v in parser.DEFAULTSTAGE.items()}) for stage in stages] os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' logging.basicConfig(level=logging.DEBUG, format=formatstr) # log up to INFO to a results log file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) fileobj.setLevel(logging.INFO) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) tepacocids, tepacocsents = readtepacoc() try: (corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks) = pickle.load( gzip.open('tiger.pickle.gz', 'rb')) except IOError: # file not found corpus = treebank.READERS['export']( '../tiger/corpus/tiger_release_aug07.export', headrules=binarization.headrules, headfinal=True, headreverse=False, punct='move', encoding='iso-8859-1') corpus_sents = list(corpus.sents().values()) corpus_taggedsents = list(corpus.tagged_sents().values()) corpus_trees = list(corpus.trees().values()) if transformations: for tree, sent in zip(corpus_trees, corpus_sents): treebanktransforms.transform(tree, sent, transformations) corpus_blocks = list(corpus.blocks().values()) with gzip.open('tiger.pickle.gz', 'wb') as out: pickle.dump((corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks), out, protocol=-1) # test sets (one for each category) testsets = {} allsents = [] for cat, catsents in tepacocsents.items(): testset = sents, trees, goldsents, blocks = [], [], [], [] for n, sent in catsents: if sent != corpus_sents[n]: logging.error( 'mismatch. sent %d:\n%r\n%r\n' 'not in corpus %r\nnot in tepacoc %r', n + 1, sent, corpus_sents[n], [a for a, b in zip_longest(sent, corpus_sents[n]) if a and a != b], [b for a, b in zip_longest(sent, corpus_sents[n]) if b and a != b]) elif len(corpus_sents[n]) <= testmaxwords: sents.append(corpus_taggedsents[n]) trees.append(corpus_trees[n]) goldsents.append(corpus_taggedsents[n]) blocks.append(corpus_blocks[n]) allsents.extend(sents) logging.info('category: %s, %d of %d sentences', cat, len(testset[0]), len(catsents)) testsets[cat] = testset testsets['baseline'] = zip(*[sent for n, sent in enumerate(zip(corpus_taggedsents, corpus_trees, corpus_taggedsents, corpus_blocks)) if len(sent[1]) <= trainmaxwords and n not in tepacocids][trainnumsents:trainnumsents + 2000]) allsents.extend(testsets['baseline'][0]) if usetagger: overridetags = ('PTKANT', 'VAIMP') taglex = defaultdict(set) for sent in corpus_taggedsents[:trainnumsents]: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV', 'PIDAT': 'PIAT'} # the sentences in the list allsents are modified in-place so that # the relevant copy in testsets[cat][0] is updated as well. lexicon.externaltagging( usetagger, '', allsents, overridetagdict, tagmap) # training set trees, sents, blocks = zip(*[sent for n, sent in enumerate(zip(corpus_trees, corpus_sents, corpus_blocks)) if len(sent[1]) <= trainmaxwords and n not in tepacocids][:trainnumsents]) getgrammars(dobinarization(trees, sents, binarization, False), sents, stages, testmaxwords, resultdir, numproc, None, trees[0].label) del corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks results = {} cnt = 0 params = parser.DictObj(parser.DEFAULTS) params.update(stages=stages, binarization=binarization, transformations=transformations) theparser = parser.Parser(params) for cat, testset in sorted(testsets.items()): if cat == 'baseline': continue logging.info('category: %s', cat) begin = process_time() results[cat] = doparsing(parser=theparser, testset=testset, resultdir=resultdir, usetags=True, numproc=numproc, category=cat) cnt += len(testset[0]) logging.info('time elapsed during parsing: %g', process_time() - begin) goldbrackets = Counter() totresults = [parser.DictObj(name=stage.name) for stage in stages] for result in totresults: result.elapsedtime = [None] * cnt result.parsetrees = [None] * cnt result.brackets = Counter() result.exact = result.noparse = 0 goldblocks = [] goldsents = [] # FIXME for cat, res in results.items(): logging.info('category: %s', cat) goldbrackets |= res[2] goldblocks.extend(res[3]) goldsents.extend(res[4]) for result, totresult in zip(res[0], totresults): totresult.exact += result.exact totresult.noparse += result.noparse totresult.brackets |= result.brackets totresult.elapsedtime.extend(result.elapsedtime) oldeval(*res) logging.info('TOTAL') oldeval(totresults, goldbrackets) # write TOTAL results file with all tepacoc sentences (not the baseline) for stage in stages: with io.open('TOTAL.%s.export' % stage.name, 'w', encoding='utf8') as tmp: tmp.writelines(io.open('%s.%s.export' % (cat, stage.name), encoding='utf8').read() for cat in list(results) + ['gold']) # do baseline separately because it shouldn't count towards the total score cat = 'baseline' logging.info('category: %s', cat) oldeval(*doparsing(parser=theparser, testset=testsets[cat], resultdir=resultdir, usetags=True, numproc=numproc, category=cat))
__all__ = ['initworker', 'startexp', 'loadtraincorpus', 'getposmodel', 'dobinarization', 'getgrammars', 'doparsing', 'worker', 'writeresults', 'oldeval', 'readtepacoc', 'parsetepacoc']