Source code for discodop.functiontags

"""Function tags classifier."""
from .tree import Tree, HEAD
from .treebanktransforms import base, functions, FUNC
from .heads import getheadpos


FIELDS = tuple(range(8))
WORD, LEMMA, TAG, MORPH, FUNC, PARENT, SECEDGETAG, SECEDGEPARENT = FIELDS


[docs]def trainfunctionclassifier(trees, sents, numproc): """Train a classifier to predict functions tags in trees.""" from sklearn import linear_model, multiclass, pipeline from sklearn import preprocessing, feature_extraction from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer, jaccard_score vectorizer = pipeline.Pipeline([ ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), ('scaler', preprocessing.StandardScaler( copy=False, with_mean=False))]) # PTB has no function tags on pretermintals, Negra/Tiger/Lassy do. posfunc = any(functions(node) for tree in trees for node in tree.subtrees() if node and isinstance(node[0], int)) target = [functions(node) for tree in trees for node in tree.subtrees() if tree is not node and node and (posfunc or isinstance(node[0], Tree))] # PTB may have multiple tags (or 0) per node. # Negra/Tiger/Lassy have exactly 1 tag for every node. multi = any(len(a) > 1 for a in target) if multi: encoder = preprocessing.MultiLabelBinarizer() else: encoder = preprocessing.LabelEncoder() target = [a[0] if a else '--' for a in target] # binarize features (output is a sparse array) trainfeats = vectorizer.fit_transform(functionfeatures(node, sent) for tree, sent in zip(trees, sents) for node in tree.subtrees() if tree is not node and node and (posfunc or isinstance(node[0], Tree))) trainfuncs = encoder.fit_transform(target) classifier = linear_model.SGDClassifier( loss='hinge', penalty='elasticnet', max_iter=int(10 ** 6 / len(trees))) alphas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6] if multi: classifier = multiclass.OneVsRestClassifier( classifier, n_jobs=numproc or -1) param_grid = dict( estimator__alpha=alphas) else: param_grid = dict(alpha=alphas) classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring=make_scorer(jaccard_score)) # train classifier classifier.fit(trainfeats, trainfuncs) msg = ('trained classifier; grid search results:\n%s\n' 'multi=%r, posfunc=%r; best score on training set: %g %%\n' 'parameters: %r\nfunction tags: %s' % ( '\n'.join('%s: %s' % a for a in classifier.cv_results_.items()), multi, posfunc, 100.0 * classifier.best_score_, classifier.best_estimator_, ' '.join(str(a) for a in encoder.classes_))) return (classifier, vectorizer, encoder, posfunc, multi), msg
[docs]def applyfunctionclassifier(funcclassifier, tree, sent): """Add predicted function tags to tree using classifier.""" classifier, vectorizer, encoder, posfunc, multi = funcclassifier # get features and use classifier funclabels = encoder.inverse_transform(classifier.predict( vectorizer.transform(functionfeatures(node, sent) for node in tree.subtrees(lambda n: n is not tree and n and (posfunc or isinstance(n[0], Tree)))))) # store labels in tree for node, func in zip(tree.subtrees(lambda n: n is not tree and n and (posfunc or isinstance(n[0], Tree))), funclabels): if node.source is None: node.source = ['--'] * 6 elif isinstance(node.source, tuple): node.source = list(node.source) if not func: node.source[FUNC] = '--' elif multi: node.source[FUNC] = '-'.join(func) else: node.source[FUNC] = func
[docs]def functionfeatures(node, sent): """Return a list of features for node to predict its function tag. The node must be a ParentedTree, with head information. The features are based on Blaheta & Charniak (2000), Assigning Function Tags to Parsed Text. http://aclweb.org/anthology/A00-2031""" headsib = headsibpos = None for sib in node.parent: if sib.type == HEAD: headsib = sib headsibpos = getheadpos(headsib) break result = { # 4. head sister const label 'hsc': headsib.label if headsib else '', # 5. head sister head word POS 'hsp': headsibpos.label if headsibpos else '', # 6. head sister head word 'hsf': sent[headsibpos[0]] if headsibpos else '', # 10. parent label 'moc': node.parent.label, # 11. grandparent label 'grc': node.parent.parent.label if node.parent.parent else '', # 12. Offset of this node to head sister 'ohs': (node.parent_index - headsib.parent_index) if headsib is not None else -1, } result.update(basefeatures(node, sent)) # add similar features for neighbors if node.parent_index > 0: result.update(basefeatures( node.parent[node.parent_index - 1], sent, prefix='p')) if node.parent_index + 1 < len(node.parent): result.update(basefeatures( node.parent[node.parent_index + 1], sent, prefix='n')) return result
def basefeatures(node, sent, prefix=''): """A set features describing this particular node.""" headpos = getheadpos(node) if base(node, 'PP'): # NB: we skip the preposition here; need way to identify it. altheadpos = getheadpos(node[1:]) else: altheadpos = None return { # 1. syntactic category prefix + 'cat': node.label, # 2. head POS prefix + 'hwp': headpos.label if headpos else '', # 3. head word prefix + 'hwf': sent[headpos[0]] if headpos else '', # 7. alt (for PPs, non-prep. node) head POS prefix + 'ahc': altheadpos.label if altheadpos else '', # 8. alt head word prefix + 'ahf': sent[altheadpos[0]] if altheadpos else '', # 9 yield length prefix + 'yis': len(node.leaves()), } __all__ = ['trainfunctionclassifier', 'applyfunctionclassifier', 'functionfeatures']