389 lines
16 KiB
C++
389 lines
16 KiB
C++
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#undef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
|
|
#ifdef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include "../algs.h"
|
|
#include "../serialize.h"
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
struct constituent
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents the linguistic idea of a constituent, that is, a
|
|
group of words that functions as a single unit. In particular, it
|
|
represents a combination of two constituents into a new constituent.
|
|
|
|
Additionally, a constituent object represents a range of words relative to
|
|
some std::vector of words. The range is from [begin, end) (i.e. including
|
|
begin but not including end, so using the normal C++ iterator notation).
|
|
Moreover, a constituent is always composed of two parts, each having a tag.
|
|
Therefore, the left part is composed of the words in the range [begin,k)
|
|
and has tag left_tag while the right part of the constituent contains the
|
|
words in the range [k,end) and has the tag right_tag.
|
|
|
|
The tags are user defined objects of type T. In general, they are used to
|
|
represent syntactic categories such as noun phrase, verb phrase, etc.
|
|
!*/
|
|
|
|
unsigned long begin, end, k;
|
|
T left_tag;
|
|
T right_tag;
|
|
};
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
void serialize(
|
|
const constituent<T>& item,
|
|
std::ostream& out
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
void deserialize(
|
|
constituent<T>& item,
|
|
std::istream& in
|
|
);
|
|
/*!
|
|
provides deserialization support
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
/*!A END_OF_TREE is used to indicate that parse_tree_element::left or
|
|
parse_tree_element::right doesn't point to another subtree.
|
|
!*/
|
|
const unsigned long END_OF_TREE = 0xFFFFFFFF;
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
struct parse_tree_element
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object is used to represent a node in a binary parse tree. An entire
|
|
parse tree is represented by a std::vector of parse_tree_element objects.
|
|
We follow the convention that the first element of this vector is always
|
|
the root of the entire tree.
|
|
|
|
The fields of this object have the following interpretations:
|
|
- c == the constituent spanned by this node in the parse tree.
|
|
Therefore, the node spans the words in the range [c.begin, c.end).
|
|
- tag == the syntactic category of this node in the parse tree.
|
|
- score == the score or log likelihood for this parse tree. In
|
|
general, this is the sum of scores of all the production rules used
|
|
to build the tree rooted at the current node.
|
|
- let PT denote the vector of parse_tree_elements that defines an
|
|
entire parse tree. Then we have:
|
|
- if (left != END_OF_TREE) then
|
|
- PT[left] == the left sub-tree of the current node.
|
|
- PT[left] spans the words [c.begin, c.k)
|
|
- PT[left].tag == c.left_tag
|
|
- else
|
|
- there is no left sub-tree
|
|
|
|
- if (right != END_OF_TREE) then
|
|
- PT[right] == the right sub-tree of the current node.
|
|
- PT[right] spans the words [c.k, c.end)
|
|
- PT[right].tag == c.right_tag
|
|
- else
|
|
- there is no right sub-tree
|
|
!*/
|
|
|
|
constituent<T> c;
|
|
T tag;
|
|
double score;
|
|
|
|
unsigned long left;
|
|
unsigned long right;
|
|
};
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
void serialize (
|
|
const parse_tree_element<T>& item,
|
|
std::ostream& out
|
|
);
|
|
/*!
|
|
provides serialization support
|
|
!*/
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
void deserialize (
|
|
parse_tree_element<T>& item,
|
|
std::istream& in
|
|
);
|
|
/*!
|
|
provides deserialization support
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
void example_production_rule_function (
|
|
const std::vector<T>& words,
|
|
const constituent<T>& c,
|
|
std::vector<std::pair<T,double> >& possible_tags
|
|
)
|
|
/*!
|
|
requires
|
|
- 0 <= c.begin < c.k < c.end <= words.size()
|
|
- possible_tags.size() == 0
|
|
ensures
|
|
- Finds all the syntactic categories that can be used to label c and puts those
|
|
categories, along with their scores, into possible_tags. Or in other words,
|
|
this function determines which production rules can be used to turn the left
|
|
and right sub-constituents in c into a single constituent. The contents of c
|
|
have the following interpretations:
|
|
- The left sub-constituent has syntactic category c.left_tag
|
|
- for all i such that c.begin <= i < c.k:
|
|
- words[i] is part of the left sub-constituent.
|
|
- The right sub-constituent has syntactic category c.right_tag
|
|
- for all i such that c.k <= i < c.end:
|
|
- words[i] is part of the right sub-constituent.
|
|
|
|
- Note that example_production_rule_function() is not a real function. It is
|
|
here just to show you how to define production rule producing functions for
|
|
use with the find_max_parse_cky() routine defined below.
|
|
!*/
|
|
|
|
template <
|
|
typename T,
|
|
typename production_rule_function
|
|
>
|
|
void find_max_parse_cky (
|
|
const std::vector<T>& words,
|
|
const production_rule_function& production_rules,
|
|
std::vector<parse_tree_element<T> >& parse_tree
|
|
);
|
|
/*!
|
|
requires
|
|
- production_rule_function == a function or function object with the same
|
|
interface as example_production_rule_function defined above.
|
|
- It must be possible to store T objects in a std::map.
|
|
ensures
|
|
- Uses the CKY algorithm to find the most probable/highest scoring binary parse
|
|
tree of the given vector of words.
|
|
- if (#parse_tree.size() == 0) then
|
|
- There is no parse tree, using the given production_rules, that can cover
|
|
the given word sequence.
|
|
- else
|
|
- #parse_tree == the highest scoring parse tree that covers all the
|
|
elements of words.
|
|
- #parse_tree[0] == the root node of the parse tree.
|
|
- #parse_tree[0].score == the score of the parse tree. This is the sum of
|
|
the scores of all production rules used to construct the tree.
|
|
- #parse_tree[0].begin == 0
|
|
- #parse_tree[0].end == words.size()
|
|
- This function uses production_rules() to find out what the allowed production
|
|
rules are. That is, production_rules() defines all properties of the grammar
|
|
used by find_max_parse_cky().
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
class parse_tree_to_string_error : public error
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This is the exception thrown by parse_tree_to_string() and
|
|
parse_tree_to_string_tagged() if the inputs are discovered to be invalid.
|
|
!*/
|
|
};
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename U
|
|
>
|
|
std::string parse_tree_to_string (
|
|
const std::vector<parse_tree_element<T> >& tree,
|
|
const std::vector<U>& words,
|
|
const unsigned long root_idx = 0
|
|
);
|
|
/*!
|
|
requires
|
|
- It must be possible to print U objects to an ostream using operator<<
|
|
(typically, U would be something like std::string)
|
|
ensures
|
|
- Interprets tree as a parse tree defined over the given sequence of words.
|
|
- returns a bracketed string that represents the parse tree over the words.
|
|
For example, suppose the following parse tree is input:
|
|
|
|
/\
|
|
/ \
|
|
/\ \
|
|
/ \ \
|
|
the dog ran
|
|
|
|
Then the output would be the string "[[the dog] ran]"
|
|
- Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >=
|
|
tree.size() then the empty string is returned.
|
|
throws
|
|
- parse_tree_to_string_error
|
|
This exception is thrown if an invalid tree is detected. This might happen
|
|
if the tree refers to elements of words that don't exist because words is
|
|
shorted than it is supposed to be.
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename U
|
|
>
|
|
std::string parse_tree_to_string_tagged (
|
|
const std::vector<parse_tree_element<T> >& tree,
|
|
const std::vector<U>& words,
|
|
const unsigned long root_idx = 0
|
|
);
|
|
/*!
|
|
requires
|
|
- It must be possible to print T objects to an ostream using operator<<
|
|
- It must be possible to print U objects to an ostream using operator<<
|
|
(typically, U would be something like std::string)
|
|
ensures
|
|
- This function does the same thing as parse_tree_to_string() except that it
|
|
also includes the parse_tree_element::tag object in the output. Therefore,
|
|
the tag of each bracket will be included as the first token inside the
|
|
bracket. For example, suppose the following parse tree is input (where tags
|
|
are shown at the vertices):
|
|
|
|
S
|
|
/\
|
|
NP \
|
|
/\ \
|
|
/ \ \
|
|
the dog ran
|
|
|
|
Then the output would be the string "[S [NP the dog] ran]"
|
|
- Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >=
|
|
tree.size() then the empty string is returned.
|
|
throws
|
|
- parse_tree_to_string_error
|
|
This exception is thrown if an invalid tree is detected. This might happen
|
|
if the tree refers to elements of words that don't exist because words is
|
|
shorted than it is supposed to be.
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename U
|
|
>
|
|
std::string parse_trees_to_string (
|
|
const std::vector<parse_tree_element<T> >& tree,
|
|
const std::vector<U>& words,
|
|
const T& tag_to_skip
|
|
);
|
|
/*!
|
|
requires
|
|
- It must be possible to print U objects to an ostream using operator<<
|
|
(typically, U would be something like std::string)
|
|
ensures
|
|
- This function behaves just like parse_tree_to_string() except that it will
|
|
not print the brackets (i.e. []) for the top most parts of the tree which
|
|
have tags equal to tag_to_skip. It will however print all the words.
|
|
Therefore, this function only includes brackets on the subtrees which begin
|
|
with a tag other than tag_to_skip.
|
|
throws
|
|
- parse_tree_to_string_error
|
|
This exception is thrown if an invalid tree is detected. This might happen
|
|
if the tree refers to elements of words that don't exist because words is
|
|
shorted than it is supposed to be.
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T,
|
|
typename U
|
|
>
|
|
std::string parse_trees_to_string_tagged (
|
|
const std::vector<parse_tree_element<T> >& tree,
|
|
const std::vector<U>& words,
|
|
const T& tag_to_skip
|
|
);
|
|
/*!
|
|
requires
|
|
- It must be possible to print T objects to an ostream using operator<<
|
|
- It must be possible to print U objects to an ostream using operator<<
|
|
(typically, U would be something like std::string)
|
|
ensures
|
|
- This function behaves just like parse_tree_to_string_tagged() except that it
|
|
will not print the brackets (i.e. []) for the top most parts of the tree
|
|
which have tags equal to tag_to_skip. It will however print all the words.
|
|
Therefore, this function only includes brackets on the subtrees which begin
|
|
with a tag other than tag_to_skip.
|
|
throws
|
|
- parse_tree_to_string_error
|
|
This exception is thrown if an invalid tree is detected. This might happen
|
|
if the tree refers to elements of words that don't exist because words is
|
|
shorted than it is supposed to be.
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
void find_trees_not_rooted_with_tag (
|
|
const std::vector<parse_tree_element<T> >& tree,
|
|
const T& tag,
|
|
std::vector<unsigned long>& tree_roots
|
|
);
|
|
/*!
|
|
requires
|
|
- objects of type T must be comparable using operator==
|
|
ensures
|
|
- Finds all the largest non-overlapping trees in tree that are not rooted with
|
|
the given tag.
|
|
- find_trees_not_rooted_with_tag() is useful when you want to cut a parse tree
|
|
into a bunch of sub-trees and you know that the top level of the tree is all
|
|
composed of the same kind of tag. So if you want to just "slice off" the top
|
|
of the tree where this tag lives then this function is useful for doing that.
|
|
- #tree_roots.size() == the number of sub-trees found.
|
|
- for all valid i:
|
|
- tree[#tree_roots[i]].tag != tag
|
|
- To make the operation of this function clearer, here are a few examples of
|
|
what it will do:
|
|
- if (tree[0].tag != tag) then
|
|
- #tree_roots.size() == 0
|
|
- #tree_roots[0] == 0
|
|
- else if (tree[0].tag == tag but its immediate children's tags are not equal to tag) then
|
|
- #tree_roots.size() == 2
|
|
- #tree_roots[0] == tree[0].left
|
|
- #tree_roots[1] == tree[0].right
|
|
!*/
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
|
|
|