2
* (C) 2006, 2007 Andreas Gruenbacher <agruen@suse.de>
3
* Copyright (c) 2003-2008 Novell, Inc. (All rights reserved)
4
* Copyright 2009-2013 Canonical Ltd.
6
* The libapparmor library is licensed under the terms of the GNU
7
* Lesser General Public License, version 2.1. Please see the file
10
* This library is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU Lesser General Public License for more details.
15
* You should have received a copy of the GNU Lesser General Public License
16
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19
* Functions to create/manipulate an expression tree for regular expressions
20
* that have been parsed.
22
* The expression tree can be used directly after the parse creates it, or
23
* it can be factored so that the set of important nodes is smaller.
24
* Having a reduced set of important nodes generally results in a dfa that
25
* is closer to minimum (fewer redundant states are created). It also
26
* results in fewer important nodes in a the state set during subset
27
* construction resulting in less memory used to create a dfa.
29
* Generally it is worth doing expression tree simplification before dfa
30
* construction, if the regular expression tree contains any alternations.
31
* Even if the regular expression doesn't simplification should be fast
32
* enough that it can be used with minimal overhead.
34
#ifndef __LIBAA_RE_EXPR_H
35
#define __LIBAA_RE_EXPR_H
44
#include "apparmor_re.h"
48
typedef unsigned char uchar;
49
typedef set<uchar> Chars;
51
ostream &operator<<(ostream &os, uchar c);
53
/* Compute the union of two sets. */
54
template<class T> set<T> operator+(const set<T> &a, const set<T> &b)
57
c.insert(b.begin(), b.end());
62
* When creating DFAs from regex trees, a DFA state is constructed from
63
* a set of important nodes in the syntax tree. This includes AcceptNodes,
64
* which indicate that when a match ends in a particular state, the
65
* regular expressions that the AcceptNode belongs to match.
69
typedef set<ImportantNode *> NodeSet;
72
* Text-dump a state (for debugging).
74
ostream &operator<<(ostream &os, const NodeSet &state);
77
* Out-edges from a state to another: we store the follow-set of Nodes
78
* for each input character that is not a default match in
79
* cases (i.e., following a CharNode or CharSetNode), and default
80
* matches in otherwise as well as in all matching explicit cases
81
* (i.e., following an AnyCharNode or NotCharSetNode). This avoids
82
* enumerating all the explicit tranitions for default matches.
84
typedef struct Cases {
85
typedef map<uchar, NodeSet *>::iterator iterator;
86
iterator begin() { return cases.begin(); }
87
iterator end() { return cases.end(); }
89
Cases(): otherwise(0) { }
90
map<uchar, NodeSet *> cases;
94
ostream &operator<<(ostream &os, Node &node);
96
/* An abstract node in the syntax tree. */
99
Node(): nullable(false), label(0) { child[0] = child[1] = 0; }
100
Node(Node *left): nullable(false), label(0)
105
Node(Node *left, Node *right): nullable(false), label(0)
119
* See the "Dragon Book" for an explanation of nullable, firstpos,
120
* lastpos, and followpos.
122
virtual void compute_nullable() { }
123
virtual void compute_firstpos() = 0;
124
virtual void compute_lastpos() = 0;
125
virtual void compute_followpos() { }
126
virtual int eq(Node *other) = 0;
127
virtual ostream &dump(ostream &os) = 0;
128
void dump_syntax_tree(ostream &os);
129
virtual void normalize(int dir)
132
child[dir]->normalize(dir);
134
child[!dir]->normalize(dir);
136
/* return false if no work done */
137
virtual int normalize_eps(int dir __attribute__((unused))) { return 0; }
140
NodeSet firstpos, lastpos, followpos;
141
/* child 0 is left, child 1 is right */
144
unsigned int label; /* unique number for debug etc */
146
* We indirectly release Nodes through a virtual function because
147
* accept and Eps Nodes are shared, and must be treated specially.
148
* We could use full reference counting here but the indirect release
149
* is sufficient and has less overhead
151
virtual void release(void) { delete this; }
154
class InnerNode: public Node {
156
InnerNode(): Node() { };
157
InnerNode(Node *left): Node(left) { };
158
InnerNode(Node *left, Node *right): Node(left, right) { };
161
class OneChildNode: public InnerNode {
163
OneChildNode(Node *left): InnerNode(left) { };
166
class TwoChildNode: public InnerNode {
168
TwoChildNode(Node *left, Node *right): InnerNode(left, right) { };
169
virtual int normalize_eps(int dir);
172
class LeafNode: public Node {
174
LeafNode(): Node() { };
175
virtual void normalize(int dir __attribute__((unused))) { return; }
178
/* Match nothing (//). */
179
class EpsNode: public LeafNode {
181
EpsNode(): LeafNode()
188
/* don't delete Eps nodes because there is a single static
189
* instance shared by all trees. Look for epsnode in the code
193
void compute_firstpos() { }
194
void compute_lastpos() { }
197
if (dynamic_cast<EpsNode *>(other))
201
ostream &dump(ostream &os)
208
* Leaf nodes in the syntax tree are important to us: they describe the
209
* characters that the regular expression matches. We also consider
210
* AcceptNodes import: they indicate when a regular expression matches.
212
class ImportantNode: public LeafNode {
214
ImportantNode(): LeafNode() { }
215
void compute_firstpos() { firstpos.insert(this); }
216
void compute_lastpos() { lastpos.insert(this); }
217
virtual void follow(Cases &cases) = 0;
218
virtual int is_accept(void) = 0;
219
virtual int is_postprocess(void) = 0;
222
/* common base class for all the different classes that contain
223
* character information.
225
class CNode: public ImportantNode {
227
CNode(): ImportantNode() { }
228
int is_accept(void) { return false; }
229
int is_postprocess(void) { return false; }
232
/* Match one specific character (/c/). */
233
class CharNode: public CNode {
235
CharNode(uchar c): c(c) { }
236
void follow(Cases &cases)
238
NodeSet **x = &cases.cases[c];
241
*x = new NodeSet(*cases.otherwise);
245
(*x)->insert(followpos.begin(), followpos.end());
249
CharNode *o = dynamic_cast<CharNode *>(other);
255
ostream &dump(ostream &os)
263
/* Match a set of characters (/[abc]/). */
264
class CharSetNode: public CNode {
266
CharSetNode(Chars &chars): chars(chars) { }
267
void follow(Cases &cases)
269
for (Chars::iterator i = chars.begin(); i != chars.end(); i++) {
270
NodeSet **x = &cases.cases[*i];
273
*x = new NodeSet(*cases.otherwise);
277
(*x)->insert(followpos.begin(), followpos.end());
282
CharSetNode *o = dynamic_cast<CharSetNode *>(other);
283
if (!o || chars.size() != o->chars.size())
286
for (Chars::iterator i = chars.begin(), j = o->chars.begin();
287
i != chars.end() && j != o->chars.end(); i++, j++) {
293
ostream &dump(ostream &os)
296
for (Chars::iterator i = chars.begin(); i != chars.end(); i++)
304
/* Match all except one character (/[^abc]/). */
305
class NotCharSetNode: public CNode {
307
NotCharSetNode(Chars &chars): chars(chars) { }
308
void follow(Cases &cases)
310
if (!cases.otherwise)
311
cases.otherwise = new NodeSet;
312
for (Chars::iterator j = chars.begin(); j != chars.end(); j++) {
313
NodeSet **x = &cases.cases[*j];
315
*x = new NodeSet(*cases.otherwise);
317
/* Note: Add to the nonmatching characters after copying away
318
* the old otherwise state for the matching characters.
320
cases.otherwise->insert(followpos.begin(), followpos.end());
321
for (Cases::iterator i = cases.begin(); i != cases.end();
323
if (chars.find(i->first) == chars.end())
324
i->second->insert(followpos.begin(),
330
NotCharSetNode *o = dynamic_cast<NotCharSetNode *>(other);
331
if (!o || chars.size() != o->chars.size())
334
for (Chars::iterator i = chars.begin(), j = o->chars.begin();
335
i != chars.end() && j != o->chars.end(); i++, j++) {
341
ostream &dump(ostream &os)
344
for (Chars::iterator i = chars.begin(); i != chars.end(); i++)
352
/* Match any character (/./). */
353
class AnyCharNode: public CNode {
356
void follow(Cases &cases)
358
if (!cases.otherwise)
359
cases.otherwise = new NodeSet;
360
cases.otherwise->insert(followpos.begin(), followpos.end());
361
for (Cases::iterator i = cases.begin(); i != cases.end();
363
i->second->insert(followpos.begin(), followpos.end());
367
if (dynamic_cast<AnyCharNode *>(other))
371
ostream &dump(ostream &os) { return os << "."; }
374
/* Match a node zero or more times. (This is a unary operator.) */
375
class StarNode: public OneChildNode {
377
StarNode(Node *left): OneChildNode(left) { nullable = true; }
378
void compute_firstpos() { firstpos = child[0]->firstpos; }
379
void compute_lastpos() { lastpos = child[0]->lastpos; }
380
void compute_followpos()
382
NodeSet from = child[0]->lastpos, to = child[0]->firstpos;
383
for (NodeSet::iterator i = from.begin(); i != from.end(); i++) {
384
(*i)->followpos.insert(to.begin(), to.end());
389
if (dynamic_cast<StarNode *>(other))
390
return child[0]->eq(other->child[0]);
393
ostream &dump(ostream &os)
401
/* Match a node one or more times. (This is a unary operator.) */
402
class PlusNode: public OneChildNode {
404
PlusNode(Node *left): OneChildNode(left) {
406
void compute_nullable() { nullable = child[0]->nullable; }
407
void compute_firstpos() { firstpos = child[0]->firstpos; }
408
void compute_lastpos() { lastpos = child[0]->lastpos; }
409
void compute_followpos()
411
NodeSet from = child[0]->lastpos, to = child[0]->firstpos;
412
for (NodeSet::iterator i = from.begin(); i != from.end(); i++) {
413
(*i)->followpos.insert(to.begin(), to.end());
416
int eq(Node *other) {
417
if (dynamic_cast<PlusNode *>(other))
418
return child[0]->eq(other->child[0]);
421
ostream &dump(ostream &os) {
428
/* Match a pair of consecutive nodes. */
429
class CatNode: public TwoChildNode {
431
CatNode(Node *left, Node *right): TwoChildNode(left, right) { }
432
void compute_nullable()
434
nullable = child[0]->nullable && child[1]->nullable;
436
void compute_firstpos()
438
if (child[0]->nullable)
439
firstpos = child[0]->firstpos + child[1]->firstpos;
441
firstpos = child[0]->firstpos;
443
void compute_lastpos()
445
if (child[1]->nullable)
446
lastpos = child[0]->lastpos + child[1]->lastpos;
448
lastpos = child[1]->lastpos;
450
void compute_followpos()
452
NodeSet from = child[0]->lastpos, to = child[1]->firstpos;
453
for (NodeSet::iterator i = from.begin(); i != from.end(); i++) {
454
(*i)->followpos.insert(to.begin(), to.end());
459
if (dynamic_cast<CatNode *>(other)) {
460
if (!child[0]->eq(other->child[0]))
462
return child[1]->eq(other->child[1]);
466
ostream &dump(ostream &os)
472
void normalize(int dir);
475
/* Match one of two alternative nodes. */
476
class AltNode: public TwoChildNode {
478
AltNode(Node *left, Node *right): TwoChildNode(left, right) { }
479
void compute_nullable()
481
nullable = child[0]->nullable || child[1]->nullable;
483
void compute_lastpos()
485
lastpos = child[0]->lastpos + child[1]->lastpos;
487
void compute_firstpos()
489
firstpos = child[0]->firstpos + child[1]->firstpos;
493
if (dynamic_cast<AltNode *>(other)) {
494
if (!child[0]->eq(other->child[0]))
496
return child[1]->eq(other->child[1]);
500
ostream &dump(ostream &os)
509
void normalize(int dir);
512
class SharedNode: public ImportantNode {
517
/* don't delete SharedNodes via release as they are shared, and
518
* will be deleted when the table they are stored in is deleted
522
void follow(Cases &cases __attribute__ ((unused)))
524
/* Nothing to follow. */
527
/* requires shared nodes to be common by pointer */
528
int eq(Node *other) { return (this == other); }
532
* Indicate that a regular expression matches. An AcceptNode itself
533
* doesn't match anything, so it will never generate any transitions.
535
class AcceptNode: public SharedNode {
538
int is_accept(void) { return true; }
539
int is_postprocess(void) { return false; }
542
class MatchFlag: public AcceptNode {
544
MatchFlag(uint32_t flag, uint32_t audit): flag(flag), audit(audit) { }
545
ostream &dump(ostream &os) { return os << "< 0x" << hex << flag << '>'; }
551
class ExactMatchFlag: public MatchFlag {
553
ExactMatchFlag(uint32_t flag, uint32_t audit): MatchFlag(flag, audit) {}
556
class DenyMatchFlag: public MatchFlag {
558
DenyMatchFlag(uint32_t flag, uint32_t quiet): MatchFlag(flag, quiet) {}
561
/* Traverse the syntax tree depth-first in an iterator-like manner. */
562
class depth_first_traversal {
564
void push_left(Node *node) {
567
while (dynamic_cast<InnerNode *>(node)) {
568
pos.push(node->child[0]);
569
node = node->child[0];
573
depth_first_traversal(Node *node) { push_left(node); }
574
Node *operator*() { return pos.top(); }
575
Node *operator->() { return pos.top(); }
576
operator bool() { return !pos.empty(); }
579
Node *last = pos.top();
583
/* no need to dynamic cast, as we just popped a node so
584
* the top node must be an inner node */
585
InnerNode *node = (InnerNode *) (pos.top());
586
if (node->child[1] && node->child[1] != last) {
587
push_left(node->child[1]);
604
extern EpsNode epsnode;
606
int debug_tree(Node *t);
607
Node *simplify_tree(Node *t, dfaflags_t flags);
608
void label_nodes(Node *root);
609
unsigned long hash_NodeSet(NodeSet *ns);
610
void flip_tree(Node *node);
615
* hashedNodes - for efficient set comparison
617
class hashedNodeSet {
622
hashedNodeSet(NodeSet *n): nodes(n)
624
hash = hash_NodeSet(n);
627
bool operator<(hashedNodeSet const &rhs)const
629
if (hash == rhs.hash) {
630
if (nodes->size() == rhs.nodes->size())
631
return *nodes < *(rhs.nodes);
633
return nodes->size() < rhs.nodes->size();
635
return hash < rhs.hash;
641
class hashedNodeVec {
643
typedef ImportantNode ** iterator;
644
iterator begin() { return nodes; }
645
iterator end() { iterator t = nodes ? &nodes[len] : NULL; return t; }
649
ImportantNode **nodes;
651
hashedNodeVec(NodeSet *n)
653
hash = hash_NodeSet(n);
655
nodes = new ImportantNode *[n->size()];
658
for (NodeSet::iterator i = n->begin(); i != n->end(); i++, j++) {
663
hashedNodeVec(NodeSet *n, unsigned long h): hash(h)
666
nodes = new ImportantNode *[n->size()];
667
ImportantNode **j = nodes;
668
for (NodeSet::iterator i = n->begin(); i != n->end(); i++) {
678
unsigned long size()const { return len; }
680
bool operator<(hashedNodeVec const &rhs)const
682
if (hash == rhs.hash) {
683
if (len == rhs.size()) {
684
for (unsigned int i = 0; i < len; i++) {
685
if (nodes[i] != rhs.nodes[i])
686
return nodes[i] < rhs.nodes[i];
690
return len < rhs.size();
692
return hash < rhs.hash;
698
unsigned long dup, sum, max;
700
CacheStats(void): dup(0), sum(0), max(0) { };
702
void clear(void) { dup = sum = max = 0; }
703
virtual unsigned long size(void) const = 0;
706
class NodeCache: public CacheStats {
708
set<hashedNodeSet> cache;
710
NodeCache(void): cache() { };
711
~NodeCache() { clear(); };
713
virtual unsigned long size(void) const { return cache.size(); }
717
for (set<hashedNodeSet>::iterator i = cache.begin();
718
i != cache.end(); i++) {
725
NodeSet *insert(NodeSet *nodes)
729
pair<set<hashedNodeSet>::iterator,bool> uniq;
730
uniq = cache.insert(hashedNodeSet(nodes));
731
if (uniq.second == false) {
735
sum += nodes->size();
736
if (nodes->size() > max)
739
return uniq.first->nodes;
743
struct deref_less_than {
744
bool operator()(hashedNodeVec * const &lhs, hashedNodeVec * const &rhs)const
750
class NodeVecCache: public CacheStats {
752
set<hashedNodeVec *, deref_less_than> cache;
754
NodeVecCache(void): cache() { };
755
~NodeVecCache() { clear(); };
757
virtual unsigned long size(void) const { return cache.size(); }
761
for (set<hashedNodeVec *>::iterator i = cache.begin();
762
i != cache.end(); i++) {
769
hashedNodeVec *insert(NodeSet *nodes)
773
pair<set<hashedNodeVec *>::iterator,bool> uniq;
774
hashedNodeVec *nv = new hashedNodeVec(nodes);
775
uniq = cache.insert(nv);
776
if (uniq.second == false) {
780
sum += nodes->size();
781
if (nodes->size() > max)
785
return (*uniq.first);
789
#endif /* __LIBAA_RE_EXPR */