| top | |
typedef struct Affix Affix,*pAffix,**hAffix;
| ^ | Definition continued at: 47. | | | | | section top | | 15. Affix tree ::
Even in the best of circumstances repeated scanning through
rules is going to be slow; and adding regular expression
parsing and matching is going to be even slower. To speed
this up a bit, fixed prefixes and suffixes of the various
regular expressions are extracted, and these are used to
narrow the number of rules that are checked.
In essence for each rule is discovered a conjuction of prefixes
and suffixes that must match if all the REs in the rule
will match (though of course the REs can still fail on the
entire string).
- begins("abc",input) and begins("xyz",state) and ends("pqr",top(stack,1)) and ...
(The conjunction is always formed in this same order, input, state,
top of stack, next of stack, et cetera, prefix then suffix.)
The suffix of the input is never checked. Transitions involving the whole
input string are discouraged.
This conjunction can be represented by a list of positions and strings
- if ((0,"abc"),(1,"xyz"),(-3,"pqr")) then rule i
where position 0 is the input prefix, 1 is the state prefix,
-1 is the state suffix, 2 and -2 are the top of stack prefix
and suffix, and so forth. This can be further refined by allowing
a position to be repeated, which checks successive characters
of the affix.
- if ((0,"ab"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr"))
then rule i
After considerring one rule in isolation, all the rules can
be considerred, forming a set of conjunctions
- if ((0,"ab"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr"))
then rule i
- if ((0,"ab"),(0,"d"),(0,"pqr")) then rule j
- if ((0,"xy"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr"))
then rule k
A few items of interests are
-
A given configuration can satify many conjunctions. All possibilities must be
discoverred and then tested in rule order.
-
These conjunctions can have common subexpressions.
The common subexpressions are exploited somewhat by building a tree on the conjunctions.
As long as the leading antecedents are the same, two rules are on the same branch.
- if ((0,"ab"))
- if ((0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr")) then rule i
- if ((0,"d"),(0,"pqr")) then rule j
- if ((0,"xy"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr"))
then rule k
This tree is called the affix tree.
- type struct Affix—Tree to match fixed affixes of REs.
| - int pos—Which substring and end of the substrings is being checked.
| - chars substring—The substring in this antecedent.
- Memory. This refers to the RE object string representation.
| - chars substringLength—The substring length.
| - chars lt—Followed if the queried position or string is less than this one.
| - chars eq—Followed if the queried position and string is equal to this one.
| - chars gt—Followed if the queried position or string is greater than this one.
| - chars consequent—Rules implied by the antecedents from here to the root.
|
|
|
struct Affix {
int pos;
chars substring;
int substringLength;
pAffix lt,eq,gt;
Obj consequent;
};
| ^ | | | | | | section top | | - protocol
| - io hAffix branch—Where in the tree; this points to link field to the node, not the node itself.
| - io chars newpos—The new substring position.
| - io chars newsubstring—The new substring.
| - io chars newsubstringLength—The new substring length.
|
|
|
while (newsubstringLength>0) {
pAffix node = *branch;
if (!node) {
node = heap(Affix); zero (1,Affix,node);
node->pos = pos;
node->substringLength = newsubstringLength;
node->substring = newsubstring;
*branch = node;
newsubstringLength = 0;
}else if (pos<node->pos) {
branch = &node->lt;
}else if (pos>node->pos) {
branch = &node->gt;
}else if (pos>=0 && newsubstring[0]<node->substring[0]) {
branch = &node->lt;
}else if (pos>=0 && newsubstring[0]>node->substring[0]) {
branch = &node->gt;
}else if (pos<0 && newsubstring[newsubstringLength-1]<node->substring[node->substringLength-1]) {
branch = &node->lt;
}else if (pos<0 && newsubstring[newsubstringLength-1]>node->substring[node->substringLength-1]) {
branch = &node->gt;
}else if (newsubstringLength==node->substringLength
&& memcmp(newsubstring,node->substring,newsubstringLength)==0
) {
newsubstringLength = 0;
}else if (pos>=0) {
<Split prefix node with common initial substring>
}else if (pos<0) {
<Split suffix node with common final substring>
}
}
| ^ | | | | | | section top | | - protocol
| - io hAffix branch—Where in the tree; this points to link field to the node, not the node itself.
| - io pAffix node—Queried and modified node.
| - io chars newpos—The new substring position.
| - io chars newsubstring—The new substring.
| - io chars newsubstringLength—The new substring length.
|
|
|
if (newsubstringLength>node->substringLength
&& memcmp(newsubstring,node->substring,node->substringLength)==0
) {
newsubstringLength -= node->substringLength;
newsubstring += node->substringLength;
branch = &node->eq;
}else {
int common=0; pAffix splitNode;
while (common<newsubstringLength && newsubstring[common]==node->substring[common]) common ++;
splitNode = heap(Affix); *splitNode = *node;
splitNode->substringLength -= common;
splitNode->substring += common;
node->consequent = 0;
node->substringLength = common;
splitNode->lt = splitNode->gt = 0; node->eq = splitNode;
newsubstringLength -= common;
newsubstring += common;
if (newsubstringLength) branch = &node->eq;
}
| ^ | | | | | | section top | | - protocol
| - io hAffix branch—Where in the tree;\ this points to link field to the node, not the node itself.
| - io pAffix node—Queried and modified node.
| - io chars newpos—The new substring position.
| - io chars newsubstring—The new substring.
| - io chars newsubstringLength—The new substring length.
|
|
|
if (newsubstringLength>node->substringLength
&& memcmp(newsubstring,node->substring-node->substringLength,node->substringLength)==0
) {
newsubstringLength -= node->substringLength;
branch = &node->eq;
}else {
int common=0; pAffix splitNode;
while (common<newsubstringLength
&& newsubstring[newsubstringLength-1-common]==node->substring[node->substringLength-1-common]
)
common ++;
splitNode = heap(Affix); *splitNode = *node;
splitNode->substringLength -= common;
node->consequent = 0;
node->substringLength = common;
node->substring += splitNode->substringLength;
splitNode->lt = splitNode->gt = 0; node->eq = splitNode;
newsubstringLength -= common;
if (newsubstringLength) branch = &node->eq;
}
| ^ | | | | | | section top | | - protocol
| - input Obj RE—Regular expression.
| - output int newsubstringLength—Length of the prefix.
| - output chars newsubstring—The prefix.
| - output bool allFixed—If the entire RE was matched: it has no meta.
|
|
|
{
int reLength; chars re = Tcl_GetStringFromObj(RE,&reLength);
int pdepth = 0,bdepth = 0;
if (reLength>0 && *re=='^') re++,reLength--;
newsubstringLength = 0; newsubstring = re; allFixed = true;
while (allFixed && reLength>0) switch (*re) {
case '*': case '?':
if (newsubstringLength>0) newsubstringLength--;
allFixed = false;
break;
case '{':
if (reLength>1 && !isdigit(re[1])) goto fixedpart;
case '}': case '(': case ')': case '[': case ']':
case '+': case '.': case '|': case '\\':
allFixed = false;
break;
case '$':
if (reLength>1) allFixed = false;
else re++,reLength--;
break;
default: fixedpart:
newsubstringLength++; re++,reLength--;
break;
}
while (newsubstringLength>0 && reLength>0) switch (*re) {
case '('/*)*/: if (bdepth==0) pdepth++; re++,reLength--; break;
case /*(*/')': if (bdepth==0) pdepth--; re++,reLength--; break;
case '['/*]*/: bdepth++; re++,reLength--; break;
case /*[*/']': bdepth--; re++,reLength--; break;
case '\\': re+=2,reLength-=2; break;
case '|': if (pdepth==0 && bdepth==0) newsubstringLength = 0; re++,reLength--; break;
default: re++,reLength--; break;
}
} | ^ | | | | | | section top | | - protocol
| - input Obj RE—Regular expression.
| - output int newsubstringLength—Length of the prefix.
| - output chars newsubstring—The prefix.
|
|
|
{
int reLength; chars re = Tcl_GetStringFromObj(RE,&reLength);
int pdepth = 0,bdepth = 0; bool stillFixed = true;
re += reLength-1;
if (reLength>0 && *re=='$') re--,reLength--;
newsubstringLength = 0; newsubstring = re+1;
while (stillFixed && reLength>0) switch (*re) {
case '\\':
if (newsubstringLength>0) newsubstring--,newsubstringLength--;
case '{': case '}': case '(': case ')': case '[': case ']':
case '*': case '?': case '+': case '.': case '|':
stillFixed = false;
break;
case '^':
if (reLength>1) stillFixed = false;
else re--,reLength--;
break;
default:
newsubstring--,newsubstringLength++; re--,reLength--;
break;
}
while (newsubstringLength>0 && reLength>0)
if (reLength>1 && re[-1]=='\\') re-=2,reLength-=2;
else switch (*re) {
case '('/*)*/: if (bdepth==0) pdepth--; re--,reLength--; break;
case /*(*/')': if (bdepth==0) pdepth++; re--,reLength--; break;
case '['/*]*/: bdepth--; re--,reLength--; break;
case /*[*/']': bdepth++; re--,reLength--; break;
case '|': if (pdepth==0 && bdepth==0) newsubstringLength = 0; re--,reLength--; break;
default: re++,reLength--; break;
}
} | ^ | | | | | | section top | | - protocol
| - input int nRE—Number of regular expressions.
| - io Obj* vRE—Regular expressions.
| - io pAffix affix—Root of the affix true.
| - output pAffix conclusion—The last node of the last antecedent.
|
|
|
| ^ | | | | | | section top | | - protocol
| - input bool anything—If there were any affixes.
| - io pAffix affix—The root which will match empty if not anything.
| - output chars branch—Pointed to the root if necessary.
|
|
|
{
if (!affix) {
affix = heap(Affix); zero(1,Affix,affix);
affix->substring = "";
branch = &affix;
}
if (!anything) {
if (affix->substringLength>0) {
pAffix nonempty = heap(Affix);
*nonempty = *affix;
affix->consequent = 0;
affix->substringLength = 0;
affix->substring = "";
affix->gt = affix->lt = 0; affix->eq = nonempty;
}
branch = &affix;
}
} | ^ | | | | | | section top | | 23 ::
Leaving aside for a moment how rules are parsed into their internal form,
the internal form of the rule is a set of parallel arrays inside a Tcl
nested list.
- {
- rule-comment
- {
- input-re
- state-re
- stack-top-re
- stack-next-top-re
- ...
}
- {
- input-subst
- state-subst
- stack-top-subst
- stack-next-top-subst
- ...
}
- {
- {
- input-re-variable
- input-re-variable
- ...
}
- {
- state-re-variable
- state-re-variable
- ...
}
- {
- top-re-variable
- top-re-variable
- ...
}
- ...
}
}
The affix tree identifies all possible rules that might apply to a configuration.
The possible rules then have to be tested in grammar order until a match.
Once the affix tree node is found that matches all the fixed affixes,
the rule order and contents are added to that node; and because other rules might
have these same affixes, the node holds a list of orders and rules.
During matching, these are merged with other affix nodes and sorted on their order.
The sorting will be done with associative maps, which only sort on ascii strings.
The order is written out as fixed length zero padded string so that its
ascii string sort order is the same as its numerical sort order.
| ^ | | | | section top | | - protocol
| - input Obj comment—Rule comments.
- Memory. Reference count decremented.
| - input Obj regexps—Regular expressions.
- Memory. Reference count decremented.
| - input Obj substs—Rule substitutions.
- Memory. Reference count decremented.
| - input Obj names—Rule names.
- Memory. Reference count decremented.
| - output Obj ruleObj—The packaged rule.
| - input int order—Where the rule is in the grammar.
| - output Obj orderObj—Encoded rule order.
|
|
|
{
Obj ruleItem[4]; int i;
ruleItem[0] = comment;
ruleItem[1] = regexps;
ruleItem[2] = substs;
ruleItem[3] = names;
ruleObj = incr(Tcl_NewListObj(4,ruleItem));
decr(comment); decr(regexps); decr(substs); decr(names);
orderObj = oprintf("%012d",order);
} | ^ | | | | | | section top | | - protocol
| - input Obj rule—The rule.
- Memory. The output use this object and are valid until its reference count is decremented.
| - output Intr intr—Possible error messages.
| - output label exit—Error exit.
| - output Obj comment—Rule comment
| - output int nRE—Number of REs.
| - output Obj* vRE—Vector of REs.
| - output Obj nSU—Number of substitutions.
| - output Obj vSU—Substitutions.
| - output int* nNA—Number of names for each RE.
- Memory. Caller disposes.
| - output Obj** vNA—Names for each RE.
- Memory. Caller disposes.
|
|
|
{
int npieces; Obj *piece;
int nnames; Obj *name; int i;
if (Tcl_ListObjGetElements(0,rule,&npieces,&piece)==TCL_ERROR || npieces<4) {
rprintf(intr,"rule has been damaged: %{y}s",rule); goto exit;
}
comment = piece[0];
if (Tcl_ListObjGetElements(0,piece[1],&nRE,&vRE)==TCL_ERROR) {
rprintf(intr,"rule has been damaged: RE %{y}s",piece[1]); goto exit;
}
if (Tcl_ListObjGetElements(0,piece[2],&nSU,&vSU)==TCL_ERROR) {
rprintf(intr,"rule has been damaged: substitutions %{y}s",piece[2]); goto exit;
}
if (Tcl_ListObjGetElements(0,piece[3],&nnames,&name)==TCL_ERROR || nnames<nRE) {
rprintf(intr,"rule has been damaged: names %{y}s",piece[3]); goto exit;
}
nNA = nheap(nRE,int); vNA = nheap(nRE,Obj*);
for (i=0; i<nRE; i++) {
if (Tcl_ListObjGetElements(0,name[i],&nNA[i],&vNA[i])==TCL_ERROR) {
rprintf(intr,"rule has been damaged: name %d %{y}s",i,name[i]); goto exit;
}
}
} | ^ | | | | | | section top | | - protocol
| - input Obj comment—Rule comments.
| - input int nRE—Number of REs.
| - input Obj* vRE—Vector of REs.
| - input Obj regexps—Regular expressions.
| - input Obj substs—Rule substitutions.
| - input Obj names—Rule names.
| - input int order—Where the rule is in the grammar.
| - io pAffix affix—The affix tree root.
|
|
|
| ^ | | | | | | section top | | - protocol
| - input Obj* G—Items in the grammar.
| - io int r—Current rule index into G.
| - output Obj comment—Rule comments.
| - output int nRE—Number of REs.
| - output Obj* vRE—Vector of REs.
| - output Obj regexps—Regular expressions.
| - output Obj substs—Rule substitutions.
| - output Obj names—Rule names.
|
|
|
{
pStack stack[2]; int ni[2]; int nS,nt; Obj *vS,*vN; int pos; int off; chars what;
comment = incr(G[r++]);
for (off=0,what="match"; off<2; off++,what="edit",r++) {
<Nearly type 3 rule parsing>
}
nRE = ni[0]; vRE = nheap(nRE,Obj); vN = nheap(nRE,Obj);
for (pos=0; pos<nRE; pos++) {
vRE[pos] = incr(top(stack[0],0)); stack[0] = pop(stack[0]);
vN[pos] = incr(Tcl_NewObj());
<Edit RE to extract assigned variables and insert end anchors>
}
regexps = incr(Tcl_NewListObj(nRE,vRE));
for (pos=0; pos<nRE; pos++) decr(vRE[pos]); dispose(vRE);
Tcl_ListObjGetElements(0,regexps,&nt,&vRE);
names = incr(Tcl_NewListObj(nRE,vN));
for (pos=0; pos<nRE; pos++) {decr(vN[pos]);} dispose(vN);
nS = ni[1]; vS = nheap(nS,Obj);
for (pos=0; pos<nS; pos++) {
vS[pos] = incr(top(stack[1],0)); stack[1] = pop(stack[1]);
}
substs = incr(Tcl_NewListObj(nS,vS));
for (pos=0; pos<nS; pos++) {decr(vS[pos]);} dispose(vS);
} | ^ | | | | | | section top | | - protocol
| - input Obj* G—Items in the grammar.
| - input int r—Current rule index into G.
| - input int what—match or edit
| - input int off—0=match, 1=edit.
| - output pStack* stack—Stack of found subitems.
| - output int* ni—Number of found subitems.
| - output intr Intr—Possible error message.
| - output label abandonAffixTree—Error escape.
|
|
|
{
int db=0;
int n; bytes re = (bytes)Tcl_GetStringFromObj(G[r],&n);
Obj t = 0;
ni[off] = 0; stack[off] = 0;
findNextStackItem:
if (isspace(*re)) {re++,n--; goto findNextStackItem;}
switch (n==0 ? -1 : *re) {
case -1: stack[off] = push(push(stack[off],Tcl_NewObj()),Tcl_NewObj()); ni[off] += 2; goto okay;
case '('/*)*/: re++,n--; goto stackItemStart;
case '<': re++,n--; goto stateItemStart;
default: stack[off] = push(stack[off],Tcl_NewObj()); ni[off] += 1; goto inputItem;
}
stackItemStart:
t = incr(Tcl_NewObj());
db = 0;
stackItem:
switch (n==0 ? -1 : *re) {
case -1: rprintf(intr,/*(*/"missing closing ')' for the stack-%s: %{y}s",what,G[r]); goto error;
case '('/*)*/: db++; Tcl_AppendToObj(t,re,1); re++,n--; goto stackItem;
case /*(*/')': if (--db<0) goto stackItemEnd; Tcl_AppendToObj(t,re,1); re++,n--; goto stackItem;
case '\\': if (n>1) {Tcl_AppendToObj(t,re,2); re+=2,n-=2; goto stackItem;}
default: Tcl_AppendToObj(t,re,1); re++,n--; goto stackItem;
}
stackItemEnd:
re++,n--; stack[off] = push(stack[off],t); ni[off] += 1; decr(t); t = 0;
if (n>0 && *re==',') re++,n--;
goto findNextStackItem;
stateItemStart:
t = incr(Tcl_NewObj());
stateItem:
switch (n==0 ? -1 : *re) {
case -1: rprintf(intr,"missing closing '>' for the state-%s: %{y}s",what,G[r]); goto error;
case '>': re++,n--; stack[off] = push(stack[off],t); ni[off] += 1; decr(t); t = 0; goto inputItem;
default: Tcl_AppendToObj(t,re,1); re++,n--; goto stateItem;
}
inputItem:
stack[off] = push(stack[off],Tcl_NewStringObj(re,n)); ni[off] += 1;
goto okay;
error: {
int sp; for (sp=0; sp<=off; sp++) while (stack[sp]) stack[sp] = pop(stack[sp]);
decr(t); goto abandonAffixTree;
}
okay:
decr(t);
} | ^ | | | | | | section top | | - protocol
| - io Obj* vRE—Vector of REs.
| - io Obj* vN—List of extracted names.
| - input int pos—Which RE is being editted
|
|
|
{
int n; bytes s = (bytes)Tcl_GetStringFromObj(vRE[pos],&n);
Obj t = incr(Tcl_NewStringObj("^",1));
Obj names = vN[pos];
int db = 0,dv = 0,nv; chars vs;
editloop:
switch (n==0 ? -1 : *s) {
case -1: goto editend;
case '('/*)*/: Tcl_AppendToObj(t,s,1); s++,n--; goto afterOpen;
case '['/*]*/: db=1; Tcl_AppendToObj(t,s,1); s++,n--; goto bracketted;
case '\\': if (n>1) {Tcl_AppendToObj(t,s,1); s++,n--;}
default: Tcl_AppendToObj(t,s,1); s++,n--; goto editloop;
}
bracketted:
switch (n==0 ? -1 : *s) {
case -1: goto editend;
case '['/*]*/: Tcl_AppendToObj(t,s,1); s++,n--; db++; goto bracketted;
case /*[*/']': Tcl_AppendToObj(t,s,1); s++,n--; if (--db<=0); goto editloop; goto bracketted;
default: Tcl_AppendToObj(t,s,1); s++,n--; goto bracketted;
}
afterOpen:
switch (n==0 ? -1 : *s) {
case -1: goto editend;
case '?': s++,n--; goto afterOpenQuery;
default: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); goto editloop;
}
afterOpenQuery:
switch (n==0 ? -1 : *s) {
case -1: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); Tcl_AppendToObj(t,"?",-1); goto editend;
case ':': Tcl_AppendToObj(t,"?:",-1); s++,n--; goto editloop;
case '=': Tcl_AppendToObj(t,"?=",-1); s++,n--; goto editloop;
case '!': Tcl_AppendToObj(t,"?!",-1); s++,n--; goto editloop;
case '~': s++,n--; goto extractVariableName;
default: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); Tcl_AppendToObj(t,"?",-1); goto editloop;
}
extractVariableName:
dv = 0; vs = s; nv = 0;
variableName:
switch (n==0 ? -1 : *s) {
case -1: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); goto editend;
case '('/*)*/: dv++; nv++; s++,n--; goto variableName;
case /*(*/')': dv--; nv++; s++,n--; goto variableName;
case '=':
if (dv==0) {
Tcl_ListObjAppendElement(0,names,Tcl_NewStringObj(vs,nv)); s++,n--; goto editloop;
}else {
nv++; s++,n--; goto variableName;
}
default: nv++; s++,n--; goto variableName;
}
editend:
if (pos>0) Tcl_AppendToObj(t,"$",-1);
decr(vRE[pos]); vRE[pos] = t;
} | ^ | | | | | | section top | | - protocol
| - input Obj grammar—The grammar.
| - output pAffix affix—The affix tree for the grammar.
| - output Intr intr—Possible error message.
| - output label abandonAffixTree—Error escape.
|
|
|
| ^ | | | | section top | | - proc int createGrammarInternalRepresentation—Compile the grammar; returns TCL_OK or TCL_ERROR.
| - input Obj grammar—The grammar.
| - output Intr intr—Possible error messages.
|
|
|
static int createGrammarInternalRepresentation(Intr intr,Obj grammar) {
pAffix affix = 0;
if (grammar->typePtr!=&GrammarType) {
int nG; Obj *G; int r,order;
if (Tcl_ListObjGetElements(intr,grammar,&nG,&G)!=TCL_OK) {
return TCL_ERROR;
}else if (nG%3!=0) {
rprintf(intr,"grammar must be a list with length multiple of 3");
return TCL_ERROR;
}
for (r=0,order=0; r<nG; order++) {
Obj comment; int nRE; Obj *vRE,regexps,substs,names;
<Parse a rule from the grammar>
<Add a parsed rule to the affix tree>
}
if (grammar->typePtr && grammar->typePtr->freeIntRepProc)
grammar->typePtr->freeIntRepProc(grammar);
grammar->typePtr = &GrammarType;
<Create affix tree reference>
grammar->internalRep.otherValuePtr = affix;
return TCL_OK;
}else
return TCL_OK;
abandonAffixTree:
freeAffixTree(affix);
return TCL_ERROR;
}
| ^ | | | | | | section top | |
static int createGrammarInternalRepresentation(Intr intr,Obj grammar);
| ^ | | |
| |