DNA.
wyrm-dpda
Version.
1.1.9
Namespace.
::wyrm
Command.
::wyrm::dpda
Language.
c
Manpage.
dpda (1WY)
Testbase.
Test Script
Test Report
Import.
Interface.
wyrm-io
wyrm-assoc
wyrmwif
Package.
wyrmwif
wyrm-assoc
Export.
Implementation.
wyrm-dpda.c
Interface.
wyrm-dpda.h
Package.
wyrmdpda.dylib

Deterministic Push-Down Automata with RE Extensions

Sections.
DPDA Command
Configuration
DPDA Execution
Affix Tree
Configuration Parse Cycle
Object Stack
Grammar Tcl Type
Command Options
Debugging
Test Base
Make.
Package.
compile -cc -ld -o [export package] [export implementation] -- -list [
  import interface] [export interface
]
Script.
rule clean :: {} "
  -rm $test/wyrm-dpda.TESTING
"
rule clobber :: {} "
  -rm $include/wyrm-dpda.h
  -rm $so/wyrmdpda[info sharedlibextension]
"
   
top

1 :: Implement a dpda with state transitions defined by regular expression instead of simple strings.

Copyright (C) 2002 SM Ryan

Wyrmwif Tcl extensions. For non-profit uses only, provided this copyright is preserved on all copies, this work may be freely copied, modified, redistributed, compiled, and incorporated in other works. This work is distributed with no warranty of any kind; no author or distributor accepts any responsibility for the consequences of using it, or for whether it serves any particular purpose or works at all, unless he or she says so in writing.

   
   

DPDA Command

   
top
 
section    top

#ifndef WYRM_DPDA_H
#define WYRM_DPDA_H

	//	wyrm-dpda.dna - Copyright (C) 2002 wyrmwif@bigfoot.com.  All rights reserved.


	#include "wyrmwif.h"
	int Wyrmdpda_Init(Intr intr);
	int Wyrmdpda_SafeInit(Intr intr);

#endif

		
 
section    top
 
section    top

static int dpdaCommand(ClientData clientData,Intr intr,int N,Tcl_Obj *const P[]) {
	int rc = TCL_ERROR;
	pAffix affix = 0; int *nNA = 0; Obj rules = 0,rule = 0,**vNA = 0;
	<Command option variable declarations>
	<Initial configuration variable declarations>
	<Configuration variable declarations>
	<Command option parsing>
	if (N==3) {
		initialStack = 0; initialState = P[0]; initialInput = P[1]; grammar = P[2];
	}else if (N==4) {
		initialStack = P[0]; initialState = P[1]; initialInput = P[2]; grammar = P[3];
	}else {
		rprintf(intr,"usage: dpda [options] [initial-stack] initial-state string grammar");
		return TCL_ERROR;
	}
	<Initialise the configuration>
	affix = getAffixTree(intr,grammar);
	if (!affix) goto exit;
	<Dump affix tree and rules>
	<Parse the input until no rule matches the configuration>
	<Construct the return value and free local memory>
	return rc;
}

int Wyrmdpda_Init(Intr intr) {
	Tcl_PkgProvide(intr,"wyrmdpda",VERSION);
	Tcl_PkgRequire(intr,"wyrmwif","1",0);
	Tcl_PkgRequire(intr,"wyrmassoc","1",0);
	<Register grammar object type>
	Tcl_CreateObjCommand(intr,"::wyrm::dpda",dpdaCommand,0,0);
	return Tcl_VarEval(intr,
			"namespace eval ::wyrm {namespace export dpda}\n",
			0);
}

int Wyrmdpda_SafeInit(Intr intr) {
	return Wyrmdpda_Init(intr);
}

		
 
section    top

okay: {
		int d = depth(stack); Obj *s = nheap(d,Obj),r[3]; Obj *p = s+d;
		int ni; chars si = Tcl_GetStringFromObj(input,&ni);
		while (stack) {*--p = incr(top(stack,0)); stack = pop(stack);}
		r[0] = Tcl_NewListObj(d,s);
		r[1] = state;
		r[2] = Tcl_NewStringObj(si+inputoff,ni-inputoff);
		Tcl_SetObjResult(intr,Tcl_NewListObj(3,r));
		while (d-->0) decr(*p++); dispose(s);
		rc = TCL_OK;
	}
exit:
	while (stack) stack = pop(stack);
	decr(state);
	decr(input);
	freeAffixTree(affix);
	dispose(nNA);
	dispose(vNA);
	decr(rules);
	decr(rule);
	return rc;

		
   
   

Configuration

   
top
 
section    top
protocol
Obj initialStack—Initial stack or NULL.
Obj initialState—Initial state.
Obj initialInput—Initial input string.
Obj grammar—The dpda transition rules.
Obj initialStack=0,initialState=0,initialInput=0,grammar=0;
		
 
section    top
protocol
pStack stack—The current stack.
Obj state—The current state.
int inputlength—The current input length.
Obj input—The current input string.
int inputoff—How much of the input that has already been read.
pStack stack = 0;
Obj state = 0;
Obj input = 0; int inputoff = 0;
		
 
section    top
 
section    top
protocol
input Obj initialStack—Initial stack or NULL.
input Obj initialState—Initial state.
input Obj initialInput—Initial input string.
output pStack stack—The current stack.
output Obj state—The current state.
output int inputlength—The current input length.
output Obj input—The current input string.
output int inputoff—zero.
output Intr intr—Error messages.

if (initialStack) {
	int ne; Obj *pe;
	if (Tcl_ListObjGetElements(intr,initialStack,&ne,&pe)!=TCL_OK) return TCL_ERROR;
	for (; ne>0; ne--,pe++) stack = push(stack,*pe);
}
{
	int n; chars s = Tcl_GetStringFromObj(initialState,&n);
	state = incr(*s=='<' && s[n-1]=='>' ? Tcl_NewStringObj(s+1,n-2) : initialState);
}
input = incr(initialInput);
inputoff = 0;
			
		
   
   

DPDA Execution

   
top
 
section    top
13. Command description :: <Command option description>

dpda implements a deterministic push-down automata where state transition rules use regular expressions instead of fixed strings. The command line gives the optional initial-stack (an empty stack by default), the initial-state, the input string, and the grammar rules. On completion, dpda returns a three element list, the first element is a sublist which is the stack at completion (from bottom of stack to top of stack), the final state, and any remaining unparsed input.

dpda cycles through a series of states. With each state a rule from the grammar is chosen based on the top elements of the stack, the current state, and the beginning of the remainder of the input. The rule can transform the stack and input and returns a new state name. This continues until no rule matches, the parser halts and the final configuration is returned.

initial-stack
The initial configuration stack, as a list of stack elements. The first list element is the bottom of the stack. If omitted, the stack is initially empty.
initial-state
The initial configuration state.
input
The input to parse.
grammar
The grammar to parse with.

The grammar is not modifiable while parsing. If the grammar object is modified, the changes are ignored by the parser until the parse is complete.

<Configuration representation><Sequence of operations in the command><Parse a rule from the grammar><Edit RE to extract assigned variables and insert end anchors><Edit the matched configuration>
   
   

Affix Tree

   
top
typedef struct Affix Affix,*pAffix,**hAffix;
		
 
section    top

15. Affix tree :: Even in the best of circumstances repeated scanning through rules is going to be slow; and adding regular expression parsing and matching is going to be even slower. To speed this up a bit, fixed prefixes and suffixes of the various regular expressions are extracted, and these are used to narrow the number of rules that are checked.

In essence for each rule is discovered a conjuction of prefixes and suffixes that must match if all the REs in the rule will match (though of course the REs can still fail on the entire string).

begins("abc",input) and begins("xyz",state) and ends("pqr",top(stack,1)) and ...
(The conjunction is always formed in this same order, input, state, top of stack, next of stack, et cetera, prefix then suffix.)

The suffix of the input is never checked. Transitions involving the whole input string are discouraged.

This conjunction can be represented by a list of positions and strings

if ((0,"abc"),(1,"xyz"),(-3,"pqr")) then rule i
where position 0 is the input prefix, 1 is the state prefix, -1 is the state suffix, 2 and -2 are the top of stack prefix and suffix, and so forth. This can be further refined by allowing a position to be repeated, which checks successive characters of the affix.
if ((0,"ab"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr")) then rule i

After considerring one rule in isolation, all the rules can be considerred, forming a set of conjunctions

if ((0,"ab"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr")) then rule i
if ((0,"ab"),(0,"d"),(0,"pqr")) then rule j
if ((0,"xy"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr")) then rule k
A few items of interests are
A given configuration can satify many conjunctions. All possibilities must be discoverred and then tested in rule order.
These conjunctions can have common subexpressions.

The common subexpressions are exploited somewhat by building a tree on the conjunctions. As long as the leading antecedents are the same, two rules are on the same branch.

if ((0,"ab"))
if ((0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr")) then rule i
if ((0,"d"),(0,"pqr")) then rule j
if ((0,"xy"),(0,"c"),(1,"x"),(1,"y"),(1,"z"),(-3,"pqr")) then rule k
This tree is called the affix tree.

type struct Affix—Tree to match fixed affixes of REs.
int pos—Which substring and end of the substrings is being checked.
chars substring—The substring in this antecedent.
Memory. This refers to the RE object string representation.
chars substringLength—The substring length.
chars lt—Followed if the queried position or string is less than this one.
chars eq—Followed if the queried position and string is equal to this one.
chars gt—Followed if the queried position or string is greater than this one.
chars consequent—Rules implied by the antecedents from here to the root.

struct Affix {
	int pos;
	chars substring;
	int substringLength;
	pAffix lt,eq,gt;
	Obj consequent;
};

		
 
section    top
protocol
io hAffix branch—Where in the tree; this points to link field to the node, not the node itself.
io chars newpos—The new substring position.
io chars newsubstring—The new substring.
io chars newsubstringLength—The new substring length.

while (newsubstringLength>0) {
	pAffix node = *branch;
	if (!node) {
		node = heap(Affix); zero (1,Affix,node);
		node->pos = pos;
		node->substringLength = newsubstringLength;
		node->substring = newsubstring;
		*branch = node;
		newsubstringLength = 0;
	}else if (pos<node->pos) {
		branch = &node->lt;
	}else if (pos>node->pos) {
		branch = &node->gt;
	}else if (pos>=0 && newsubstring[0]<node->substring[0]) {
		branch = &node->lt;
	}else if (pos>=0 && newsubstring[0]>node->substring[0]) {
		branch = &node->gt;
	}else if (pos<0 && newsubstring[newsubstringLength-1]<node->substring[node->substringLength-1]) {
		branch = &node->lt;
	}else if (pos<0 && newsubstring[newsubstringLength-1]>node->substring[node->substringLength-1]) {
		branch = &node->gt;
	}else if (newsubstringLength==node->substringLength
			&& memcmp(newsubstring,node->substring,newsubstringLength)==0
	) {
		newsubstringLength = 0;
	}else if (pos>=0) {
		<Split prefix node with common initial substring>
	}else if (pos<0) {
		<Split suffix node with common final substring>
	}
}

		
 
section    top
protocol
io hAffix branch—Where in the tree; this points to link field to the node, not the node itself.
io pAffix node—Queried and modified node.
io chars newpos—The new substring position.
io chars newsubstring—The new substring.
io chars newsubstringLength—The new substring length.

if (newsubstringLength>node->substringLength
		&& memcmp(newsubstring,node->substring,node->substringLength)==0
) {
	newsubstringLength -= node->substringLength;
	newsubstring += node->substringLength;
	branch = &node->eq;
}else {
	int common=0; pAffix splitNode;
	while (common<newsubstringLength && newsubstring[common]==node->substring[common]) common ++;
	splitNode = heap(Affix); *splitNode = *node;
	splitNode->substringLength -= common;
	splitNode->substring += common;
	node->consequent = 0;
	node->substringLength = common;
	splitNode->lt = splitNode->gt = 0; node->eq = splitNode;
	newsubstringLength -= common;
	newsubstring += common;
	if (newsubstringLength) branch = &node->eq;
}

		
 
section    top
protocol
io hAffix branch—Where in the tree;\ this points to link field to the node, not the node itself.
io pAffix node—Queried and modified node.
io chars newpos—The new substring position.
io chars newsubstring—The new substring.
io chars newsubstringLength—The new substring length.

if (newsubstringLength>node->substringLength
		&& memcmp(newsubstring,node->substring-node->substringLength,node->substringLength)==0
) {
	newsubstringLength -= node->substringLength;
	branch = &node->eq;
}else {
	int common=0; pAffix splitNode;
	while (common<newsubstringLength
			&& newsubstring[newsubstringLength-1-common]==node->substring[node->substringLength-1-common]
	)
		common ++;
	splitNode = heap(Affix); *splitNode = *node;
	splitNode->substringLength -= common;
	node->consequent = 0;
	node->substringLength = common;
	node->substring += splitNode->substringLength;
	splitNode->lt = splitNode->gt = 0; node->eq = splitNode;
	newsubstringLength -= common;
	if (newsubstringLength) branch = &node->eq;
}

		
 
section    top
protocol
input Obj RE—Regular expression.
output int newsubstringLength—Length of the prefix.
output chars newsubstring—The prefix.
output bool allFixed—If the entire RE was matched: it has no meta.
{

int reLength; chars re = Tcl_GetStringFromObj(RE,&reLength);
int pdepth = 0,bdepth = 0;
if (reLength>0 && *re=='^') re++,reLength--;
newsubstringLength = 0; newsubstring = re; allFixed = true;
while (allFixed && reLength>0) switch (*re) {
	case '*': case '?':
		if (newsubstringLength>0) newsubstringLength--;
		allFixed = false;
		break;
	case '{':
		if (reLength>1 && !isdigit(re[1])) goto fixedpart;
	case '}': case '(': case ')': case '[': case ']':
	case '+': case '.': case '|': case '\\':
		allFixed = false;
		break;
	case '$':
		if (reLength>1) allFixed = false;
		else re++,reLength--;
		break;
	default: fixedpart:
		newsubstringLength++; re++,reLength--;
		break;
}
while (newsubstringLength>0 && reLength>0) switch (*re) {
	case '('/*)*/: if (bdepth==0) pdepth++; re++,reLength--; break;
	case /*(*/')': if (bdepth==0) pdepth--; re++,reLength--; break;
	case '['/*]*/: bdepth++; re++,reLength--; break;
	case /*[*/']': bdepth--; re++,reLength--; break;
	case '\\': re+=2,reLength-=2; break;
	case '|': if (pdepth==0 && bdepth==0) newsubstringLength = 0; re++,reLength--; break;
	default: re++,reLength--; break;
}

		}
 
section    top
protocol
input Obj RE—Regular expression.
output int newsubstringLength—Length of the prefix.
output chars newsubstring—The prefix.
{

int reLength; chars re = Tcl_GetStringFromObj(RE,&reLength);
int pdepth = 0,bdepth = 0; bool stillFixed = true;
re += reLength-1;
if (reLength>0 && *re=='$') re--,reLength--;
newsubstringLength = 0; newsubstring = re+1;
while (stillFixed && reLength>0) switch (*re) {
	case '\\':
		if (newsubstringLength>0) newsubstring--,newsubstringLength--;
	case '{': case '}': case '(': case ')': case '[': case ']':
	case '*': case '?': case '+': case '.': case '|':
		stillFixed = false;
		break;
	case '^':
		if (reLength>1) stillFixed = false;
		else re--,reLength--;
		break;
	default:
		newsubstring--,newsubstringLength++; re--,reLength--;
		break;
}
while (newsubstringLength>0 && reLength>0) 
	if (reLength>1 && re[-1]=='\\') re-=2,reLength-=2;
	else switch (*re) {
		case '('/*)*/: if (bdepth==0) pdepth--; re--,reLength--; break;
		case /*(*/')': if (bdepth==0) pdepth++; re--,reLength--; break;
		case '['/*]*/: bdepth--; re--,reLength--; break;
		case /*[*/']': bdepth++; re--,reLength--; break;
		case '|': if (pdepth==0 && bdepth==0) newsubstringLength = 0; re--,reLength--; break;
		default: re++,reLength--; break;
	}

		}
 
section    top
protocol
input int nRE—Number of regular expressions.
io Obj* vRE—Regular expressions.
io pAffix affix—Root of the affix true.
output pAffix conclusion—The last node of the last antecedent.
{

int pos; hAffix branch = &affix; bool anything = false;
for (pos=0; pos<nRE; pos++) {
	int end; Obj RE = vRE[pos];
	for (end=0; end<2; end++) {
		int newsubstringLength; chars newsubstring; bool allFixed;
		if (pos>=0) {
			<Find fixed RE prefix>
		}else if (!allFixed) {
			<Find fixed RE suffix>
		}
		if (newsubstringLength>0) {
			if (anything) {
				branch = &((*branch)->eq);
			}
			anything = true;
			<Insert an antecedent in the affix tree>
		}
		if (!pos) break; pos = -pos;
	}
}
<Add a special empty substring to the root if all affixes were empty>
conclusion = *branch;

		}
 
section    top
protocol
input bool anything—If there were any affixes.
io pAffix affix—The root which will match empty if not anything.
output chars branch—Pointed to the root if necessary.
{

if (!affix) {
	affix = heap(Affix); zero(1,Affix,affix);
	affix->substring = "";
	branch = &affix;
}
if (!anything) {
	if (affix->substringLength>0) {
		pAffix nonempty = heap(Affix);
		*nonempty = *affix;
		affix->consequent = 0;
		affix->substringLength = 0;
		affix->substring = "";
		affix->gt = affix->lt = 0; affix->eq = nonempty;
	}
	branch = &affix;
}

		}
 
section    top

23 :: Leaving aside for a moment how rules are parsed into their internal form, the internal form of the rule is a set of parallel arrays inside a Tcl nested list.

{
rule-comment
{
input-re
state-re
stack-top-re
stack-next-top-re
...
}
{
input-subst
state-subst
stack-top-subst
stack-next-top-subst
...
}
{
{
input-re-variable
input-re-variable
...
}
{
state-re-variable
state-re-variable
...
}
{
top-re-variable
top-re-variable
...
}
...
}
}

The affix tree identifies all possible rules that might apply to a configuration. The possible rules then have to be tested in grammar order until a match. Once the affix tree node is found that matches all the fixed affixes, the rule order and contents are added to that node; and because other rules might have these same affixes, the node holds a list of orders and rules. During matching, these are merged with other affix nodes and sorted on their order.

The sorting will be done with associative maps, which only sort on ascii strings. The order is written out as fixed length zero padded string so that its ascii string sort order is the same as its numerical sort order.

 
section    top
protocol
input Obj comment—Rule comments.
Memory. Reference count decremented.
input Obj regexps—Regular expressions.
Memory. Reference count decremented.
input Obj substs—Rule substitutions.
Memory. Reference count decremented.
input Obj names—Rule names.
Memory. Reference count decremented.
output Obj ruleObj—The packaged rule.
input int order—Where the rule is in the grammar.
output Obj orderObj—Encoded rule order.
{

Obj ruleItem[4]; int i;
ruleItem[0] = comment;
ruleItem[1] = regexps;
ruleItem[2] = substs;
ruleItem[3] = names;
ruleObj = incr(Tcl_NewListObj(4,ruleItem));
decr(comment); decr(regexps); decr(substs); decr(names);
orderObj = oprintf("%012d",order);

		}
 
section    top
protocol
input Obj rule—The rule.
Memory. The output use this object and are valid until its reference count is decremented.
output Intr intr—Possible error messages.
output label exit—Error exit.
output Obj comment—Rule comment
output int nRE—Number of REs.
output Obj* vRE—Vector of REs.
output Obj nSU—Number of substitutions.
output Obj vSU—Substitutions.
output int* nNA—Number of names for each RE.
Memory. Caller disposes.
output Obj** vNA—Names for each RE.
Memory. Caller disposes.
{


int npieces; Obj *piece;
int nnames; Obj *name; int i;
if (Tcl_ListObjGetElements(0,rule,&npieces,&piece)==TCL_ERROR || npieces<4) {
	rprintf(intr,"rule has been damaged: %{y}s",rule); goto exit;
}
comment = piece[0];
if (Tcl_ListObjGetElements(0,piece[1],&nRE,&vRE)==TCL_ERROR) {
	rprintf(intr,"rule has been damaged: RE %{y}s",piece[1]); goto exit;
}
if (Tcl_ListObjGetElements(0,piece[2],&nSU,&vSU)==TCL_ERROR) {
	rprintf(intr,"rule has been damaged: substitutions %{y}s",piece[2]); goto exit;
}
if (Tcl_ListObjGetElements(0,piece[3],&nnames,&name)==TCL_ERROR || nnames<nRE) {
	rprintf(intr,"rule has been damaged: names %{y}s",piece[3]); goto exit;
}
nNA = nheap(nRE,int); vNA = nheap(nRE,Obj*);
for (i=0; i<nRE; i++) {
	if (Tcl_ListObjGetElements(0,name[i],&nNA[i],&vNA[i])==TCL_ERROR) {
		rprintf(intr,"rule has been damaged: name %d %{y}s",i,name[i]); goto exit;
	}
}

		}
 
section    top
protocol
input Obj comment—Rule comments.
input int nRE—Number of REs.
input Obj* vRE—Vector of REs.
input Obj regexps—Regular expressions.
input Obj substs—Rule substitutions.
input Obj names—Rule names.
input int order—Where the rule is in the grammar.
io pAffix affix—The affix tree root.
{

pAffix conclusion = 0;
Obj ruleObj = 0;
Obj orderObj = 0;
<Add a conjunction of rule antecdents to the affix tree>
<Pack the rule pieces before adding to the affix tree>
if (!conclusion->consequent) conclusion->consequent = incr(Tcl_NewObj());
Tcl_ListObjAppendElement(0,conclusion->consequent,orderObj); decr(orderObj);
Tcl_ListObjAppendElement(0,conclusion->consequent,ruleObj); decr(ruleObj);

		}
 
section    top
protocol
input Obj* G—Items in the grammar.
io int r—Current rule index into G.
output Obj comment—Rule comments.
output int nRE—Number of REs.
output Obj* vRE—Vector of REs.
output Obj regexps—Regular expressions.
output Obj substs—Rule substitutions.
output Obj names—Rule names.
{

pStack stack[2]; int ni[2]; int nS,nt; Obj *vS,*vN; int pos; int off; chars what;
comment = incr(G[r++]);
for (off=0,what="match"; off<2; off++,what="edit",r++) {
	<Nearly type 3 rule parsing>
}

nRE = ni[0]; vRE = nheap(nRE,Obj); vN = nheap(nRE,Obj);
for (pos=0; pos<nRE; pos++) {
	vRE[pos] = incr(top(stack[0],0)); stack[0] = pop(stack[0]);
	vN[pos] = incr(Tcl_NewObj());
	<Edit RE to extract assigned variables and insert end anchors>
}
regexps = incr(Tcl_NewListObj(nRE,vRE));
for (pos=0; pos<nRE; pos++) decr(vRE[pos]); dispose(vRE);
Tcl_ListObjGetElements(0,regexps,&nt,&vRE);
names = incr(Tcl_NewListObj(nRE,vN));
for (pos=0; pos<nRE; pos++) {decr(vN[pos]);} dispose(vN);

nS = ni[1]; vS = nheap(nS,Obj);
for (pos=0; pos<nS; pos++) {
	vS[pos] = incr(top(stack[1],0)); stack[1] = pop(stack[1]);
}
substs = incr(Tcl_NewListObj(nS,vS));
for (pos=0; pos<nS; pos++) {decr(vS[pos]);} dispose(vS);

		}
 
section    top
protocol
input Obj* G—Items in the grammar.
input int r—Current rule index into G.
input int what—match or edit
input int off—0=match, 1=edit.
output pStack* stack—Stack of found subitems.
output int* ni—Number of found subitems.
output intr Intr—Possible error message.
output label abandonAffixTree—Error escape.
{

int db=0;
int n; bytes re = (bytes)Tcl_GetStringFromObj(G[r],&n);
Obj t = 0;
ni[off] = 0; stack[off] = 0;
findNextStackItem:
	if (isspace(*re)) {re++,n--; goto findNextStackItem;}
	switch (n==0 ? -1 : *re) {
		case -1: stack[off] = push(push(stack[off],Tcl_NewObj()),Tcl_NewObj()); ni[off] += 2; goto okay;
		case '('/*)*/: re++,n--; goto stackItemStart;
		case '<': re++,n--; goto stateItemStart;
		default: stack[off] = push(stack[off],Tcl_NewObj()); ni[off] += 1; goto inputItem;
	}
stackItemStart:
	t = incr(Tcl_NewObj());
	db = 0;
stackItem:
	switch (n==0 ? -1 : *re) {
		case -1: rprintf(intr,/*(*/"missing closing ')' for the stack-%s: %{y}s",what,G[r]); goto error;
		case '('/*)*/: db++; Tcl_AppendToObj(t,re,1); re++,n--; goto stackItem;
		case /*(*/')': if (--db<0) goto stackItemEnd; Tcl_AppendToObj(t,re,1); re++,n--; goto stackItem;
		case '\\': if (n>1) {Tcl_AppendToObj(t,re,2);  re+=2,n-=2; goto stackItem;}
		default: Tcl_AppendToObj(t,re,1); re++,n--; goto stackItem;
	}
stackItemEnd:
	re++,n--; stack[off] = push(stack[off],t); ni[off] += 1; decr(t); t = 0;
	if (n>0 && *re==',') re++,n--;
	goto findNextStackItem;
stateItemStart:
	t = incr(Tcl_NewObj());
stateItem:
	switch (n==0 ? -1 : *re) {
		case -1: rprintf(intr,"missing closing '>' for the state-%s: %{y}s",what,G[r]); goto error;
		case '>': re++,n--; stack[off] = push(stack[off],t); ni[off] += 1; decr(t); t = 0; goto inputItem;
		default: Tcl_AppendToObj(t,re,1); re++,n--; goto stateItem;
	}
inputItem:
	stack[off] = push(stack[off],Tcl_NewStringObj(re,n)); ni[off] += 1;
	goto okay;
error: {
	int sp; for (sp=0; sp<=off; sp++) while (stack[sp]) stack[sp] = pop(stack[sp]);
	decr(t); goto abandonAffixTree;
}
okay:
	decr(t);
	
		}
 
section    top

29. Edit RE to extract assigned variables and insert end anchors :: The REs are Tcl regular expressions with one extension: (?~variable=RE). If the regular expression RE matches, the substring it matches is assigned to the local variable variable which may be used as $variable in the substitution expressions. This is equivalent to matching a variable list to parentheses in the usual regular expressions commands. Variables are assigned as matches are made; if a rule's substitution refer to a variable assigned in another rule's matches and not its own, the value of the variable is undefined.

The regular expressions in stack and state matches must match the whole string as if specified ^RE$, whether actually specified that way or not. The input match must always match the beginning of the string, as if ^RE; the whole string will only be matched if the rule has an explicit $ ending the regular expression.

protocol
io Obj* vRE—Vector of REs.
io Obj* vN—List of extracted names.
input int pos—Which RE is being editted
{

int n; bytes s = (bytes)Tcl_GetStringFromObj(vRE[pos],&n);
Obj t = incr(Tcl_NewStringObj("^",1));
Obj names = vN[pos];
int db = 0,dv = 0,nv; chars vs;
editloop:
	switch (n==0 ? -1 : *s) {
		case -1: goto editend;
		case '('/*)*/: Tcl_AppendToObj(t,s,1); s++,n--; goto afterOpen;
		case '['/*]*/: db=1; Tcl_AppendToObj(t,s,1); s++,n--; goto bracketted;
		case '\\': if (n>1) {Tcl_AppendToObj(t,s,1); s++,n--;}
		default: Tcl_AppendToObj(t,s,1); s++,n--; goto editloop;
	}
bracketted:
	switch (n==0 ? -1 : *s) {
		case -1: goto editend;
		case '['/*]*/: Tcl_AppendToObj(t,s,1); s++,n--; db++; goto bracketted;
		case /*[*/']': Tcl_AppendToObj(t,s,1); s++,n--; if (--db<=0); goto editloop; goto bracketted;
		default: Tcl_AppendToObj(t,s,1); s++,n--; goto bracketted;
	}
afterOpen:
	switch (n==0 ? -1 : *s) {
		case -1: goto editend;
		case '?': s++,n--; goto afterOpenQuery;
		default: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); goto editloop;
	}
afterOpenQuery:
	switch (n==0 ? -1 : *s) {
		case -1: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); Tcl_AppendToObj(t,"?",-1); goto editend;
		case ':': Tcl_AppendToObj(t,"?:",-1); s++,n--; goto editloop;
		case '=': Tcl_AppendToObj(t,"?=",-1); s++,n--; goto editloop;
		case '!': Tcl_AppendToObj(t,"?!",-1); s++,n--; goto editloop;
		case '~': s++,n--; goto extractVariableName;
		default: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); Tcl_AppendToObj(t,"?",-1); goto editloop;
	}
extractVariableName:
	dv = 0; vs = s; nv = 0;
variableName:
	switch (n==0 ? -1 : *s) {
		case -1: Tcl_ListObjAppendElement(0,names,Tcl_NewObj()); goto editend;
		case '('/*)*/: dv++; nv++; s++,n--; goto variableName;
		case /*(*/')': dv--; nv++; s++,n--; goto variableName;
		case '=':
			if (dv==0) {
				Tcl_ListObjAppendElement(0,names,Tcl_NewStringObj(vs,nv)); s++,n--; goto editloop;
			}else {
				nv++; s++,n--; goto variableName;
			}
		default: nv++; s++,n--; goto variableName;
	}
	
editend:
	if (pos>0) Tcl_AppendToObj(t,"$",-1);
	decr(vRE[pos]); vRE[pos] = t;

		}
 
section    top
protocol
input Obj grammar—The grammar.
output pAffix affix—The affix tree for the grammar.
output Intr intr—Possible error message.
output label abandonAffixTree—Error escape.
{


		}
 
section    top
proc int createGrammarInternalRepresentation—Compile the grammar; returns TCL_OK or TCL_ERROR.
input Obj grammar—The grammar.
output Intr intr—Possible error messages.

static int createGrammarInternalRepresentation(Intr intr,Obj grammar) {
	pAffix affix = 0;
	if (grammar->typePtr!=&GrammarType) {
		int nG; Obj *G; int r,order;
		if (Tcl_ListObjGetElements(intr,grammar,&nG,&G)!=TCL_OK) {
			return TCL_ERROR;
		}else if (nG%3!=0) {
			rprintf(intr,"grammar must be a list with length multiple of 3");
			return TCL_ERROR;
		}
		for (r=0,order=0; r<nG; order++) {
			Obj comment; int nRE; Obj *vRE,regexps,substs,names;
			<Parse a rule from the grammar>
			<Add a parsed rule to the affix tree>
		}
		if (grammar->typePtr && grammar->typePtr->freeIntRepProc)
			grammar->typePtr->freeIntRepProc(grammar);
		grammar->typePtr = &GrammarType;
		<Create affix tree reference>
		grammar->internalRep.otherValuePtr = affix;
		return TCL_OK;
	}else
		return TCL_OK;
abandonAffixTree:
	freeAffixTree(affix);
	return TCL_ERROR;
}

		
 
section    top
static int createGrammarInternalRepresentation(Intr intr,Obj grammar);
		
   
   

Configuration Parse Cycle

   
top
protocol
input pAffix affix—The affix tree.
io Obj input—Configuration input string.
io int inputoff—Configuration input string offset.
io Obj state—Configuration state.
io pStack stack—Configuration stack.
{
bool matched = true;
while (matched) {
	matched = false; rules = incr(Tcl_NewObj());
	affixMatch(affix,0,0,input,inputoff,state,stack,rules);
	<Dump pausible rules>
	while (wyrm_assocEmpty(0,rules)==TCL_BREAK && !matched) {
		int nRE,nSU; Obj comment,*vRE,*vSU; int inputMatchedLength;
		Obj key = wyrm_assocFirst(0,rules);
		rule = wyrm_assocData(0,rules,key);
		wyrm_assocDelete(0,rules,key);
		decr(key);
		<Unpack the rule into match, edit, and name pieces>
		<Match the rule to the configuration>
		if (matched) {
			<Edit the matched configuration>
		}
		decr(rule); rule = 0;
		dispose(nNA); nNA = 0; dispose(vNA); vNA = 0;
	}
	decr(rules); rules = 0;
}

		}
 
section    top

34. Matching the current configuration against the affix tree :: The affix tree is used to discover potential rules that can apply to the current configuration. The lt and gt links are refer to antecedents that are both different positions and different substrings at the same positions. In the former, all different positions have to be checked regardless of this antecedent, but for the latter the tree could have been constructed not to check substrings at the same position which could not possibly match. However that was not done to simplify the tree, and because it is anticpated it would not reduce the checks much more than by a few branches: it is anticpated that a short distance from the root the tree will become sparse. Because of how the tree is thus actually constructed, the lt and gt links are followed unconditionally. Only the eq link depends on the current antecedent.

The offset keeps track of how much has been checked at the current position by nodes above the current one; a substring can be split to exploit commonalities. If the position of the current node changes, it just means start checking a new substring.

If the actual substrings and node substrings match, this antecedent is satisfied and all rules catalogged in the antecedent might apply to this configuration. The consequent is built as an alternating sequence of rule order and rule contents, which are added as an associative map key and data; the map will do the sorting, and the rules can be picked off easily. If this antecedent is satisfied then continued along the eq for more antecedents which are conjoined to it.

The top node of the affix tree actually has a reference count rather than a substring, and this indicated by a negative substring length; such nodes are just skipped and the real tree is at the eq link. Under this may be the only empty substring with length zero; this node is an always true antecedent.

proc void affixMatch—Seach for rules that match configuration affixes.
input pAffix node—Node under consideration.
input int pos—Which position the offset refers to.
input int offset—How much at that position has already been checked.
input Obj input—Configuration input string.
input int inputoff—Configuration input string offset.
input Obj state—Configuration state.
input pStack stack—Configuration stack.
output Obj rules—Rules that can match this configuration.

static void affixMatch(pAffix node,int pos,int offset,Obj input,int inputoff,Obj state,pStack stack,Obj rules) {
	while (node) {
		chars actualsubstring;
		affixMatch(node->lt,pos,offset,input,inputoff,state,stack,rules);
		affixMatch(node->gt,pos,offset,input,inputoff,state,stack,rules);
		if (node->substringLength<0) {node = node->eq; continue;}
		if (node->pos!=pos) offset = 0; pos = node->pos;
		<Get the actual substring at this position and offset>
		if (actualsubstring && memcmp(actualsubstring,node->substring,node->substringLength)==0
		) {
			if (node->consequent) {
				int nr; Obj *pr;
				if (Tcl_ListObjGetElements(0,node->consequent,&nr,&pr)==TCL_ERROR) nr = 0;
				for (; nr>1; nr-=2,pr+=2) {
					wyrm_assocPut(0,rules,pr[0],pr[1],0);
				}
			}
			offset += node->substringLength;
			node = node->eq;
		}else
			node = 0;
	}
}

		
 
section    top
static void affixMatch(pAffix node,int pos,int offset,Obj input,int inputoff,Obj state,pStack stack,Obj rules);
		
 
section    top
protocol
input pAffix node->substringLength—How long the substring is.
input int pos—Which position the offset refers to.
input int offset—How much at that position has already been checked.
input Obj input—Configuration input string.
input int inputoff—Configuration input string offset.
input Obj state—Configuration state.
input pStack stack—Configuration stack.
output chars actualsubstring—Configuration substring at this position and offset, or NULL if no such substring.

if (node->substringLength==0)
	actualsubstring = "";
else switch (pos) {
	case 0: {
		int n; chars s = Tcl_GetStringFromObj(input,&n);
		n -= inputoff; s += inputoff;
		actualsubstring = offset+node->substringLength>n ? 0 : s+offset;
	}	break;
	case 1: {
		int n; chars s = Tcl_GetStringFromObj(state,&n);
		actualsubstring = offset+node->substringLength>n ? 0 : s+offset;
	}	break;
	case -1: {
		int n; chars s = Tcl_GetStringFromObj(state,&n);
		actualsubstring = offset+node->substringLength>n ? 0 : s+n-node->substringLength-offset;
	}	break;
	default: {
		Obj stackentry = top(stack,(pos<0?-pos:pos)-2);
		if (stackentry) {
			int n; chars s = Tcl_GetStringFromObj(stackentry,&n);
			actualsubstring = 
					offset+node->substringLength>n ? 0
				:	pos>=0 ? s+offset
				:	s+n-node->substringLength-offset;
		}else {
			actualsubstring = 0;
		}
	}	break;
}

		
 
section    top
protocol
input Obj input—Configuration input string.
input int inputoff—Configuration input string offset.
input Obj state—Configuration state.
input pStack stack—Configuration stack.
input Obj comment—Rule comment
input int nRE—Number of REs.
input Obj* vRE—Vector of REs.
input int* nNA—Number of names for each RE.
input Obj** vNA—Names for each RE.
output bool matched—If the rule matches.
output int inputMatchedLength—How much of the input was matched.
{

int pos; inputMatchedLength = 0;
<Dump rule before RE matching>
for (pos=0,matched=true; matched && pos<nRE; pos++) {
	Tcl_RegExp re; int i,rs,offset,n; chars s;
	Tcl_RegExpInfo info; Tcl_RegExpIndices matchedAll[2];
	Obj string = pos==0 ? input : pos==1 ? state : top(stack,pos-2);
	if (!string) {matched = false; continue;}
	offset = pos==0 ? inputoff : 0;
	s = Tcl_GetStringFromObj(string,&n);
	<Special case for match all>
	else {
		re = Tcl_GetRegExpFromObj(intr,vRE[pos],cflags);
		rs = re ? Tcl_RegExpExecObj(intr,re,string,offset,nNA[pos]+1,0) : -1;
		if (rs<0) {
			Tcl_AppendResult(intr,": ",Tcl_GetString(vRE[pos]),0);
			goto exit;
		}
		if ((matched=rs>0)) {
			zero(1,Tcl_RegExpInfo,&info);
			Tcl_RegExpGetInfo(re,&info);
		}
	}
	<Dump rule during RE matching>
	if (matched) {
		if (pos==0) inputMatchedLength = info.matches[0].end-info.matches[0].start;
		for (i=0; i<nNA[pos]; i++) {
			if (!streq(Tcl_GetString(vNA[pos][i]),"")) {
				Obj value,assigned;
				value = incr(i>=info.nsubs
					? Tcl_NewObj()
					: Tcl_NewStringObj(
							s+info.matches[i+1].start+offset,
							info.matches[i+1].end-info.matches[i+1].start));
				<Dump variable assignments during RE matching>
				assigned = Tcl_ObjSetVar2(intr,vNA[pos][i],0,value,TCL_LEAVE_ERR_MSG);
				decr(value); if (!assigned) goto exit;
			}
		}
	}
}

		}
 
section    top

if (
		streq(Tcl_GetString(vRE[pos]),".*") || streq(Tcl_GetString(vRE[pos]),"(.*)")
		|| streq(Tcl_GetString(vRE[pos]),"^.*$") || streq(Tcl_GetString(vRE[pos]),"^(.*)$")
) {
	matched = true; info.nsubs = 2; info.matches = matchedAll;
	matchedAll[0].start = matchedAll[1].start = offset;
	matchedAll[0].end = matchedAll[1].end = n-offset;
}

		
 
section    top

39. Edit the matched configuration :: If a rule matches the current configuration, the edit item is substituted to convert to the next configuration. First, all matched portions of the configuration are deleted: the top stack entries that were matched, the state string, and the initial matched portion of the input string. Then the new stack nodes are substituted and pushed in order, the new state string is substituted, and the input edit is substituted and pushed back onto the beginning of the input string.

Each subst-string is individually passed to the subst command to do all nested command, variable reference, and backslash substitution. Variable can be assigned during the matches and then used in substitutions; while the matched elements are deleted, the variables can still hold matched substrings of the now deleted elements.

Whole stack entries are deleted and then pushed. Part of a stack entry can be saved in a variable, and that variable used to create a new stack entry, but those are separate stack elements rather than modification of the same entry. The number of stack entries in the match and edit need not be the same; if the edit has more, the stack size will increase; if the edit has fewer, the stack will shrink.

The entire state is replaced. Variables may be used to copy substrings from the old state to the new state. The initial matched portion of the input is replaced. The rest of the string that was not examined by the RE is left unmodified.

protocol
io Obj input—Configuration input string.
io int inputoff—Configuration input string offset.
io Obj state—Configuration state.
io pStack stack—Configuration stack.
output int inputMatchedLength—How much of the input was matched.
output Obj comment—Rule comment
output int nRE—Number of REs.
output Obj nSU—Number of substitutions.
output Obj vSU—Substitutions.
{
		
int i,newinputlength = 0;
for (i=2; i<nRE; i++) stack = pop(stack);
for (i=nSU-1; i>=2; i--) {
	Obj newentry = substitute(intr,vSU[i]);
	if (!newentry) goto exit;
	stack = push(stack,newentry);
	decr(newentry);
}
decr(state); state = nSU>1 ? substitute(intr,vSU[1]) : incr(Tcl_NewObj());
if (!state) goto exit;
inputoff += inputMatchedLength;
if (nSU>0) {
	Obj newinput = substitute(intr,vSU[0]);
	if (!newinput) goto exit;
	Tcl_GetStringFromObj(newinput,&newinputlength);
	if (newinputlength>0) {
		int n; chars s = Tcl_GetStringFromObj(input,&n);
		newinput = oprintf("%{y-}s%.*s",newinput,n-inputoff,s+inputoff);
		decr(input); input = newinput; inputoff = 0;
	}
}
<Dump new configuration>

		}
 
section    top
proc substitute—Evaluate the 'subst' command on a string; returns the string with substitutions, or NULL if error.
Memory. Reference count incremented; caller decrements when done.
output intr—The string with substitution, or an error message.
input string—The string evaluated.

static Obj substitute(Intr intr,Obj string) {
	Tcl_CmdInfo substCmd; bool gotSubstCmd = false;
	Obj Q[2]; int rc; Obj result = 0;
	gotSubstCmd = Tcl_GetCommandInfo(intr,"subst",&substCmd);
	if (!gotSubstCmd) {
		Tcl_AppendResult(intr,"dpda: could not find 'subst' command in the interpretter",0);
		return 0;
	}
	Q[0] = incr(Tcl_NewStringObj("subst",-1));
	Q[1] = string;
	rc = substCmd.objProc(substCmd.objClientData,intr,2,Q);
	decr(Q[0]);
	if (rc==TCL_OK) {
		result = incr(Tcl_GetObjResult(intr));
		Tcl_ResetResult(intr);
	}
	return result;
}

		
 
section    top
static Obj substitute(Intr intr,Obj string);
		
   
   

Object Stack

   
top
 
section