DNA.
wyrm-huff
Version.
1.0.9
Namespace.
::wyrm
Command.
::wyrm::huffman
Language.
c
Manpage.
huffman (1WY)
Manpage.
wyrm_huffcreate (3WY)
Testbase.
Test Script
Test Report
Import.
Interface.
wyrmwif
Export.
Implementation.
wyrm-huff.c
Interface.
wyrm-huff.h
Object.
wyrm-huff.o

Adaptive Huffman Compression

Sections.
C Interface
Data Structures
Data Structure Allocation
Compression
Expansion
Tcl Interface
Make.
Object.
compile "'-Dsplashgif=\"[pwd]/$doc/../img/splash.gif\"'" -c -o [
  export object
] [
  export implementation
] -- -list [import interface] [export interface]
   
top

1 :: A simple, copyright and patent free compressor/decompressor.

Derived from the copyrighted work:

"Application Of Splay Trees To Data Compression" by Douglas W. Jones, Communications of the ACM, August 1988.

   
   

C Interface

   
top
 
section    top
 
section    top

4. wyrm-huff.h :: The details of the algorithm are given in

"Application Of Splay Trees To Data Compression" by Douglas W. Jones, Communications of the ACM, August 1988

The Huffman code for a byte depends on its position in a splay tree; the more frequent the letter, the higher it is placed, and the shorter its code. The tree is updated after each byte so it is always the best guess to date of the actual frequency.

These routines compress bytes rather than characters. Reading and writing compressed text should be as binary rather than trying to interpret the bytes as a character encoding.

Multiple states number-states will sometimes produce better compression of a repeating phrase. Also the compression and decompression starts with an assumption that all characters are equally likely. The prime is an optional sample of the text to be compressed; specifying it allows the compressor to adapt to the strings. The prime is run through the compressor/decompressor to adapt the splay tree, but the compressed/decompressed characters are discarded.

The same number-states and prime parameters (or their absence) must be the same on both compression and decompression.

This technique is fast and effective. And copyright and patent free.

   
   

Data Structures

   
top

5. Opaque data structures :: The end of the string has a distinguished code, wyrm_huffEOS, which is distinct from all characters. This must be the last code compressed, and it will be the last character returned during expansion. The compressed codes will be packed into regular sized units, such as machine words, but might not fit into an exact number of units, with some extra bits at the end. The existence of a special end character distinguishes the end of the string and any extra bits.

The compressor is called once for byte to be compressed, and the expander once for each byte expanded. The compressor uses a callback routine called as each machine word is filled out and ready for the output stream. The expander uses a callback to acquire machine words from the input stream.

The state of compression or expansion is stored in a WyrmHuffTree data structure. It must be created before calling the compressor or expander, and deleted on completion. Its details are private.


enum WyrmHuffConstants {
	wyrm_huffEOS = 256,
	wyrm_huffLongsize = 32,
	wyrm_huffMaxStates = 64
};

typedef struct WyrmHuffTree WyrmHuffTree,*pWyrmHuffTree;
typedef long WyrmHuffBuffer;

typedef WyrmHuffBuffer (*rWyrmHuffReceive)(ptr,bool *ofl);
typedef void (*rWyrmHuffTransmit)(ptr,WyrmHuffBuffer);


		
 
section    top

enum {
	maxchar = wyrm_huffEOS,
	succmax = maxchar+1,
	twicemax = 2*maxchar+1,
	root = 1
};

typedef short codetype; /* 0..maxchar */
typedef short upindex; /* 1..maxchar */
typedef short downindex; /* 1..twicemax */
typedef short stateindex; /* 0..wyrm_huffMaxStates-1 */

struct WyrmHuffTree {
	stateindex state;
	stateindex nstates;
	ptr context;
	rWyrmHuffReceive receive;
	rWyrmHuffTransmit transmit;
	WyrmHuffBuffer buffer;
	short bp;
	bool ofl;
	downindex left[wyrm_huffMaxStates][maxchar+1];
	downindex right[wyrm_huffMaxStates][maxchar+1];
	upindex up[wyrm_huffMaxStates][twicemax+1];
};

		
 
section    top

assert(sizeof(short)>sizeof(byte));
assert(SHRT_MAX>=twicemax);
assert(SHRT_MAX>=wyrm_huffMaxStates);
assert(sizeof(WyrmHuffBuffer)*CHAR_BIT==wyrm_huffLongsize);

		
 
section    top

static void splay(WyrmHuffTree *C,codetype plain) {
	downindex a,b;
	upindex c,d;
	upindex *up = C->up[C->state];
	downindex *left = C->left[C->state];
	downindex *right = C->right[C->state];

	a = plain+succmax;
	do {
		c = up[a];
		if (c!=root) {
			d = up[c]; b = left[d];
			if (c==b) {
				b = right[d]; right[d] = a;
			}else {
				left[d] = a;
			}
			if (a==left[c]) {
				left[c] = b;
			}else {
				right[c] = b;
			}
			up[a] = d; up[b] = c;
			a = d;
		}else {
			a = c;
		}
	} while (a!=root);
	C->state = (plain) & C->nstates;
}

		
   
   

Data Structure Allocation

   
top
proc wyrm_huffCreate—Create and initialisation the compression state and return it.
input nstates—Number of states in the compressor.
input context—An arbitrary value which is passed to receive and transmit unmodified.
input receive—Expander callback. It is called as the expander needs more input. May be NULL if no expansion.
input transmit—Compressor callback. It is called as the compressor produces more output. May be NULL if no compression.
pWyrmHuffTree wyrm_huffCreate(int nstates,ptr context,rWyrmHuffReceive receive,rWyrmHuffTransmit transmit);
		
 
section    top

pWyrmHuffTree wyrm_huffCreate(int nstates,ptr context,rWyrmHuffReceive receive,rWyrmHuffTransmit transmit) {
	stateindex s;
	downindex i;
	upindex j;
	pWyrmHuffTree C = heap(WyrmHuffTree);
	C->state = 0; C->context = context; C->receive = receive; C->transmit = transmit;
	C->buffer = C->bp = 0; C->ofl = 0;
	for (s=0; s<wyrm_huffMaxStates; s++) {
		for (i=2; i<=twicemax; i++) {C->up[s][i] = i/2;}
		for (j=1; j<=maxchar; j++) {C->left[s][j] = 2*j; C->right[s][j] = 2*j+1;}
	}
	for (s=1; s<wyrm_huffMaxStates && s<nstates; s+=s) ;
	C->nstates = s-1;
	return C;
}

		
 
section    top
#define wyrm_huffDelete—Delete the state.
output C—output: State is deleted.
#define wyrm_huffDelete(C) (Tcl_Free((chars)(C)))
		
 
section    top
proc wyrm_huffPrime—Prime the compression tables.
output C—output: State is updated.
input prime—input: The priming string.
void wyrm_huffPrime(pWyrmHuffTree C,chars);
		
 
section    top

void wyrm_huffPrime(pWyrmHuffTree C,chars prime) {
	while (*prime) splay(C,*prime++);
}

		
   
   

Compression

   
top
proc wyrm_huffCompress—Compress a byte.
io C—The compression state is read and updated.
output C.transmit—Called as necessary as a word of compressed data is completed.
input plain—input: The plain code byte to compress.
void wyrm_huffCompress(pWyrmHuffTree C,short plain);
		
 
section    top

void wyrm_huffCompress(WyrmHuffTree *C,short plain) {
	short sp;
	bool stack[maxchar+1];
	downindex a;
	upindex *up = C->up[C->state];
	downindex *right = C->right[C->state];

	if (plain<0) plain &= ((1<<CHAR_BIT)-1);
	a = plain+succmax;
	sp = 1;
	do {
		stack[sp] = right[up[a]]==a; sp++;
		a = up[a];
	} while (a!=root);
	do {
		sp--;
		if (!C->transmit) abort();
		C->buffer = (C->buffer<<1) | (stack[sp]&1);
		C->bp += 1;
		if (C->bp==wyrm_huffLongsize) {
			(C->transmit)(C->context,htonl(C->buffer));
			C->buffer = C->bp = 0;
		}
	} while (sp>1);
	splay(C,plain);
	if (plain==wyrm_huffEOS) {
		if (!C->transmit) abort();
		if (C->bp>0) {
			C->buffer <<= wyrm_huffLongsize-C->bp;
			(C->transmit)(C->context,htonl(C->buffer));
			C->buffer = C->bp = 0;
		}
	}
}

		
 
section    top
proc wyrm_huffCompressObj—output: Compressed object.
input x—The object to be compressed.
input nstates—Requested number of states; the actual number may differ.
input prime—String used to prime the splay tree.
Obj wyrm_huffCompressObj(Obj x,int nstates,chars prime);
		
 
section    top

typedef struct {
	int m,n; bytes p;
} CompressContext;

static void transmitToObj(ptr o,WyrmHuffBuffer b) {
	CompressContext *context = o; WyrmHuffBuffer B = b;
	if (context->n+sizeof B>=context->m) {
		context->m = 2*(context->n+sizeof B); context->p = reheap(context->m,byte,context->p);
	}
	memcpy(context->p+context->n,&B,sizeof B); context->n += sizeof B;
}

Obj wyrm_huffCompressObj(Obj x,int nstates,chars prime) {
	Obj outp; CompressContext context;
	pWyrmHuffTree C = wyrm_huffCreate(nstates,&context,0,transmitToObj);
	int n; bytes s = Tcl_GetByteArrayFromObj(x,&n);
	context.m = context.n = 0; context.p = 0;
	if (prime) wyrm_huffPrime(C,prime);
	while (n-->0) wyrm_huffCompress(C,*s++);
	wyrm_huffCompress(C,wyrm_huffEOS);
	wyrm_huffDelete(C);
	outp = incr(Tcl_NewByteArrayObj(context.p,context.n));
	dispose(context.p);
	return outp;
}

		
   
   

Expansion

   
top
proc wyrm_huffExpand—Expand a byte and return it.
io C—The compression state is read and updated.
input C.receive—Called as necessary as a word of compressed data is required.
short wyrm_huffExpand(pWyrmHuffTree C);
		
 
section    top

short wyrm_huffExpand(WyrmHuffTree *C) {
	downindex *left = C->left[C->state];
	downindex *right = C->right[C->state];
	downindex a;

	a = root;
	do {
		int receive;
		if (!C->receive) abort();
		if (C->bp==0) {
			C->buffer = ntohl((C->receive)(C->context,&C->ofl));
			C->bp = wyrm_huffLongsize;
			if (C->ofl) return wyrm_huffEOS;
		}
		C->bp -= 1;
		receive = (C->buffer>>C->bp) & 1;
		if (receive==0) {
			a = left[a];
		}else {
			a = right[a];
		}
	} while (a<=maxchar);
	a -= succmax;
	splay(C,a);
	return a;
}

		
 
section    top
proc wyrm_huffExpandObj—output: Expanded object.
input x—The object to be expanded.
input nstates—Requested number of states, which may differ from the actual number.
input prime—String used to prime the splay tree.
Obj wyrm_huffExpandObj(Obj x,int nstates,chars prime);
		
 
section    top

typedef struct {Obj inp; int off;} ExpandContext;

static WyrmHuffBuffer receiveFromObj(ptr e,bool *ofl) {
	ExpandContext *E = e; int n; bytes s = Tcl_GetByteArrayFromObj(E->inp,&n);
	WyrmHuffBuffer B;
	n -= E->off; s += E->off; if (n>sizeof B) n = sizeof B; E->off += n;
	*ofl = n<=0;
	memset(&B,0,sizeof(B)); memcpy(&B,s,n); return B;
}

Obj wyrm_huffExpandObj(Obj x,int nstates,chars prime) {
	ExpandContext E; int m=0,n=0; bytes p=0; Obj obj; short c;
	pWyrmHuffTree C = wyrm_huffCreate(nstates,&E,receiveFromObj,0);
	if (prime) wyrm_huffPrime(C,prime);
	E.inp = x; E.off = 0;
	while ((c=wyrm_huffExpand(C))!=wyrm_huffEOS) {
		if (n+1>=m) {m = 2*(n+1); p = reheap(m,byte,p);}
		p[n++] = c;
	}
	wyrm_huffDelete(C);
	obj = incr(Tcl_NewByteArrayObj(p,n));
	dispose(p);
	return obj;
}

		
   
   

Tcl Interface

   
top
 
section    top
int wyrm_huffCommandInit(Intr intr);
		
 
section    top

static int wyrm_huffCommand(ClientData clientData,Intr intr,int N,Tcl_Obj *const P[]) {
	chars p,prime; Obj R; int nstates = 16;
	Tcl_ResetResult(intr);
	#ifdef TESTING
		if (streq(Tcl_GetStringFromObj(P[1],0),"splash")) {
			Tcl_SetResult(intr,splashgif,TCL_STATIC);
			return TCL_OK;
		}
	#endif
	if (N>=3 && streq(Tcl_GetStringFromObj(P[1],0),"-n")) {
		if (Tcl_GetIntFromObj(intr,P[2],&nstates)!=TCL_OK)
			return TCL_ERROR;
		P+=2; N-=2;
	}
	if (N!=3 && N!=4) {
		usage:
			Tcl_AppendResult(intr,"usage: huffman compress|expand <string> [<prime>]",0);
			return TCL_ERROR;
	}
	prime = N==4 ? Tcl_GetStringFromObj(P[3],0) : 0;
	p = Tcl_GetStringFromObj(P[1],0);
	if (stribegins("c",p))
		R = wyrm_huffCompressObj(P[2],nstates,prime);
	else if (stribegins("e",p))
		R = wyrm_huffExpandObj(P[2],nstates,prime);
	else {
		Tcl_AppendResult(intr,"expected 'compress' or 'expand': ",p,"\n",0); goto usage;
	}
	Tcl_SetObjResult(intr,R); decr(R); return TCL_OK;
}


int wyrm_huffCommandInit(Intr intr) {
	char package[] = "namespace eval ::wyrm {namespace export huffman}\n";
	<Assert types are large enough>
	Tcl_CreateObjCommand(intr,"::wyrm::huffman",wyrm_huffCommand,0,0);
	return Tcl_Eval(intr,package);
}

		
 
section    top
    HFF huffman command.
      HFF000
      Parameters.
        Too few.
          HFF001
        Too many.
          HFF002
        Invalid.
          HFF003
      Encoding and decoding.
        Encoding.
          HFF010
        Decoding.
          HFF011
        HFF012
      Stress testing.
        HFF020
        HFF021
      Priming.
        Matching primers.
          HFF050
        Mismatched primers.
          HFF051
      Number of states.
        Default.
          HFF060
        A range of numbers.
          HFF061