/*******************************************************************************

  name     : hugi2.cpp
  desc     : Encoder supporting several methods (derived from existing classes)
  author   : Christian Schlange  aka  tryx/Xography
  started  :
  last up  : Mo, 06-22-1998
  compiler : wpp386.exe Watcom C++ v10.x-11.x

  comments :

  rev log  : current-method: 5bit packing with token-replace and dropped chars

  interface:

  todo	   : try out simple huffmann-coding

*******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <math.h>
#include <string.h>


#define byte  unsigned char
#define word  unsigned short
#define dword unsigned int

#define maxint -1

/***
 ***

 Code the data with a 5bit-code and replace frequently used words
 by one of the 7 free codes from 25..31. The letters a-z are coded
 by 0..25.

***
***


/*
 *
 Die CodeCom-Klasse
 *
 */

#define alloclen 1000

struct  countstruc {
  		char c;
  		int  count;
};

typedef struct _node {					// Huffmann node
	int choosen;
	int code;
	int count;
	_node * r;
	_node * l;
} node;

class CodeCom
{
	private:
		char * orig;						// original and replaced ascii-data
		char * tokens;						// mem for 7*(tokens+tokensize)
		char * coded;						// 5bit coded bitstream
		char * droppedchars; 				// dropped characters
		char * dropped;		        		// sequence of dropped chars
		int    tokenlen; 					// Lnge der Tokendaten
		int    origlen; 					// len of original text
		int    replen; 						// len of text with replacings
		int    replaces; 					// number of tokens (usually 7=max)
		int    droppedcharss;
		int    droppedones;
		struct countstruc counts[256];		// cunts-table
		int    characters; 					// number of used characters
		FILE * IN, * OUT;
		//Huffman data
		node * nodes;
		int  * codes;
		int  * clens;
		int    Leaves;

		int    Depth;

		int    code;

	public:
		CodeCom          (void);
		~CodeCom         ();
		void  Invoke     (void);
		void  Rle        (void);
		void  Visit 	 (node * nn);
		void  GetHuffmanCodes (void);
		void  Huffman    (void);
		int   Replace    (char *str1, char *str2, char char3);
		int   Replace    (char *str1, char *str2, char *str3);
	    int   AddToken   (char *str1, char token);
	    void  AddDropped (char token);
	    void  DoDropping (char token);
		int   GetCounts	 (void);
		float GetEntropy (void);
		void  Save5Bit   (char *name);
	    void  SaveBin    (char *name, void *src, int len);
	    void  SaveInc    (char *name, void *src, int len);
};


CodeCom::CodeCom (void)
{
	int i, c;

	// allocate mem
	if (!(orig=new char[alloclen])) exit(1);
	if (!(coded=new char[alloclen])) exit(1);
	if (!(tokens=new char[alloclen])) exit(1);
	if (!(droppedchars=new char[alloclen])) exit(1);
	if (!(dropped=new char[alloclen])) exit(1);

	memset (orig,   0, alloclen);
	memset (coded,  0, alloclen);
	memset (tokens, 0, alloclen);
	memset (droppedchars, 0, alloclen);
	memset (dropped, 0, alloclen);


	origlen       = 0;
	replaces      = 0;
	tokenlen      = 0;
	characters    = 0;
	droppedcharss = 0;
	droppedones   = 0;

	// read in original file
	if     (!(IN = fopen ("text.txt", "rb"))) exit(1);
	while  ((c=fgetc(IN))!=EOF) orig[origlen++]=c;
	printf ("Original Textlen : %i\n", origlen);
	fclose (IN);
}

CodeCom::~CodeCom ()
{
	delete (droppedchars);
	delete (dropped);
	delete (tokens);
	delete (coded);
	delete (orig);
}

void CodeCom::Invoke (void)  /*HERE_XX*/
{

    // calculate the best replaces ( at first the replaces are specified
	// manually here in the source-code )

	printf ("Entropy before replace: %f\n", GetEntropy());

	//strlwr  (orig);					 // convert to lowercase
	AddDropped ('\?');		       		 // dropped ones, token 'j'
	AddDropped ('\,');
	AddDropped ('\-');
	AddDropped ('I');
	AddDropped ('P');
	AddDropped ('T');
	AddDropped ('N');
	AddDropped ('O');
	AddDropped ('B');
	AddDropped ('Y');
	//AddDropped ('q');

	// fix
	AddToken ("y",      'a'+23);
	AddToken ("the",    'a'+24);
	AddToken ("are ",   'a'+25);
	AddToken ("  ",     'a'+26);
	AddToken (" ",      'a'+27);
	AddToken ("\.",     'a'+28);
	AddToken ("place",  'a'+29);
    AddToken ("Some",   'a'+30);
	AddToken ("\r\n",   'a'+31);	   	// 0xD,0xA -> 0xD
	AddToken ("in",     'b');
	DoDropping ('j');					// create dropping sequence
	//AddToken ("people", 'q');

	// set new lens
	replen  = strlen(orig);
	origlen = replen;

	//Huffman ();							// Apply Huffman-coding on string
	//Rle     ();
	printf ("replen:       %i\n", replen);
	printf ("replaces      %i\n", replaces);
	printf ("Tokenlen      %i\n", tokenlen);
	printf ("bitpacked     %i\n", replen*5/8);
	printf ("bit+tokens    %i\n", replen*5/8+tokenlen);
	printf ("tokens packed %i\n", (replen+tokenlen)*5/8);
	printf ("Entropy after replace: %f\n", GetEntropy());

	SaveInc  ("token.inc", tokens, tokenlen);
	Save5Bit ("code5.inc");
	SaveInc  ("dropped.inc", dropped, droppedones);
	if (!(OUT = fopen ("replen.inc", "wt"))) exit(1);
	fprintf (OUT, "REPLEN=%i\n", replen);
	fclose (OUT);


	printf ("%s\n", orig);
	getch  ();

}

int CodeCom::Replace (char *str1, char *str2, char *str3)
{
	char *pos;
	int   len, reps;

	pos  = str1;
	reps = 0;
	len  = strlen(str1);
	while (pos=strstr(pos, str2))
	{
		memmove (pos+strlen(str3),
				 pos+strlen(str2),
				 len-((int)(pos-str1)+strlen(str3)));
		memcpy  (pos, str3, strlen(str3));
		reps++;
	}


	return (reps);
}

int CodeCom::Replace (char *str1, char *str2, char char3)
{
	char tmp[2] = {char3, 0};
	return (Replace (str1, str2, tmp));
}


int CodeCom::AddToken (char *str1, char token)
{
	int  reps;
	int  slen;

	reps = Replace (orig, str1, token);

	if (reps)
	{
		slen = strlen(str1);
		tokens[tokenlen++] = slen;
		memcpy (&tokens[tokenlen], str1, slen);
		tokenlen += slen;
		replaces++;
	}

	return (reps);
}

void CodeCom::AddDropped (char token)
{
	droppedchars[droppedcharss++] = token;
}

void CodeCom::DoDropping (char token)
{
	int  i, j, reps;
	char c, str1[8];

	origlen = strlen (orig);
	for (i=0; i<origlen; i++) {
		c = orig[i];
		for (j=0; j<droppedcharss; j++) {
			if (c==droppedchars[j]) dropped[droppedones++]=c;
		}
	}

	for (reps=0, j=0; j<droppedcharss; j++) {
		sprintf (str1, "%c", droppedchars[j]);
		reps += Replace (orig, str1, token);
	}

	printf ("%i droppings found [%i replaces]\n", droppedones, reps);
	//getch ();

}

/*
 Use 3bit rle-coding
 */

byte   save[2000];

void CodeCom::Rle (void)
{
	int    pos[200];
	int    poss;
	int    count;
	char   c;
	int    i, opos, cn, sum, done;

	int    saveidx=0;

	printf ("[CodeCom::Rle]\n");

	characters = GetCounts();
	printf ("Different characters : %i\n", characters);


	done = 0;
	for (cn=0; cn<characters; cn++)
	{

		c	  = counts[cn].c;
		count = counts[cn].count;
		poss  = 0;

		for (i=0; i<origlen; i++) {
			if (orig[i]==c) pos[poss++]=i;
		}
		printf ("%i positions of character \'%c\' [%i] found\n", poss, c, c);

		save[saveidx++] = c;
		save[saveidx++] = poss;

		opos = 0;
		sum  = 0;
		for (i=0; i<poss; i++) {
			//printf ("%i   %i\n", pos[i], pos[i]-opos); getch ();
			save[saveidx++] = pos[i]-opos;
			sum += pos[i]-opos;
			opos = pos[i];
		}

		//printf ("Average delta: %i  [%i of %i]\n", sum/poss, done, origlen);
		done += poss;
		getch  ();
	}
	//SaveBin ("delta.bin", save, saveidx);

}


/*
 Huffman for char * orig
 */


// parse huffman-tree and generate codes
void CodeCom::Visit ( node * nn )
{
	Depth++;
	if (nn->l==NULL && nn->r==NULL)
	{
		Leaves++;
	   	/*
	   	printf ("found leave %i, code %i, count %i, depth %i\n",
	   			 Leaves,nn->code, nn->count, Depth-1);
		getch  ();
		*/
	   	clens[nn->code-'a']=Depth-1;
	   	code = 0;
	   	Depth--;
	   	return;
	}
	 else if (nn->l!=NULL && nn->r!=NULL)
	{
	   	Visit ( nn->l );
	   	Visit ( nn->r );
	}
	 else {
		printf ("no huffmann node\n"); getch (); exit(0);
	}
   	Depth--;
	return;
}

// calculate huffman-tree
void CodeCom::GetHuffmanCodes (void)
{
	printf ("[CodeCom::GetHuffmanCodes]\n\n");
	int   i, v;
	int   sum, c1, c2, max;
	int   base;
	node *n1, *n2, nn;

	origlen = strlen(orig);

	for (i=0; i<origlen; i++) {			// check for valid range (0..31)
		v = orig[i]-'a';
		//printf ("%i -> %i [%i]\n", i, v, counts[v].count); getch ();
		if (v<0 || v>=characters) {
			printf ("range error\n");getch();exit (0);
		}
	}

	for (i=0; i<characters; i++) {		// setup initial huffman-tree
		nodes[i].code    = counts[i].c;	// chars are not linear but sorted
		nodes[i].count   = counts[i].count;
		if (counts[i].count==0) {
			printf ("Zero counts not allowed\n"); getch(); exit(1);
		}
		nodes[i].choosen = 0;
		nodes[i].l       = nodes[i].r   = NULL;
	}


	do
	{
		// search 2 nodes with smallest counts
		max = maxint;
		for (i=0; i<characters; i++) {
			//printf ("%i ", nodes[i].count);
			if (!nodes[i].choosen) {
				v = nodes[i].count;
				if (v<max) { max=v; c1=i; }
			}
		}
		//printf ("\n"); getch ();

		max = maxint;
		for (i=0; i<characters; i++) {
			if (!nodes[i].choosen) {
				v = nodes[i].count;
				if (v<max && i!=c1) { max=v; c2=i; }
			}
		}
		//printf ("two smallest : %i %i\n", c1,c2);
		//getch  ();

		// join nodes (angebaut wird immer an c2)
		// die neuen nodes sind n1 links und n2 rechts

		n1 = new node;
		n2 = new node;

		nodes[c2].choosen = 1;

		n1->code    = nodes[c1].code;
		n1->count   = nodes[c1].count;
		n1->choosen = 0;
		n1->l       = nodes[c1].l;
		n1->r       = nodes[c1].r;
		n2->code    = nodes[c2].code;
		n2->count   = nodes[c2].count;
		n2->choosen = 0;
		n2->l       = nodes[c2].l;
		n2->r       = nodes[c2].r;

		nodes[c1].count = nodes[c1].count + nodes[c2].count;
		nodes[c1].l = n1;
		nodes[c1].r = n2;

		// check if everything full
		for (sum=0,i=0; i<characters; i++) sum+=nodes[i].choosen;
		//printf ("choosen : %i\n", sum);
	} while (sum<(characters-1));

	// extract codes
	for (i=0; i<characters; i++) {		// get tree-root
		if (!nodes[i].choosen) base=i;
	}

	memset (codes,  0, characters*sizeof(int));
	memset (clens, -1, characters*sizeof(int));

	code = 0;
	Visit ( &nodes[base] );

	sum = 0;
	for (i=0; i<characters; i++) {
		sum += clens[counts[i].c-'a']*counts[i].count;
	}

	sum = 0;
	for (i=0; i<origlen; i++) {
		sum += clens[orig[i]-'a'];
	}

	printf ("original bits : %i\n", origlen*8);
	printf ("packed bits   : %i\n", sum);
	printf ("Entropy       : %f\n", GetEntropy());

	for (i=0; i<characters; i++)
	{
		printf ("%c [%i] -> %i\n", counts[i].c, counts[i].count, clens[counts[i].c-'a']);
		getch ();
	}

}

void CodeCom::Huffman (void)			/* HERE_XX */
{
	printf ("[CodeCom::Huffman]\n\n");
	int alen;
	Leaves=0;
	Depth =0;

	GetCounts ();

	alen  = characters+16;
	nodes = new node[alen];
	codes = new int[alen];
	clens = new int[alen];

	printf ("Characters : %i\n", characters);

	GetHuffmanCodes ();

	getch ();

}

/*
 Get counts of characters and return number of different
 characters
 */

int qsortcmp1 (const void *s1, const void *s2) // small to big
{
	countstruc *c1 = (countstruc *) s1;
	countstruc *c2 = (countstruc *) s2;
	return ( c2->count - c1->count );
}

int CodeCom::GetCounts (void)
{
	int   i, k, chars;

	for (i=0; i<256; i++) {
		counts[i].c = i;
		counts[i].count = 0;
	}

	for (i=0; i<origlen; i++) {
		k = (byte) orig[i];
		counts[k].count++;
	}

    qsort  (counts, 256, sizeof(countstruc), qsortcmp1);
	for    (chars=0, i=0; i<256; i++) if (counts[i].count) chars++;
	characters = chars;
	return (chars);
}

float CodeCom::GetEntropy (void)
{
	double p,I,MI;
	int    i;
	MI = 0.0;

	origlen    = strlen (orig);
	characters = GetCounts ();

	for (i=0; i<characters; i++)
	{
		p   = double(counts[i].count) / double(origlen);
		MI += p*(-log(p))/log(2.0);
	}

	return (MI);
}

/*
 Save 5bit-coded
 Src: char * orig
 Len: origlen = replen bytes
 */

void  CodeCom::Save5Bit (char * name) /* HERE_XX */
{
	int  i, clen, bitpos;
	byte * bptr;
	word c, w;

	printf ("[CodeCom::Save5Bit]\n");
	origlen = strlen (orig);

	clen    = (origlen*5)/8+1;		  	// Lnge der 5bit-codierten Daten
	memset (coded,0,clen*sizeof(byte));

	bitpos  = 0;

	for (i=0; i<origlen; i++)
	{
		c = orig[i]-'a';
		if (c>31) {
		 	printf ("Domain Error\n"); getch ();
		    c=31;
		}
		bptr = coded+(bitpos>>3);
		w    = *(word*)bptr;       // word, das die 5bits enthlt
		c  <<= (bitpos&7);
		*(word*)bptr|=c;		   // bits schreiben
		bitpos+=5;
	}

	SaveInc (name, coded, clen);
	delete (coded);
}


/*
 Save raw bytes to file / incfile
 */

void CodeCom::SaveBin (char *name, void *src, int len)
{
	int i;
	byte *bptr = (byte *) src;
	OUT = fopen (name, "wb");
	if (!OUT) return;
	for (i=0; i<len; i++) fputc (*bptr++, OUT);

	fclose (OUT);
}

void CodeCom::SaveInc (char *name, void *src, int len)
{
	int i, j, k, rows, cols;
	byte *bptr = (byte *) src;
	OUT = fopen (name, "wt");
	if (!OUT) return;

	rows = len/16;
	cols = 16;
	for (k=0; k<2; k++) {
		for (i=0; i<rows; i++) {
			fprintf (OUT, "DB ");
		 	for (j=0; j<cols; j++) {
		 		fprintf (OUT,"0%xh", *bptr++);
		 		if (j < (cols-1)) fprintf (OUT, ",");
		 	}
		 	fprintf (OUT, "\n");
		}
		cols = len % 16;
		rows = 1;
	}
	fclose (OUT);
}



/***
 ***
 MAIN
***
***/

void main (int argc, char *argv[])
{

	CodeCom   codecom;
	codecom.Invoke ();

	getch     ( );
	exit      (0);

}

