
#include <stdio.h>


struct stWord {
    int     len;
    char    str[256];
};


FILE            *gin, *goutwords, *goutdata;
struct stWord   gwords[1024] = {
                                { 1, "?" },
                                { 1, "." },
                                { 1, "," },
                                { 1, "-" }
                               };
#define CODE_EOF    0
#define CODE_QMARK  1
#define CODE_PERIOD 2
#define CODE_COMMA  3
#define CODE_DASH   4
#define MAX_CODE    4
int             gnumwords = MAX_CODE;

#define CODE_3SPACES 0xfe
#define CODE_CRLF    0x7f

//struct stWord   gwords[1024];
//int             gnumwords = 0;


void main( void )
{
    if( !( gin = fopen( "text.txt", "rb" ) ) ) {
        printf( "Couldn't open text.txt\n" );
        exit( 1 );
    }

    if( !( goutwords = fopen( "words.txt", "wb" ) ) ) {
        printf( "Couldn't open words.txt\n" );
        exit( 1 );
    }

    if( !( goutdata = fopen( "words.dat", "wb" ) ) ) {
        printf( "Couldn't open words.dat\n" );
        exit( 1 );
    }

    readwords();
    writewords();

    fclose( gin );
    fclose( goutwords );
    fclose( goutdata );
}


void writewords( void )
{
    int     i;

    fprintf( goutwords, " " );  // initial space

    for( i = 0; i < gnumwords; i++ ) {
//        if( i != CODE_3SPACES ) {
            fprintf( goutwords, "%s%c",gwords[i].str, ' ' );
//        }
    }
}


void readwords( void )
{
    int     c;
    int     cnt = 0;
    int     i;
    int     c2;
    int     c3;
    int     outbyte;

    do {
        c = getc( gin );
        if( isalpha( c ) ) {
            gwords[gnumwords].str[cnt++] = c;
            gwords[gnumwords].str[cnt] = '\0';
            gwords[gnumwords].len = cnt;
        } else if( c != EOF ) {
            outbyte = -1;
            c2 = getc( gin );
            c3 = getc( gin );
            ungetc( c3, gin );
            ungetc( c2, gin );

            if( cnt > 0 ) {
                ungetc( c, gin );
                for( i = 0; i < gnumwords; i++ ) {
                    if( !strcmp( gwords[i].str, gwords[gnumwords].str ) ) {
                        break;
                    }
                }
                if( i >= gnumwords ) {
                    gnumwords++;
                }
                outbyte = i + 1;
//                if( c == ' ' && c2 == ' ' ) {
//                    fprintf( goutdata, "%c", ( i + 1 ) | 0x80 );
//                } else {
//                    fprintf( goutdata, "%c", i + 1 );
//                }
            } else {
                /* if( c == 0x0d ) {
                    outbyte = CODE_CRLF;
                    c = c2;
                    c2 = c3;
    //                fprintf( goutdata, "%c", 1 );
                } else*/ if( c == '?' ) {
                    outbyte = CODE_QMARK;
                    c = c2;
                    c2 = c3;
    //                fprintf( goutdata, "%c", 2 );
                } else if( c == ',' ) {
                    outbyte = CODE_COMMA;
                    c = c2;
                    c2 = c3;
    //                fprintf( goutdata, "%c", 3 );
                } else if( c == '.' ) {
                    outbyte = CODE_PERIOD;
                    c = c2;
                    c2 = c3;
    //                fprintf( goutdata, "%c", 4 );
                } else if( c == '-' ) {
                    outbyte = CODE_DASH;
                    c = c2;
                    c2 = c3;
    //                fprintf( goutdata, "%c", 5 );
                }
            }

            if( outbyte != -1 ) {
                if( c == ' ' && c2 == ' ' ) {
                    fprintf( goutdata, "%c", outbyte | 0x80 );
                } else {
                    fprintf( goutdata, "%c", outbyte );
                }
            } else {
                if( c == ' ' && c2 == ' ' && c3 == ' ' ) {
                    fprintf( goutdata, "%c", CODE_3SPACES );
                } else if( c == 0x0d ) {
                    fprintf( goutdata, "%c", CODE_CRLF );
                }
            }

            cnt = 0;
        }
    } while( c != EOF );

    fprintf( goutdata, "%c", CODE_EOF );
}

