//
//  BWTA.CPP
//
//  Mark Nelson
//  March 8, 1996
//  http://web2.airmail.net/markn
//
// DESCRIPTION
// -----------
//
//  This program performs a Burrows-Wheeler transform on an input
//  file or stream, and sends the result to an output file or stream.
//
//  While this program can be compiled in 16 bit mode, it will suffer
//  greatly by virtue of the fact that it will need to drop its
//  block size tremendously.
//
//  This program takes two arguments: an input file and an output
//  file.  You can leave off one argument and send your output to
//  stdout.  Leave off two arguments and read your input from stdin
//  as well.  You can also specify "-d" as the first argument to get
//  a debug dump as well.  However, the debug listing can be a little
//  overwhelming if you are working on a large file!
//
//  The output consists of a series of blocks that look like this:
//
//  long byte_count |  ...data... | long first | long last
//
//  The byte_count refers to the number of data bytes.  The data
//  itself is the "L" column from the sorted data.  "first" is the
//  index where the first character from the buffer appears in the
//  sorted output.  "last" is where the end-of-buffer special byte
//  appears in the output buffer.  These blocks are repeated until
//  I'm out of data.
//
//  This program accompanies my article "Data Compression with the
//  Burrows-Wheeler Transform."  There is one major deviation from
//  the text of the article in this implementation.  To simplify the
//  sorting, I append a special end-of-buffer character to the end
//  of the input buffer.  The end-of-buffer character isn't found
//  in the buffer, which means I no longer have to wrap around to
//  the start of the buffer when performing comparisons.  Instead,
//  I'm guaranteed that a memcmp() will terminate at or before the
//  last character in the buffer.
//
//  One problem, though.  Since I can handle any kind of binary input,
//  what character is guaranteed to never appear in the buffer?  None,
//  so instead I do a special hack and make sure I never *really*
//  look at that last position when comparing.  Instead, I only compare
//  until one or the other string gets to the end, then award the
//  comparison to whoever hit the end first.
//
//  This special character means the output has N+1 characters.  I just
//  output a '?' when I hit that special end-of-buffer character, but
//  I also have to pass along the information about the end-of-buffer
//  character's position to the decoder, so I append it to the end
//  of each data block.
//
//  The sorting for this routine is done by inserting pointers into
//  the buffer into an STL set.  There are two good things about this.
//  First, I can create a templated comparison function which is then
//  called in-line during the insertion.  Second, since I insert each
//  string one at a time, I can provide some indication to the end user
//  of how far along in the process I am.  With a function like qsort(),
//  it's hard to know how far along you are.
//
//  If you don't have an STL capable compiler, you are going to have to
//  use the less-sexy version of this program, BWTA.CPP.
//
// Build Instructions
// ------------------
//
//  Borland C++ 4.5 16 bit    : bcc -w -ml bwt.cpp  //Yes, large model!
//  Borland C++ 4.5 32 bit    : bcc32 -w bwt.cpp
//  Microsoft Visual C++ 4.0  : ???? haven't had a chance to test it
//
//  This code has not been tested under UNIX.  I don't have a g++
//  testbed that supports the STL.
//
// Typical Use
// -----------
//
//  rle < raw-file | bwta | mtf | rle | ari > compressed-file
//

//
// Borland STL hack
//
#define __MINMAX_DEFINED

//
// set.h is an STL file.  If you don't have the STL on your system,
// you can expect an error here.
//
#include <set.h>

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <fcntl.h>
#include <io.h>
#include <limits.h>

#if ( INT_MAX == 32767 )
#define BLOCK_SIZE 20000
#else
#define BLOCK_SIZE 200000
#endif

//
// length has the number of bytes presently read into the buffer,
// buffer contains the data itself.
//
long length;
unsigned char buffer[ BLOCK_SIZE ];

//
// This is the special comparison function used when inserting
// strings into the sorted set.  Remember that the character
// at buffer+length doesn't really exist, but it is assumed to
// be the special end-of-buffer character, which is bigger than
// any character found in the input buffer.  So I terminate the
// comparison at the end of the buffer.
//
class BoundedCompare {
    public :
        operator()( const unsigned char *p1,
                    const unsigned char *p2 ) const
        {
            unsigned int l1 = (unsigned int) (( buffer - p1 ) + length );
            unsigned int l2 = (unsigned int) (( buffer - p2 ) + length );
            int result = memcmp( p1, p2, min( l1, l2 ) );
            if ( result < 0 )
                return 1;
            if ( result > 0 )
                return 0;
            return l1 > l2;
        }
};


main( int argc, char *argv[] )
{
    int debug = 0;
    if ( argc > 1 && strcmp( argv[ 1 ], "-d" ) == 0 ) {
        debug = 1;
        argv++;
        argc--;
    }
    fprintf( stderr, "Performing BWT on " );
    if ( argc > 1 ) {
        freopen( argv[ 1 ], "rb", stdin );
        fprintf( stderr, "%s", argv[ 1 ] );
    } else
        fprintf( stderr, "stdin" );
    fprintf( stderr, " to " );
    if ( argc > 2 ) {
        freopen( argv[ 2 ], "wb", stdout );
        fprintf( stderr, "%s", argv[ 2 ] );
    } else
        fprintf( stderr, "stdout" );
    fprintf( stderr, "\n" );
    setmode( fileno( stdin ), O_BINARY );
    setmode( fileno( stdout ), O_BINARY );
//
// This is the start of the giant outer loop.  Each pass
// through the loop compresses up to BLOCK_SIZE characters.
// When an fread() operation finally reads in 0 characters,
// we break out of the loop and are done.
//
    for ( ; ; ) {
//
// After reading in the data into the buffer, I do some
// UI stuff, then write the length out to the output
// stream.
//
        length = fread( buffer, 1, BLOCK_SIZE, stdin );
        if ( length == 0 )
            break;
        fprintf( stderr, "Performing BWT on %ld bytes\n", length );
        long l = length + 1;
        fwrite( &l, 1, sizeof( long ), stdout );
//
// Sorting the input strings is simply a matter of inserting
// a pointer to each string into an STL set<> container.
// The sorting is done by operator()() in the BoundedCompare
// class.  Note that I insert N+1 pointers. The last pointer
// points one past the end of the buffer, which is where the
// imaginary end-of-buffer character resides.  Sort of.
//
        int i;
        int ticker = 0;
        set< unsigned char *, BoundedCompare > p;
        for ( i = 0 ; i <= length ; i++ ) {
            if ( ( ticker++ % 1024 ) == 0 )
                fprintf( stderr, "." );
            p.insert( buffer + i );
        }
        fprintf( stderr, "\n" );
        set< unsigned char *, BoundedCompare >::iterator ii;
//
// If the debug flag was turned on, I print out the sorted
// strings, along with their prefix characters.  This is
// not a very good idea when you are compressing a giant
// binary file, but it can be real helpful when debugging.
//
        if ( debug ) {
            for ( ii = p.begin(), i = 0 ; ii != p.end() ; ii++, i++ ) {
                unsigned char *s = *ii;
                fprintf( stderr, "%d : " );
                unsigned char prefix;
                if ( s == buffer )
                    prefix = '?';
                else
                    prefix = s[ -1 ];
                if ( isprint( prefix ) )
                    fprintf( stderr, "%c", prefix );
                else
                    fprintf( stderr, "<%d>", prefix );
                fprintf( stderr, ": " );
                int stop = (int)( ( buffer - s ) + length );
                if ( stop > 30 )
                    stop = 30;
                for ( int j = 0 ; j < stop ; j++ ) {
                    if ( isprint( *s ) )
                        fprintf( stderr, "%c", *s );
                    else
                        fprintf( stderr, "<%d>", *s );
                    s++;
                }
                fprintf( stderr, "\n" );
            }
        }
//
// Finally, I write out column L.  Column L consists of all
// the prefix characters to the sorted strings, in order.
// It's easy to get the prefix character, but I have to
// handle S0 with care, since its prefix character is the
// imaginary end-of-buffer character.  I also have to spot
// the positions in L of the end-of-buffer character and
// the first character, so I can write them out at the end
// for transmission to the output stream.
//
        long first;
        long last;
        for ( i = 0, ii = p.begin() ; ii != p.end() ; i++, ii++ ) {
            if ( *ii == ( buffer + 1 ) )
                first = i;
            if ( *ii == buffer ) {
                last = i;
                fputc( '?', stdout );
            } else
                fputc( (*ii)[ -1 ], stdout );
        }
        p.erase( p.begin(), p.end() );
        fprintf( stderr,
                 "first = %ld"
                 "  last = %ld\n",
                 first,
                 last );
        fwrite( &first, 1, sizeof( long ), stdout );
        fwrite( &last, 1, sizeof( long ), stdout );
    }
    return 0;
}

