/* $Id: junk_filter.cc,v 1.40 1998/06/14 06:25:36 dps Exp $ */

/* This code filters out almost all the "junk" in word documents and
 * is useful for extracting the text from word documents. I think this
 * program produces better signal to noise that catdoc. */

/* #include "config.h" */
#define HAVE_STRING_H 1
#define HAVE_STRINGS_H 1
#include <stdio.h>
#include <stdlib.h>
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#include <ctype.h>
#include <stream.h>
#include <iostream.h>
#include <fstream.h>
#include "strip.h"
#include "tune.h"

/* Note: This is a two stage filter that outputs 0 or 1 characters
   per input character and thus can use the same buffer safely. */

/* Read more */
int word_junk_filter::__fill(const char *b, int n, int sp, int offset)
{
    int end, nr;

    if (n>0)
	memmove(buf+offset, b, n);
    offset+=n+sp;
    if (offset>=JUNK_FILT_BUFSIZE)
    {
	cerr<<"junk_filter::__fill: start offset "<<offset<<" >= buffer size"
	    "(fatal)\n";
	exit(1);
    }
    is->read(buf+offset, JUNK_FILT_BUFSIZE-offset);
    nr=is->gcount();
    pos+=nr;
    end=nr+n;
    return end;
}


/* Junk filtering. Buffer size must be >=RESUME_CHARS */
int word_junk_filter::filter_junk(const char *b, int ns)
{
    int i, d, n, top_run;

    if ((n=__fill(b, ns, junk_end_usage, 1))==0)
	return 0;
    if (junk_end_usage>0)
	memcpy(buf+ns+1, junk_end_buf, junk_end_usage);
    if (init==INIT_SKIPPED)
    {
	init=INIT_DONE;		// Finished init
	ns=0;			// Process all the data
	text_size=0;		// Text size is zero
    }

#ifdef DEBUG_FILTER_FILL
    printf("\n---refill ns=%d junk_end_usage=%d\n", ns, junk_end_usage);
    printf("---end=%d mode=%d\n", n, mode);
#endif

    /* Correctness of this loop depends on d<=i at the top of the
       loop. The claims * by inductioon about the fake variable __adj__ suffice
       to prove this. All the relevant commments are prefixed by CS */

    top_run=0;
    /*** CS: __adj__=1 ****/
    for (i=ns+1, d=ns; i<n; i++)	// Ignore already processed data pushed back
    {
	/*** CS: i==d+junk_end_usage+__adj__ and __adj__>=0 here by induction ***/
	/*** CS: if mode==SKIP_JUNK or SKIP_JUNK_WASPRN then __adj__>=1       ***/
#ifdef DEBUG_FILTER
	printf("mode %d, char[%i] (t=%d) %c, asc=%d\n", mode, i, n,
	       buf[i], junk_end_usage);
#endif
	switch(mode)
	{
	case NORMAL:
	    /* Normal mode. OK becasue change in d at most 1 and junk_end_usage=0 */
	    if (buf[i]==0)
	    {
		junk_end_usage=0;
		junk_size=0;
		mode=SKIP_JUNK;
		break;
	    }
	    if (buf[i] & 0x80)
	    {
		/*** CS: by induction junk_end_usage==top_run ***/
		junk_end_buf[junk_end_usage++]=buf[i];	// Save character
		top_run++;				// Increment top run counter
		/*** CS: __adj__++ ***/
		if (top_run==tune.max_top_run)
		{
		    junk_size=top_run;			// Top bit characters are junk
		    junk_end_usage=0;			// Throw away characters
		    mode=SKIP_JUNK;			// End junk skipping mode
		    /*** CS: __adj__ += top_run ***/
		    break;
		}
		break;					// Exit switch
	    }
	    /* Top bit not set */
	    if (top_run)
	    {
		memcpy(buf+d, junk_end_buf, junk_end_usage); // OK by ind hyp.
		d+=junk_end_usage;
		junk_end_usage=0;			// Ind. hyp now true
		top_run=0;				// Reset top_run counter.
	    }
	    /* Add character */
	    buf[d++]=buf[i];
	    text_size++;
	    if (unicode!=__UNI_NO)
		mode=UNICODE_Z;
	    break;
	 
	case UNICODE_Z:
	    /* unicode zero mode. OK because change in d is 0 and junk_end_usage=0 */
	    if (buf[i]==0)
	    {
		mode=NORMAL; // 0 => Back to printable character mode
		/*** CS: __adj__++ ***/
	    }
	    else
	    {
#ifdef UNICODE_MARKERS
		printf("\n*** Mode %d: %d Got %d expected 0\n", mode,
		       UNICODE_Z, buf[i]);
#endif
		junk_end_usage=0;
		junk_size=0;
		mode=SKIP_JUNK;
		/*** CS: __adj__++ ***/
	    }
	    break;

	case SKIP_JUNK_WASPRN:
	    /* Skip junk was printable, added for unicode documents */
	    /* OK because junk_end_usage chages by at most 1 and d stays fixed */
	    if ((++junk_size)==tune.max_junk && text_size>tune.min_text)
	    {
#ifdef DEBUG_SINK_START
		cerr<<"Enter sink mode\n";
#endif /* DEBUG_SINK_START */
		/*** CS: __adj__++ ***/
		mode=SINK_JUNK;	// Sink junk
		break;
	    }
	    if  (tune.options.unicode_aggresive)
	    {
		/* Agresive unicode mode: insist on the MSBs appearing */
		if (unicode!=__UNI_NO)
		{
		    if (buf[i]!=0)
		    {
			/*** CS: __adj__+=junk_end_usage ***/
			junk_end_usage=0;
		    }
		    mode=SKIP_JUNK;
		    /*** CS: __adj__++ ***/
		    break;
		}
	    }
	    else
	    {
		/* Non-aggersive unicode mode: simply skip the MSB */
		if (buf[i]==0 && unicode!=__UNI_NO)
		{
		    /*** CS: __adj__++ ***/
		    break;	// Accept zeros
		}
	    }
	    /* Fall thru */
	    
	case SKIP_JUNK:
	    /* Junk skipping mode */
	    if ((++junk_size)==tune.max_junk && text_size>tune.min_text)
	    {
#ifdef DEBUG_SINK_START
		cerr<<"Enter sink mode\n";
#endif /* DEBUG_SINK_START */
		mode=SINK_JUNK;	// Sink junk
		/*** CS: __adj__++ ***/
		break;
	    }
	    if (isprint(buf[i]) || buf[i]=='\r' || buf[i]=='\n'
		|| buf[i]=='\t')
	    {
#ifdef DEBUG_RESTART
		printf("---Accepted *%c*, usage=%d\n", buf[i], junk_end_usage);
#endif
		junk_end_buf[junk_end_usage++]=buf[i]; // Save character
		if (unicode!=__UNI_NO)
		    mode=SKIP_JUNK_WASPRN; // For unicode documents
	    }
	    else
	    {
		/* Reset junk buffer usage counter */
		/*** CS: __adj__+=junk_end_usage+1 ***/
#ifdef DEBUG_RESTART
		if (junk_end_usage!=0)
		printf("---Rejected %d, usage=%d\n", buf[i], junk_end_usage);
#endif
		junk_end_usage=0;
		break;
	    }
	    if (junk_end_usage<tune.resume_chars)
		break;
	    /* Turn nomral mode back on */
#ifdef DEBUG_RESTART
	    printf("---back normal mode count=%d\n", junk_size);
#endif
	    /* The __adj__ analysis is *vital* here; if __Adj__=0 then d>i occurs */
	    /* however forutnately this is impossible by the claims that __adj__  */
	    /* and indcution proves.                                              */

	    /**** CS: __adj__>=1 here by indcution. Thus i>=d+junk_end_usage+1 ****/
	    mode=(unicode==__UNI_NO) ? NORMAL : UNICODE_Z;
	    buf[d]=0x7f;				// Mark suspect position
	    memcpy(buf+d+1, junk_end_buf, junk_end_usage); // Nothing clobbered by [1]
	    d+=junk_end_usage+1;			// Advance d
	    junk_end_usage=0;				// Clear junk end buffer
	    /*** CS: __adj__-- ***/
	    break;

	case SINK_JUNK:
	    /*** CS: __adj__++ ***/
	    break;

	default:
	    cerr<<"word_junk_filter::filter_junk: Invalid mode "<<mode
		<<" (fatal)\n";
	    exit(1);
	}
    }
    return d;
}

int word_junk_filter::underflow(void)
{
    int s,d,last;
    int end;

    do				// While no characters
    {
	/* Fill the buffer */
	if (init==INIT_NONE)
	{
	    if (is!=NULL)
	    {
		init=INIT_SKIPPED;
		ns=this->skip_to_start();
		end=this->filter_junk(buf, ns);
	    }
	    else
		return EOF;	// If no input stream return EOF
	}
	else
	{
	    end=this->filter_junk(&save, ns);
	    ns=0;
	}
	
	/* Check for EOF */
	if (end==0 && is->eof())
	    return EOF;
	
	ns=0;
	/* Collapse multi-character sequeneces */
	for (s=0, d=0, last=-1; s<end; s++)
	{
	    switch(buf[s])
	    {
	    case '\n':
		if (last=='\r')
		    continue;	// \n \r to \r
		break;
		
		
	    case '\r':
		if (last=='\n')
		    d--;	// \r \n to \r
		break;
		
		
	    case 0x07:
		if (last=='\r')
		    d--;	// \n <TAB SEP> to <TAB SEP>
		break;
		
	    default:
		break;
	    }
	    last=buf[d]=buf[s];
	    d++;
	}
	
	if (end!=1 && (buf[d-1]=='\r' || buf[d-1]=='\n'))
	{
	    d--;
	    ns=1;
	    save=buf[d];
	}
    } while (d==0);
    setg(buf, buf, buf+d);
    return(buf[0] & 0xff);
}


/* Skip to start */
int word_junk_filter::skip_to_start(void)
{
    int zc,ffc,c,n,i,rej,limit=-1;
    char back[2*ST_ASC_VCHARS_UNICODE+ST_REJ_LIMIT+4];
    int bp;
    long pos=0;

    enum { FIND_BEGIN, SKIP_ONE, FIND_ASC, FIND_ASC_WASASC, DONE } mode;
    zc=0; ffc=0; i=0; n=0; bp=0; rej=-1;
    mode=FIND_BEGIN;
    while (mode!=DONE)
    {
	/* If need a refill */
	if (i==n)
	{
	    /* Fill buffer */
	    n=this->__fill(back,bp,0,0);
	    if (n==0)
		return 0;
	    pos+=n-bp;
#ifdef DEBUG_SKIP_FILL
	    printf("---skip--- refill bp=%x, pos=%x\n", bp, pos);
#endif
	    /* Ignore first saved character */
	    if (mode==FIND_ASC)
		mode=SKIP_ONE;
	    i=0;
	    bp=0;
	}

#ifdef DEBUG_SKIP
	printf("char %c mode %d, c=%d, pos=%x, i=%d l=%d, rej=%d\n", buf[i],
		   mode, c, pos++, i, limit, rej);
#endif
	switch(mode)
	{
	case FIND_BEGIN:
	    switch(buf[i])
	    {
	    case 0:
		zc++;
		ffc=0;
		break;

	    case ((char) 0xff):
		ffc++;
		zc=0;
		break;

	    default:
		if (zc>=tune.st_min_zeros ||
		    (ffc>=tune.st_min_ff && tune.options.ff_intro))
		{
		    unicode=__UNI_PROBE;
		    back[bp++]=buf[i];
		    c=0;
		    rej=0;
		    mode=FIND_ASC_WASASC; // Ignore this character deliberately
		}
		else
		{
		    zc=0;
		    ffc=0;
		}
		break;
	    }
	    break;

	case SKIP_ONE:
	    back[bp++]=buf[i];
	    c=0;
	    rej=0;
	    mode=FIND_ASC_WASASC;
	    break;

	case FIND_ASC_WASASC:	// unicode support
	    if (buf[i]==0 && unicode!=__UNI_NO)
	    {
		if (unicode==__UNI_PROBE)
		{
		    limit=tune.unicode_st; // Start characters needed
		    /* FIXME: Anyone got a better method of doing this? */
		    if ((pos+i) % 2==0)
		    {
			unicode=__UNI_YES_BE; // 0 is first (even) bu#yte
		    }
		    else
		    {
			unicode=__UNI_YES_LE; // 0 is second (odd) byte
		    }
		    back[bp++]=buf[i];
		    mode=FIND_ASC;	      // Inist next character not zero
#ifdef DEBUG_SKIP_UNI
		    cerr<<"Characters are unicode "
			<<((unicode==__UNI_YES_LE) ? "litle" : "big")
			<<" endian\n";
#endif
		}
		else
		    back[bp++]=buf[i];
		c++;
		break;
	    }
	    else if (unicode==__UNI_PROBE)
	    {
#ifdef DEBUG_SKIP_UNI
		if (unicode==__UNI_PROBE)
		    cerr<<"Characters are not unicode\n";
#endif
		limit=tune.non_unicode_st; // Start characters needed
		unicode=__UNI_NO;	   // Non unicode
		mode=FIND_ASC;		   // Finding printable mode
	    }
	    /* fall thru */

	case FIND_ASC:
	    if (isprint(buf[i]) || buf[i]=='\r' || buf[i]=='\n'
		|| buf[i]=='\t')	      // If is printable
	    {
		back[bp++]=buf[i];	      // Store character
		c++;			      // Increase count
		if (unicode!=__UNI_NO)
		    mode=FIND_ASC_WASASC;     // Unicode hack...
	    }
	    else
	    {
		back[bp++]=buf[i];	      // Store caharcetr
		if (buf[i]!=0)
		    rej++;		      // Adjust reject count
		if (buf[i]==0 || rej>tune.st_rej_limit)	// If 0 or reject limit
		{
		    bp=0;		      // Reset buffer pointer
		    unicode=__UNI_PROBE;      // Set unicode status to unknonw
		    ffc=(buf[i]== (char) 0xff) ? 1 : 0;	// reset ff count
		    zc=(buf[i]==0) ? 1 : 0;   // reset zeros count
		    mode=FIND_BEGIN ;	      // Find leading characters mode
		}
	    }
	    if (limit==-1)		      // Bug catcher
	    {
		cerr<<"Bug: limit still -1 at point #1\n";
		exit(2);
	    }
	    if (c==limit)		      // IF found enough
	    {
#ifdef DEBUG_SKIP
		printf("***** Enter exit phase ****\n");
#endif
		mode=DONE;		      // Finished skipping
	    }
	    break;

	case DONE:
	    cerr<<"skip_to_start: loop called in done mode\n";
	    break;

	default:
	    cerr<<"skip_to_start: Mode invalid\n";
	    break;
	}
	i++;
    }
#ifdef DEBUG_SKIP_MOVE
    printf("%c%c%c\n", buf[i-bp], buf[i-bp+1], buf[i-bp+2]);
#endif
    memmove(buf, buf+i-bp, n-i+bp);	      // Move memory
    return(n-i+bp);			      // Return count
}


/* Option handling stuff for tuning this module. Note that only long options
 * are supported. If you lack long options these features are not avialable. */
/* set default tuning */
void word_junk_filter::set_dfl_tuning(void)
{
#ifdef AGGRESIVE_UNICODE
    tune.options.unicode_aggresive=AGGRESIVE_UNICODE; // Aggresive unicode mode
#else
    tune.options.unicode_aggresive=1;		      // Aggresive unicode mode
#endif /* AGGRESIVE_UNICODE */
    tune.resume_chars=RESUME_CHARS;		      // Resume characters
    tune.st_min_zeros=ST_MIN_ZEROS;		      // Minimum zeros
    tune.st_rej_limit=ST_REJ_LIMIT;		      // Reject chars tolerance
    tune.st_min_ff=ST_MIN_FF;			      // Minimum ff's (alternative to 0s)
    tune.options.ff_intro=ALLOW_FF_LEADIN;	      // Enable leading ff's
    tune.non_unicode_st=ST_ASC_VCHARS;		      // ok characters leadin
    tune.unicode_st=ST_ASC_VCHARS_UNICODE;	      // ok unicode characters
    tune.min_text=SINK_MIN_TEXT;		      // text required before sink mode
    tune.max_junk=SINK_JUNK_CNT;		      // junk required for sink mode
    tune.max_top_run=MAX_TOP_RUN;		      // Max run length of top bits set
}

int word_junk_filter::do_reset(void)
{
	pos=0;
	init=INIT_NONE;
	mode=NORMAL;
	junk_end_usage=0;
	ns=0;
	return 1;		// Success
}

