dictzip.c

/* Made up from data.c and other supplementary files of dictd-1.0.11 for the
 * GoldenDict program.
 */

/* data.c -- 
 * Created: Tue Jul 16 12:45:41 1996 by faith@dict.org
 * Revised: Sat Mar 30 10:46:06 2002 by faith@dict.org
 * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org)
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 1, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Suite 500, Boston, MA 02110, USA.
 */

#include <stdlib.h>
#include <time.h>
#include "dictzip.h"
#include <limits.h>
#include <stdarg.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include "ufile.hh"

#define BUFFERSIZE 10240

#define OUT_BUFFER_SIZE 0xffffL

#define IN_BUFFER_SIZE ((unsigned long)((double)(OUT_BUFFER_SIZE - 12) * 0.89))

/* For gzip-compatible header, as defined in RFC 1952 */

				/* Magic for GZIP (rfc1952)                */
#define GZ_MAGIC1     0x1f	/* First magic byte                        */
#define GZ_MAGIC2     0x8b	/* Second magic byte                       */

				/* FLaGs (bitmapped), from rfc1952         */
#define GZ_FTEXT      0x01	/* Set for ASCII text                      */
#define GZ_FHCRC      0x02	/* Header CRC16                            */
#define GZ_FEXTRA     0x04	/* Optional field (random access index)    */
#define GZ_FNAME      0x08	/* Original name                           */
#define GZ_COMMENT    0x10	/* Zero-terminated, human-readable comment */
#define GZ_MAX           2	/* Maximum compression                     */
#define GZ_FAST          4	/* Fasted compression                      */

				/* These are from rfc1952                  */
#define GZ_OS_FAT        0	/* FAT filesystem (MS-DOS, OS/2, NT/Win32) */
#define GZ_OS_AMIGA      1	/* Amiga                                   */
#define GZ_OS_VMS        2	/* VMS (or OpenVMS)                        */
#define GZ_OS_UNIX       3      /* Unix                                    */
#define GZ_OS_VMCMS      4      /* VM/CMS                                  */
#define GZ_OS_ATARI      5      /* Atari TOS                               */
#define GZ_OS_HPFS       6      /* HPFS filesystem (OS/2, NT)              */
#define GZ_OS_MAC        7      /* Macintosh                               */
#define GZ_OS_Z          8      /* Z-System                                */
#define GZ_OS_CPM        9      /* CP/M                                    */
#define GZ_OS_TOPS20    10      /* TOPS-20                                 */
#define GZ_OS_NTFS      11      /* NTFS filesystem (NT)                    */
#define GZ_OS_QDOS      12      /* QDOS                                    */
#define GZ_OS_ACORN     13      /* Acorn RISCOS                            */
#define GZ_OS_UNKNOWN  255      /* unknown                                 */

#define GZ_RND_S1       'R'	/* First magic for random access format    */
#define GZ_RND_S2       'A'	/* Second magic for random access format   */

#define GZ_ID1           0	/* GZ_MAGIC1                               */
#define GZ_ID2           1	/* GZ_MAGIC2                               */
#define GZ_CM            2	/* Compression Method (Z_DEFALTED)         */
#define GZ_FLG	         3	/* FLaGs (see above)                       */
#define GZ_MTIME         4	/* Modification TIME                       */
#define GZ_XFL           8	/* eXtra FLags (GZ_MAX or GZ_FAST)         */
#define GZ_OS            9	/* Operating System                        */
#define GZ_XLEN         10	/* eXtra LENgth (16bit)                    */
#define GZ_FEXTRA_START 12	/* Start of extra fields                   */
#define GZ_SI1          12	/* Subfield ID1                            */
#define GZ_SI2          13      /* Subfield ID2                            */
#define GZ_SUBLEN       14	/* Subfield length (16bit)                 */
#define GZ_VERSION      16      /* Version for subfield format             */
#define GZ_CHUNKLEN     18	/* Chunk length (16bit)                    */
#define GZ_CHUNKCNT     20	/* Number of chunks (16bit)                */
#define GZ_RNDDATA      22	/* Random access data (16bit)              */


#define DBG_VERBOSE     (0<<30|1<< 0) /* Verbose                            */
#define DBG_ZIP         (0<<30|1<< 1) /* Zip                                */
#define DBG_UNZIP       (0<<30|1<< 2) /* Unzip                              */
#define DBG_SEARCH      (0<<30|1<< 3) /* Search                             */
#define DBG_SCAN        (0<<30|1<< 4) /* Config file scan                   */
#define DBG_PARSE       (0<<30|1<< 5) /* Config file parse                  */
#define DBG_INIT        (0<<30|1<< 6) /* Database initialization            */
#define DBG_PORT        (0<<30|1<< 7) /* Log port number for connections    */
#define DBG_LEV         (0<<30|1<< 8) /* Levenshtein matching               */
#define DBG_AUTH        (0<<30|1<< 9) /* Debug authentication               */
#define DBG_NODETACH    (0<<30|1<<10) /* Don't detach as a background proc. */
#define DBG_NOFORK      (0<<30|1<<11) /* Don't fork (single threaded)       */
#define DBG_ALT         (0<<30|1<<12) /* altcompare()                      */

#define LOG_SERVER      (0<<30|1<< 0) /* Log server diagnostics             */
#define LOG_CONNECT     (0<<30|1<< 1) /* Log connection information         */
#define LOG_STATS       (0<<30|1<< 2) /* Log termination information        */
#define LOG_COMMAND     (0<<30|1<< 3) /* Log commands                       */
#define LOG_FOUND       (0<<30|1<< 4) /* Log words found                    */
#define LOG_NOTFOUND    (0<<30|1<< 5) /* Log words not found                */
#define LOG_CLIENT      (0<<30|1<< 6) /* Log client                         */
#define LOG_HOST        (0<<30|1<< 7) /* Log remote host name               */
#define LOG_TIMESTAMP   (0<<30|1<< 8) /* Log with timestamps                */
#define LOG_MIN         (0<<30|1<< 9) /* Log a few minimal things           */
#define LOG_AUTH        (0<<30|1<<10) /* Log authentication denials         */

#define DICT_LOG_TERM    0
#define DICT_LOG_DEFINE  1
#define DICT_LOG_MATCH   2
#define DICT_LOG_NOMATCH 3
#define DICT_LOG_CLIENT  4
#define DICT_LOG_TRACE   5
#define DICT_LOG_COMMAND 6
#define DICT_LOG_AUTH    7
#define DICT_LOG_CONNECT 8

#define DICT_UNKNOWN    0
#define DICT_TEXT       1
#define DICT_GZIP       2
#define DICT_DZIP       3

#include <ctype.h>
#include <fcntl.h>
#include <assert.h>

#include <sys/stat.h>

#define USE_CACHE 1

#define dict_data_filter( ... )
#define PRINTF( ... )

#define xmalloc malloc
#define xfree free

static const char * _err_programName = "GoldenDict";

#define log_error( ... )
#define log_error_va( ... )

static void err_fatal( const char *routine, const char *format, ... )
{
   va_list ap;

   fflush( stdout );
   if (_err_programName) {
      if (routine)
	 fprintf( stderr, "%s (%s): ", _err_programName, routine );
      else
	 fprintf( stderr, "%s: ", _err_programName );
   } else {
      if (routine) fprintf( stderr, "%s: ", routine );
   }
   
   va_start( ap, format );
   vfprintf( stderr, format, ap );
   log_error_va( routine, format, ap );
   va_end( ap );
   
   fflush( stderr );
   fflush( stdout );
   exit ( 1 );
}

/* \doc |err_fatal_errno| flushes "stdout", prints a fatal error report on
   "stderr", prints the system error corresponding to |errno|, flushes
   "stderr" and "stdout", and calls |exit|.  |routine| is the name of the
   routine in which the error took place. */

static void err_fatal_errno( const char *routine, const char *format, ... )
{
   va_list ap;
   int     errorno = errno;

   fflush( stdout );
   if (_err_programName) {
      if (routine)
	 fprintf( stderr, "%s (%s): ", _err_programName, routine );
      else
	 fprintf( stderr, "%s: ", _err_programName );
   } else {
      if (routine) fprintf( stderr, "%s: ", routine );
   }
   
   va_start( ap, format );
   vfprintf( stderr, format, ap );
   log_error_va( routine, format, ap );
   va_end( ap );

#if HAVE_STRERROR
   fprintf( stderr, "%s: %s\n", routine, strerror( errorno ) );
   log_error( routine, "%s: %s\n", routine, strerror( errorno ) );
#else
   errno = errorno;
   perror( routine );
   log_error( routine, "%s: errno = %d\n", routine, errorno );
#endif
   
   fflush( stderr );
   fflush( stdout );
   exit( 1 );
}

/* \doc |err_internal| flushes "stdout", prints the fatal error message,
   flushes "stderr" and "stdout", and calls |abort| so that a core dump is
   generated. */

static void err_internal( const char *routine, const char *format, ... )
{
  va_list ap;

  fflush( stdout );
  if (_err_programName) {
     if (routine)
  fprintf( stderr, "%s (%s): Internal error\n     ",
     _err_programName, routine );
     else
  fprintf( stderr, "%s: Internal error\n     ", _err_programName );
  } else {
     if (routine) fprintf( stderr, "%s: Internal error\n     ", routine );
     else         fprintf( stderr, "Internal error\n     " );
  }

  va_start( ap, format );
  vfprintf( stderr, format, ap );
  log_error( routine, format, ap );
  va_end( ap );

  if (_err_programName)
     fprintf( stderr, "Aborting %s...\n", _err_programName );
  else
     fprintf( stderr, "Aborting...\n" );
  fflush( stderr );
  fflush( stdout );
  abort();
}

#ifndef __func__
# ifdef __FUNCTION__
#  define __func__  __FUNCTION__
# else
#  define __func__  __FILE__
# endif
#endif

static int dict_read_header( const char *filename,
			     dictData *header, int computeCRC )
{
   FILE          *str;
   int           id1, id2, si1, si2;
   char          buffer[BUFFERSIZE];
   int           extraLength, subLength;
   int           i;
   char          *pt;
   int           c;
   struct stat   sb;
   unsigned long crc   = crc32( 0L, Z_NULL, 0 );
   int           count;
   unsigned long offset;

   if (!(str = gd_fopen( filename, "rb" )))
      err_fatal_errno( __func__,
		       "Cannot open data file \"%s\" for read\n", filename );

   header->filename     = NULL;//str_find( filename );
   header->headerLength = GZ_XLEN - 1;
   header->type         = DICT_UNKNOWN;
   
   id1                  = getc( str );
   id2                  = getc( str );

   if (id1 != GZ_MAGIC1 || id2 != GZ_MAGIC2) {
      header->type = DICT_TEXT;
      fstat( fileno( str ), &sb );
      header->compressedLength = header->length = sb.st_size;
      header->origFilename     = NULL;//str_find( filename );
      header->mtime            = sb.st_mtime;
      if (computeCRC) {
	 rewind( str );
	 while (!feof( str )) {
	    if ((count = fread( buffer, 1, BUFFERSIZE, str ))) {
	       crc = crc32( crc, buffer, count );
	    }
	 }
      }
      header->crc = crc;
      fclose( str );
      return 0;
   }
   header->type = DICT_GZIP;
   
   header->method       = getc( str );
   header->flags        = getc( str );
   header->mtime        = getc( str ) <<  0;
   header->mtime       |= getc( str ) <<  8;
   header->mtime       |= getc( str ) << 16;
   header->mtime       |= getc( str ) << 24;
   header->extraFlags   = getc( str );
   header->os           = getc( str );
   
   if (header->flags & GZ_FEXTRA) {
      extraLength          = getc( str ) << 0;
      extraLength         |= getc( str ) << 8;
      header->headerLength += extraLength + 2;
      si1                  = getc( str );
      si2                  = getc( str );
      
      if (si1 == GZ_RND_S1 && si2 == GZ_RND_S2) {
	 subLength            = getc( str ) << 0;
	 subLength           |= getc( str ) << 8;
	 header->version      = getc( str ) << 0;
	 header->version     |= getc( str ) << 8;
	 
	 if (header->version != 1)
	    err_internal( __func__,
			  "dzip header version %d not supported\n",
			  header->version );
   
	 header->chunkLength  = getc( str ) << 0;
	 header->chunkLength |= getc( str ) << 8;
	 header->chunkCount   = getc( str ) << 0;
	 header->chunkCount  |= getc( str ) << 8;
	 
	 if (header->chunkCount <= 0) {
	    fclose( str );
	    return 5;
	 }
	 header->chunks = xmalloc( sizeof( header->chunks[0] )
				   * header->chunkCount );
	 for (i = 0; i < header->chunkCount; i++) {
	    header->chunks[i]  = getc( str ) << 0;
	    header->chunks[i] |= getc( str ) << 8;
	 }
	 header->type = DICT_DZIP;
      } else {
	 fseek( str, header->headerLength, SEEK_SET );
      }
   }
   
   if (header->flags & GZ_FNAME) { /* FIXME! Add checking against header len */
      pt = buffer;
      while ((c = getc( str )) && c != EOF){
	 *pt++ = c;

	 if (pt == buffer + sizeof (buffer)){
	    err_fatal (
	       __func__,
	       "too long FNAME field in dzip file \"%s\"\n", filename);
	 }
      }

      *pt = '\0';
      header->origFilename = NULL;//str_find( buffer );
      header->headerLength += strlen( buffer ) + 1;
   } else {
      header->origFilename = NULL;
   }
   
   if (header->flags & GZ_COMMENT) { /* FIXME! Add checking for header len */
      pt = buffer;
      while ((c = getc( str )) && c != EOF){
	 *pt++ = c;

	 if (pt == buffer + sizeof (buffer)){
	    err_fatal (
	       __func__,
	       "too long COMMENT field in dzip file \"%s\"\n", filename);
	 }
      }

      *pt = '\0';
      header->comment = NULL;//str_find( buffer );
      header->headerLength += strlen( header->comment ) + 1;
   } else {
      header->comment = NULL;
   }

   if (header->flags & GZ_FHCRC) {
      getc( str );
      getc( str );
      header->headerLength += 2;
   }

   if (ftell( str ) != header->headerLength + 1)
      err_internal( __func__,
		    "File position (%lu) != header length + 1 (%d)\n",
		    ftell( str ), header->headerLength + 1 );

   fseek( str, -8, SEEK_END );
   header->crc     = getc( str ) <<  0;
   header->crc    |= getc( str ) <<  8;
   header->crc    |= getc( str ) << 16;
   header->crc    |= getc( str ) << 24;
   header->length  = getc( str ) <<  0;
   header->length |= getc( str ) <<  8;
   header->length |= getc( str ) << 16;
   header->length |= getc( str ) << 24;
   header->compressedLength = ftell( str );

				/* Compute offsets */
   header->offsets = xmalloc( sizeof( header->offsets[0] )
			      * header->chunkCount );
   for (offset = header->headerLength + 1, i = 0;
	i < header->chunkCount;
	i++)
   {
      header->offsets[i] = offset;
      offset += header->chunks[i];
   }

   fclose( str );
   return 0;
}

dictData *dict_data_open( const char *filename, int computeCRC )
{
   dictData    *h = NULL;
   struct stat sb;
   int         j;

   if (!filename)
      return NULL;

   h = xmalloc( sizeof( struct dictData ) );

   memset( h, 0, sizeof( struct dictData ) );
   h->initialized = 0;

   if (dict_read_header( filename, h, computeCRC )) {
     return 0; /*
      err_fatal( __func__,
     "\"%s\" not in text or dzip format\n", filename );*/
   }

   h->fd = gd_fopen( filename, "rb" );

   if ( !h->fd )
   {
     return 0;
      /*err_fatal_errno( __func__,
           "Cannot open data file \"%s\"\n", filename );*/
    }

   fseek( h->fd, 0, SEEK_END );

   h->size = ftell( h->fd );

   for (j = 0; j < DICT_CACHE_SIZE; j++) {
      h->cache[j].chunk    = -1;
      h->cache[j].stamp    = -1;
      h->cache[j].inBuffer = NULL;
      h->cache[j].count    = 0;
   }
   
   return h;
}

void dict_data_close( dictData *header )
{
   int i;

   if (!header)
      return;

   if ( header->fd )
     fclose( header->fd );

   if (header->chunks)       xfree( header->chunks );
   if (header->offsets)      xfree( header->offsets );

   if (header->initialized) {
      if (inflateEnd( &header->zStream ))
	 err_internal( __func__,
		       "Cannot shut down inflation engine: %s\n",
		       header->zStream.msg );
   }

   for (i = 0; i < DICT_CACHE_SIZE; ++i){
      if (header -> cache [i].inBuffer)
	 xfree (header -> cache [i].inBuffer);
   }

   memset( header, 0, sizeof( struct dictData ) );
   xfree( header );
}

char *dict_data_read_ (
   dictData *h, unsigned long start, unsigned long size,
   const char *preFilter, const char *postFilter )
{
   char          *buffer, *pt;
   unsigned long end;
   int           count;
   char          *inBuffer;
   char          outBuffer[OUT_BUFFER_SIZE];
   int           firstChunk, lastChunk;
   int           firstOffset, lastOffset;
   int           i, j;
   int           found, target, lastStamp;
   static int    stamp = 0;

   end  = start + size;

   buffer = xmalloc( size + 1 );

   if ( !size )
   {
     *buffer = 0;
     return buffer;
   }
   
   PRINTF(DBG_UNZIP,
	  ("dict_data_read( %p, %lu, %lu, %s, %s )\n",
	   h, start, size, preFilter, postFilter ));

   assert( h != NULL);
   switch (h->type) {
   case DICT_GZIP:
      err_fatal( __func__,
		 "Cannot seek on pure gzip format files.\n"
		 "Use plain text (for performance)"
		 " or dzip format (for space savings).\n" );
      break;
   case DICT_TEXT:
   {
     if ( fseek( h->fd, start, SEEK_SET ) != 0 ||
          fread( buffer, size, 1, h->fd ) != 1 )
     {
       xfree( buffer );
       return 0;
     }

     buffer[size] = '\0';
   }
   break;
   case DICT_DZIP:
      if (!h->initialized) {
	 ++h->initialized;
	 h->zStream.zalloc    = NULL;
	 h->zStream.zfree     = NULL;
	 h->zStream.opaque    = NULL;
	 h->zStream.next_in   = 0;
	 h->zStream.avail_in  = 0;
	 h->zStream.next_out  = NULL;
	 h->zStream.avail_out = 0;
	 if (inflateInit2( &h->zStream, -15 ) != Z_OK)
	    err_internal( __func__,
			  "Cannot initialize inflation engine: %s\n",
			  h->zStream.msg );
      }
      firstChunk  = start / h->chunkLength;
      firstOffset = start - firstChunk * h->chunkLength;
      lastChunk   = end / h->chunkLength;
      lastOffset  = end - lastChunk * h->chunkLength;
      PRINTF(DBG_UNZIP,
	     ("   start = %lu, end = %lu\n"
	      "firstChunk = %d, firstOffset = %d,"
	      " lastChunk = %d, lastOffset = %d\n",
	      start, end, firstChunk, firstOffset, lastChunk, lastOffset ));
      for (pt = buffer, i = firstChunk; i <= lastChunk; i++) {

				/* Access cache */
	 found  = 0;
	 target = 0;
	 lastStamp = INT_MAX;
	 for (j = 0; j < DICT_CACHE_SIZE; j++) {
#if USE_CACHE
	    if (h->cache[j].chunk == i) {
	       found  = 1;
	       target = j;
	       break;
	    }
#endif
	    if (h->cache[j].stamp < lastStamp) {
	       lastStamp = h->cache[j].stamp;
	       target = j;
	    }
	 }

	 h->cache[target].stamp = ++stamp;
	 if (found) {
	    count = h->cache[target].count;
	    inBuffer = h->cache[target].inBuffer;
	 } else {
	    h->cache[target].chunk = i;
	    if (!h->cache[target].inBuffer)
	       h->cache[target].inBuffer = xmalloc( IN_BUFFER_SIZE );
	    inBuffer = h->cache[target].inBuffer;

	    if (h->chunks[i] >= OUT_BUFFER_SIZE ) {
	       err_internal( __func__,
			     "h->chunks[%d] = %d >= %ld (OUT_BUFFER_SIZE)\n",
			     i, h->chunks[i], OUT_BUFFER_SIZE );
	    }

      if ( fseek( h->fd, h->offsets[ i ], SEEK_SET ) != 0 ||
           fread( outBuffer, h->chunks[ i ], 1, h->fd ) != 1 )
      {
        xfree( buffer );
        return 0;
      }

      dict_data_filter( outBuffer, &count, OUT_BUFFER_SIZE, preFilter );
	 
	    h->zStream.next_in   = outBuffer;
	    h->zStream.avail_in  = h->chunks[i];
	    h->zStream.next_out  = inBuffer;
	    h->zStream.avail_out = IN_BUFFER_SIZE;
	    if (inflate( &h->zStream,  Z_PARTIAL_FLUSH ) != Z_OK)
	       err_fatal( __func__, "inflate: %s\n", h->zStream.msg );
	    if (h->zStream.avail_in)
	       err_internal( __func__,
			     "inflate did not flush (%d pending, %d avail)\n",
			     h->zStream.avail_in, h->zStream.avail_out );
	    
	    count = IN_BUFFER_SIZE - h->zStream.avail_out;
      dict_data_filter( inBuffer, &count, IN_BUFFER_SIZE, postFilter );

	    h->cache[target].count = count;
	 }
	 
	 if (i == firstChunk) {
	    if (i == lastChunk) {
	       memcpy( pt, inBuffer + firstOffset, lastOffset-firstOffset);
	       pt += lastOffset - firstOffset;
	    } else {
	       if (count != h->chunkLength )
		  err_internal( __func__,
				"Length = %d instead of %d\n",
				count, h->chunkLength );
	       memcpy( pt, inBuffer + firstOffset,
		       h->chunkLength - firstOffset );
	       pt += h->chunkLength - firstOffset;
	    }
	 } else if (i == lastChunk) {
	    memcpy( pt, inBuffer, lastOffset );
	    pt += lastOffset;
	 } else {
	    assert( count == h->chunkLength );
	    memcpy( pt, inBuffer, h->chunkLength );
	    pt += h->chunkLength;
	 }
      }
      *pt = '\0';
      break;
   case DICT_UNKNOWN:
      err_fatal( __func__, "Cannot read unknown file type\n" );
      break;
   }
   
   return buffer;
}