Logo Search packages:      
Sourcecode: latrine version File versions  Download package

data.c

/* vim: set noet ts=4:
 *
 * Copyright (c) 2002-2004 Martin A. Godisch <martin@godisch.de>.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 * Place, Suite 330, Boston, MA 02111-1307 USA.
 */
#include <data.h>
#include <freedict.h>
#include <latrine.h>
#include <memory.h>
#include <stdio.h>
#include <time.h>

char
      *dictfile = NULL,
      *wordfile = NULL;
size_t
      randcount = DEFAULT_RANDCOUNT,
      wordlimit = DEFAULT_WORDLIMIT;
static size_t
      dictcount = 0,
      wordcount = 0;
static struct word
      *wordlist = NULL;
static double
      smallest_out = 0.0,
      largest_in   = 0.0;

size_t get_wordcount(void) { return wordcount; }
size_t get_dictcount(void) { return dictcount; }

static int comp_by_access(const void *a, const void *b)
{
      time_t q1 = ((struct word*)a)->timestamp;
      time_t q2 = ((struct word*)b)->timestamp;
      if (q1 < q2)
            return -1;
      else if (q1 > q2)
            return +1;
      return 0;
}

static int comp_by_rate(const void *a, const void *b)
{
      double q1 = ((struct word*)a)->rate;
      double q2 = ((struct word*)b)->rate;
      if (q1 < q2)
            return -1;
      else if (q1 > q2)
            return +1;
      return comp_by_access(a, b);
}

static int comp_by_pos(const void *a, const void *b)
{
      double q1 = ((struct word*)a)->pos;
      double q2 = ((struct word*)b)->pos;
      if (q1 < q2)
            return -1;
      else if (q1 > q2)
            return +1;
      return 0;
}

static inline double rate(struct word *w)
{
      double r = 0.0;
      unsigned long i;

      assert(w != NULL);
      for (i = 1; i <= 0x800000; i *= 2)
            if (w->index & i)
                  r += 1.0;
      for (i = 1; i <= 0x800000; i *= 2)
            if (w->index & i)
                  r += 1.0;
            else
                  break;
      return(w->rate = r);
}

void update_word(struct word *w, int hit)
{
      assert(w != NULL);
      w->index = ((w->index << 1) & 0xffffff) | (hit ? 1 : 0);
      time(&w->timestamp);
      rate(w);
}

struct word *select_word(void)
{
      static size_t
            last[3] = {(size_t)(-1), (size_t)(-1), (size_t)(-1)};
      static int
            cycle   = 0;
      size_t
            next    = 0;

      if (randcount > wordcount)
            randcount = wordcount;
      assert(randcount > 0);
      if (cycle < 3) {
            qsort(wordlist, wordcount, sizeof(struct word), comp_by_rate);
            largest_in = wordlist[wordcount-1].rate;
            cycle++;
      } else {
            qsort(wordlist, wordcount, sizeof(struct word), comp_by_access);
            cycle = 0;
      }
      do
            next = random() % randcount;
      while ((wordlist[next].pos == last[0] && randcount > 1)
            || (wordlist[next].pos == last[1] && randcount > 2)
            || (wordlist[next].pos == last[2] && randcount > 3));
      last[2] = last[1];
      last[1] = last[0];
      last[0] = wordlist[next].pos;
      return &wordlist[next];
}

static inline void free_wordlist(void)
{
      size_t i;
      for (i = 0; i < wordcount; i++) {
            FREE(&wordlist[i].lang[0]);
            FREE(&wordlist[i].lang[1]);
      }
      if (wordlist != NULL)
            free(wordlist);
      wordlist  = NULL;
      wordcount = 0;
}

static inline int check_wordlist_version(gzFile hits)
{
      char buffer[BUFSIZE];
      int  version;

      assert(hits != NULL);
      while (gzgets(hits, buffer, BUFSIZE) != Z_NULL) {
            if (sscanf(buffer, HEADER, &version) == 1) {
                  if (version >= 2 && version <= WORDLIST_VERSION)
                        return version;
                  else
                        return 0;
            }
      }
      /* no version marker found, assuming version = 1 */
      return 0;
}

/* load the dictionary and the corresponding hits file,
 * an existing wordlist will be overridden
 *
 * returns  0: success
 * returns -1: failure (errmsg called, wordlist freed)
 */
int load_wordlist(void)
{
      int (*open_dict)(const char*)          = open_freedict;
      int (*read_dict)(gzFile, struct word*) = read_freedict;
      int (*close_dict)(void)                = close_freedict;
      gzFile hits = NULL;
      double q    = 0.0;
      struct word w;
      size_t cursize, i;
      time_t randinit;
      int    ret;

      assert(dictfile != NULL);
      assert(wordfile != NULL);
      if (wordlimit == 0) {
            if (wordlist == NULL)
                  wordlist = (struct word*)MALLOC((cursize = WORDSTEP) * sizeof(struct word));
            else
                  cursize  = wordcount;
      } else {
            if (wordlist == NULL)
                  wordlist = (struct word*)MALLOC(wordlimit * sizeof(struct word));
            else {
                  wordlist = (struct word*)REALLOC(wordlist, wordlimit * sizeof(struct word));
                  memset(&wordlist[wordcount], 0, &wordlist[wordlimit] - &wordlist[wordcount]);
            }
            cursize = wordlimit;
      }
      if (open_dict(dictfile) == -1)
            return -1;
      if ((hits = gzopen(wordfile, "rb")) == NULL && errno != ENOENT)
            errmsg(_("cannot open wordlist: %s"), errno == 0 ? zError(Z_MEM_ERROR) : strerror(errno));
      if (hits != NULL) {
            if (check_wordlist_version(hits))
                  gzrewind(hits);
            else {
                  errmsg(_("ignoring wordlist because of incompatible version"));
                  gzclose(hits);
                  hits = NULL;
            }
      }
      for (dictcount = 0, wordcount = 0; (ret = read_dict(hits, &w)) == 1; dictcount++) {
            if (wordcount >= cursize && wordlimit == 0) {
                  assert(wordcount == cursize);
                  wordlist = (struct word*)REALLOC(wordlist, (cursize += WORDSTEP) * sizeof(struct word));
                  memset(&wordlist[wordcount], 0, &wordlist[cursize] - &wordlist[wordcount]);
            }
            q = rate(&w);
            if (wordcount < cursize) {
                  if (wordcount == 0 || q > largest_in)
                        largest_in = q;
                  FREE(&wordlist[wordcount].lang[0]);
                  FREE(&wordlist[wordcount].lang[1]);
                  wordlist[wordcount++] = w;
                  continue;
            }
            if (q < largest_in) {
                  for (i = 0; i < cursize; i++)
                        if (rate(&wordlist[i]) == largest_in)
                              break;
                  assert(i < cursize);
                  FREE(&wordlist[i].lang[0]);
                  FREE(&wordlist[i].lang[1]);
                  wordlist[i]  = w;
                  smallest_out = largest_in;
                  largest_in   = q;
                  continue;
            }
            if (wordcount == cursize || q < smallest_out)
                  smallest_out = q;
            FREE(&w.lang[0]);
            FREE(&w.lang[1]);
      }
      FREE(&w.lang[0]);
      FREE(&w.lang[1]);
      close_dict();
      if (hits != NULL)
            gzclose(hits);
      if (ret == -1 || wordcount == 0)
            free_wordlist();
      else {
            if (wordcount < cursize)
                  wordlist = (struct word*)REALLOC(wordlist, wordcount * sizeof(struct word));
            time(&randinit);
            srandom(randinit);
            qsort(wordlist, wordcount, sizeof(struct word), comp_by_rate);
      }
      if (ret == 0 && wordcount == 0) {
            errmsg(_("invalid or empty dictionary"));
            ret = -1;
      }
      return ret;
}

/* save the dictionary and the corresponding hits file
 *
 * returns  0: success
 * returns -1: failure (errmsg called)
 */
int save_wordlist(void)
{
      char buffer[BUFSIZE];
      char *tempfile   = NULL;
      gzFile old       = NULL;
      gzFile new       = NULL;
      const char *zmsg = NULL;
      unsigned long
            index     = 0,
            timestamp = 0;
      size_t i, n;
      int ret;

      if (wordcount == 0)
            return 0;
      tempfile = (char*)MALLOC(strlen(wordfile) + 5);
      sprintf(tempfile, "%s.new", wordfile);
      if ((old = gzopen(wordfile, "rb")) == NULL && errno != ENOENT)
            errmsg(_("cannot open old wordfile: %s"), errno == 0 ? zError(Z_MEM_ERROR) : strerror(errno));
      if (old != NULL) {
            if (check_wordlist_version(old))
                  gzrewind(old);
            else {
                  gzclose(old);
                  old = NULL;
            }
      }
      if ((new = gzopen(tempfile, "wb")) == NULL) {
            errmsg(_("cannot open new wordfile: %s"), errno == 0 ? zError(Z_MEM_ERROR) : strerror(errno));
            FREE(&tempfile);
            return -1;
      }
      gzprintf(new, HEADER, WORDLIST_VERSION);
      gzprintf(new, _("# Do not change the first line or this file cannot be read anymore!\n"));
      gzprintf(new, _("# Do not rename this file or it cannot be found anymore!\n"));
      gzprintf(new, _("# Do not edit while LaTrine is running, your changes will be overridden!\n"));
      gzprintf(new, _("# Be careful not to destroy the position-dependent mapping with the dictionary!\n"));
      gzprintf(new, _("# Dictionary: %s\n\n"), dictfile);
      qsort(wordlist, wordcount, sizeof(struct word), comp_by_pos);
      for (i = 0, n = 0; gzgets(old, buffer, BUFSIZE) != Z_NULL;) {
            if (*buffer == '\n' || *buffer == '#' || sscanf(buffer, "%06lx:%08lx", &index, &timestamp) < 2)
                  continue;
            if (i < wordcount && wordlist[i].pos == n) {
                  gzprintf(new, "%06lx:%08lx\n", wordlist[i].index, wordlist[i].timestamp);
                  i++;
            } else
                  gzprintf(new, "%06lx:%08lx\n", index, timestamp);
            n++;
      }
      if (old != NULL)
            gzclose(old);
      for (; n < dictcount; n++)
            if (i < wordcount && wordlist[i].pos == n) {
                  gzprintf(new, "%06lx:%08lx\n", wordlist[i].index, wordlist[i].timestamp);
                  i++;
            } else
                  gzprintf(new, "%06lx:%08lx\n", 0, 0);
      if ((ret = gzclose(new)) != Z_OK) {
            zmsg = gzerror(new, &ret);
            errmsg(_("cannot close wordfile: %s"), ret == Z_ERRNO ? strerror(errno) : zmsg);
            FREE(&tempfile);
            return -1;
      }
      if (rename(tempfile, wordfile) != 0) {
            errmsg(_("cannot update wordfile: %s"), strerror(errno));
            FREE(&tempfile);
            return -1;
      }
      FREE(&tempfile);
      return 0;
}

Generated by  Doxygen 1.6.0   Back to index