/* $Id: guppi-data-classify.c,v 1.1 2000/01/17 06:00:23 trow Exp $ */

/*
 * guppi-data-classify.c
 *
 * Copyright (C) 1999, 2000 EMC Capital Management, Inc.
 *
 * Developed by Jon Trowbridge <trow@gnu.org>
 * and Havoc Pennington <hp@pobox.com>.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */

#include <math.h>
#include "guppi-boolean-data.h"
#include "guppi-scalar-data.h"
#include "guppi-categorical-data.h"
#include "guppi-string-data.h"
#include "guppi-data-classify.h"

static gboolean
contains_non_integer(const GuppiData* data)
{
  gchar buffer[1024];
  const gint buffer_len = 1024;
  dindex_t i, i0, i1;
  double x, dist;
  const double epsilon = 1e-8;

  i0 = guppi_data_min_index(data);
  i1 = guppi_data_max_index(data);

  for(i=i0; i<=i1; ++i) {
    guppi_data_get(data, i, buffer, buffer_len);

    /* If buffer[] contains some non-numeric garbage, atof() will just return
       zero, and we'll see it as an integer.  This is why using atof()
       is preferable to just searching buffer[] looking for '.' */
    x = atof(buffer);

    /* We prefer this hack to using rint() since rint() isn't POSIX
       (but floor() and ceil() are) */
    dist = MIN(fabs(x - floor(x)), fabs(x - ceil(x)));
    if (dist > epsilon)
      return TRUE;
  }
  return FALSE;
}

GtkType
guppi_data_classify(const GuppiData* data)
{
  double scalarness;

  gchar buffer[1024];
  const gint buffer_len = 1024;
  gchar* p;
  guint16 hash;
  guint* hashtable;
  dindex_t i, i0, i1;
  const double threshold = 0.15;
  gint distinct_count, max_distinct;
  
  g_return_val_if_fail(data != NULL, (GtkType)0);

  if (guppi_data_size(data) == 0)
    return (GtkType)0;

  /* First, the easy case --- boolean */
  if (guppi_data_conversion_potential(data, GUPPI_TYPE_BOOLEAN_DATA) > 0.95)
    return GUPPI_TYPE_BOOLEAN_DATA;

  scalarness = guppi_data_conversion_potential(data, GUPPI_TYPE_SCALAR_DATA);

  /* If we look a lot like scalar data, we can't immediately conclude
     that we should be of type scalar: categorical data could also
     look scalar, taking the form of a set of small integers.  But if
     there is a non-integer in the batch, we know for sure that
     scalar is a good guess.
  */
  if (scalarness > 0.95 && contains_non_integer(data))
    return GUPPI_TYPE_SCALAR_DATA;

  /* OK, now the fun part.  We guess that data is categorical if the
     number of distinct elements in the data is low relative to the total
     amount of data --- that is to say, if the data has lots and lots of
     repetition in it.

     One way of checking this would be to take strings of each element,
     sort them, and step through counting the number of distinct values.

     We aren't going to do that, though -- I have something more
     entertaining in mind.  For each element, we'll compute a hash value from
     its string.  We'll then keep track of which hash values we've seen
     before with a big-ass bit vector.

     One advantage of this method is that we can terminate early without
     having to test every data element.  Once we've seen more than our
     threshold number of distinct elements, we can give up.

     So this approach is O(N) vs. O(N log N) for a method that requires
     sorting.  This method should be a big winner when N is large and when
     (# of distinct elements) / N close to 1... in this case, we shouldn't
     have to check that many more than (thresh)N elements, when thresh
     is our threshold value.
  */

  i0 = guppi_data_min_index(data);
  i1 = guppi_data_max_index(data);
  hashtable = g_new0(guint32, 2048); /* 2048 * 32 = 65536 = 2^16 */
  distinct_count = 0;
  max_distinct = (gint)(threshold * guppi_data_size(data));

  for (i=i0; i<=i1 && distinct_count <= max_distinct; ++i) {

    guppi_data_get(data, i, buffer, buffer_len);

    /* calculate our hash value */
    hash = 0xbeef;
    p = buffer;
    while (*p) {
      hash = 17*hash + *p;
      ++p;
    }

    if ((hashtable[hash >> 5] & (1<<(hash & 31))) == 0) {
      ++distinct_count;
      hashtable[hash >> 5] |= 1 << (hash & 31);
    }
  
  }

  g_free(hashtable);
  
  if (distinct_count <= max_distinct)
    return GUPPI_TYPE_CATEGORICAL_DATA;


  /* At this point we are safe in concluding that if it looks like scalar
     data, it probably is scalar data */
  if (scalarness > 0.95)
    return GUPPI_TYPE_SCALAR_DATA;
  

  /* When in doubt, and when all else fails, just call it string data
     and be done with it... */
  return GUPPI_TYPE_STRING_DATA;
}


/* $Id: guppi-data-classify.c,v 1.1 2000/01/17 06:00:23 trow Exp $ */
