#!/usr/bin/perl -w 
# -*-cperl-*-
## Filename: ucs-summarize
## Modified: Sun Jan 25 13:18:08 2004 (evert)   
##   Author: Stefan Evert
##  Purpose: compute statistical summaries for variables in UCS data set
$| = 1;
STDERR->autoflush(1);

use UCS;
use UCS::DS::Stream;
use UCS::DS::Memory;

use Getopt::Long;

$Opt_Help = 0;					# --help
$Opt_Verbose = 0;				# --verbose
$Opt_Memory = 0;				# --memory

# parse optional switches
$ok = GetOptions(
		 "verbose|v" => \$Opt_Verbose,
		 "memory|m"  => \$Opt_Memory,
		 "help|h" => \$Opt_Help,
		 );
$UCS::Verbose = 0 unless $Opt_Verbose;

# parse command line
@varspec = ();					# variable specifications
$file = undef;					# input file
if (@ARGV == 1) {
  $file = shift @ARGV;				# short form: summaries for all variables
}
else {						# long form: variables specified explicitly
  while (@ARGV) {
    $arg = shift @ARGV;
    if (uc($arg) eq "FROM") {
      $ok = 0 unless @ARGV and not defined $file;	# avoid repeated spec. of input file
      $file = shift @ARGV;
    }
    else {
      $ok = 0 if defined $file;			# no variable specs. allowed after FROM
      push @varspec, $arg;
    }
  }
  $ok = 0 unless defined $file;	
}

die "Usage:   ucs-summarize [-v] [-m] <variables> FROM data.ds.gz\n[type 'ucsdoc ucs-summarize' for more information]\n"
  unless $ok and (not $Opt_Help);

# open input stream (done in both modes to check variables quickly)
$ds = new UCS::DS::Stream::Read $file;

# expand UCS wildcard patterns
@variables = ();
if (@varspec) {
  foreach $spec (@varspec) {
    @matches = UCS::Match($spec, $ds->vars);
    die "Error:  no match found for variable name/pattern '$spec'.\n"
      unless @matches;
    push @variables, @matches;
  }
}
else {
  @variables = grep { $ds->var($_) ne "STRING" } $ds->vars;
}

# ignore STRING-type variables in list
@variables = grep {
  my $ok = ($ds->var($_) ne "STRING");
  print STDERR "Can't computer summary for STRING variable $_ (ignored)\n"
    unless $ok;
  $ok;
  } @variables;

if ($Opt_Memory) {				# MEMORY MODE
  print STDERR "Loading data set $file ... "
    if $Opt_Verbose;
  $ds = new UCS::DS::Memory $ds;		# load input stream into in-memory representation
  $size = $ds->size;
  print STDERR "$size rows\n"
    if $Opt_Verbose;
  foreach $v (@variables) {
    print "Summary of $v:\n";
    $r = $ds->summary($v);
    print "    min.: ", na($r->{MIN}), "\n";
    print "    max.: ", na($r->{MAX}), "\n";
    print "abs.min.: ", na($r->{ABSMIN}), "\n";
    print "abs.max.: ", na($r->{ABSMAX}), "\n";
    print "    mean: ", na($r->{MEAN}), "\n";
    print "  median: ", na($r->{MEDIAN}), "\n";
    print "    var.: ", na($r->{VAR}), "\n";
    print "    s.d.: ", na($r->{SD}), "\n";
    print "   gran.: ", na($r->{STEP}), "\n";
    print "    NA's: ", $r->{NA}, "\n"
      if $r->{NA} > 0;
    print "\n";
  }
}
else {						# STREAM MODE
  # initialise hashes for data collection
  @zeroes = map {$_ => 0} @variables;
  %min = ();					# minimum value
  %max = ();					# maximum value
  %sum = @zeroes;				# sum of values -> mean
  %ss = @zeroes;				# sum of squares -> variance, s.d.
  %n = @zeroes;					# count number of pair types where attribute is defined
  %na = @zeroes;				# number of undef's (NA) 

  $size = $ds->global("size");
  $size = "??????" unless defined $size;
  while ($ds->read) {
    printf STDERR "Processing row %6d / %s   \r", $ds->row, $size
      if $Opt_Verbose and ($ds->row & 0xff) == 0;
    $data = $ds->data;				# ref to hash of variable values
    foreach $v (@variables) {
      $x = $data->{$v};
      if (defined $x) {
	$y = $min{$v};
	$min{$v} = $x unless defined $y and $y <= $x;
	$y = $max{$v};
	$max{$v} = $x unless defined $y and $y >= $x;
	$sum{$v} += $x;
	$ss{$v} += $x * $x;
	$n{$v}++;
      } else {
	$na{$v}++;
      }
    }
  }
  printf STDERR "Processing complete (%d rows).               \n\n", $ds->row
    if $Opt_Verbose;
  $ds->close;

  # compute descriptive statistics (mean, variance, ...)
  %mean = ();					# mean
  %var = ();					# empirical variance
  %sd = ();					# empirical standard deviation
  foreach $v (@variables) {
    $n = $n{$v};
    if ($n > 0) {
      $mean{$v} = $sum{$v} / $n;
      $var{$v} = ($ss{$v} - $n * $mean{$v} * $mean{$v}) / ($n - 1);
      $var{$v} = 0				# cancellation error for large values with small variance
	if $var{$v} < 0;
      $sd{$v} = sqrt($var{$v});
    } else {
      $mean{$v} = $var{$v} = $sd{$v} = "NaN";
    }
    $min{$v} = "NaN" unless defined $min{$v};
    $max{$v} = "NaN" unless defined $max{$v};
  }

  # print summary for each variable
  foreach $v (@variables) {
    print "Summary of $v:\n";
    print "    min.: ", $min{$v}, "\n";
    print "    max.: ", $max{$v}, "\n";
    print "    mean: ", $mean{$v}, "\n";
    print "    var.: ", $var{$v}, "\n";
    print "    s.d.: ", $sd{$v}, "\n";
    print "NA's:   ", $na{$v}, "\n"
      if $na{$v} > 0;
    print "\n";
  }
}

# that's it

# print undefined values as "NA", everything else normally
sub na {
  if (@_ == 1) {
    my $x = shift;
    return (defined $x) ? $x : "NA";
  }
  else {
    return map {(defined $_) ? $_ : "NA"} @_;
  }
}


__END__

=head1 NAME

ucs-summarize - Compute statistical summaries for variables in UCS data set


=head1 SYNOPSIS

  ucs-summarize [-v] [-m] f f1 f2 FROM data.ds.gz

  ucs-summarize [-v] [-m] am.%.pv FROM data.ds.gz

  ucs-summarize [-v] [-m] data.ds.gz


=head1 DESCRIPTION

This program computes short statistical summaries of numerical variables in a
UCS data set.  The general form of the B<ucs-summarize> command is

  ucs-summarize [-v] [-m] <variables> FROM <input.ds>

where C<< <variables> >> is a whitespace-separated list of variable names or
wildcard expression, and the data set is read from the file specified as C<<
<input.ds> >>.  Wildcard expressions may need to be quoted to avoid
interpretation by the shell.  When the list of variables is omitted (including
the keyword C<FROM>), summaries are generated for all variables in the data
set.  In verbose mode (C<--verbose> or C<-v> option), some progress
information is shown while computing the summary.

So far, the statistical summary includes the B<minimum> (C<min.>),
B<maximum> (C<max.>), B<mean> (C<mean>), B<empirical variance>
(C<var.>), and the B<empirical standard deviation> (C<s.d.>).  In 
addition, the number of missing values (C<NA's>) is reported.

When C<--memory> (or C<-m>) is specified, the data set will be read
into memory first.  In addition to the ordinary statistical summary,
the B<absolute minimum> (C<abs.min.>, the smallest non-zero absolute
value), B<absolute maximum> (C<abs.max.>), and B<granularity>
(C<gran.>, smallest difference between any two unequal values) are
computed in this mode.


=head1 COPYRIGHT

Copyright 2004 Stefan Evert.

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
