#!/usr/bin/perl -w 
# -*-cperl-*-
## Filename: ucs-add
## Modified: Sun Aug 22 16:13:41 2004 (evert)   
##   Author: Stefan Evert
##  Purpose: add variables (association scores) to UCS data set
$| = 1;
STDERR->autoflush(1);

use UCS;
use UCS::AM;
use UCS::Expression;
use UCS::DS::Stream;
use UCS::DS::Memory;

use Getopt::Long;

$Opt_Help = 0;					# --help
$Opt_Memory = 0;				# --memory
$Opt_Extra = "";				# --extra
$Opt_Random = 0;				# --randomize
$Opt_Verbose = 0;				# --verbose

# parse optional switches
$ok = GetOptions(
		 "verbose|v" => \$Opt_Verbose,
		 "memory|m" => \$Opt_Memory,
		 "extra|x=s" => \$Opt_Extra,
		 "randomize|r" => \$Opt_Random,
		 "help|h" => \$Opt_Help,
		 );
$UCS::Verbose = 0 unless $Opt_Verbose;

# parse command line
@varspec = ();					# variable specifications
$in = undef;					# input file
$out = undef;					# output file
while (@ARGV) {
  $arg = shift @ARGV;
  if (uc($arg) eq "TO") {
    $ok = 0 unless @ARGV and not defined $in;	# avoid repeated spec. of input file
    $in = shift @ARGV;
  }
  elsif (uc($arg) eq "INTO") {
    $ok = 0 unless @ARGV and not defined $out;	# avoid repeated spec. of output file
    $out = shift @ARGV;
  }
  else {
    $ok = 0 if defined $in or defined $out;	# no variable specs. allowed after INTO or TO
    push @varspec, $arg;
  }
}
$in = "-" unless defined $in;			# input and output streams default to stdin/stdout
$out = "-" unless defined $out;
$ok = 0 unless @varspec;

if ($Opt_Extra) {
  @loaded = UCS::Load_AM_Package(split /,/, $Opt_Extra);
  die "Error: no match for requested AM package(s) '$Opt_Extra'.\n"
    unless @loaded;
  if ($Opt_Verbose) {
    foreach $mod (@loaded) {
      print STDERR "[loading package $mod]\n";
    }
  }
}

die "Usage:  ucs-add [-v] [-x <package>] [-m] [-r] <variables> TO data.ds.gz INTO new.ds.gz\n"
  . "[type 'ucsdoc ucs-add' for more information]\n"
  unless $ok and (not $Opt_Help);

# open stream for input data set (to check for existing variables)
$IN = new UCS::DS::Stream::Read $in;

# expand UCS wildcard patterns
@variables = ();				# list of variables to add
%EXP = ();					# UCS expressions used to compute these variables
%AM = map { ("am.$_" => $_) } UCS::AM_Keys();
$do_rankings = 0;				# whether any rankings are requested (for --randomize)
foreach $spec (@varspec) {
  if ($spec =~ /^\s*([a-zA-Z0-9.]+)\s*:=\s*(.+)$/) { # user-defined expression
    my ($var, $exp) = ($1, $2);
    die "Error:  illegal variable name '$var'.\n"
      unless UCS::ValidName($var);
    my ($prefix) = UCS::SplitName($var);
    die "Error:  can't modify standard variable '$var'.\n"
      unless defined $prefix;
    my $type = UCS::VarType($var);
    $exp = "my \$result = eval { $exp };\n";
    if ($type eq "BOOL") {
      $exp .= '($result) ? 1 : 0';
    }
    elsif ($type eq "INT") {
      $exp .= 'int($result)';
    }
    elsif ($type eq "DOUBLE") {
      $exp .= '$result + 0.0';
    }
    else { # "STRING"
      $exp .= '$result . ""';
    }
    my $code = new UCS::Expression $exp;
    die "Error:  syntax error in user-defined expression.\n"
      unless defined $code;
    $EXP{$var} = $code;
    @matches = $var;
  }
  elsif ($spec =~ /^r\.(.+)$/) {		# pattern for ranking variable: match against defined association scores
    $key = $1;
    @am_matches = UCS::Match("am.$key", $IN->vars, @variables);
    die "Error:  can't compute ranking r.$key -- no match for variable name/pattern am.$key\n"
      unless @am_matches > 0;
    @matches = ();
    foreach $am (@am_matches) {
      $r = $am;
      $r =~ s/^am/r/;
      $EXP{$r} = [$r => $am];			# special entry in %EXP hash for rankings
      push @matches, $r;
    }
    print STDERR "[activating memory mode to add rankings]\n"
      unless $Opt_Memory;
    $Opt_Memory = 1;				# implicitly activates memory mode
    $do_rankings = 1;
  }
  else {
    @matches = UCS::Match($spec, sort keys %AM); # pattern: match against built-in AMs first
    if (@matches) {
      foreach $var (@matches) {			# retrieve UCS expressions to compute association scores
	$EXP{$var} = UCS::AM_Expression($AM{$var});
      }
    }
    else {
      @matches = UCS::Match($spec, @UCS::DerivedVars); # try derived variables next
      if (@matches) {
	foreach $var (@matches) {			# compile UCS expressions to compute derived variables
	  $EXP{$var} = new UCS::Expression "\%$var\%";
	}
      }
    }
  }
  die "Error:  no match found for variable name/pattern '$spec'.\n"
    unless @matches;
  push @variables, @matches;
}
@variables = UCS::Unique(@variables);		# remove duplicates from list of variables
@redo = grep {$IN->var($_)} @variables;		# existing variables will be overwritten with recomputed values
print STDERR "[existing variables ".join(", ", @redo)." will be overwritten]\n"
  if @redo and $Opt_Verbose;

print STDERR "[--randomize option will be ignored because no rankings were requested]\n"
  if $Opt_Random and not $do_rankings;

# check that input and output files are different (unless running in memory mode, or $in=$out="-")
die "Error:  output file ($out) must not be identical to input file.\n"
  if $in eq $out and $in ne "-" and not $Opt_Memory;

if ($Opt_Memory) {				# MEMORY MODE
  print STDERR "Loading data set $in ... "
    if $Opt_Verbose;
  $DS = new UCS::DS::Memory $IN;		# load entire data set into memory
  $IN->close;
  if ($Opt_Verbose) {
    print STDERR $DS->size, " rows\n";
    print STDERR "Data set variables: ", join(" ", $DS->vars), "\n";
  }
  if ($do_rankings and $Opt_Random and not $DS->var("am.random")) {
    print STDERR "[temporarily adding am.random for rankings with --random option]\n"
      if $Opt_Verbose;
    $DS->add("am.random");
    $DS->temporary("am.random", 1);
  }
  print STDERR "Adding variables:  "
    if $Opt_Verbose;
  foreach $var (@variables) {
    print STDERR " $var"
      if $Opt_Verbose;
    $exp = $EXP{$var};
    if (ref($exp) eq "ARRAY") {			# add ranking
      ($r, $am) = @$exp;
      if ($Opt_Random and not $am eq "am.random") { # --randomize (but not for am.random)
	$DS->rank($r, $am, "am.random");
	$DS->set_global($r, "ranking is randomized");
      }
      else {					# without randomization
	$DS->rank($r, $am);
      }
    }
    else {					# add variable from UCS expression
      @missing = $DS->missing($exp);
      UCS::Die "Can't add $var because of missing variable(s) @missing required by UCS expression."
	  if @missing > 0;
      $DS->eval($var, $exp);
    }
  }
  if ($Opt_Verbose) {
    print STDERR "\n";
    print STDERR "Writing data set to $out ... ";
  }
  $DS->save($out);
  print STDERR "done\n"
    if $Opt_Verbose;
}
else {						# STREAM MODE
  # open data set streams and configure output data set
  $OUT = new UCS::DS::Stream::Write $out;
  $OUT->copy_comments($IN);
  $OUT->copy_globals($IN);
  if ($Opt_Verbose) {				# print some information in verbose mode
    print STDERR "Variables: ", join(", ", $IN->vars), "\n";
    print STDERR "Adding:    ", join(", ", @variables), "\n";
  }
  $OUT->add_vars($IN->vars);			# copy variables from input
  $OUT->add_vars(@variables);			# append new variables (existing ones are ignored)
  $OUT->open;
  
  # copy variable values for each row by name, inserting computed association scores
  $size = $IN->global("size");
  $size = "??????" unless defined $size;
  while ($IN->read) {
    printf STDERR "Processing row %6d / %s   \r", $IN->row, $size
      if $Opt_Verbose and ($IN->row & 0xff) == 0;
    $data = $IN->data;				# ref to hash of variable values
    foreach $v (@variables) {
      $data->{$v} = $EXP{$v}->eval($data);
    }
    $OUT->data($data);
    $OUT->write;
  }
  $size = $IN->row;
  $IN->close;
  $OUT->close;
  print STDERR "Processing complete ($size rows).               \n"
    if $Opt_Verbose;
}


__END__

=head1 NAME

ucs-add - Add variables (association scores) to UCS data set


=head1 SYNOPSIS

  ucs-add [-v] [-m] am.t.score am.Fisher.pv TO data.ds.gz INTO new.ds.gz 

  ucs-add [-v] [-m] -x HTest am.%.pv TO data.ds.gz INTO new.ds.gz

  ucs-add [-r] r.% TO data.ds.gz INTO new.ds.gz

=head1 DESCRIPTION

This program is used to add variables (B<association scores>, B<rankings>,
B<derived variables>, or arbitrary B<UCS expressions> entered on the command
line) to a UCS data set.  If a variable is already defined in the data set,
its values will be overwritten.

The general form of the B<ucs-add> command is

  ucs-add [--verbose | -v] [--memory | -m] [--extra=<list> | -x <list>] 
          <variables> [ TO <input.ds> ] [ INTO <output.ds> ]

where C<< <variables> >> is a whitespace-separated list of variable
specifications (see the section on L<Variable Specifications|"VARIABLE
SPECIFICATIONS"> below for details).  An additional C<--randomize> option is
only useful when adding rankings:

  ucs-add [--verbose | -v] [--extra=<list> | -x <list>] [--randomize | -r] 
          <variables> [ TO <input.ds> ] [ INTO <output.ds> ]

The data are read from the file C<< <input.ds> >>, and the resulting data set
with the new annotations is written to the file C<< <output.ds> >>.  When they
are not specified, the input and output files default to STDIN and STDOUT,
respectively.

Variable specifications and file names may need to be quoted
individually (when they contain shell metacharacters or whitespace).

Normally, the B<ucs-add> program processes the data set one row at a time,
so that C<< <input.ds> >> and C<< <output.ds> >> must not refer to the
same file.  When C<--memory> (or C<-m>) is specified, the entire data
set is read into memory, annotated, and then written back to the
output file.  In this case, C<< <input.ds> >> and C<< <output.ds> >>
may be identical.  This mode is automatically activated when any
rankings are added to the data set.

In both modes of operation, variables are added in the order in which they are
given on the command-line, so variable specifications (rankings and
user-defined expressions) may refer to any of the previously introduced
variables.

With the C<--verbose> (or C<-v>) option, some debugging and progress
information is displayed while the program is running.  The C<--extra> (or
C<-x>) option loads additional built-in association measures (see the section
on adding L<Associations Scores|"Association Scores"> below for details).


=head1 VARIABLE SPECIFICATIONS

=head2 Association Scores

Variables representing association scores are selected by specifying their
variable names (which start with the prefix C<am.>).  The names may be given
as UCS wildcard patterns (see L<the ucsexp manpage|ucsexp>), which will be
matched against the list of all supported association measures.  Examples of
useful wildcard patterns are C<am.%> (all measures), C<am.%.pv> (all measures
that compute probability values), and C<am.chi.squared.%> (all variants of
Pearson's chi-squared test).

By default, only the basic association measures defined in L<UCS::AM|UCS::AM>
are supported.  Other AM packages (see L<the UCS::AM manpage|UCS::AM> for a
list of add-on packages) can be loaded with the C<--extra> (or C<-x>) option.
The argument is a comma-separated list of package names
(e.g. C<--extra=HTest,Parametric> to load B<UCS::AM::HTest> and
B<UCS::AM::Parametric>), which are case-insensitive and may be abbreviated
to unique prefixes (so C<-x htest,par> works just as well).  Use C<-x ALL> 
to load all available AM packages.

=head2 Rankings

Variables representing association score rankings are selected by
specifying their variable names (which start with the prefix C<r.>).
In order to compute a ranking, say C<r.something>, the corresponding
association scores (C<am.something>) must be annotated in the data
set.  UCS wildcard patterns are matched against all association scores
in the data set (but not against other built-in association measures).
Rankings can also be computed for user-defined measures, provided that
their association scores are annotated.  In order to compute a ranking
for a built-in association measure that is not available in the data
set, both the association score and the ranking variable must be
specified.  The example

  ucs-add -m am.% r.% TO data.ds.gz INTO data.ds.gz

adds associations scores and rankings for the basic built-in association
measures to the data set C<data.ds.gz>.

Ties are not resolved in the rankings, so pair types with identical
association scores share the same rank.  The rank assigned to such a
group of pair types is the lowest free rank (as in the Olympic Games)
rather than the average of all ranks in the group (as is often done
in statistics).  With the C<--random> (or C<-r>) option, ties are 
resolved in a random fashion.  When association scores for the C<random> 
measure are pre-annotated (i.e. the C<am.random> variable is present
in the data set), these are used for the randomization so that the
ranking is reproducible.

=head2 Derived Variables

Any variable names or wildcard patterns that do not match one of the built-in
association measures are matched against the list of derived variables, which
can be computed automatically from the frequency signatures of pair types.
See L<the ucsfile manpage|ucsfile> for a complete list of derived variables.
Examples of useful patterns are C<E*> (expected frequencies), C<lp*>
(logarithmic coordinates), and C<e b m> (I<(e,b,m)>-coordinates).

=head2 User-Defined Expressions

A user-defined variable specification is a UCS expression (see L<the ucsexp
manpage|ucsexp>) of the form

  <var> := <expression>

where C<< <var> >> is the name of a user-defined variable, association score,
or ranking (without surrounding C<%> characters).  This variable is added to
the input data set if necessary and set to the values computed by the UCS
expression C<< <expression> >>.  The example below computes association scores
for a compound measure C<mixed> from the rankings according to two other
measures (which must both be annotated in the data set).

  am.mixed := -max(%r.t.score%, %r.dice%)

Note that it isn't possible to compute the corresponding ranking
C<r.mixed> directly. 


=head1 COPYRIGHT

Copyright 2004 Stefan Evert.

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
