#!/usr/bin/perl -w 
# -*-cperl-*-
## Filename: ucs-sort
## Modified: Sun Jan 25 13:21:24 2004 (evert)   
##   Author: Stefan Evert
##  Purpose: sort UCS data set by one or more variables
STDERR->autoflush(1);

use UCS;
use UCS::AM;			# import definition of 'random' AM 
use UCS::DS::Memory;

use Getopt::Long;

$Opt_Help = 0;					# --help
$Opt_Verbose = 0;				# --verbose
$Opt_Random = 0;				# --randomize

$ok = GetOptions(
		 "help|h" => \$Opt_Help,
		 "verbose|v" => \$Opt_Verbose,
		 "randomize|r" => \$Opt_Random,
		);
$UCS::Verbose = 0 unless $Opt_Verbose;

# parse command line
@keys = ();
$in = "-";
$out = "-";

$ok = 1;
$in = shift @ARGV 
  if @ARGV and uc($ARGV[0]) ne "BY";
if (@ARGV and uc(shift(@ARGV)) eq "BY") {
  while (@ARGV and uc($ARGV[0]) ne "INTO") {
    push @keys, shift @ARGV;
  }
  if (@ARGV and uc(shift(@ARGV)) eq "INTO") {
    if (@ARGV) {
      $out = shift @ARGV;
    }
    else {
      $ok = 0;
    }
  }
}
else { 
  $ok = 0;
}
$ok = 0 
  if @ARGV or @keys == 0;

die "Usage:  ucs-sort [-v] [-r] [data.ds.gz] BY am.t.score [...] [INTO new.ds.gz]\n"
  . "[type 'ucsdoc ucs-sort' for more information]\n"
  unless $ok and (not $Opt_Help);

# check keys for syntax errors (before loading data set)
foreach $key (@keys) {
  $temp = $key;
  $temp =~ s/^[+-]// or $temp =~ s/[+-]$//;
  UCS::Die "Error:  invalid sort key $key"
      unless UCS::ValidName($temp);
}

# load data set into memory
print STDERR "Loading data set $in ... "
  if $Opt_Verbose;
$ds = new UCS::DS::Memory $in;
$size = $ds->size;
print STDERR "$size rows\n"
  if $Opt_Verbose;

# sort by specified keys
if ($Opt_Random) {
  push @keys, "am.random";			# --randomize: use am.random variable as final tie-breaker
  if (not $ds->var("am.random")) {
    $ds->add("am.random");			# add am.random as temporary variable if necessary
    $ds->temporary("am.random", 1);
  }
}
print STDERR "Sorting by @keys ... "
  if $Opt_Verbose;
$ds->sort("sorted", @keys);
$ds->activate_index("sorted");
print STDERR "done\n"
  if $Opt_Verbose;

# save result in data set file
print STDERR "Writing data set to $out ... "
  if $Opt_Verbose;
$ds->save($out);
print STDERR "done\n"
  if $Opt_Verbose;



__END__

=head1 NAME

ucs-sort - Sort UCS data set by one or more variables


=head1 SYNOPSIS

  ucs-sort [-v] [-r] [data.ds.gz] BY am.t.score [INTO new.ds.gz]

  ucs-sort [-v] [-r] [data.ds.gz] BY l2+ l1- ... [INTO new.ds.gz]


=head1 DESCRIPTION

This program sorts the rows of UCS data by one or more variables.  
The general form of the B<ucs-sort> command is

  ucs-sort [--verbose | -v] [--randomize | -r]
           [<input.ds>] BY <variables> [INTO <output.ds>]

where C<< <variables> >> is a whitespace-separated list of variable
names.  A C<+> or C<-> character appended to a variable name selects
ascending or descending order, respectively.  The default order 
depends on the variable type (association scores are sorted in descending
order). 

The data set is read from STDIN by default, or from the file C<< <input.ds> >>
when it is specified.  The sorted data set is printed on STDOUT, and 
can be saved into the file C<< <output.ds> >> with the optional C<INTO>
clause.

When C<--randomize> (or C<-r>) is specified, ties are broken randomly,
using the C<am.random> measure if it is annotated in the data set.  The
C<--verbose> (or C<-v>) option displays some (minial) progress information.


=head1 EXAMPLES

The B<ucs-sort> utility is often used in command-line pipes to sort data sets
before viewing.  Assuming that a data set file F<candidates.ds.gz> is
annotated with the necessary association scores, ranked candidate lists for
the log-likelihood and t-score measures can be displayed with the following
commands:

  ucs-sort -r candidates.ds.gz BY am.log.likelihood | ucs-print -i
  ucs-sort -r candidates.ds.gz BY am.t.score | ucs-print -i

B<ucs-sort> can also be applied to the output of another UCS tool,
e.g. B<ucs-select>.  The following command selects the 100 highest-ranked pair
types from the data set file F<candidates.ds.gz>, according to the
log-likelihood measure, and displays them in alphabetical order, sorted by
C<l2> first. (Note that the command must be entered as a single line in the
shell.)

  ucs-add -v r.log.likelihood TO candidates.ds.gz
    | ucs-select -v '%' WHERE '%r.log.likelihood% <= 100'
    | ucs-sort BY l2 l1 | ucs-print -i


=head1 COPYRIGHT

Copyright 2004 Stefan Evert.

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
