#!/usr/bin/perl -w 
# -*-cperl-*-
## Filename: ucs-select
## Modified: Mon Feb 20 14:01:36 2006 (severt)   
##   Author: Stefan Evert
##  Purpose: select rows and/or columns from UCS data set
$| = 1;

use UCS;
use UCS::AM;
use UCS::DS::Stream;

use Getopt::Long;

$Opt_Help = 0;					# --help
$Opt_Verbose = 0;				# --verbose
$Opt_Count = 0;					# --count

# parse optional switches
$ok = GetOptions(
		 "verbose|v" => \$Opt_Verbose,
		 "count|c" => \$Opt_Count,
		 "help|h" => \$Opt_Help,
		 );
$UCS::Verbose = 0 unless $Opt_Verbose;

# parse command line
@varspec = ();					# variable specifications
$in = undef;					# input file
$out = undef;					# output file
$where = undef;					# WHERE clause
while (@ARGV and not $ARGV[0] =~ /^(FROM|INTO|WHERE)$/i) {
  push @varspec, shift @ARGV;
}
while (@ARGV) {
  $arg = uc(shift @ARGV);
  if ($arg eq "FROM") {
    $ok = 0 unless @ARGV and not defined $in;	# avoid repeated spec. of input file
    $in = shift @ARGV;
  }
  elsif ($arg eq "INTO") {
    $ok = 0 unless @ARGV and not defined $out;	# avoid repeated spec. of output file
    $out = shift @ARGV;
  }
  elsif ($arg eq "WHERE") {
    $ok = 0 unless @ARGV and not defined $where; # avoid multiple WHERE clauses
    $where = shift @ARGV;
  }
  else {
    $ok = 0;
    last;
  }
}
$in = "-" unless defined $in;			# input stream defaults to stdin
if ($Opt_Count) {
  die "Error: no variable specifications allowed with --count switch.\n"
    if @varspec;
  die "Error: no output file allowed with --count switch.\n"
    if defined $out;
}
else {
  $out = "-" unless defined $out;		# output stream defaults to stdout
  ### no variable specs means "select all columns", as with gawk
  @varspec = '%'
    unless @varspec;
}

die 
  "Usage:  ucs-select <variables> FROM data.ds.gz [ WHERE <condition> ] [ INTO new.ds.gz ]\n" .
  "        ucs-select --count FROM data.ds.gz WHERE <condition>\n" .
  "[type 'ucsdoc ucs-select' for more information]\n"
  unless $ok and (not $Opt_Help);

# check that input and output files are different
die "Error:  output file ($out) must not be identical to input file.\n"
  if (defined $out) and ($in eq $out) and not ($in eq "-");

# open input stream & expand varspecs
$IN = new UCS::DS::Stream::Read $in;
@variables = ();
unless ($Opt_Count) {
  foreach $spec (@varspec) {
    @matches = UCS::Match($spec, $IN->vars);
    print STDERR "Warning:  no matches for variable spec. '$spec' (ignored)\n"
      unless @matches;
    push @variables, @matches;
  }
  die "Error: no variables selected for output.\n"
    unless @variables;
  print STDERR "Selected variables: ", join(", ", @variables), "\n"
    if $Opt_Verbose;
}

# compile WHERE clause into UCS expression
if (defined $where) {
  $WHERE = new UCS::Expression $where;
  die "Syntax error in WHERE condition '$where'.\n"
    unless defined $WHERE;
  @needed = $WHERE->needed;
  @missing = grep {not $IN->var($_)} @needed;
  die "Can't evaluate WHERE condition because of missing variables\n" .
    "(@missing) in $in (aborted).\n"
      if @missing;
}
else {
  $WHERE = new UCS::Expression "1";		# in case we make the mistake of evaluating it
}

# open output stream & configure data set
unless ($Opt_Count) {
  $OUT = new UCS::DS::Stream::Write $out;
  $OUT->copy_comments($IN);
  # might append comment giving ucs-select command
  $OUT->copy_globals($IN);
  $OUT->delete_global("size")			# data set size is going to change if there's a WHERE clause
    if defined $where;
  $OUT->add_vars(@variables);
  foreach $name ($OUT->globals) { # delete header information (global variables) about variables that are not selected for output
    $OUT->delete_global($name)
      if $IN->var($name) and not $OUT->var($name);
  }
  $OUT->open;
}

# now process each row of the input data
$size = $IN->global("size");
$size = "??????" unless defined $size;
$count = 0;
while ($IN->read) {
  printf STDERR "Processing row %6d / %s   \r", $IN->row, $size
    if $Opt_Verbose and ($IN->row & 0xff) == 0;
  $data = $IN->data;				# ref to hash of variable values
  $ok = (defined $where) ? $WHERE->eval($data) : 1;
  if ($ok) {
    $count++;
    unless ($Opt_Count) {
      $OUT->data($data);			# will simply ignore the non-selected variables
      $OUT->write;
    }
  }
}
$size = $IN->row;
$IN->close;
$OUT->close
  unless $Opt_Count;
print STDERR "Processing complete ($size rows).               \n"
  if $Opt_Verbose;

# with --count, print number of selected lines
if ($Opt_Count) {
  print "$count / $size rows selected.\n";
  printf "(= %4.2f%s)\n", $count / $size * 100, '%';
}



__END__

=head1 NAME

ucs-select - Select rows and/or columns from UCS data set


=head1 SYNOPSIS

  ucs-select --count FROM data.ds.gz WHERE '%O11% < %E11%'

  ucs-select '*' 'am.%.pv' FROM data.ds.gz INTO new.ds.gz

  ucs-select '%' FROM data.ds.gz WHERE 'not defined %b.accept%'


=head1 DESCRIPTION

This program is used to select rows and/or columns from a UCS data set file,
very much like a C<SELECT> statement in SQL.  The general form of the
B<ucs-select> command is

  ucs-select [--verbose | -v] (<variables> | --count)
             [ FROM <input.ds> ] [ WHERE <condition> ] [ INTO <output.ds> ]

C<< <variables> >> is a whitespace-separated list of variable names or
wildcard patterns (see L<the ucsexp manpage|ucsexp>), which are matched
against the columns of the data set file C<< <input.ds> >>.  The list of
variables may not be omitted: use C<'%'> to select I<all> columns, and
C<--count> to display the number of matching rows only.  Note that
wildcard patterns may need to be quoted individually (because they contain
shell metacharacters). 

C<< <condition> >> is a UCS expression (see L<the ucsexp manpage|ucsexp>) used
to select rows from the data set for which it evaluates to a true value.  When
the C<WHERE> clause is omitted, all rows are selected.  Note that C<< <condition> >> 
must be a single argument and will usually have to be quoted (single quotes are
highly recommended). 

The input data set file C<< <input.ds> >> defaults to STDIN (when omitted).
The resulting table is printed on STDOUT in UCS data set file format (see
L<the ucsfile manpage|ucsfile>), and can be written to a data set file C<<
<output.ds> >> with the optional C<INTO> clause.

With the C<--verbose> (or C<-v>) option, some progress information is 
displayed while the program is running. 


=head1 COPYRIGHT

Copyright 2004 Stefan Evert.

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
