#!/usr/bin/perl -w 
# -*-cperl-*-
## Filename: ucs-join
## Modified: Sat Sep 24 17:14:53 2005 (severt)   
##   Author: Stefan Evert
##  Purpose: join lines or variables from UCS data sets (esp. for annotation databases)
$| = 1;
STDERR->autoflush(1);

use UCS;
use UCS::DS::Stream;
use UCS::DS::Memory;

use Getopt::Long;

$Opt_Help = 0;					# --help
$Opt_Verbose = 0;				# --verbose
$Opt_Add = 0;					# --add
$Opt_Update = 0;				# --update (overwrite existing values)
$Opt_No_Overwrite = 0;				# --no-overwrite (for backward compatibility)
$Opt_MatchOn = "l1,l2";				# --match-on
$Opt_Multiple = 0;				# --multiple

Getopt::Long::Configure("no_ignore_case");
$ok = GetOptions(
		 "help|h" => \$Opt_Help,
		 "verbose|v" => \$Opt_Verbose,
		 "add|a" => \$Opt_Add,
		 "no-overwrite|n" => \$Opt_No_Overwrite, # backward compatibility
		 "update|u" => \$Opt_Update,
		 "match-on|m=s" => \$Opt_MatchOn,
		 "multiple|M" => \$Opt_Multiple,
		 );
$UCS::Verbose = 0 unless $Opt_Verbose;
$Opt_Update = 0 if $Opt_No_Overwrite;  # --no-overwrite overrides --update
@match_vars = split /,/, $Opt_MatchOn; # variables used for matching rows from the two data sets

# parse command line
@varspec = ();					# variables (or wildcard patterns) from DS2 which will be added to DS1
$in1 = shift @ARGV;				# first input file (DS1)
$in2 = undef;					# second input file (DS2)
$out = undef;					# output file
if (@ARGV) {
  $arg = shift @ARGV;
  if (uc($arg) eq "WITH") {			# WITH <varspecs> FROM ...
    while (@ARGV) {
      $arg = shift @ARGV;
      last if $arg =~ /^(FROM|INTO)$/i;
      push @varspec, $arg;
    }
    $ok = 0 unless uc($arg) eq "FROM";
    $arg = shift @ARGV;
  }
  $in2 = $arg;
  if (@ARGV) {
    $arg = shift @ARGV;
    $ok = 0 unless uc($arg) eq "INTO";		# INTO <dataset>
    $out = shift @ARGV;
    $ok = 0 if @ARGV;
  }
}
$ok = 0 unless defined $in1 and defined $in2;	# undef's indicate missing arguments

die "Usage:  ucs-join [options] data1.ds.gz data2.ds.gz INTO new.ds.gz\n" 
  . "        ucs-join [options] data1.ds.gz WITH am.\% FROM data2.ds.gz INTO new.ds.gz\n"
  . "[type 'ucsdoc ucs-join' for more information]\n"
  unless $ok and (not $Opt_Help);

# read second data set (DS2) into cache
print STDERR "Loading $in2 ... "
  if $Opt_Verbose;
$DS2 = new UCS::DS::Memory $in2;
print STDERR "indexing ... "
  if $Opt_Verbose;
@missing = grep {not $DS2->var($_)} @match_vars;
UCS::Die "Missing variable(s) ".join(", ", @missing)." in data set $in2"
  if @missing;
$Index = $DS2->dict(@match_vars);
UCS::Die "Error: rows of $in2 are not uniquely identified by variable(s) ".join(", ", @match_vars)
  unless $Index->unique;
$size2 = $Index->keys;
print STDERR "$size2 entries\n"
  if $Opt_Verbose;

# resolve variable specifications
@variables = ();
if (@varspec) {					# add selected variables from DS2
  %seen = ();					# avoid duplicate entries
  foreach $spec (@varspec) {
    if ($spec =~ /^([^=]+)=([^=]+)$/) {		# variable renaming
      ($new, $old) = ($1, $2);
      UCS::Die "Invalid variable name(s) in renaming $spec."
	  unless UCS::ValidName($new) and UCS::ValidName($old);
      UCS::Die "Variable '$old' does not exist in data set $in2."
	  unless $DS2->var($old);
      UCS::Die "Duplicate specification of variable '$new' (in renaming $spec)"
	  if $seen{$new};
      push @variables, [$new, $old];
      $seen{$new} = 1;
    }						# variable name or pattern
    else {
      @vars = UCS::Match($spec, $DS2->vars);
      print STDERR "Warning: no matches for variable name/pattern '$spec'.\n"
	unless @vars;
      foreach $v (@vars) {
	push @variables, [$v, $v]			# duplicate specifications from patterns are ignored
	  unless $seen{$v};
	$seen{$v} = 1;
      }
    }
  }
}
else {
  @variables = map { [$_, $_] } $DS2->vars;	# default: add all variables from DS2
}

# open first data set (DS1)
$DS1 = new UCS::DS::Stream::Read $in1;
$size1 = $DS1->global("size");
$size1 = "??????" unless defined $size1;
@missing = grep {not $DS2->var($_)} @match_vars;
UCS::Die "Missing variable(s) ".join(", ", @missing)." in data set $in1"
  if @missing;

# open output data set (OUT) if specified
if ($out) {
  $DS = new UCS::DS::Stream::Write $out;
  $DS->copy_comments($DS1);
  $DS->copy_globals($DS1);
  $DS->delete_global("size")
    if $Opt_Add;				# don't know size of output file with --add switch
  $DS->add_vars($DS1->vars);
  $DS->add_vars(map {$_->[0]} @variables);	# declare variables that will be added from DS2
  foreach (@variables) {
    $annot = $DS2->global($_->[1]);		# copy header information (global variables) about added variables (if available)
    $DS->set_global($_->[0], $annot) 
      if $annot;
  }
  $DS->open;
  print STDERR "Writing results to $out ...\n"
    if $Opt_Verbose;
}
else {
  print STDERR "Checking coverage ...\n"
    if $Opt_Verbose;
}

# now process DS1 by row and add variables from DS2
$OneOnly = 0;					# keep stats on coverage
$Both = 0;
$TwoOnly = 0;
$DS2->eval("b.ucs.seen", 0);			# mark pair types that are common to both data sets
# (note: when a number C is passed as second argument, it is converted into a string
#  and then compiled into a UCS expression that evaluates to the constant C)
$DS2->temporary("b.ucs.seen", 1);

$id = 1;					# output is always reindexed with new IDs
while ($DS1->read) {
  printf STDERR "Processing %s  [%6d / %s]   \r", $in1, $DS1->row, $size1
    if $Opt_Verbose and ($DS1->row & 0xff) == 0;
  $data1 = $DS1->data;
  @keys = @{$data1}{@match_vars};
  $rownum = $Index->lookup(@keys);
  if (defined $rownum) {
    UCS::Die "Duplicate entry for key (".join(", ", @keys).") in data set $in1!"
	if $DS2->cell("b.ucs.seen", $rownum) and not $Opt_Multiple;
    $DS2->set_cell("b.ucs.seen", $rownum, 1);
    $Both++;
    if ($out) {
      # add variables from the corresponding entry in DS2 to $data1 (unless we're just checking coverage)
      foreach $pair (@variables) {
	($new, $old) = @$pair;
	$value = $DS2->cell($old, $rownum);
	if (defined $value) {
	  $data1->{$new} = $value
	    if $Opt_Update or not defined $data1->{$new};
	}
      }
    }
  }
  else {
    $OneOnly++;
  }
  # copy data from DS1 (with additions from DS2) to output data set
  if ($out) {
    $DS->data($data1);
    $DS->data("id" => $id++);
    $DS->write;
  }
}
$TwoOnly = $size2 - $Both;
$rows = $DS1->row;
$DS1->close;
printf STDERR "Processing %s  [%6d / %s]     \n", $in1, $rows, $rows
  if $Opt_Verbose;

# add remaining lines from DS2 (with --add switch)
if ($Opt_Add and $out) {
  $DS2->where("remaining", 'not %b.ucs.seen%');
  @lines = @{$DS2->index("remaining")};		# unused line numbers in original order
  $total_size = $rows + @lines;
  foreach $n (@lines) {
    $rows++;
    printf STDERR "Adding %s  [%6d / %s]     \r", $in2, $rows, $total_size
      if $Opt_Verbose and ($rows & 0xff) == 0;
    $DS->data($DS2->row($n));
    $DS->data("id" => $id++);
    $DS->write;
  }
  printf STDERR "Adding %s  [%6d / %s]     \n", $in2, $rows, $total_size
    if $Opt_Verbose;
}

if ($out) {
  $DS->close;
}
else {
  print "Data Set A = $in1\n";
  print "Data Set B = $in2\n";
  printf "Coverage: %4.2f%s\n", $Both / ($OneOnly + $Both) * 100, '%';
  print "A only:   $OneOnly\n";
  print "B only:   $TwoOnly\n";
  print "both:     $Both\n";
}


__END__

=head1 NAME

ucs-join - Join rows and variables from two UCS data sets


=head1 SYNOPSIS

  ucs-join [--match-on var1,var2,...] data1.ds.gz data2.ds.gz

  ucs-join [--add] [--update] [--multiple] [-m var1,var2,...]
           data1.ds.gz data2.ds.gz INTO new.ds.gz

  ucs-join [--add] [--update] [--multiple] [-m var1,var2,...]
           data1.ds.gz WITH am.% FROM data2.ds.gz INTO new.ds.gz


=head1 DESCRIPTION

This program can be invoked in three different ways. The short form

  ucs-join  [-v] [-m <var>,...]  <ds1>  <ds2>

B<compares> two data sets C<< <ds1> >> and C<< <ds2> >>.  In
particular, the number of rows common to both data sets and the
numbers of rows unique to either one of the data sets are reported.
Rows are matched on the B<pair types> they represent, i.e. the
variables C<l1> and C<l2>.  Differences in the C<id> value or any
other annotations are ignored.  The B<coverage> is the proportion of
pair types in C<< <ds1> >> that are also contained in C<< <ds2> >>.

With the C<--verbose> (or C<-v>) switch, some progress information is
displayed while the program is running.  The C<--match-on> (or C<-m>)
flag specifies a comma-separated list of variables to use for matching
rows (instead of C<l1> and C<l2>).  Note that the combination of their
values must be unique for every row within each data set.

The second form

  ucs-join  [-v] [--add] [--update] [--multiple] [-m <var>,...]
            <ds1> <ds2> INTO <ds3>

adds variables and/or rows from the data set C<< <ds2> >> to C<< <ds1> >>.
Rows from the two data sets are matched on the C<l1> and C<l2> variables as
above, unless this has been changed with the C<--match-on> (or C<-m>) flag.
The combination of their values must uniquely identify rows in C<< <ds2> >>,
while duplicate rows in C<< <ds1> >> are allowed in combination with the
C<--multiple> (or C<-M>) option.

For matching rows, all variables from C<< <ds2> >> are added to the
annotations in C<< <ds1> >>.  Variables that are common to both data sets are
overwritten with the values from C<< <ds2> >> only when they are undefined
(C<NA>) in C<< <ds1> >>, or when the C<--update> (or C<-u>) option has been
given.  For backward compatibility, the default setting can be explicitly
selected with C<--no-overwrite> (or C<-n>).  If C<--add> or C<-a> is
specified, rows that appear only in C<< <ds2> >> are added to C<< <ds1> >>
(with all variables that are not defined in C<< <ds2> >> set to C<NA>).  The
resulting data set is written to the file C<< <ds3> >>.

The most general form 

  ucs-join  [-v] [--add] [--update] [--multiple] [-m <var>,...]
            <ds1> WITH <variables> FROM <ds2> INTO <ds3>

adds selected variables from C<< <ds2> >> only.  C<< <variables> >> is
a whitespace-separated list of variables names and wildcard patterns,
which are matched against the variables of C<< <ds2> >>.  Variables
can be renamed with specifiers of the form C<< new.name=old.name >>
(of course, wildcard patterns cannot be used here).  The C<--add>
switch is rarely useful with this form of the B<ucs-join> command.


=head1 ANNOTATION DATABASES

The B<ucs-join> program is often used to add (manual) annotations from an
B<annotation database> file (C<.adb>) to a data set, and to update
annotation databases.  For instance, the UCS distribution includes German 
PP+verb pairs extracted from the I<Frankfurter Rundschau> corpus
(F<fr-pnv.ds.gz>) and an annotation database created by Brigitte Krenn
(F<pnv.adb.gz>).  In order to check the B<coverage> of the annotation 
database (i.e., how many of the pair types are already contained in the
database), type

  ucs-join -v fr-pnv.ds.gz pnv.adb.gz

This will show a coverage of 100%.  Annotations from the database can now be
added to the F<fr-pnv.ds.gz> data set (the C<--update> option is only relevant
if F<fr-pnv.ds.gz> is already annotated with the relevant variables):

  ucs-join -v --update fr-pnv.ds.gz 
           WITH 'b.*' FROM pnv.adb.gz INTO fr-pnv.annot.ds.gz

When an annotation database contains entries that have not been manually
examined so far, these should be annotated with missing values (C<NA>).  The
database can then be updated from a new file (in the same C<.adb> format, say
F<new-pnv.adb>) with the following commands

  mv pnv.adb.gz pnv.adb.BAK.gz
  ucs-join -v pnv.adb.BAK.gz new-pnv.adb INTO pnv.adb.gz

If the file I<new-pnv.adb> contains additional pair types (that haven't
already been entered into the database), you should also specify the C<--add>
flag.

Recall that B<ucs-join> will not overwrite existing annotations by default.
If you want to correct mistakes in the annotation database, you need to
specify the C<--update> option in the command above.  Note that missing values
(C<NA>) will I<never> overwrite existing annotations in the first data set.


=head1 COPYRIGHT

Copyright 2004-2005 Stefan Evert.

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
