#!/usr/bin/perl -w 
# -*-cperl-*-
## Filename: ucs-make-tables
## Modified: Sun Aug 22 00:01:56 2004 (evert)   
##   Author: Stefan Evert
##  Purpose: compute contingency tables from sequence of pair tokens
$| = 1;
STDERR->autoflush(1);

use UCS;
use UCS::File;
use UCS::DS::Stream;

use Getopt::Long;

$Opt_Help = 0;					# --help
$Opt_Verbose = 0;				# --verbose
$Opt_N = 0;					# --sample-size, -N
$Opt_f = 0;					# --threshold, -f
$Opt_Dispersion = 0;				# --dispersion, -d
$Opt_ChunkSize = 0;				# --chunk-size, -c
$Opt_Segments = 0;				# --segments, -S
$Opt_Sort = 0;					# --sort, -s

Getopt::Long::Configure("no_ignore_case");
$ok = GetOptions(
		 "help|h" => \$Opt_Help,
		 "verbose|v" => \$Opt_Verbose,
		 "sample-size|N=i" => \$Opt_N,
		 "threshold|f=i" => \$Opt_f,
		 "dispersion|d" => \$Opt_Dispersion,
		 "chunk-size|c=i" => \$Opt_ChunkSize,
		 "segments|S" => \$Opt_Segments,
		 "sort|s" => \$Opt_Sort,
		 );

$ok = 0 unless @ARGV == 1;
$filename = shift @ARGV;

die 
  "Usage:  ... | ucs-make-tables [-v] [-s] [-N <n>] [-f <n>] [-d [-c <n>]] [-S] data.ds.gz\n" .
  "[type 'ucsdoc ucs-make-tables' for more information]\n"
  unless $ok and (not $Opt_Help);

## hash tables for joint and marginal frequencies, as well as dispersion
%F = ();					# $F{"$l1\t$l2"} = $f
%F1 = ();					# $F1{$l1} = $f1
%F2 = ();					# $F2{$l2} = $f2
%D = ();					# $D{"$l1\t$l2"} = $d

$N = 0;						# number of tokens
$V = 0;						# number of types

## main loop - read sequence of pair tokens and count them
$chunk_size = 0;
$last_chunk = undef;
$chunk_num = 1;
%Chunk = ();
while (<STDIN>) {				# ========== START OF MAIN LOOP
  chomp;
  if ($Opt_Segments) {				# segment-based cooccurrences
    (undef, $chunk) = split /\t/;
    chomp($line = <STDIN>);
    @l1 = unique(split /\t/, $line);
    chomp($line = <STDIN>);
    @l2 = unique(split /\t/, $line);
    $line = <STDIN>;
    die "Format Error: expected input line #$. to be record separator (empty line)\n",
      "--> $line"
	unless (not defined $line) or $line =~ /^\s*$/;
    foreach $l1 (@l1) {
      $F1{$l1}++;
    }
    foreach $l2 (@l2) {
      $F2{$l2}++;
    }
  }
  else {					# relational cooccurrences
    ($l1, $l2, $chunk) = split /\t/;
    @l1 = ($l1);
    @l2 = ($l2);
    $F1{$l1}++;
    $F2{$l2}++;
  }
  $N++;						# stands either for one pair token or for one segment
  $chunk_size++;				# when measuring dispersion across equally-sized chunks

  ## further processing is the same for both types of cooccurrences
  foreach $l1 (@l1) {
    foreach $l2 (@l2) {
      if ($Opt_Dispersion) {			# --dispersion: collect pair types from current chunk in %Chunk
	if ($Opt_ChunkSize) {			# --chunk-size <n>: each chunk contains exactly <n> tokens
	  $end_of_chunk = ($chunk_size > $Opt_ChunkSize);
	}
	else {					# otherwise: determine chunk boundaries from annotated chunk identifiers
	  $last_chunk = $chunk			# avoid chunk boundary at start of input
	    unless defined $last_chunk;
	  die "Error: input is missing chunk identifiers for --dispersion mode.\n",
	    "Please use the --sample-size and --chunk-size options for equally-sized chunks.\n"
	      unless defined $chunk;
	  $end_of_chunk = ($chunk ne $last_chunk);	
	  $last_chunk = $chunk;
	}
	if ($end_of_chunk) {			# increment dispersion counts at end of chunk
	  foreach $pair (keys %Chunk) { $D{$pair}++ };
	  %Chunk = ();
	  $chunk_size = 1;                      # current pair token or segment is first element of the new chunk
	  $chunk_num++;
	}
      }
      $pair = "$l1\t$l2";
      $V++
	unless exists $F{$pair};
      $F{$pair}++;
      $Chunk{$pair} = 1
	if $Opt_Dispersion;
    }
  }

  if ($Opt_Verbose and ($N & 0xff) == 0) {
    printf STDERR "N = %8d,  V = %6d   ", $N, $V;
    printf STDERR "[chunk #%05d]       ", $chunk_num
      if $Opt_Dispersion;
    print STDERR "\r";
  }

  last if $Opt_N > 0 and $N >= $Opt_N;
}						# ========== END OF MAIN LOOP

if ($Opt_Dispersion) {				# last chunk is always non-empty, so update dispersion counts
  foreach $pair (keys %Chunk) { $D{$pair}++ };
  %Chunk = ();
  UCS::Warn("Final chunk contains less than the requested number of pair tokens or segments",
	    "($chunk_size instead of $Opt_ChunkSize). Consider using the --sample-size option.")
      if $Opt_ChunkSize and $chunk_size < $Opt_ChunkSize;
}

if ($Opt_Verbose) {
  printf STDERR "N = %8d,  V = %6d   ", $N, $V;
  printf STDERR "[%d chunks]         ", $chunk_num
    if $Opt_Dispersion;
  print STDERR "\n";
}

## write pair types with frequency signatures to data set file (unsorted)
print STDERR "Saving to $filename ... "
  if $Opt_Verbose;
$ds = new UCS::DS::Stream::Write $filename;
$ds->add_vars(qw<id l1 l2 f f1 f2 N>);
$ds->add_comments("Frequency signatures computed by the ucs-make-tables tool for ".
		  (($Opt_Segments) ? "segment-based positional" : "relational")." cooccurrences.",
		  "Sample size:  N = $N tokens,  V = $V pair types.");
if ($Opt_f > 0) {				# apply frequency threshold as type filter
  while (($pair, $f) = each %F) {
    delete $F{$pair}
      if $f < $Opt_f;
  }
  $V = keys %F;
  $ds->add_comments("A frequency threshold of f >= $Opt_f was applied, leaving V = $V pair types.");
  $ds->set_global("threshold", $Opt_f);
}
if ($Opt_Dispersion) {
  if ($Opt_ChunkSize) {
    $ds->add_comments("Dispersion counts were added for $chunk_num equally-sized chunks, ".
		      "each of which contains $Opt_ChunkSize pair tokens.");
    $ds->set_global("chunk.size", $Opt_ChunkSize);
  }
  else {
    $ds->add_comments("Dispersion counts were added for $chunk_num pre-annotated chunks.");
  }
  $ds->set_global("chunks", $chunk_num);
  $ds->set_global("n.disp", "dispersion of pair type, i.e. number of chunks in which it occurs");
  $ds->add_vars("n.disp");
}
$ds->set_global("size", $V);
$ds->open;

if ($Opt_Sort) {				# --sort: sort pair types alphabetically
  print STDERR "sorting ... "
    if $Opt_Verbose;
  $id = 0;
  foreach $pair (sort keys %F) {
    ($l1, $l2) = split /\t/, $pair;
    $id++;
    $ds->columns($id, $l1, $l2, $F{$pair}, $F1{$l1}, $F2{$l2}, $N, (($Opt_Dispersion) ? $D{$pair} : ()));
    $ds->write;
  }
}
else {						# unsorted: iterate through hash with "each"
  $id = 0;
  while (($pair, $f) = each %F) {
    ($l1, $l2) = split /\t/, $pair;
    $id++;
    $ds->columns($id, $l1, $l2, $f, $F1{$l1}, $F2{$l2}, $N, (($Opt_Dispersion) ? $D{$pair} : ()));
    $ds->write;
  }
}
$ds->close;
print STDERR "done\n"
  if $Opt_Verbose;


##  @unique_strings = unique(@strings);
sub unique {
  my %set = ();
  foreach (@_) { $set{$_} = 1 };
  return keys %set;
}



__END__

=head1 NAME

ucs-make-tables - Compute contingency tables from a sequence of pair tokens


=head1 SYNOPSIS

  ... | ucs-make-tables [-v] [--sort | -s] [--sample-size=<n> | -N <n>] 
                        [--threshold=<n> | -f <n>]  data.ds.gz

  ... | ucs-make-tables [-v] [-s] [-N <n>] [-f <n>] 
                        [--dispersion [--chunk-size=<n>] ]  data.ds.gz

  ... | ucs-make-tables [-v] [-s] [-N <n>] [-f <n>] --segments data.ds.gz


=head1 DESCRIPTION

This utility computes frequency signatures and constructs a UCS data set for a
stream of pair tokens (or segment-based cooccurrence data) read from STDIN.
It is usually applied to the output of a cooccurrence extraction tool in a
command-line pipe.  The input can also be read from a file (with a C<< < >>
redirection), or decompressed on the fly with (C<gzip -cd> or C<bzip2 -cd>).
The resulting data set is written to the file specified as the single
mandatory argument on the command-line.

B<ucs-make-tables> operates in two different modes for B<relational> and
B<positional> (segment-based) cooccurrences.  These two modes are described
separately in the following subsections.  They take the same command-line
options and arguments, as described in the section L<COMMAND LINE|"COMMAND
LINE"> below.  Distance-based positional cooccurrences are not supported, as
they usually require direct access to the source corpus in order to determine
the precise window size.


=head2 Relational Cooccurrences

By default, B<ucs-make-tables> operates in a mode for relational
cooccurrences.  In this mode, the input line format is

  <l1> TAB <l2>

Each such line represents a pair token with labels C<< <l1> >> and C<< <l2> >>
(i.e. a pair token that belongs to the pair type I<(l1,l2)>).  For dispersion
counts (see below), the input lines should preserve the order in which the
corresponding pair tokens appear in the corpus.  When dispersion is measured
with respect to pre-annotated parts (e.g. paragraphs or documents) rather than
equally-sized parts, the input must contain an extra column with unique part
identifiers:

  <l1> TAB <l2> TAB <part_id>

Note that all pair tokens from a given part must form an uninterrupted
sequence in the input, otherwise the dispersion counts will not be correct.

=head2 Segment-based Cooccurrences

The mode for segment-based cooccurrences is activated with the C<--segments>
(or C<-S>) option.  In this mode, each segment is represented by a sequence 
of four lines in the input stream, called a B<record>:

=over 4

=item 1.

C<< <segment_id> [ TAB <part_id> ] >>

=item 2.

The labels of all tokens in the segment that can become I<first> components
of pairs, separated by C<TAB>s.

=item 3.

The labels of all tokens in the segment that can become I<second> components
of pairs, separated by C<TAB>s.

=item 4.

A blank separator line.

=back

Duplicate strings on the second or third line will automatically be ignored.
The C<< <segement_id> >> on the first line is currently ignored.  The optional
C<< <part_id> >> can be used to compute dispersion counts for pre-annotated
parts.  All segments that belong to a given part must appear in consecutive
records, otherwise the dispersion counts will not be correct.

A prototypical example of the segment-based approach are lemmatised noun-verb
cooccurrences within sentences.  In this case, each record in the input stream
corresponds to a sentence.  The first line contains an unimportant sentence
identifier.  The second line contains the lemma forms of all nouns in the
sentence (note that duplicates are automatically removed), and the third line
contains the lemma forms of all verbs in the sentence.  In order to compute
the dispersion of cooccurrences across documents (i.e. I<document frequencies>
in the terminology of information retrieval), unique document identifiers have 
to be added to the first line.


=head1 COMMAND LINE

The general form of the B<ucs-make-tables> command is

  ... | ucs-make-tables [--verbose | -v] [--sort | -s]
                        [--threshold=<t> | -f <t>] 
                        [--sample-size=<n> | -N <n>] 
                        [--dispersion [--chunk-size=<s>]]
                        [--segments]
                        data.ds.gz

With the C<--verbose> (or C<-v>) option, some progress information (including
the number of pair tokens or segments, as well as the number of pair types
encountered so far) is displayed while the program is running.  When C<--sort>
(or C<-s>) is specified, the resulting data set is sorted in ascending
alphabetical order (on C<l1> first, then C<l2>).  Of course, the data set file
can always be re-sorted with the B<ucs-sort> utility.  When a frequency
threshold C<< <t> >> is specified with the C<--threshold> (or C<-f>) option,
only pair types with cooccurrence frequency C<< f >= <t> >> will be saved to
the data set file (but they are still included in the marginal frequency
counts of relational cooccurrences, of course).  This option helps keep the
size of data sets extracted from large corpora manageable.

When C<--sample-size> (or C<-N>) is specified, only the first C<< <n> >> pair
tokens (or segment records) read from STDIN will be used, so that the sample
size C<N> of the resulting data set is equal to C<< <n> >>.  This option is
mainly useful when computing dispersion counts on equally-sized parts (see
below), but it has some other applications as well.

With the C<--dispersion> (or C<-d>) option, dispersion counts are added to the
data set and can then be used to test the random sample assumption with a
B<dispersion test> (see Baayen 2001, Sec. 5.1.1).  In order to do so, the
token stream is divided into equally-sized B<parts>, each one containing the
number C<< <s> >> of pair tokens specified with the C<--chunk-size> (or C<-c>)
option.  For segment-based cooccurrences, each part will contain cooccurrences
from C<< <s> >> segments.  When the total number of pair tokens (or segments)
is not an integer multiple of C<< <s> >>, a warning message will be issued.
In this case, it is recommended to adjust the number of tokens with the
C<--sample-size> option described above.

The dispersion count for each pair type, i.e. the number of parts in which it
occurs, is stored in a variable named C<n.disp> in the resulting data set
file.  In addition, the number of parts and the part size are recorded in
the global variables C<chunks> and C<chunk.size>.  When the part size is not
specified, dispersion counts can be computed for pre-annotated parts, which
must be identified in the input stream (see above).  In this case,
C<chunk.size> is not defined as the individual parts may have different
sizes.  B<NB:> The use of pre-annotated parts is discouraged, since the
mathematics of the dispersion test assume equally-sized parts.


=head1 EXAMPLES

If you have installed the IMS Corpus Workbench (CWB) as well as the CWB/Perl
interface, you can easily extract relational adjective+noun cooccurrences from
part-of-speech tagged CWB corpora.  The B<ucs-adj-n-from-cwb.perl> script
supplied with the UCS system supports several tagsets for German and English
corpora.  It can easily be extended to other tagsets, languages, and types of
cooccurrences (as long as they can be identified with the help of
part-of-speech patterns).

The following example extracts adjective+noun pairs with cooccurrence
frequency C<< f >= 3 >> from the CWB demonstration corpus C<DICKENS> (ca. 3.4
million words), and saves them into the data set file C<dickens.adj-n.ds.gz>.
The shell variable C<$UCS> refers to the F<System/> directory of the UCS
installation (as in the UCS/Perl tutorial).

  $UCS/Perl/tools/ucs-adj-n-from-cwb.perl  penn  DICKENS
       |  ucs-make-tables  --verbose --sort --threshold=3  dickens.adj-n.ds.gz

(Note that the command must be entered as a single line in the shell.)

Extraction from the C<DICKENS> corpus produces approximately 122990 pair tokens.
In order to apply a dispersion test with a chunk size of 1000 tokens each, the
sample size has to be limited to an integer multiple of 1000:

  $UCS/Perl/tools/ucs-adj-n-from-cwb.perl  penn  DICKENS
       |  ucs-make-tables  --verbose --sort --threshold=3 --sample-size=122000
                           --dispersion --chunk-size=1000  dickens.disp.ds.gz

A dispersion test for pair types with C<< f <= 5 >> can then be performed with
the following command, showing a significant amount of underdispersion at all
levels.

  $UCS/Perl/tools/ucs-dispersion-test.perl -v -m 5 dickens.disp.ds.gz

Segment-based data can be obtained from a CWB corpus with the
B<ucs-segment-from-cwb.perl> script.  The following example extracts nouns and
verbs cooccurring within sentences.  A frequency threshold of 5 is applied in
order to keep the amount of data (and hence the memory consumption of the
B<ucs-make-tables> program) manageable.

  $UCS/Perl/tools/ucs-segment-from-cwb.perl -f 5 -t1 "VB.*" -t2 "NN.*" DICKENS s
       | ucs-make-tables --verbose --segments --threshold=5 dickens.n-v.ds.gz

=head1 REFERENCES

Baayen, R. Harald (2001).  I<Word Frequency Distributions.> 
Kluwer, Dordrecht.

IMS Corpus Workbench (CWB): I<http://www.ims.uni-stuttgart.de/projekte/CorpusWorkbench/>


=head1 COPYRIGHT

Copyright 2004 Stefan Evert.

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
