diff --git a/perl/randomSampleFromStream.pl b/perl/randomSampleFromStream.pl new file mode 100755 index 000000000..d39d6c961 --- /dev/null +++ b/perl/randomSampleFromStream.pl @@ -0,0 +1,98 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +sub usage { + print "\nUsage: randomSampleFromStream.pl [-N size] [file1 file2 ...]\n\n"; + print " Selects a random sample of 'size' elements, without replacement,\n"; + print " from dataset that represents a union of all input steams. If no\n"; + print " input streams are specified, then reads STDIN. This script implements\n"; + print " the standard reservoir sampling algorithm, i.e. it does not preload\n"; + print " the data into memory and performs selection in one pass.\n\n"; + print " -N size : optional (default=1), size of the random sample to select.\n\n"; + + exit(0); +} + + + +my @selectedLines; # the line we are going to print at the end +my $sampleSize = 1; + +my @streams; +my $curr_stream; + +sub nextLine { + my $line = <$curr_stream>; + + return $line if ( $line ) ; + + if ( $curr_stream ne "STDIN" && scalar @streams > 0 ) { + # we are done with the current stream: try opening next one + until ( $line ) { + close $curr_stream; + if ( scalar @streams > 0 ) { + my $fname = shift @streams; + open($curr_stream, "< $fname") or + die("Can not open input file $fname"); + $line = <$curr_stream>; + } else { + last; # no more streams left + } + } + } + return $line; +} + +my $help = 0; +GetOptions( "N:s" => \$sampleSize, + "h" => \$help ) or usage(); + +usage() if ( $help ) ; + +if ( scalar(@ARGV) == 0 ) { + $curr_stream = "STDIN"; +} else { + my $fname = shift @ARGV; + open($curr_stream, "< $fname") or + die("Can not open input file $fname"); + push @streams, @ARGV; +} + + +my $line; + +for ( my $i = 0 ; $i < $sampleSize; $i++ ) { + $line = nextLine(); + if ( $line ) { + push @selectedLines, $line; + } else { + # no more lines in the input stream(s)! we got less than sampleSize so far! + $sampleSize = $i ; # reset sampleSize to the actual number of lines available + last; + } +} + + +$line = nextLine() if ( $line ) ; # if no more lines left, do not attempt to read +my $index = 0; # where to insert line if selected + +my $counter = $sampleSize; # total number of lines read + +while ( $line ) { + $counter++; + + my $prob = $sampleSize/$counter; + + if ( rand() <= $prob ) { + # line gets selected + $index = int ( rand ( $sampleSize ) ) if ( $sampleSize > 1 ); # choose where to insert + $selectedLines[$index] = $line; # replace old value with newly selected line + } + $line = nextLine(); +} + + +for ( my $i = 0 ; $i < $sampleSize ; $i++ ) { print $selectedLines[$i]; } +