99 lines
2.6 KiB
Perl
Executable File
99 lines
2.6 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
|
|
|
use strict;
|
|
use Getopt::Long;
|
|
|
|
sub usage {
|
|
print "\nUsage: randomSampleFromStream.pl [-N size] [file1 file2 ...]\n\n";
|
|
print " Selects a random sample of 'size' elements, without replacement,\n";
|
|
print " from dataset that represents a union of all input steams. If no\n";
|
|
print " input streams are specified, then reads STDIN. This script implements\n";
|
|
print " the standard reservoir sampling algorithm, i.e. it does not preload\n";
|
|
print " the data into memory and performs selection in one pass.\n\n";
|
|
print " -N size : optional (default=1), size of the random sample to select.\n\n";
|
|
|
|
exit(0);
|
|
}
|
|
|
|
|
|
|
|
my @selectedLines; # the line we are going to print at the end
|
|
my $sampleSize = 1;
|
|
|
|
my @streams;
|
|
my $curr_stream;
|
|
|
|
sub nextLine {
|
|
my $line = <$curr_stream>;
|
|
|
|
return $line if ( $line ) ;
|
|
|
|
if ( $curr_stream ne "STDIN" && scalar @streams > 0 ) {
|
|
# we are done with the current stream: try opening next one
|
|
until ( $line ) {
|
|
close $curr_stream;
|
|
if ( scalar @streams > 0 ) {
|
|
my $fname = shift @streams;
|
|
open($curr_stream, "< $fname") or
|
|
die("Can not open input file $fname");
|
|
$line = <$curr_stream>;
|
|
} else {
|
|
last; # no more streams left
|
|
}
|
|
}
|
|
}
|
|
return $line;
|
|
}
|
|
|
|
my $help = 0;
|
|
GetOptions( "N:s" => \$sampleSize,
|
|
"h" => \$help ) or usage();
|
|
|
|
usage() if ( $help ) ;
|
|
|
|
if ( scalar(@ARGV) == 0 ) {
|
|
$curr_stream = "STDIN";
|
|
} else {
|
|
my $fname = shift @ARGV;
|
|
open($curr_stream, "< $fname") or
|
|
die("Can not open input file $fname");
|
|
push @streams, @ARGV;
|
|
}
|
|
|
|
|
|
my $line;
|
|
|
|
for ( my $i = 0 ; $i < $sampleSize; $i++ ) {
|
|
$line = nextLine();
|
|
if ( $line ) {
|
|
push @selectedLines, $line;
|
|
} else {
|
|
# no more lines in the input stream(s)! we got less than sampleSize so far!
|
|
$sampleSize = $i ; # reset sampleSize to the actual number of lines available
|
|
last;
|
|
}
|
|
}
|
|
|
|
|
|
$line = nextLine() if ( $line ) ; # if no more lines left, do not attempt to read
|
|
my $index = 0; # where to insert line if selected
|
|
|
|
my $counter = $sampleSize; # total number of lines read
|
|
|
|
while ( $line ) {
|
|
$counter++;
|
|
|
|
my $prob = $sampleSize/$counter;
|
|
|
|
if ( rand() <= $prob ) {
|
|
# line gets selected
|
|
$index = int ( rand ( $sampleSize ) ) if ( $sampleSize > 1 ); # choose where to insert
|
|
$selectedLines[$index] = $line; # replace old value with newly selected line
|
|
}
|
|
$line = nextLine();
|
|
}
|
|
|
|
|
|
for ( my $i = 0 ; $i < $sampleSize ; $i++ ) { print $selectedLines[$i]; }
|
|
|