A little gadget to select random samples from input stream(s) of unknown length. By default, selects a single line (with probability 1/TOTAL_NUMBER_OF_LINES_READ), with -N option randomly selects specified number of lines. Can read from STDIN or from arbitrary number of input streams (all streams will be merged). Examples:\n cat file1 file2 file3 | randomSampleFromStream.pl -N 5 \n\n or \n randomSampleFromStream.pl file1 file2 file3
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1360 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9dfee7a75c
commit
8d06bb21ed
|
|
@ -0,0 +1,98 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
sub usage {
|
||||
print "\nUsage: randomSampleFromStream.pl [-N size] [file1 file2 ...]\n\n";
|
||||
print " Selects a random sample of 'size' elements, without replacement,\n";
|
||||
print " from dataset that represents a union of all input steams. If no\n";
|
||||
print " input streams are specified, then reads STDIN. This script implements\n";
|
||||
print " the standard reservoir sampling algorithm, i.e. it does not preload\n";
|
||||
print " the data into memory and performs selection in one pass.\n\n";
|
||||
print " -N size : optional (default=1), size of the random sample to select.\n\n";
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
my @selectedLines; # the line we are going to print at the end
|
||||
my $sampleSize = 1;
|
||||
|
||||
my @streams;
|
||||
my $curr_stream;
|
||||
|
||||
sub nextLine {
|
||||
my $line = <$curr_stream>;
|
||||
|
||||
return $line if ( $line ) ;
|
||||
|
||||
if ( $curr_stream ne "STDIN" && scalar @streams > 0 ) {
|
||||
# we are done with the current stream: try opening next one
|
||||
until ( $line ) {
|
||||
close $curr_stream;
|
||||
if ( scalar @streams > 0 ) {
|
||||
my $fname = shift @streams;
|
||||
open($curr_stream, "< $fname") or
|
||||
die("Can not open input file $fname");
|
||||
$line = <$curr_stream>;
|
||||
} else {
|
||||
last; # no more streams left
|
||||
}
|
||||
}
|
||||
}
|
||||
return $line;
|
||||
}
|
||||
|
||||
my $help = 0;
|
||||
GetOptions( "N:s" => \$sampleSize,
|
||||
"h" => \$help ) or usage();
|
||||
|
||||
usage() if ( $help ) ;
|
||||
|
||||
if ( scalar(@ARGV) == 0 ) {
|
||||
$curr_stream = "STDIN";
|
||||
} else {
|
||||
my $fname = shift @ARGV;
|
||||
open($curr_stream, "< $fname") or
|
||||
die("Can not open input file $fname");
|
||||
push @streams, @ARGV;
|
||||
}
|
||||
|
||||
|
||||
my $line;
|
||||
|
||||
for ( my $i = 0 ; $i < $sampleSize; $i++ ) {
|
||||
$line = nextLine();
|
||||
if ( $line ) {
|
||||
push @selectedLines, $line;
|
||||
} else {
|
||||
# no more lines in the input stream(s)! we got less than sampleSize so far!
|
||||
$sampleSize = $i ; # reset sampleSize to the actual number of lines available
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
$line = nextLine() if ( $line ) ; # if no more lines left, do not attempt to read
|
||||
my $index = 0; # where to insert line if selected
|
||||
|
||||
my $counter = $sampleSize; # total number of lines read
|
||||
|
||||
while ( $line ) {
|
||||
$counter++;
|
||||
|
||||
my $prob = $sampleSize/$counter;
|
||||
|
||||
if ( rand() <= $prob ) {
|
||||
# line gets selected
|
||||
$index = int ( rand ( $sampleSize ) ) if ( $sampleSize > 1 ); # choose where to insert
|
||||
$selectedLines[$index] = $line; # replace old value with newly selected line
|
||||
}
|
||||
$line = nextLine();
|
||||
}
|
||||
|
||||
|
||||
for ( my $i = 0 ; $i < $sampleSize ; $i++ ) { print $selectedLines[$i]; }
|
||||
|
||||
Loading…
Reference in New Issue