gatk-3.8/perl/randomSampleFromStream.pl

99 lines
2.6 KiB
Perl
Raw Normal View History

#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "\nUsage: randomSampleFromStream.pl [-N size] [file1 file2 ...]\n\n";
print " Selects a random sample of 'size' elements, without replacement,\n";
print " from dataset that represents a union of all input steams. If no\n";
print " input streams are specified, then reads STDIN. This script implements\n";
print " the standard reservoir sampling algorithm, i.e. it does not preload\n";
print " the data into memory and performs selection in one pass.\n\n";
print " -N size : optional (default=1), size of the random sample to select.\n\n";
exit(0);
}
my @selectedLines; # the line we are going to print at the end
my $sampleSize = 1;
my @streams;
my $curr_stream;
sub nextLine {
my $line = <$curr_stream>;
return $line if ( $line ) ;
if ( $curr_stream ne "STDIN" && scalar @streams > 0 ) {
# we are done with the current stream: try opening next one
until ( $line ) {
close $curr_stream;
if ( scalar @streams > 0 ) {
my $fname = shift @streams;
open($curr_stream, "< $fname") or
die("Can not open input file $fname");
$line = <$curr_stream>;
} else {
last; # no more streams left
}
}
}
return $line;
}
my $help = 0;
GetOptions( "N:s" => \$sampleSize,
"h" => \$help ) or usage();
usage() if ( $help ) ;
if ( scalar(@ARGV) == 0 ) {
$curr_stream = "STDIN";
} else {
my $fname = shift @ARGV;
open($curr_stream, "< $fname") or
die("Can not open input file $fname");
push @streams, @ARGV;
}
my $line;
for ( my $i = 0 ; $i < $sampleSize; $i++ ) {
$line = nextLine();
if ( $line ) {
push @selectedLines, $line;
} else {
# no more lines in the input stream(s)! we got less than sampleSize so far!
$sampleSize = $i ; # reset sampleSize to the actual number of lines available
last;
}
}
$line = nextLine() if ( $line ) ; # if no more lines left, do not attempt to read
my $index = 0; # where to insert line if selected
my $counter = $sampleSize; # total number of lines read
while ( $line ) {
$counter++;
my $prob = $sampleSize/$counter;
if ( rand() <= $prob ) {
# line gets selected
$index = int ( rand ( $sampleSize ) ) if ( $sampleSize > 1 ); # choose where to insert
$selectedLines[$index] = $line; # replace old value with newly selected line
}
$line = nextLine();
}
for ( my $i = 0 ; $i < $sampleSize ; $i++ ) { print $selectedLines[$i]; }