checking in new folder for perl scripts AND a simple script that takes an input text file and reference dictionary (.fai) and performs stable sort of the input lines according to the contig order specified by the dictionary. Position of the contig filed to sort on in the input lines is specified as --k POS option. Input lines may specify contigs that are not in the dictionary, in this case the additional contigs will be added at the end of the sorted output, after all known contigs. The sorting order between these additional contigs is simply the order in which they first appear in the input

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@831 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-05-27 16:34:55 +00:00
parent 919e995b7f
commit 3098ed091c
1 changed files with 109 additions and 0 deletions

109
perl/sortByRef.pl 100755
View File

@ -0,0 +1,109 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "\nUsage:\n";
print "sortByRef.pl [--k POS] INPUT REF_DICT\n\n";
print " Sorts lines of the input file INFILE according\n";
print " to the reference contig order specified by the\n";
print " reference dictionary REF_DICT (.fai file).\n";
print " The sort is stable. If -k option is not specified,\n";
print " it is assumed that the contig name is the first\n";
print " field in each line.\n\n";
print " --k POS : contig name is in the field POS (1-based)\n";
print " of input lines.\n\n";
exit(1);
}
my $pos = 1;
GetOptions( "k:i" => \$pos );
$pos--;
usage() if ( scalar(@ARGV) == 0 );
if ( scalar(@ARGV) != 2 ) {
print "Wrong number of arguments\n";
usage();
}
my $input_file = $ARGV[0];
my $dict_file = $ARGV[1];
open(DICT, "< $dict_file") or die("Can not open $dict_file: $!");
my %ref_order;
my $n = 0;
while ( <DICT> ) {
chomp;
my ($contig, $rest) = split "\t";
die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} );
$ref_order{$contig} = $n;
$n++;
}
close DICT;
#we have loaded contig ordering now
open(INPUT, "< $input_file") or die("Can not open $input_file: $!");
my %temp_outputs;
while ( <INPUT> ) {
my @fields = split '\s';
die("Specified field position exceeds the number of fields:\n$_")
if ( $pos >= scalar(@fields) );
my $contig = $fields[$pos];
chomp $contig if ( $pos == scalar(@fields) - 1 ); # if last field in line
my $order;
if ( defined $ref_order{$contig} ) { $order = $ref_order{$contig}; }
else {
$order = $n; # input line has contig that was not in the dict;
$n++; # this contig will go at the end of the output,
# after all known contigs
}
my $fhandle;
if ( defined $temp_outputs{$order} ) { $fhandle = $temp_outputs{$order} }
else {
open( $fhandle, " > /tmp/sortByRef.$$.$order.tmp" ) or
die ( "Can not open temporary file $order: $!");
$temp_outputs{$order} = $fhandle;
}
# we got the handle to the temp file that keeps all
# lines with contig $contig
print $fhandle $_; # send current line to its corresponding temp file
}
close INPUT;
foreach my $f ( values %temp_outputs ) { close $f; }
# now collect back into single output stream:
for ( my $i = 0 ; $i < $n ; $i++ ) {
# if we did not have any lines on contig $i, then there's
# no temp file and nothing to do
next if ( ! defined $temp_outputs{$i} ) ;
my $f;
open ( $f, "< /tmp/sortByRef.$$.$i.tmp" );
while ( <$f> ) { print ; }
close $f;
unlink "/tmp/sortByRef.$$.$i.tmp";
}