gatk-3.8/perl/filterSingleSampleCalls.pl

204 lines
6.5 KiB
Perl
Executable File

#!/usr/bin/perl
use Getopt::Long;
use strict;
sub usage {
my $message = shift;
print "\nERROR:\n$message\n\n";
print "Usage:\n\n";
print " filterSingleSampleCalls --calls FILEIN [--max_cons_av_mm N1] [--max_ref_av_mm N2] [--max_cons_nqs_av_mm N3] [--min_cons_nqs_av_qual N4] [--min_ref_nqs_av_qual N5] [--mode MODE]\n\n";
print " FILEIN File to read and apply filter to. If \"-\" then read from stdin.\n";
print " N1 max. average number of mismatches per (consensus) indel-containing read.\n";
print " If the number is greater then N1, indel will be discarded/marked.\n";
print " N2 max. average number of mismatches per reference-matching read.\n";
print " If the number is greater then N2, indel will be discarded/marked.\n";
print " N3 max. average mismatch rate in NQS window around the indel, across all indel-containing read.\n";
print " If the number is greater then N3, indel will be discarded/marked.\n";
print " N4 min. average base quality in all indel supporting reads in the nqs window around the indel.\n";
print " If the average base quality is less than N4, the indel will be discarded/marked.\n";
print " N5 min. average base quality in all reference supporting reads in the nqs window around the indel.\n";
print " If the average base quality is less than N5, the indel will be discarded/marked.\n";
print " MODE If (any prefix of) ANNOTATE, then indel calls not passing any of the filters will be still printed into stdout with\n";
print " additional AUTOFILTER_* tags added, specifying what cutoff(s) were not passed.\n";
print " If (any prefix of) DISCARD, then only indels passing all the filters will be printed into stdout, the rest\n";
print " will be discarded (default).\n\n";
exit(1);
}
my $calls = "";
my $tolerate_old_calls = 1;
my $cons_av_mm_cutoff = 100000;
my $ref_av_mm_cutoff = 100000;
my $cons_av_nqs_mm_cutoff = 100000;
my $ref_nqs_av_qual_cutoff = 0;
my $cons_nqs_av_qual_cutoff = 0;
my $mode_arg = "";
my $mode = 1; # "discard"
GetOptions("calls:s" => \$calls,
"max_cons_av_mm:f" => \$cons_av_mm_cutoff,
"max_ref_av_mm:f" => \$ref_av_mm_cutoff,
"max_cons_nqs_av_mm:f" => \$cons_av_nqs_mm_cutoff,
"min_ref_nqs_av_qual:f" => \$ref_nqs_av_qual_cutoff,
"min_cons_nqs_av_qual:f" => \$cons_nqs_av_qual_cutoff,
"mode=s" => \$mode_arg
)
or usage("Can not parse argument string");
usage ("--calls argument must be specified") if ( $calls eq "" ) ;
usage ("--mode argument must be specified (unique prefix of: ANNOTATE or DISCARD)") if ( $mode_arg eq "");
if ( "ANNOTATE" =~ /^$mode_arg/ ) {
$mode = 0;
} elsif ( "DISCARD" =~ /^$mode_arg/ ) {
$mode=1;
} else {
die("Unrecognized value specified in --mode argument");
}
my $input_stream;
if ( $calls eq "-" ) {
$input_stream = "STDIN";
} else {
open ( $input_stream, "< $calls") or
die("Can not open input file $calls: $!");
}
my $id_counter = 0;
while( <$input_stream> ) {
chomp;
$id_counter++;
my $annotation="";
next if ( $_ =~ /_TOO_DIRTY/ );
# print $_,"\n";
# next;
my $cons_cnt = $1;
my $indel_cnt = $2;
my $cov = $3;
if ( $_ =~ /\sOBS_COUNTS\[C\/A\/R\]:(\d+)\/(\d+)\/(\d+)\s/ ) {
$cons_cnt = $1;
$indel_cnt = $2;
$cov = $3;
} else {
if ( $tolerate_old_calls != 0 ) {
print "$_\n";
next;
} else {
die("NO OBS_COUNTS in\n$_\n");
}
}
die ("NO AV_MM MATCH in\n$_\n") if ( $_ !~ /\sAV_MM\[C\/R\]:([\+-\d\.]+)\/([\+-\d\.]+)\s/ ) ;
my $cons_av_mm = $1;
my $ref_av_mm = $2;
die("NO AV_MAPQ MATCH in\n$_\n") if ( $_ !~ /\sAV_MAPQ\[C\/R\]:([\+-\d\.]+)\/([\+-\d\.]+)\s/ ) ;
my $cons_av_mapq = $1;
my $ref_av_mapq = $2;
die("NO NQS_MM_RATE MATCH in\n$_\n") if ( $_ !~ /\sNQS_MM_RATE\[C\/R\]:([\d\.]+)\/([\d\.]+)\s/ ) ;
my $cons_nqs_mm_rate = $1;
my $ref_nqs_mm_rate = $2;
die( "NO NQS_AV_QUAL MATCH in\n$_\n") if ( $_ !~ /\sNQS_AV_QUAL\[C\/R\]:([\d\.]+)\/([\d\.]+)\s/ ) ;
my $cons_nqs_av_qual = $1;
my $ref_nqs_av_qual = $2;
if ( $cons_av_mm < 0 ) {
print STDERR "WARNING: negative mismatch rate in consensus supporting reads:\n";
print STDERR "$_\n";
next;
}
if ( $ref_av_mm < 0 ) {
print STDERR "WARNING: negative mismatch rate in reference supporting reads:\n";
print STDERR "$_\n";
next;
}
if ( $cons_av_mapq < 0 ) {
print STDERR "WARNING: negative average mapping quality in consensus supporting reads:\n";
print STDERR "$_\n";
next;
}
if ( $ref_av_mapq < 0 ) {
print STDERR "WARNING: negative average mapping quality in reference supporting reads:\n";
print STDERR "$_\n";
next;
}
if ( $cons_av_mm > $cons_av_mm_cutoff ) {
# filter indel out: alignments for indel-containing reads are too messy
if ( $mode == 0 ) { # annotate
$annotation .= "\tAUTOFILTER_CONS_AV_MM_$cons_av_mm_cutoff";
} else {
next; # discard
}
}
if ( $ref_av_mm > $ref_av_mm_cutoff ) {
# filter indel out: alignments for reference-matching reads are too messy
if ( $mode == 0 ) { # annotate
$annotation .= "\tAUTOFILTER_REF_AV_MM_$ref_av_mm_cutoff";
} else {
next; # discard
}
}
if ( $cons_nqs_av_qual < $cons_nqs_av_qual_cutoff ) {
# filter indel out: alignments for indel-containing reads are too messy
if ( $mode == 0 ) { # annotate
$annotation .= "\tAUTOFILTER_CONS_NQS_AV_QUAL_$cons_nqs_av_qual_cutoff";
} else {
next; # discard
}
}
if ( $ref_nqs_av_qual < $ref_nqs_av_qual_cutoff ) {
# filter indel out: alignments for reference-matching reads are too messy
if ( $mode == 0 ) { # annotate
$annotation .= "\tAUTOFILTER_REF_NQS_AV_QUAL_$ref_nqs_av_qual_cutoff";
} else {
next; # discard
}
}
if ( $cons_nqs_mm_rate > $cons_av_nqs_mm_cutoff ) {
# filter indel out: consensus nqs window too messy (probably "strange" indel)
if ( $mode == 0 ) { # annotate
$annotation .= "\tAUTOFILTER_CONS_NQS_MM_$cons_av_nqs_mm_cutoff";
} else {
next; # discard
}
}
print "$_$annotation\n";
}
close $input_stream if ( $calls ne "-" );