diff --git a/perl/filterSingleSampleCalls.pl b/perl/filterSingleSampleCalls.pl new file mode 100755 index 000000000..2cc49f0b4 --- /dev/null +++ b/perl/filterSingleSampleCalls.pl @@ -0,0 +1,203 @@ +#!/usr/bin/perl + +use Getopt::Long; +use strict; + + +sub usage { + + my $message = shift; + + print "\nERROR:\n$message\n\n"; + + print "Usage:\n\n"; + print " filterSingleSampleCalls --calls FILEIN [--max_cons_av_mm N1] [--max_ref_av_mm N2] [--max_cons_nqs_av_mm N3] [--min_cons_nqs_av_qual N4] [--min_ref_nqs_av_qual N5] [--mode MODE]\n\n"; + print " FILEIN File to read and apply filter to. If \"-\" then read from stdin.\n"; + print " N1 max. average number of mismatches per (consensus) indel-containing read.\n"; + print " If the number is greater then N1, indel will be discarded/marked.\n"; + print " N2 max. average number of mismatches per reference-matching read.\n"; + print " If the number is greater then N2, indel will be discarded/marked.\n"; + print " N3 max. average mismatch rate in NQS window around the indel, across all indel-containing read.\n"; + print " If the number is greater then N3, indel will be discarded/marked.\n"; + print " N4 min. average base quality in all indel supporting reads in the nqs window around the indel.\n"; + print " If the average base quality is less than N4, the indel will be discarded/marked.\n"; + print " N5 min. average base quality in all reference supporting reads in the nqs window around the indel.\n"; + print " If the average base quality is less than N5, the indel will be discarded/marked.\n"; + print " MODE If (any prefix of) ANNOTATE, then indel calls not passing any of the filters will be still printed into stdout with\n"; + print " additional AUTOFILTER_* tags added, specifying what cutoff(s) were not passed.\n"; + print " If (any prefix of) DISCARD, then only indels passing all the filters will be printed into stdout, the rest\n"; + print " will be discarded (default).\n\n"; + + exit(1); + +} + +my $calls = ""; +my $tolerate_old_calls = 1; +my $cons_av_mm_cutoff = 100000; +my $ref_av_mm_cutoff = 100000; +my $cons_av_nqs_mm_cutoff = 100000; +my $ref_nqs_av_qual_cutoff = 0; +my $cons_nqs_av_qual_cutoff = 0; +my $mode_arg = ""; +my $mode = 1; # "discard" + +GetOptions("calls:s" => \$calls, + "max_cons_av_mm:f" => \$cons_av_mm_cutoff, + "max_ref_av_mm:f" => \$ref_av_mm_cutoff, + "max_cons_nqs_av_mm:f" => \$cons_av_nqs_mm_cutoff, + "min_ref_nqs_av_qual:f" => \$ref_nqs_av_qual_cutoff, + "min_cons_nqs_av_qual:f" => \$cons_nqs_av_qual_cutoff, + "mode=s" => \$mode_arg + ) + or usage("Can not parse argument string"); + +usage ("--calls argument must be specified") if ( $calls eq "" ) ; +usage ("--mode argument must be specified (unique prefix of: ANNOTATE or DISCARD)") if ( $mode_arg eq ""); + +if ( "ANNOTATE" =~ /^$mode_arg/ ) { + $mode = 0; +} elsif ( "DISCARD" =~ /^$mode_arg/ ) { + $mode=1; +} else { + die("Unrecognized value specified in --mode argument"); +} + +my $input_stream; + +if ( $calls eq "-" ) { + $input_stream = "STDIN"; +} else { + open ( $input_stream, "< $calls") or + die("Can not open input file $calls: $!"); +} + +my $id_counter = 0; + +while( <$input_stream> ) { + + chomp; + + $id_counter++; + my $annotation=""; + + next if ( $_ =~ /_TOO_DIRTY/ ); + +# print $_,"\n"; +# next; + + my $cons_cnt = $1; + my $indel_cnt = $2; + my $cov = $3; + + if ( $_ =~ /\sOBS_COUNTS\[C\/A\/R\]:(\d+)\/(\d+)\/(\d+)\s/ ) { + $cons_cnt = $1; + $indel_cnt = $2; + $cov = $3; + } else { + if ( $tolerate_old_calls != 0 ) { + print "$_\n"; + next; + } else { + die("NO OBS_COUNTS in\n$_\n"); + } + } + + + die ("NO AV_MM MATCH in\n$_\n") if ( $_ !~ /\sAV_MM\[C\/R\]:([\+-\d\.]+)\/([\+-\d\.]+)\s/ ) ; + + my $cons_av_mm = $1; + my $ref_av_mm = $2; + + die("NO AV_MAPQ MATCH in\n$_\n") if ( $_ !~ /\sAV_MAPQ\[C\/R\]:([\+-\d\.]+)\/([\+-\d\.]+)\s/ ) ; + + my $cons_av_mapq = $1; + my $ref_av_mapq = $2; + + die("NO NQS_MM_RATE MATCH in\n$_\n") if ( $_ !~ /\sNQS_MM_RATE\[C\/R\]:([\d\.]+)\/([\d\.]+)\s/ ) ; + + my $cons_nqs_mm_rate = $1; + my $ref_nqs_mm_rate = $2; + + die( "NO NQS_AV_QUAL MATCH in\n$_\n") if ( $_ !~ /\sNQS_AV_QUAL\[C\/R\]:([\d\.]+)\/([\d\.]+)\s/ ) ; + + my $cons_nqs_av_qual = $1; + my $ref_nqs_av_qual = $2; + + + if ( $cons_av_mm < 0 ) { + print STDERR "WARNING: negative mismatch rate in consensus supporting reads:\n"; + print STDERR "$_\n"; + next; + } + if ( $ref_av_mm < 0 ) { + print STDERR "WARNING: negative mismatch rate in reference supporting reads:\n"; + print STDERR "$_\n"; + next; + } + if ( $cons_av_mapq < 0 ) { + print STDERR "WARNING: negative average mapping quality in consensus supporting reads:\n"; + print STDERR "$_\n"; + next; + } + if ( $ref_av_mapq < 0 ) { + print STDERR "WARNING: negative average mapping quality in reference supporting reads:\n"; + print STDERR "$_\n"; + next; + } + + if ( $cons_av_mm > $cons_av_mm_cutoff ) { + # filter indel out: alignments for indel-containing reads are too messy + if ( $mode == 0 ) { # annotate + $annotation .= "\tAUTOFILTER_CONS_AV_MM_$cons_av_mm_cutoff"; + } else { + next; # discard + } + } + + if ( $ref_av_mm > $ref_av_mm_cutoff ) { + # filter indel out: alignments for reference-matching reads are too messy + if ( $mode == 0 ) { # annotate + $annotation .= "\tAUTOFILTER_REF_AV_MM_$ref_av_mm_cutoff"; + } else { + next; # discard + } + } + + + if ( $cons_nqs_av_qual < $cons_nqs_av_qual_cutoff ) { + # filter indel out: alignments for indel-containing reads are too messy + if ( $mode == 0 ) { # annotate + $annotation .= "\tAUTOFILTER_CONS_NQS_AV_QUAL_$cons_nqs_av_qual_cutoff"; + } else { + next; # discard + } + } + + if ( $ref_nqs_av_qual < $ref_nqs_av_qual_cutoff ) { + # filter indel out: alignments for reference-matching reads are too messy + if ( $mode == 0 ) { # annotate + $annotation .= "\tAUTOFILTER_REF_NQS_AV_QUAL_$ref_nqs_av_qual_cutoff"; + } else { + next; # discard + } + } + + + if ( $cons_nqs_mm_rate > $cons_av_nqs_mm_cutoff ) { + # filter indel out: consensus nqs window too messy (probably "strange" indel) + if ( $mode == 0 ) { # annotate + $annotation .= "\tAUTOFILTER_CONS_NQS_MM_$cons_av_nqs_mm_cutoff"; + } else { + next; # discard + } + } + + print "$_$annotation\n"; + + +} + + +close $input_stream if ( $calls ne "-" ); +