Merge pull request #16 from broadinstitute/eb_bqsr_fails_on_RR

Added check that BaseRecalibrator is not being run on a reduced bam.
This commit is contained in:
depristo 2013-02-06 07:20:42 -08:00
commit bee127482b
7 changed files with 53 additions and 9 deletions

View File

@ -214,6 +214,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
}
initializeRecalibrationEngine();
RecalUtils.checkForInvalidRecalBams(getToolkit().getSAMFileHeaders(), getToolkit().getArguments().ALLOW_BQSR_ON_REDUCED_BAMS);
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
referenceReader = getToolkit().getReferenceDataSource().getReference();
}

View File

@ -242,11 +242,10 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
CompressionStash compressionStash = new CompressionStash();
SortedSet<GenomeLoc> intervalList;
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
// IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER
public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam";
/**

View File

@ -66,6 +66,13 @@ public class BQSRReadTransformer extends ReadTransformer {
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
this.enabled = engine.hasBQSRArgumentSet();
if ( enabled ) {
// TODO -- See important note below about applying BQSR to a reduced BAM file:
// If it is important to make sure that BQSR is not applied (as opposed to having the covariates computed) against a reduced bam file,
// we need to figure out how to make this work. The problem is that the ReadTransformers are initialized before the ReadDataSource
// inside the GenomeAnalysisEngine, so we generate a NPE when trying to retrieve the SAMFileHeaders. Ultimately, I don't think this is
// a necessary check anyways since we disallow running BaseRecalibrator on reduced bams (so we can't generate the recal tables to use here).
// Although we could add this check to the apply() method below, it's kind of ugly and inefficient.
// The call here would be: RecalUtils.checkForInvalidRecalBams(engine.getSAMFileHeaders(), engine.getArguments().ALLOW_BQSR_ON_REDUCED_BAMS);
final BQSRArgumentSet args = engine.getBQSRArgumentSet();
this.bqsr = new BaseRecalibration(args.getRecalFile(), args.getQuantizationLevels(), args.shouldDisableIndelQuals(), args.getPreserveQscoresLessThan(), args.shouldEmitOriginalQuals(), args.getGlobalQScorePrior());
}

View File

@ -46,10 +46,12 @@
package org.broadinstitute.sting.utils.recalibration;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.compression.reducereads.ReduceReads;
import org.broadinstitute.sting.utils.classloader.JVMUtils;
import org.broadinstitute.sting.utils.recalibration.covariates.*;
import org.broadinstitute.sting.utils.BaseUtils;
@ -847,7 +849,6 @@ public class RecalUtils {
}
}
/**
* creates a datum object with one observation and one or zero error
*
@ -858,4 +859,20 @@ public class RecalUtils {
private static RecalDatum createDatumObject(final byte reportedQual, final double isError) {
return new RecalDatum(1, isError, reportedQual);
}
/**
* Checks for invalid BAMs that are being used with BQSR and fails with a UserException if it finds one
*
* @param headers sam file headers being passed into the GATK engine
* @param allowBqsrOnReducedBams should we allow BQSR on reduced bams?
*/
public static void checkForInvalidRecalBams(final List<SAMFileHeader> headers, final boolean allowBqsrOnReducedBams) {
// for now, the only check we make is against reduced bams
if ( !allowBqsrOnReducedBams ) {
for ( final SAMFileHeader header : headers ) {
if ( header.getProgramRecord(ReduceReads.PROGRAM_RECORD_NAME) != null )
throw new UserException.BadInput("base quality score recalibration should absolutely not be run on reduced BAM files! Please run ReduceReads only after BQSR has been performed");
}
}
}
}

View File

@ -170,6 +170,19 @@ public class BQSRIntegrationTest extends WalkerTest {
executeTest("testBQSRFailWithSolidNoCall", spec);
}
@Test
public void testBQSRFailWithReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
" -T BaseRecalibrator" +
" -R " + b37KGReference +
" -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam" +
" -L 1:67,225,396-67,288,518" +
" -o /dev/null",
0,
UserException.class);
executeTest("testBQSRFailWithReducedBam", spec);
}
private static class PRTest {
final String args;
final String md5;
@ -241,5 +254,4 @@ public class BQSRIntegrationTest extends WalkerTest {
UserException.class);
executeTest("testPRFailWithLowMaxCycle", spec);
}
}

View File

@ -884,10 +884,10 @@ public class GenomeAnalysisEngine {
/**
* Returns the unmerged SAM file header for an individual reader.
* @param reader The reader.
* @return Header for that reader.
* @return Header for that reader or null if not available.
*/
public SAMFileHeader getSAMFileHeader(SAMReaderID reader) {
return readsDataSource.getHeader(reader);
return readsDataSource == null ? null : readsDataSource.getHeader(reader);
}
/**

View File

@ -176,7 +176,7 @@ public class GATKArgumentCollection {
@Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false)
public boolean FIX_MISENCODED_QUALS = false;
@Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
@Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountering base qualities that are too high and that seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false;
// --------------------------------------------------------------------------------------------------------------
@ -245,6 +245,14 @@ public class GATKArgumentCollection {
@Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "The global Qscore Bayesian prior to use in the BQSR. If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score", required = false)
public double globalQScorePrior = -1.0;
/**
* For the sake of your data, please only use this option if you know what you are doing. It is absolutely not recommended practice
* to run base quality score recalibration on reduced BAM files.
*/
@Advanced
@Argument(fullName = "allow_bqsr_on_reduced_bams_despite_repeated_warnings", shortName="allowBqsrOnReducedBams", doc="Do not fail when running base quality score recalibration on a reduced BAM file even though we highly recommend against it", required = false)
public boolean ALLOW_BQSR_ON_REDUCED_BAMS = false;
// --------------------------------------------------------------------------------------------------------------
//
// Other utility arguments