Added checking in the GATK for mis-encoded quality scores.
The check is performed by a Read Transformer that samples (currently set to once every 1000 reads so that we don't hurt overall GATK performance) from the input reads and checks to make sure that none of the base quals is too high (> Q60). If we encounter such a base then we fail with a User Error. * Can be over-ridden with --allow_potentially_misencoded_quality_scores. * Also, the user can choose to fix his quals on the fly (presumably using PrintReads to write out a fixed bam) with the --fix_misencoded_quality_scores argument. Added unit tests.
This commit is contained in:
parent
6f523a1ea0
commit
b6839b3049
|
|
@ -206,6 +206,22 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
|
||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// quality encoding checking arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is
|
||||
* simple: we just iterate over all reads and subtract 31 from every quality score.
|
||||
*/
|
||||
@Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false)
|
||||
public boolean FIX_MISENCODED_QUALS = false;
|
||||
|
||||
@Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
|
||||
public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// performance log arguments
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ abstract public class ReadTransformer {
|
|||
protected ReadTransformer() {}
|
||||
|
||||
/**
|
||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine.
|
||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initializeSub routine.
|
||||
*
|
||||
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
|
||||
* @param engine the engine, for initializing values
|
||||
|
|
@ -59,7 +59,7 @@ abstract public class ReadTransformer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this to initialize themeselves
|
||||
* Subclasses must override this to initialize themselves
|
||||
*
|
||||
* @param engine the engine, for initializing values
|
||||
* @param walker the walker we intend to run
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ public class QualityUtils {
|
|||
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
|
||||
|
||||
public final static double MIN_REASONABLE_ERROR = 0.0001;
|
||||
public final static byte MAX_REASONABLE_Q_SCORE = 40;
|
||||
public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious
|
||||
public final static byte MIN_USABLE_Q_SCORE = 6;
|
||||
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
|
||||
|
||||
|
|
|
|||
|
|
@ -414,7 +414,7 @@ public class BAQ {
|
|||
throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read);
|
||||
// the original quality is too high, almost certainly due to using the wrong encoding in the BAM file
|
||||
if ( tag > Byte.MAX_VALUE )
|
||||
throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores");
|
||||
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score (" + ((int)read.getBaseQualities()[i] - 33) + ") with BAQ correction factor of " + baq_i);
|
||||
bqTag[i] = (byte)tag;
|
||||
}
|
||||
return new String(bqTag);
|
||||
|
|
|
|||
|
|
@ -240,6 +240,16 @@ public class UserException extends ReviewedStingException {
|
|||
}
|
||||
}
|
||||
|
||||
public static class MisencodedBAM extends UserException {
|
||||
public MisencodedBAM(SAMRecord read, String message) {
|
||||
this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message);
|
||||
}
|
||||
|
||||
public MisencodedBAM(String source, String message) {
|
||||
super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message));
|
||||
}
|
||||
}
|
||||
|
||||
public static class MalformedVCF extends UserException {
|
||||
public MalformedVCF(String message, String line) {
|
||||
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Checks for and errors out (or fixes if requested) when it detects reads with base qualities that are not encoded with
|
||||
* phred-scaled quality scores. Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at
|
||||
* Q64. The idea here is simple: if we are asked to fix the scores then we just subtract 31 from every quality score.
|
||||
* Otherwise, we randomly sample reads (for efficiency) and error out if we encounter a qual that's too high.
|
||||
*/
|
||||
public class MisencodedBaseQualityReadTransformer extends ReadTransformer {
|
||||
|
||||
private static final int samplingFrequency = 1000; // sample 1 read for every 1000 encountered
|
||||
private static final int encodingFixValue = 31; // Illumina_64 - PHRED_33
|
||||
private static final byte maxAllowedQualByte = QualityUtils.MAX_REASONABLE_Q_SCORE + 33;
|
||||
|
||||
private boolean disabled;
|
||||
private boolean fixQuals;
|
||||
private static int currentReadCounter = 0;
|
||||
|
||||
@Override
|
||||
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
|
||||
fixQuals = engine.getArguments().FIX_MISENCODED_QUALS;
|
||||
disabled = !fixQuals && engine.getArguments().ALLOW_POTENTIALLY_MISENCODED_QUALS;
|
||||
|
||||
return ReadTransformer.ApplicationTime.ON_INPUT;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean enabled() {
|
||||
return !disabled;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GATKSAMRecord apply(final GATKSAMRecord read) {
|
||||
if ( fixQuals )
|
||||
return fixMisencodedQuals(read);
|
||||
|
||||
checkForMisencodedQuals(read);
|
||||
return read;
|
||||
}
|
||||
|
||||
protected static GATKSAMRecord fixMisencodedQuals(final GATKSAMRecord read) {
|
||||
final byte[] quals = read.getBaseQualities();
|
||||
for ( int i = 0; i < quals.length; i++ ) {
|
||||
quals[i] -= encodingFixValue;
|
||||
}
|
||||
read.setBaseQualities(quals);
|
||||
return read;
|
||||
}
|
||||
|
||||
protected static void checkForMisencodedQuals(final GATKSAMRecord read) {
|
||||
// sample reads randomly for checking
|
||||
if ( ++currentReadCounter >= samplingFrequency ) {
|
||||
currentReadCounter = 0;
|
||||
|
||||
final byte[] quals = read.getBaseQualities();
|
||||
for ( final byte qual : quals ) {
|
||||
if ( qual > maxAllowedQualByte )
|
||||
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score of " + ((int)qual - 33));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,10 +1,6 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.utils.baq;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -24,7 +20,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|||
import net.sf.samtools.*;
|
||||
|
||||
/**
|
||||
* Basic unit test for GenomeLoc
|
||||
* Basic unit test for BAQ calculation
|
||||
*/
|
||||
public class BAQUnitTest extends BaseTest {
|
||||
private SAMFileHeader header;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Basic unit test for misencoded quals
|
||||
*/
|
||||
public class MisencodedBaseQualityUnitTest extends BaseTest {
|
||||
|
||||
private static final String readBases = "AAAAAAAAAA";
|
||||
private static final byte[] badQuals = { 'Z', '[', 'c', 'd', 'e', 'a', 'b', 'Z', 'Y', 'X' };
|
||||
private static final byte[] goodQuals = { '[', '[', '[', '[', '[', '[', '[', '[', '[', '[' };
|
||||
private static final byte[] fixedQuals = { ';', '<', 'D', 'E', 'F', 'B', 'C', ';', ':', '9' };
|
||||
private SAMFileHeader header;
|
||||
|
||||
@BeforeMethod
|
||||
public void before() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||
}
|
||||
|
||||
private GATKSAMRecord createRead(final boolean useGoodBases) {
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, readBases.getBytes(), useGoodBases ? goodQuals : badQuals);
|
||||
read.setCigarString("10M");
|
||||
return read;
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testGoodQuals() {
|
||||
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
|
||||
for ( int i = 0; i < 10000; i++ )
|
||||
reads.add(createRead(true));
|
||||
|
||||
testEncoding(reads);
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = {UserException.class})
|
||||
public void testBadQualsThrowsError() {
|
||||
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
|
||||
for ( int i = 0; i < 10000; i++ )
|
||||
reads.add(createRead(false));
|
||||
|
||||
testEncoding(reads);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testFixBadQuals() {
|
||||
final GATKSAMRecord read = createRead(false);
|
||||
final GATKSAMRecord fixedRead = MisencodedBaseQualityReadTransformer.fixMisencodedQuals(read);
|
||||
for ( int i = 0; i < fixedQuals.length; i++ )
|
||||
Assert.assertEquals(fixedQuals[i], fixedRead.getBaseQualities()[i]);
|
||||
}
|
||||
|
||||
private void testEncoding(final List<GATKSAMRecord> reads) {
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
MisencodedBaseQualityReadTransformer.checkForMisencodedQuals(read);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue