Merge branch 'master' of github.com:broadinstitute/gsa-unstable
This commit is contained in:
commit
a47da9bb2f
|
|
@ -206,6 +206,22 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
|
||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// quality encoding checking arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is
|
||||
* simple: we just iterate over all reads and subtract 31 from every quality score.
|
||||
*/
|
||||
@Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false)
|
||||
public boolean FIX_MISENCODED_QUALS = false;
|
||||
|
||||
@Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
|
||||
public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// performance log arguments
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ abstract public class ReadTransformer {
|
|||
protected ReadTransformer() {}
|
||||
|
||||
/**
|
||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine.
|
||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initializeSub routine.
|
||||
*
|
||||
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
|
||||
* @param engine the engine, for initializing values
|
||||
|
|
@ -59,7 +59,7 @@ abstract public class ReadTransformer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this to initialize themeselves
|
||||
* Subclasses must override this to initialize themselves
|
||||
*
|
||||
* @param engine the engine, for initializing values
|
||||
* @param walker the walker we intend to run
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ public class QualityUtils {
|
|||
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
|
||||
|
||||
public final static double MIN_REASONABLE_ERROR = 0.0001;
|
||||
public final static byte MAX_REASONABLE_Q_SCORE = 40;
|
||||
public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious
|
||||
public final static byte MIN_USABLE_Q_SCORE = 6;
|
||||
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
|
||||
|
||||
|
|
|
|||
|
|
@ -414,7 +414,7 @@ public class BAQ {
|
|||
throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read);
|
||||
// the original quality is too high, almost certainly due to using the wrong encoding in the BAM file
|
||||
if ( tag > Byte.MAX_VALUE )
|
||||
throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores");
|
||||
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score (" + (int)read.getBaseQualities()[i] + ") with BAQ correction factor of " + baq_i);
|
||||
bqTag[i] = (byte)tag;
|
||||
}
|
||||
return new String(bqTag);
|
||||
|
|
|
|||
|
|
@ -240,6 +240,16 @@ public class UserException extends ReviewedStingException {
|
|||
}
|
||||
}
|
||||
|
||||
public static class MisencodedBAM extends UserException {
|
||||
public MisencodedBAM(SAMRecord read, String message) {
|
||||
this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message);
|
||||
}
|
||||
|
||||
public MisencodedBAM(String source, String message) {
|
||||
super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message));
|
||||
}
|
||||
}
|
||||
|
||||
public static class MalformedVCF extends UserException {
|
||||
public MalformedVCF(String message, String line) {
|
||||
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,67 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Checks for and errors out (or fixes if requested) when it detects reads with base qualities that are not encoded with
|
||||
* phred-scaled quality scores. Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at
|
||||
* Q64. The idea here is simple: if we are asked to fix the scores then we just subtract 31 from every quality score.
|
||||
* Otherwise, we randomly sample reads (for efficiency) and error out if we encounter a qual that's too high.
|
||||
*/
|
||||
public class MisencodedBaseQualityReadTransformer extends ReadTransformer {
|
||||
|
||||
private static final int samplingFrequency = 1000; // sample 1 read for every 1000 encountered
|
||||
private static final int encodingFixValue = 31; // Illumina_64 - PHRED_33
|
||||
|
||||
private boolean disabled;
|
||||
private boolean fixQuals;
|
||||
private static int currentReadCounter = 0;
|
||||
|
||||
@Override
|
||||
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
|
||||
fixQuals = engine.getArguments().FIX_MISENCODED_QUALS;
|
||||
disabled = !fixQuals && engine.getArguments().ALLOW_POTENTIALLY_MISENCODED_QUALS;
|
||||
|
||||
return ReadTransformer.ApplicationTime.ON_INPUT;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean enabled() {
|
||||
return !disabled;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GATKSAMRecord apply(final GATKSAMRecord read) {
|
||||
if ( fixQuals )
|
||||
return fixMisencodedQuals(read);
|
||||
|
||||
checkForMisencodedQuals(read);
|
||||
return read;
|
||||
}
|
||||
|
||||
protected static GATKSAMRecord fixMisencodedQuals(final GATKSAMRecord read) {
|
||||
final byte[] quals = read.getBaseQualities();
|
||||
for ( int i = 0; i < quals.length; i++ ) {
|
||||
quals[i] -= encodingFixValue;
|
||||
}
|
||||
read.setBaseQualities(quals);
|
||||
return read;
|
||||
}
|
||||
|
||||
protected static void checkForMisencodedQuals(final GATKSAMRecord read) {
|
||||
// sample reads randomly for checking
|
||||
if ( ++currentReadCounter >= samplingFrequency ) {
|
||||
currentReadCounter = 0;
|
||||
|
||||
final byte[] quals = read.getBaseQualities();
|
||||
for ( final byte qual : quals ) {
|
||||
if ( qual > QualityUtils.MAX_REASONABLE_Q_SCORE )
|
||||
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score of " + (int)qual);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,10 +1,6 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.utils.baq;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -24,7 +20,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|||
import net.sf.samtools.*;
|
||||
|
||||
/**
|
||||
* Basic unit test for GenomeLoc
|
||||
* Basic unit test for BAQ calculation
|
||||
*/
|
||||
public class BAQUnitTest extends BaseTest {
|
||||
private SAMFileHeader header;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Basic unit test for misencoded quals
|
||||
*/
|
||||
public class MisencodedBaseQualityUnitTest extends BaseTest {
|
||||
|
||||
private static final String readBases = "AAAAAAAAAA";
|
||||
private static final byte[] badQuals = { 59, 60, 62, 63, 64, 61, 62, 58, 57, 56 };
|
||||
private static final byte[] goodQuals = { 60, 60, 60, 60, 60, 60, 60, 60, 60, 60 };
|
||||
private static final byte[] fixedQuals = { 28, 29, 31, 32, 33, 30, 31, 27, 26, 25 };
|
||||
private SAMFileHeader header;
|
||||
|
||||
@BeforeMethod
|
||||
public void before() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||
}
|
||||
|
||||
private GATKSAMRecord createRead(final boolean useGoodBases) {
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, readBases.getBytes(), useGoodBases ? goodQuals : badQuals);
|
||||
read.setCigarString("10M");
|
||||
return read;
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testGoodQuals() {
|
||||
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
|
||||
for ( int i = 0; i < 10000; i++ )
|
||||
reads.add(createRead(true));
|
||||
|
||||
testEncoding(reads);
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = {UserException.class})
|
||||
public void testBadQualsThrowsError() {
|
||||
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
|
||||
for ( int i = 0; i < 10000; i++ )
|
||||
reads.add(createRead(false));
|
||||
|
||||
testEncoding(reads);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testFixBadQuals() {
|
||||
final GATKSAMRecord read = createRead(false);
|
||||
final GATKSAMRecord fixedRead = MisencodedBaseQualityReadTransformer.fixMisencodedQuals(read);
|
||||
for ( int i = 0; i < fixedQuals.length; i++ )
|
||||
Assert.assertEquals(fixedQuals[i], fixedRead.getBaseQualities()[i]);
|
||||
}
|
||||
|
||||
private void testEncoding(final List<GATKSAMRecord> reads) {
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
MisencodedBaseQualityReadTransformer.checkForMisencodedQuals(read);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue