Merge branch 'master' of github.com:broadinstitute/gsa-unstable
This commit is contained in:
commit
a47da9bb2f
|
|
@ -206,6 +206,22 @@ public class GATKArgumentCollection {
|
||||||
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
|
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
|
||||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// quality encoding checking arguments
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is
|
||||||
|
* simple: we just iterate over all reads and subtract 31 from every quality score.
|
||||||
|
*/
|
||||||
|
@Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false)
|
||||||
|
public boolean FIX_MISENCODED_QUALS = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
|
||||||
|
public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false;
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// performance log arguments
|
// performance log arguments
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ abstract public class ReadTransformer {
|
||||||
protected ReadTransformer() {}
|
protected ReadTransformer() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine.
|
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initializeSub routine.
|
||||||
*
|
*
|
||||||
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
|
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
|
||||||
* @param engine the engine, for initializing values
|
* @param engine the engine, for initializing values
|
||||||
|
|
@ -59,7 +59,7 @@ abstract public class ReadTransformer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Subclasses must override this to initialize themeselves
|
* Subclasses must override this to initialize themselves
|
||||||
*
|
*
|
||||||
* @param engine the engine, for initializing values
|
* @param engine the engine, for initializing values
|
||||||
* @param walker the walker we intend to run
|
* @param walker the walker we intend to run
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ public class QualityUtils {
|
||||||
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
|
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
|
||||||
|
|
||||||
public final static double MIN_REASONABLE_ERROR = 0.0001;
|
public final static double MIN_REASONABLE_ERROR = 0.0001;
|
||||||
public final static byte MAX_REASONABLE_Q_SCORE = 40;
|
public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious
|
||||||
public final static byte MIN_USABLE_Q_SCORE = 6;
|
public final static byte MIN_USABLE_Q_SCORE = 6;
|
||||||
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
|
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -414,7 +414,7 @@ public class BAQ {
|
||||||
throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read);
|
throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read);
|
||||||
// the original quality is too high, almost certainly due to using the wrong encoding in the BAM file
|
// the original quality is too high, almost certainly due to using the wrong encoding in the BAM file
|
||||||
if ( tag > Byte.MAX_VALUE )
|
if ( tag > Byte.MAX_VALUE )
|
||||||
throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores");
|
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score (" + (int)read.getBaseQualities()[i] + ") with BAQ correction factor of " + baq_i);
|
||||||
bqTag[i] = (byte)tag;
|
bqTag[i] = (byte)tag;
|
||||||
}
|
}
|
||||||
return new String(bqTag);
|
return new String(bqTag);
|
||||||
|
|
|
||||||
|
|
@ -240,6 +240,16 @@ public class UserException extends ReviewedStingException {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class MisencodedBAM extends UserException {
|
||||||
|
public MisencodedBAM(SAMRecord read, String message) {
|
||||||
|
this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public MisencodedBAM(String source, String message) {
|
||||||
|
super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static class MalformedVCF extends UserException {
|
public static class MalformedVCF extends UserException {
|
||||||
public MalformedVCF(String message, String line) {
|
public MalformedVCF(String message, String line) {
|
||||||
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
|
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks for and errors out (or fixes if requested) when it detects reads with base qualities that are not encoded with
|
||||||
|
* phred-scaled quality scores. Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at
|
||||||
|
* Q64. The idea here is simple: if we are asked to fix the scores then we just subtract 31 from every quality score.
|
||||||
|
* Otherwise, we randomly sample reads (for efficiency) and error out if we encounter a qual that's too high.
|
||||||
|
*/
|
||||||
|
public class MisencodedBaseQualityReadTransformer extends ReadTransformer {
|
||||||
|
|
||||||
|
private static final int samplingFrequency = 1000; // sample 1 read for every 1000 encountered
|
||||||
|
private static final int encodingFixValue = 31; // Illumina_64 - PHRED_33
|
||||||
|
|
||||||
|
private boolean disabled;
|
||||||
|
private boolean fixQuals;
|
||||||
|
private static int currentReadCounter = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
|
||||||
|
fixQuals = engine.getArguments().FIX_MISENCODED_QUALS;
|
||||||
|
disabled = !fixQuals && engine.getArguments().ALLOW_POTENTIALLY_MISENCODED_QUALS;
|
||||||
|
|
||||||
|
return ReadTransformer.ApplicationTime.ON_INPUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean enabled() {
|
||||||
|
return !disabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public GATKSAMRecord apply(final GATKSAMRecord read) {
|
||||||
|
if ( fixQuals )
|
||||||
|
return fixMisencodedQuals(read);
|
||||||
|
|
||||||
|
checkForMisencodedQuals(read);
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static GATKSAMRecord fixMisencodedQuals(final GATKSAMRecord read) {
|
||||||
|
final byte[] quals = read.getBaseQualities();
|
||||||
|
for ( int i = 0; i < quals.length; i++ ) {
|
||||||
|
quals[i] -= encodingFixValue;
|
||||||
|
}
|
||||||
|
read.setBaseQualities(quals);
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static void checkForMisencodedQuals(final GATKSAMRecord read) {
|
||||||
|
// sample reads randomly for checking
|
||||||
|
if ( ++currentReadCounter >= samplingFrequency ) {
|
||||||
|
currentReadCounter = 0;
|
||||||
|
|
||||||
|
final byte[] quals = read.getBaseQualities();
|
||||||
|
for ( final byte qual : quals ) {
|
||||||
|
if ( qual > QualityUtils.MAX_REASONABLE_Q_SCORE )
|
||||||
|
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score of " + (int)qual);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,10 +1,6 @@
|
||||||
// our package
|
|
||||||
package org.broadinstitute.sting.utils.baq;
|
package org.broadinstitute.sting.utils.baq;
|
||||||
|
|
||||||
|
|
||||||
// the imports for unit testing.
|
|
||||||
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.testng.Assert;
|
import org.testng.Assert;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
@ -24,7 +20,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Basic unit test for GenomeLoc
|
* Basic unit test for BAQ calculation
|
||||||
*/
|
*/
|
||||||
public class BAQUnitTest extends BaseTest {
|
public class BAQUnitTest extends BaseTest {
|
||||||
private SAMFileHeader header;
|
private SAMFileHeader header;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeMethod;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic unit test for misencoded quals
|
||||||
|
*/
|
||||||
|
public class MisencodedBaseQualityUnitTest extends BaseTest {
|
||||||
|
|
||||||
|
private static final String readBases = "AAAAAAAAAA";
|
||||||
|
private static final byte[] badQuals = { 59, 60, 62, 63, 64, 61, 62, 58, 57, 56 };
|
||||||
|
private static final byte[] goodQuals = { 60, 60, 60, 60, 60, 60, 60, 60, 60, 60 };
|
||||||
|
private static final byte[] fixedQuals = { 28, 29, 31, 32, 33, 30, 31, 27, 26, 25 };
|
||||||
|
private SAMFileHeader header;
|
||||||
|
|
||||||
|
@BeforeMethod
|
||||||
|
public void before() {
|
||||||
|
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
private GATKSAMRecord createRead(final boolean useGoodBases) {
|
||||||
|
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, readBases.getBytes(), useGoodBases ? goodQuals : badQuals);
|
||||||
|
read.setCigarString("10M");
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testGoodQuals() {
|
||||||
|
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
|
||||||
|
for ( int i = 0; i < 10000; i++ )
|
||||||
|
reads.add(createRead(true));
|
||||||
|
|
||||||
|
testEncoding(reads);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true, expectedExceptions = {UserException.class})
|
||||||
|
public void testBadQualsThrowsError() {
|
||||||
|
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
|
||||||
|
for ( int i = 0; i < 10000; i++ )
|
||||||
|
reads.add(createRead(false));
|
||||||
|
|
||||||
|
testEncoding(reads);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testFixBadQuals() {
|
||||||
|
final GATKSAMRecord read = createRead(false);
|
||||||
|
final GATKSAMRecord fixedRead = MisencodedBaseQualityReadTransformer.fixMisencodedQuals(read);
|
||||||
|
for ( int i = 0; i < fixedQuals.length; i++ )
|
||||||
|
Assert.assertEquals(fixedQuals[i], fixedRead.getBaseQualities()[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testEncoding(final List<GATKSAMRecord> reads) {
|
||||||
|
for ( final GATKSAMRecord read : reads )
|
||||||
|
MisencodedBaseQualityReadTransformer.checkForMisencodedQuals(read);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue