Merge branch 'master' of github.com:broadinstitute/gsa-unstable

This commit is contained in:
Ryan Poplin 2012-12-03 14:30:14 -05:00
commit a47da9bb2f
8 changed files with 164 additions and 9 deletions

View File

@ -206,6 +206,22 @@ public class GATKArgumentCollection {
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
public double BAQGOP = BAQ.DEFAULT_GOP;
// --------------------------------------------------------------------------------------------------------------
//
// quality encoding checking arguments
//
// --------------------------------------------------------------------------------------------------------------
/**
* Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is
* simple: we just iterate over all reads and subtract 31 from every quality score.
*/
@Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false)
public boolean FIX_MISENCODED_QUALS = false;
@Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false;
// --------------------------------------------------------------------------------------------------------------
//
// performance log arguments

View File

@ -41,7 +41,7 @@ abstract public class ReadTransformer {
protected ReadTransformer() {}
/**
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine.
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initializeSub routine.
*
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
* @param engine the engine, for initializing values
@ -59,7 +59,7 @@ abstract public class ReadTransformer {
}
/**
* Subclasses must override this to initialize themeselves
* Subclasses must override this to initialize themselves
*
* @param engine the engine, for initializing values
* @param walker the walker we intend to run

View File

@ -14,7 +14,7 @@ public class QualityUtils {
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
public final static double MIN_REASONABLE_ERROR = 0.0001;
public final static byte MAX_REASONABLE_Q_SCORE = 40;
public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious
public final static byte MIN_USABLE_Q_SCORE = 6;
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;

View File

@ -414,7 +414,7 @@ public class BAQ {
throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read);
// the original quality is too high, almost certainly due to using the wrong encoding in the BAM file
if ( tag > Byte.MAX_VALUE )
throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores");
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score (" + (int)read.getBaseQualities()[i] + ") with BAQ correction factor of " + baq_i);
bqTag[i] = (byte)tag;
}
return new String(bqTag);

View File

@ -240,6 +240,16 @@ public class UserException extends ReviewedStingException {
}
}
public static class MisencodedBAM extends UserException {
public MisencodedBAM(SAMRecord read, String message) {
this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message);
}
public MisencodedBAM(String source, String message) {
super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message));
}
}
public static class MalformedVCF extends UserException {
public MalformedVCF(String message, String line) {
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));

View File

@ -0,0 +1,67 @@
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Checks for and errors out (or fixes if requested) when it detects reads with base qualities that are not encoded with
* phred-scaled quality scores. Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at
* Q64. The idea here is simple: if we are asked to fix the scores then we just subtract 31 from every quality score.
* Otherwise, we randomly sample reads (for efficiency) and error out if we encounter a qual that's too high.
*/
public class MisencodedBaseQualityReadTransformer extends ReadTransformer {
private static final int samplingFrequency = 1000; // sample 1 read for every 1000 encountered
private static final int encodingFixValue = 31; // Illumina_64 - PHRED_33
private boolean disabled;
private boolean fixQuals;
private static int currentReadCounter = 0;
@Override
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
fixQuals = engine.getArguments().FIX_MISENCODED_QUALS;
disabled = !fixQuals && engine.getArguments().ALLOW_POTENTIALLY_MISENCODED_QUALS;
return ReadTransformer.ApplicationTime.ON_INPUT;
}
@Override
public boolean enabled() {
return !disabled;
}
@Override
public GATKSAMRecord apply(final GATKSAMRecord read) {
if ( fixQuals )
return fixMisencodedQuals(read);
checkForMisencodedQuals(read);
return read;
}
protected static GATKSAMRecord fixMisencodedQuals(final GATKSAMRecord read) {
final byte[] quals = read.getBaseQualities();
for ( int i = 0; i < quals.length; i++ ) {
quals[i] -= encodingFixValue;
}
read.setBaseQualities(quals);
return read;
}
protected static void checkForMisencodedQuals(final GATKSAMRecord read) {
// sample reads randomly for checking
if ( ++currentReadCounter >= samplingFrequency ) {
currentReadCounter = 0;
final byte[] quals = read.getBaseQualities();
for ( final byte qual : quals ) {
if ( qual > QualityUtils.MAX_REASONABLE_Q_SCORE )
throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score of " + (int)qual);
}
}
}
}

View File

@ -1,10 +1,6 @@
// our package
package org.broadinstitute.sting.utils.baq;
// the imports for unit testing.
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.Test;
@ -24,7 +20,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.*;
/**
* Basic unit test for GenomeLoc
* Basic unit test for BAQ calculation
*/
public class BAQUnitTest extends BaseTest {
private SAMFileHeader header;

View File

@ -0,0 +1,66 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.List;
/**
* Basic unit test for misencoded quals
*/
public class MisencodedBaseQualityUnitTest extends BaseTest {
private static final String readBases = "AAAAAAAAAA";
private static final byte[] badQuals = { 59, 60, 62, 63, 64, 61, 62, 58, 57, 56 };
private static final byte[] goodQuals = { 60, 60, 60, 60, 60, 60, 60, 60, 60, 60 };
private static final byte[] fixedQuals = { 28, 29, 31, 32, 33, 30, 31, 27, 26, 25 };
private SAMFileHeader header;
@BeforeMethod
public void before() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
}
private GATKSAMRecord createRead(final boolean useGoodBases) {
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, readBases.getBytes(), useGoodBases ? goodQuals : badQuals);
read.setCigarString("10M");
return read;
}
@Test(enabled = true)
public void testGoodQuals() {
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
for ( int i = 0; i < 10000; i++ )
reads.add(createRead(true));
testEncoding(reads);
}
@Test(enabled = true, expectedExceptions = {UserException.class})
public void testBadQualsThrowsError() {
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(10000);
for ( int i = 0; i < 10000; i++ )
reads.add(createRead(false));
testEncoding(reads);
}
@Test(enabled = true)
public void testFixBadQuals() {
final GATKSAMRecord read = createRead(false);
final GATKSAMRecord fixedRead = MisencodedBaseQualityReadTransformer.fixMisencodedQuals(read);
for ( int i = 0; i < fixedQuals.length; i++ )
Assert.assertEquals(fixedQuals[i], fixedRead.getBaseQualities()[i]);
}
private void testEncoding(final List<GATKSAMRecord> reads) {
for ( final GATKSAMRecord read : reads )
MisencodedBaseQualityReadTransformer.checkForMisencodedQuals(read);
}
}