Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2012-07-05 10:37:40 -04:00
commit c2e1dc63ff
12 changed files with 78 additions and 56 deletions

View File

@ -197,7 +197,7 @@ public class GenomeAnalysisEngine {
private BaseRecalibration baseRecalibration = null;
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
public void setBaseRecalibration(final File recalFile, final int quantizationLevels, final boolean noIndelQuals) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels, noIndelQuals); }
/**
* Actually run the GATK with the specified walker.
@ -227,7 +227,7 @@ public class GenomeAnalysisEngine {
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels, this.getArguments().noIndelQuals);
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();

View File

@ -209,6 +209,12 @@ public class GATKArgumentCollection {
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
public int quantizationLevels = -1;
/**
* Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced.
*/
@Argument(fullName="no_indel_quals", shortName = "NIQ", doc = "If true, inhibits printing of base insertion and base deletion tags.", required=false)
public boolean noIndelQuals = false;
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;

View File

@ -664,8 +664,12 @@ public class SAMDataSource {
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
// **** NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! ****
// (otherwise we will process something that we may end up throwing away)
// *********************************************************************************** //
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// *********************************************************************************** //
if (downsamplingFraction != null)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);

View File

@ -48,6 +48,7 @@ import java.util.EnumSet;
public class CycleCovariate implements StandardCovariate {
private static final int MAXIMUM_CYCLE_VALUE = 1000;
private static final int CUSHION_FOR_INDELS = 4;
private static final EnumSet<NGSPlatform> DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS);
private static final EnumSet<NGSPlatform> FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
@ -79,11 +80,11 @@ public class CycleCovariate implements StandardCovariate {
increment = readOrderFactor;
}
final int CUSHION = 4;
final int MAX_CYCLE = readLength - CUSHION - 1;
final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1;
for (int i = 0; i < readLength; i++) {
final int key = (i<CUSHION || i>MAX_CYCLE) ? -1 : keyFromCycle(cycle);
values.addCovariate(key, key, key, i);
final int substitutionKey = keyFromCycle(cycle);
final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey;
values.addCovariate(substitutionKey, indelKey, indelKey, i);
cycle += increment;
}
}

View File

@ -170,7 +170,7 @@ public class RecalDataManager {
final ArrayList<Covariate> requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates
ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>();
if (argumentCollection.USE_STANDARD_COVARIATES)
if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES)
optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user
if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified
@ -180,7 +180,7 @@ public class RecalDataManager {
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
foundClass = true;
if (!requiredClasses.contains(covClass) &&
(!argumentCollection.USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
(argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
try {
final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it
optionalCovariates.add(covariate);

View File

@ -71,16 +71,19 @@ public class RecalibrationArgumentCollection {
public boolean LIST_ONLY = false;
/**
* Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list.
* Note that the ReadGroup and QualityScore covariates are required and do not need to be specified.
* Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default.
* Use the --list argument to see the available covariates.
*/
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
@Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false)
public String[] COVARIATES = null;
/*
* Use the standard set of covariates in addition to the ones listed using the -cov argument
* The Cycle and Context covariates are standard and are included by default unless this argument is provided.
* Note that the ReadGroup and QualityScore covariates are required and cannot be excluded.
*/
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
public boolean USE_STANDARD_COVARIATES = true;
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
/////////////////////////////
// Debugging-only Arguments
@ -172,8 +175,8 @@ public class RecalibrationArgumentCollection {
argumentsTable.addColumn(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
argumentsTable.addRowID("covariate", true);
argumentsTable.set("covariate", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES));
argumentsTable.addRowID("standard_covs", true);
argumentsTable.set("standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, USE_STANDARD_COVARIATES);
argumentsTable.addRowID("no_standard_covs", true);
argumentsTable.set("no_standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
argumentsTable.addRowID("run_without_dbsnp", true);
argumentsTable.set("run_without_dbsnp", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
argumentsTable.addRowID("solid_recal_mode", true);

View File

@ -247,7 +247,7 @@ public class RecalibrationReport {
RAC.COVARIATES = value.toString().split(",");
else if (argument.equals("standard_covs"))
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
RAC.DO_NOT_USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
else if (argument.equals("solid_recal_mode"))
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);

View File

@ -49,6 +49,8 @@ public class BaseRecalibration {
private final RecalibrationTables recalibrationTables;
private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation
private final boolean noIndelQuals;
private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
static {
for (int i = 0; i < EventType.values().length; i++)
@ -58,10 +60,11 @@ public class BaseRecalibration {
/**
* Constructor using a GATK Report file
*
* @param RECAL_FILE a GATK Report file containing the recalibration information
* @param RECAL_FILE a GATK Report file containing the recalibration information
* @param quantizationLevels number of bins to quantize the quality scores
* @param noIndelQuals if true, do not emit base indel qualities
*/
public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) {
public BaseRecalibration(final File RECAL_FILE, final int quantizationLevels, final boolean noIndelQuals) {
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
recalibrationTables = recalibrationReport.getRecalibrationTables();
@ -73,6 +76,7 @@ public class BaseRecalibration {
quantizationInfo.quantizeQualityScores(quantizationLevels);
readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length);
this.noIndelQuals = noIndelQuals;
}
/**
@ -87,6 +91,7 @@ public class BaseRecalibration {
this.recalibrationTables = recalibrationTables;
this.requestedCovariates = requestedCovariates;
readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length);
noIndelQuals = false;
}
/**
@ -99,6 +104,11 @@ public class BaseRecalibration {
public void recalibrateRead(final GATKSAMRecord read) {
RecalDataManager.computeCovariates(read, requestedCovariates, readCovariates); // compute all covariates for the read
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
if (noIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) {
read.setBaseQualities(null, errorModel);
continue;
}
final byte[] quals = read.getBaseQualities(errorModel);
final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); // get the keyset for this base using the error model

View File

@ -173,10 +173,10 @@ public class GATKSAMRecord extends BAMRecord {
setBaseQualities(quals);
break;
case BASE_INSERTION:
setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, SAMUtils.phredToFastq(quals) );
setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals) );
break;
case BASE_DELETION:
setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, SAMUtils.phredToFastq(quals) );
setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, quals == null ? null : SAMUtils.phredToFastq(quals) );
break;
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );

View File

@ -84,14 +84,14 @@ public class ReadClipperTestUtils {
}
private static boolean isCigarValid(Cigar cigar) {
if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation)
if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation)
Stack<CigarElement> cigarElementStack = new Stack<CigarElement>(); // Stack to invert cigar string to find ending operator
Stack<CigarElement> cigarElementStack = new Stack<CigarElement>(); // Stack to invert cigar string to find ending operator
CigarOperator startingOp = null;
CigarOperator endingOp = null;
// check if it doesn't start with deletions
boolean readHasStarted = false; // search the list of elements for the starting operator
boolean readHasStarted = false; // search the list of elements for the starting operator
for (CigarElement cigarElement : cigar.getCigarElements()) {
if (!readHasStarted) {
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) {
@ -102,19 +102,16 @@ public class ReadClipperTestUtils {
cigarElementStack.push(cigarElement);
}
readHasStarted = false; // search the inverted list of elements (stack) for the stopping operator
while (!cigarElementStack.empty()) {
CigarElement cigarElement = cigarElementStack.pop();
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) {
readHasStarted = true;
endingOp = cigarElement.getOperator();
break;
}
}
// if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION)
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION)
return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
}
return false;

View File

@ -151,51 +151,40 @@ public class ReadClipperUnitTest extends BaseTest {
final byte LOW_QUAL = 2;
final byte HIGH_QUAL = 30;
// create a read for every cigar permutation
/** create a read for every cigar permutation */
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
int readLength = read.getReadLength();
byte[] quals = new byte[readLength];
for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) {
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail
/** create a read with nLowQualBases in the left tail */
Utils.fillArrayWithByte(quals, HIGH_QUAL);
for (int addLeft = 0; addLeft < nLowQualBases; addLeft++)
quals[addLeft] = LOW_QUAL;
read.setBaseQualities(quals);
GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
checkClippedReadsForLowQualEnds(read, clipLeft, LOW_QUAL, nLowQualBases);
assertUnclippedLimits(read, clipLeft); // Make sure limits haven't changed
assertNoLowQualBases(clipLeft, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the right tail
/** create a read with nLowQualBases in the right tail */
Utils.fillArrayWithByte(quals, HIGH_QUAL);
for (int addRight = 0; addRight < nLowQualBases; addRight++)
quals[readLength - addRight - 1] = LOW_QUAL;
read.setBaseQualities(quals);
GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
checkClippedReadsForLowQualEnds(read, clipRight, LOW_QUAL, nLowQualBases);
// System.out.println(String.format("Debug [%d]: %s -> %s / %s", nLowQualBases, cigar.toString(), clipLeft.getCigarString(), clipRight.getCigarString()));
assertUnclippedLimits(read, clipRight); // Make sure limits haven't changed
assertNoLowQualBases(clipRight, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString()));
/** create a read with nLowQualBases on both tails */
if (nLowQualBases <= readLength / 2) {
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases on both tails
Utils.fillArrayWithByte(quals, HIGH_QUAL);
for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) {
quals[addBoth] = LOW_QUAL;
quals[readLength - addBoth - 1] = LOW_QUAL;
}
read.setBaseQualities(quals);
GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
assertUnclippedLimits(read, clipBoth); // Make sure limits haven't changed
assertNoLowQualBases(clipBoth, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2 * nLowQualBases), read.getCigarString(), clipBoth.getCigarString()));
checkClippedReadsForLowQualEnds(read, clipBoth, LOW_QUAL, 2*nLowQualBases);
}
}
}
@ -209,8 +198,8 @@ public class ReadClipperUnitTest extends BaseTest {
CigarCounter original = new CigarCounter(read);
CigarCounter clipped = new CigarCounter(clippedRead);
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS
}
}
@ -286,11 +275,17 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
private void checkClippedReadsForLowQualEnds(GATKSAMRecord read, GATKSAMRecord clippedRead, byte lowQual, int nLowQualBases) {
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
assertNoLowQualBases(clippedRead, lowQual); // Make sure the low qualities are gone
assertNumberOfBases(read, clippedRead, nLowQualBases); // Make sure only low quality bases were clipped
}
/**
* Asserts that clipping doesn't change the getUnclippedStart / getUnclippedEnd
*
* @param original
* @param clipped
* @param original original read
* @param clipped clipped read
*/
private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) {
if (ReadClipperTestUtils.readHasNonClippedBases(clipped)) {
@ -299,6 +294,12 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
private void assertNumberOfBases(GATKSAMRecord read, GATKSAMRecord clipLeft, int nLowQualBases) {
if (read.getCigarString().contains("M"))
Assert.assertEquals(clipLeft.getReadLength(), read.getReadLength() - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), read.getReadLength() - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
}
private boolean startsWithInsertion(Cigar cigar) {
return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0;
}

View File

@ -11,11 +11,11 @@ insertions_default_quality 45
low_quality_tail 2
mismatches_context_size 2
mismatches_default_quality -1
no_standard_covs false
quantizing_levels 16
run_without_dbsnp false
solid_nocall_strategy THROW_EXCEPTION
solid_recal_mode SET_Q_ZERO
standard_covs true
#:GATKTable:3:94:::;
#:GATKTable:Quantized:Quality quantization map