From 78c15561860d48741e6c1b45878accb9c02d1dc1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 8 Aug 2012 15:49:31 -0400 Subject: [PATCH 1/3] Fixing ReduceReads downsampling bug -- downsampled reads were not being excluded from the read window, causing them to trail back and get caught by the sliding window exception --- .../compression/reducereads/ReduceReads.java | 74 +------------------ .../reducereads/SlidingWindow.java | 4 +- .../ReduceReadsIntegrationTest.java | 10 +-- 3 files changed, 8 insertions(+), 80 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index d2b9803ef..93d8319b7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -25,9 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; import net.sf.samtools.util.SequenceUtil; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; @@ -183,7 +180,7 @@ public class ReduceReads extends ReadWalker, ReduceRea * A value of 0 turns downsampling off. */ @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) - protected int downsampleCoverage = 0; + protected int downsampleCoverage = 250; @Hidden @Argument(fullName = "", shortName = "dl", doc = "", required = false) @@ -535,81 +532,12 @@ public class ReduceReads extends ReadWalker, ReduceRea if (debugLevel == 1) System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd()); -// if (!DONT_USE_SOFTCLIPPED_BASES) -// reSoftClipBases(read); - if (!DONT_COMPRESS_READ_NAMES) compressReadName(read); out.addAlignment(read); } - private void reSoftClipBases(GATKSAMRecord read) { - Integer left = (Integer) read.getTemporaryAttribute("SL"); - Integer right = (Integer) read.getTemporaryAttribute("SR"); - if (left != null || right != null) { - Cigar newCigar = new Cigar(); - for (CigarElement element : read.getCigar().getCigarElements()) { - newCigar.add(new CigarElement(element.getLength(), element.getOperator())); - } - - if (left != null) { - newCigar = updateFirstSoftClipCigarElement(left, newCigar); - read.setAlignmentStart(read.getAlignmentStart() + left); - } - - if (right != null) { - Cigar invertedCigar = invertCigar(newCigar); - newCigar = invertCigar(updateFirstSoftClipCigarElement(right, invertedCigar)); - } - read.setCigar(newCigar); - } - } - - /** - * Facility routine to revert the first element of a Cigar string (skipping hard clips) into a soft-clip. - * To be used on both ends if provided a flipped Cigar - * - * @param softClipSize the length of the soft clipped element to add - * @param originalCigar the original Cigar string - * @return a new Cigar object with the soft clips added - */ - private Cigar updateFirstSoftClipCigarElement (int softClipSize, Cigar originalCigar) { - Cigar result = new Cigar(); - CigarElement leftElement = new CigarElement(softClipSize, CigarOperator.S); - boolean updated = false; - for (CigarElement element : originalCigar.getCigarElements()) { - if (!updated && element.getOperator() == CigarOperator.M) { - result.add(leftElement); - int newLength = element.getLength() - softClipSize; - if (newLength > 0) - result.add(new CigarElement(newLength, CigarOperator.M)); - updated = true; - } - else - result.add(element); - } - return result; - } - - /** - * Given a cigar string, returns the inverted cigar string. - * - * @param cigar the original cigar - * @return the inverted cigar - */ - private Cigar invertCigar(Cigar cigar) { - Stack stack = new Stack(); - for (CigarElement e : cigar.getCigarElements()) - stack.push(e); - Cigar inverted = new Cigar(); - while (!stack.empty()) { - inverted.add(stack.pop()); - } - return inverted; - } - - /** * Quality control procedure that checks if the consensus reads contains too many * mismatches with the reference. This should never happen and is a good trigger for diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 60d7096a3..6a77b0c65 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -499,7 +499,7 @@ public class SlidingWindow { result.addAll(addToSyntheticReads(0, start)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); - for (GATKSAMRecord read : result) { + for (GATKSAMRecord read : allReads) { readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts. } @@ -627,7 +627,7 @@ public class SlidingWindow { int locationIndex = startLocation < 0 ? 0 : readStart - startLocation; if (removeRead && locationIndex < 0) - throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation); + throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation); if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 08f7ddd37..ee894b420 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -21,28 +21,28 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f"); + RRTest("testDefaultCompression ", L, "72eb6db9d7a09a0cc25eaac1aafa97b7"); } @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76"); + RRTest("testMultipleIntervals ", intervals, "104b1a1d9fa5394c6fea95cd32967b78"); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "c55140cec60fa8c35161680289d74d47"); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "afd39459c841b68a442abdd5ef5f8f27"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "0f2e57b7f6de03cc4da1ffcc8cf8f1a7"); } @Test(enabled = true) public void testIndelCompression() { - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973"); + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "dda0c95f56f90e5f633c2437c2b21031"); } @Test(enabled = true) From 35cec8530c302d0ad1b8e13e569041c9df899b71 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 8 Aug 2012 21:44:24 -0400 Subject: [PATCH 2/3] Make coverage threshold in FindCoveredIntervals a command-line argument --- .../walkers/diagnostics/targets/FindCoveredIntervals.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index 0c856c6df..23e4d5ae0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -46,6 +47,9 @@ public class FindCoveredIntervals extends ActiveRegionWalker { @Output(required = true) private PrintStream out; + @Argument(fullName = "coverage_threshold", shortName = "cov", doc = "The minimum allowable coverage to be considered covered", required = false) + private int coverageThreshold = 20; + @Override // Look to see if the region has sufficient coverage public ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { @@ -53,8 +57,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker { int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); // note the linear probability scale - int coverageThreshold = 20; - return new ActivityProfileResult(Math.min((double) depth / coverageThreshold, 1)); + return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1)); } From 71ee8d87b3405e80f69da3344dbf8bf0c4a3198e Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 9 Aug 2012 09:58:20 -0400 Subject: [PATCH 3/3] Rename per-sample ML allelic fractions and counts so that they don't have the same name as the per-site INFO fields, and clarify wording in VCF header --- .../GeneralPloidyExactAFCalculationModel.java | 4 ++-- ...UnifiedGenotyperGeneralPloidyIntegrationTest.java | 12 ++++++------ .../gatk/walkers/genotyper/UnifiedGenotyper.java | 4 ++-- .../sting/utils/codecs/vcf/VCFConstants.java | 2 ++ 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java index ba19638e0..78ab11eb1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java @@ -681,8 +681,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula } // per-pool logging of AC and AF - gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); - gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); + gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); // remove PLs if necessary if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 4d8c6808f..f62b2250e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -46,33 +46,33 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","d76e3b910259da819f1e1b2adc68ba8d"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0934f72865388999efec64bd9d4a9b93"); } @Test public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","ffadcdaee613dab975197bed0fc78da3"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","126581c72d287722437274d41b6fed7b"); } @Test public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","96087fe9240e3656cc2a4e0ff0174d5b"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","b543aa1c3efedb301e525c1d6c50ed8d"); } @Test public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","6fdae7093831ecfc82a06dd707d62fe9"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","55b20557a836bb92688e68f12d7f5dc4"); } @Test public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","6b27634214530d379db70391a9cfc2d7"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","7eb889e8e07182f4c3d64609591f9459"); } @Test public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "e74d4c73ece45d7fb676b99364df4f1a"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "db8114877b99b14f7180fdcd24b040a7"); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index c1c1339f5..507806fbe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -312,8 +312,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif // add the pool values for each genotype if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) { - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed, for this pool")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed, for this pool")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample")); } if (UAC.referenceSampleName != null) { headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth")); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java index 8790a000d..dac58eb10 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java @@ -36,6 +36,8 @@ public final class VCFConstants { public static final String MLE_ALLELE_COUNT_KEY = "MLEAC"; public static final String ALLELE_FREQUENCY_KEY = "AF"; public static final String MLE_ALLELE_FREQUENCY_KEY = "MLEAF"; + public static final String MLE_PER_SAMPLE_ALLELE_COUNT_KEY = "MLPSAC"; + public static final String MLE_PER_SAMPLE_ALLELE_FRACTION_KEY = "MLPSAF"; public static final String ALLELE_NUMBER_KEY = "AN"; public static final String RMS_BASE_QUALITY_KEY = "BQ"; public static final String CIGAR_KEY = "CIGAR";