From 5bb069388810f1c0858801379b1fbdc3d56fb8aa Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 28 Jun 2012 12:57:51 -0400 Subject: [PATCH 01/15] Bug fix for HC GGA mode. Shouldn't try to add an indel into the haplotype if that haplotype already contains the event of interest. Misc minor assembly param changes. Turning off capping of base qualities by base indel qualities until we can evaluate that change. --- .../broadinstitute/sting/utils/activeregion/ActiveRegion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 7c956b855..18276f932 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -46,7 +46,7 @@ public class ActiveRegion implements HasGenomeLocation, Comparable } public void hardClipToActiveRegion() { - final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, activeRegionLoc.getStart(), activeRegionLoc.getStop() ); + final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); reads.clear(); reads.addAll(clippedReads); } From 05791ebf801b5621a25a7d9a77ddc7be748985c6 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 28 Jun 2012 13:22:56 -0400 Subject: [PATCH 02/15] Adding the Clipping rank sum test: If alternate-supporting reads have more hard clipping than reference-supporting reads this is evidence for error. --- .../annotator/ClippingRankSumTest.java | 82 +++++++++++++++++++ .../sting/utils/sam/AlignmentUtils.java | 12 +++ 2 files changed, 94 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java new file mode 100644 index 000000000..aeb01bf3e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 6/28/12 + */ + +public class ClippingRankSumTest extends RankSumTest { + + public List getKeyNames() { return Arrays.asList("ClippingRankSum"); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); } + + protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + if ( p.getBase() == ref ) { + refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); + } else if ( alts.contains(p.getBase()) ) { + altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); + } + } + } + } + + protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); + else + altQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); + } + } + } + + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? + HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) + refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); + else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) + altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index cecbd1bc4..e5e747c2d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -354,6 +354,18 @@ public class AlignmentUtils { return n; } + public static int getNumHardClippedBases(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) + if (e.getOperator() == CigarOperator.H) + n += e.getLength(); + + return n; + } + public static byte[] alignmentToByteArray(final Cigar cigar, final byte[] read, final byte[] ref) { final byte[] alignment = new byte[read.length]; From 480b32e759a5b62a6dcdf9dfa5349d95df263fd9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 1 Jul 2012 14:59:27 -0400 Subject: [PATCH 03/15] BCF2 is now officially zero-based open-interval, and that's how the GATK does it now --- .../broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java | 4 ++-- .../sting/utils/variantcontext/writer/BCF2Writer.java | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index e106e72ac..0503e417a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -239,10 +239,10 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD final String contig = lookupContigName(contigOffset); builder.chr(contig); - this.pos = decoder.decodeInt(BCF2Type.INT32); + this.pos = decoder.decodeInt(BCF2Type.INT32) + 1; // GATK is one based, BCF2 is zero-based final int refLength = decoder.decodeInt(BCF2Type.INT32); builder.start((long)pos); - builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals + builder.stop((long)(pos + refLength - 1)); // minus one because GATK has closed intervals but BCF2 is open } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index e281475f2..7d9a18d14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -84,7 +84,6 @@ import java.util.*; */ class BCF2Writer extends IndexingVariantContextWriter { final protected static Logger logger = Logger.getLogger(BCF2Writer.class); - final private static List MISSING_GENOTYPE = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); final private static boolean ALLOW_MISSING_CONTIG_LINES = false; private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support @@ -203,10 +202,11 @@ class BCF2Writer extends IndexingVariantContextWriter { // note use of encodeRawValue to not insert the typing byte encoder.encodeRawValue(contigIndex, BCF2Type.INT32); - // pos - encoder.encodeRawValue(vc.getStart(), BCF2Type.INT32); + // pos. GATK is 1 based, BCF2 is 0 based + encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32); - // ref length + // ref length. GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1 + // for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1 encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32); // qual From 7aff4446d4ecec878894fe5fac58fc54ad4f69f3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 1 Jul 2012 15:38:10 -0400 Subject: [PATCH 04/15] Added unit tests for header repairing capabilities in the GATK engine --- .../variantcontext/writer/VCFWriter.java | 2 +- .../utils/codecs/vcf/VCFIntegrationTest.java | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 2a4c8cac2..8ccb79744 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -564,6 +564,6 @@ class VCFWriter extends IndexingVariantContextWriter { + " at " + vc.getChr() + ":" + vc.getStart() + " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have" + " complete VCF headers by default. This error can be disabled with the engine argument" - + " -U LENIENT_VCF_PROCESSING"); + + " -U LENIENT_VCF_PROCESSING or repair the VCF file header using repairVCFHeader"); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 4d23f0fa5..1eda504db 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.io.File; @@ -66,4 +67,38 @@ public class VCFIntegrationTest extends WalkerTest { WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("63a2e0484ae37b0680514f53e0bf0c94")); executeTest("Test reading samtools WEx BCF example", spec1); } + + // + // + // Tests to ensure that -U LENIENT_VCF_PROCESS and header repairs are working + // + // + + @Test + public void testFailingOnVCFWithoutHeaders() { + runVCFWithoutHeaders("", "", UserException.class, false); + } + + @Test + public void testPassingOnVCFWithoutHeadersWithLenientProcessing() { + runVCFWithoutHeaders("-U LENIENT_VCF_PROCESSING", "6de8cb7457154dd355aa55befb943f88", null, true); + } + + @Test + public void testPassingOnVCFWithoutHeadersRepairingHeaders() { + runVCFWithoutHeaders("-repairVCFHeader " + privateTestDir + "vcfexample2.justHeader.vcf", "ff61e9cad6653c7f93d82d391f7ecdcb", null, false); + } + + private void runVCFWithoutHeaders(final String moreArgs, final String expectedMD5, final Class expectedException, final boolean disableBCF) { + final String testVCF = privateTestDir + "vcfexample2.noHeader.vcf"; + final String baseCommand = "-R " + b37KGReference + + " --no_cmdline_in_header -o %s " + + "-T VariantsToVCF -V " + testVCF + " " + moreArgs; + WalkerTestSpec spec1 = expectedException != null + ? new WalkerTestSpec(baseCommand, 1, expectedException) + : new WalkerTestSpec(baseCommand, 1, Arrays.asList(expectedMD5)); + if ( disableBCF ) + spec1.disableShadowBCF(); + executeTest("Test reading VCF without header lines with additional args " + moreArgs, spec1); + } } From 01e04992f8a94f84776864e747e889ae6462ce31 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 11:38:59 -0400 Subject: [PATCH 06/15] Fixed compatibilities in AbstractVCFCodec that resulted in key=; being parsed as written as key; in VCF output --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 1d5869349..5ad939e76 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -363,10 +363,10 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec int eqI = infoFieldArray[i].indexOf("="); if ( eqI != -1 ) { key = infoFieldArray[i].substring(0, eqI); - String str = infoFieldArray[i].substring(eqI+1); + String valueString = infoFieldArray[i].substring(eqI+1); // split on the INFO field separator - int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); + int infoValueSplitSize = ParsingUtils.split(valueString, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); if ( infoValueSplitSize == 1 ) { value = infoValueArray[0]; final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); @@ -396,6 +396,9 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec } } + // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING + if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; + attributes.put(key, value); } } From bcd2e13d8bb1e1d4c9493030ef06c8775dadaa35 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 11:39:34 -0400 Subject: [PATCH 07/15] Adding duplicate header line keys is a logger.debug not logger.warn message now --- .../org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 50d46eab3..296be7873 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -214,7 +214,7 @@ public class VCFHeader { private final void addMetaDataMapBinding(final Map map, T line) { final String key = line.getID(); if ( map.containsKey(key) ) - logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" ); + logger.debug("Found duplicate VCF header lines for " + key + "; keeping the first only" ); else map.put(key, line); } From 602729c09d74d6beecbecbf41a96f59a52d13ee1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 11:40:28 -0400 Subject: [PATCH 08/15] Moved parallel tests from SelectVariants to separate SelectVariantsParallelIntegrationTest -- Enabled previous tests -- all now working -- Added modern test against new VCF as well --- .../SelectVariantsIntegrationTest.java | 30 ---------- ...SelectVariantsParallelIntegrationTest.java | 59 +++++++++++++++++++ 2 files changed, 59 insertions(+), 30 deletions(-) create mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 30cdbee36..5e674d6c2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -167,36 +167,6 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testNoGTs--" + testFile, spec); } - @Test(enabled = false) - public void testParallelization2() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - WalkerTestSpec spec; - - spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), - 1, - Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") - ); - spec.disableShadowBCF(); - executeTest("testParallelization (2 threads)--" + testfile, spec); - } - - @Test(enabled = false) - public void testParallelization4() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - WalkerTestSpec spec; - spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), - 1, - Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") - ); - spec.disableShadowBCF(); - - executeTest("testParallelization (4 threads)--" + testfile, spec); - } - @Test public void testSelectFromMultiAllelic() { String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java new file mode 100755 index 000000000..ff3982d24 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SelectVariantsParallelIntegrationTest extends WalkerTest { + + private class ParallelSelectTestProvider extends TestDataProvider { + final String reference; + final String args; + final String md5; + final int nt; + + private ParallelSelectTestProvider(final String reference, final String args, final String md5, final int nt) { + super(ParallelSelectTestProvider.class); + this.reference = reference; + this.args = args; + this.md5 = md5; + this.nt = nt; + } + + public final String getCmdLine() { + return "-T SelectVariants -R " + reference + " -o %s -L 1 --no_cmdline_in_header -nt " + nt + " " + args; + } + + public String toString() { + return String.format("ParallelSelectVariants nt=%d args=%s", nt, args); + } + } + + @DataProvider(name = "ParallelSelectTest") + public Object[][] makeParallelSelectTestProvider() { + for ( int nt : Arrays.asList(1, 2, 4) ) { + { // original MAF test + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + String args = " -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile; + new ParallelSelectTestProvider(b36KGReference, args, "4386fbb258dcef4437495a37f5a83c53", nt); + } + { // new tests on b37 using testdir VCF + final String testfile = privateTestDir + "NA12878.hg19.example1.vcf"; + final String args = "-select 'DP > 30' -V " + testfile; + new ParallelSelectTestProvider(b37KGReference, args, "c64b45a14d41b1e5cddbe036b47e7519", nt); + } + } + + return ParallelSelectTestProvider.getTests(ParallelSelectTestProvider.class); + } + + @Test(dataProvider = "ParallelSelectTest") + public void testParallelSelectTestProvider(final ParallelSelectTestProvider cfg) { + final WalkerTestSpec spec = new WalkerTestSpec( cfg.getCmdLine(), 1, Arrays.asList(cfg.md5) ); + executeTest(cfg.toString(), spec); + } +} From 1b0a7757738f13dfffb98cab27826fe2a1a40680 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 15:55:42 -0400 Subject: [PATCH 09/15] Disabling bcf2 reading from samtools because it's 1 basis; updating select variants integrationtest --- .../walkers/variantutils/SelectVariantsIntegrationTest.java | 2 +- .../sting/utils/codecs/vcf/VCFIntegrationTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 5e674d6c2..59eaa177d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), 1, - Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") ); spec.disableShadowBCF(); executeTest("testComplexSelection--" + testfile, spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 1eda504db..1e3c799fe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -59,7 +59,7 @@ public class VCFIntegrationTest extends WalkerTest { executeTest("Test writing samtools WEx BCF example", spec1); } - @Test + @Test(enabled = false) // TODO disabled because current BCF2 is 1 based public void testReadingSamtoolsWExBCFExample() { String testVCF = privateTestDir + "ex2.bcf"; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; From 9ee58d323aeebe4a331ffd24b61fad8f59036a6a Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 2 Jul 2012 15:37:23 -0400 Subject: [PATCH 10/15] Pass the original GATK unsafe parameter to the VcfGatherFunction --- .../sting/queue/extensions/gatk/VcfGatherFunction.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 7862dec41..739e6cc91 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -52,6 +52,9 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { val sitesOnly = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME) this.sites_only = originalGATK.getFieldValue(sitesOnly).asInstanceOf[Boolean] + // ensure that the gather function receives the same unsafe parameter as the scattered function + this.unsafe = this.originalGATK.unsafe + super.freezeFieldValues() } } From 88a02fa2cb927923519fd354b821ee4cd0fe7913 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 2 Jul 2012 12:28:28 -0400 Subject: [PATCH 11/15] Fixing but for reads with cigars like 9S54H When hard-clipping predict when the read is going to be fully hard clipped to the point where only soft/hard-clips are left in the read and preemptively eliminate the read before the SAMRecord mathematics on malformed cigars kills the GATK. --- .../org/broadinstitute/sting/utils/clipping/ClippingOp.java | 5 ++++- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 2e3978ddb..d12d2c2ec 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -285,7 +285,10 @@ public class ClippingOp { @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { - if (start == 0 && stop == read.getReadLength() - 1) + final int firstBaseAfterSoftClips = read.getAlignmentStart() - read.getSoftStart(); + final int lastBaseBeforeSoftClips = read.getSoftEnd() - read.getSoftStart(); + + if (start == firstBaseAfterSoftClips && stop == lastBaseBeforeSoftClips) // note that if the read has no soft clips, these constants will be 0 and read length - 1 (beauty of math). return GATKSAMRecord.emptyRead(read); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 78ce81a7c..6b9ba79b4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -520,7 +520,7 @@ public class ReadUtils { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); } else { - throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); } } From 3cea080aa8a0feab210a0c2e00f73cc30a090ef7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 2 Jul 2012 12:29:31 -0400 Subject: [PATCH 12/15] Cache SoftStart() and SoftEnd() in the GATKSAMRecord these are costly operations when done repeatedly on the same read. --- .../sting/utils/clipping/ClippingOp.java | 1 + .../sting/utils/sam/GATKSAMRecord.java | 66 +++++++++++++------ 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index d12d2c2ec..a4383c3ae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -312,6 +312,7 @@ public class ClippingOp { throw new ReviewedStingException("Where did the clone go?"); } + hardClippedRead.resetSoftStartAndEnd(); // reset the cached soft start and end because they may have changed now that the read was hard clipped. No need to calculate them now. They'll be lazily calculated on the next call to getSoftStart()/End() hardClippedRead.setBaseQualities(newQuals); hardClippedRead.setReadBases(newBases); hardClippedRead.setCigar(cigarShift.cigar); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 5fbe12eed..a925c7577 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -59,6 +59,8 @@ public class GATKSAMRecord extends BAMRecord { private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; private byte[] reducedReadCounts = null; + private int softStart = -1; + private int softEnd = -1; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -385,15 +387,17 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public int getSoftStart() { - int start = this.getUnclippedStart(); - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - start += cigarElement.getLength(); - else - break; + if (softStart < 0) { + int start = this.getUnclippedStart(); + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + start += cigarElement.getLength(); + else + break; + } + softStart = start; } - - return start; + return softStart; } /** @@ -404,23 +408,43 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public int getSoftEnd() { - int stop = this.getUnclippedStart(); + if (softEnd < 0) { + int stop = this.getUnclippedStart(); - if (ReadUtils.readIsEntirelyInsertion(this)) - return stop; + if (ReadUtils.readIsEntirelyInsertion(this)) + return stop; - int shift = 0; - CigarOperator lastOperator = null; - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - stop += shift; - lastOperator = cigarElement.getOperator(); - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) - shift = cigarElement.getLength(); - else - shift = 0; + int shift = 0; + CigarOperator lastOperator = null; + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + stop += shift; + lastOperator = cigarElement.getOperator(); + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) + shift = cigarElement.getLength(); + else + shift = 0; + } + softEnd = (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + return softEnd; + } - return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; + /** + * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache + * so that the next call to getSoftStart/End will recalculate it lazily. + */ + public void resetSoftStartAndEnd() { + softStart = -1; + softEnd = -1; + } + + /** + * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache + * so that the next call to getSoftStart/End will recalculate it lazily. + */ + public void resetSoftStartAndEnd(int softStart, int softEnd) { + this.softStart = softStart; + this.softEnd = softEnd; } /** From b807ff63efe7b9f92afe3dae8302c986e64fea99 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 2 Jul 2012 16:37:39 -0400 Subject: [PATCH 13/15] HaplotypeCaller now creates MNP and complex substitutions by using LD information to decide if events segregate together on haplotypes. Added unit test. --- .../gatk/walkers/annotator/ClippingRankSumTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index aeb01bf3e..f41a40621 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -25,6 +25,9 @@ public class ClippingRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); } protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { + return; + // This working implementation below needs to be tested for the UG pipeline + /* for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { if ( p.getBase() == ref ) { @@ -34,6 +37,7 @@ public class ClippingRankSumTest extends RankSumTest { } } } + */ } protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { @@ -53,6 +57,10 @@ public class ClippingRankSumTest extends RankSumTest { } protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + return; + // This working implementation below needs to be tested for the UG pipeline + + /* // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); for (final PileupElement p: pileup) { @@ -78,5 +86,6 @@ public class ClippingRankSumTest extends RankSumTest { altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); } } + */ } } From f92139dd82fe717f8133adabf06e7102b2339ad9 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 2 Jul 2012 21:12:42 -0400 Subject: [PATCH 15/15] Ooops, UG VA path for rank sum tests aren't happy with empty lists. Disabling clipping rank sum test for now. --- .../sting/gatk/walkers/annotator/ClippingRankSumTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index f41a40621..5403e19dc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -18,7 +18,7 @@ import java.util.*; * Date: 6/28/12 */ -public class ClippingRankSumTest extends RankSumTest { +public class ClippingRankSumTest /*extends RankSumTest*/ { public List getKeyNames() { return Arrays.asList("ClippingRankSum"); }