From ccf0df0fea537e5ea6e117f6d981741d70c89d9d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 24 Jul 2013 11:40:28 -0400 Subject: [PATCH 1/3] Misc. debugging functionality to FS calculation (disabled by default) --- .../gatk/walkers/annotator/FisherStrand.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 876dbf039..cdbd43a7a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -81,6 +81,7 @@ import java.util.*; *

The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

*/ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static boolean ENABLE_DEBUGGING = false; private final static Logger logger = Logger.getLogger(FisherStrand.class); private static final String FS = "FS"; @@ -99,6 +100,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if (vc.isSNP() && stratifiedContexts != null) { final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); + printTable("unfiltered", tableNoFiltering); + printTable("filtered", tableFiltering); return pValueForBestTable(tableFiltering, tableNoFiltering); } else if (stratifiedPerReadAlleleLikelihoodMap != null) { @@ -203,6 +206,20 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); } + /** + * Printing information to logger.info for debugging purposes + * + * @param name the name of the table + * @param table the table itself + */ + private void printTable(final String name, final int[][] table) { + if ( ENABLE_DEBUGGING ) { + final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS); + logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s", + name, table[0][0], table[0][1], table[1][0], table[1][1], pValue)); + } + } + private static boolean rotateTable(int[][] table) { table[0][0] -= 1; table[1][0] += 1; From b7d1096cedfefa277e3f97746b927390fca038e7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 30 Jul 2013 09:26:18 -0400 Subject: [PATCH 2/3] Added onlyEmitSamples argument to UnifiedGenotyper -- When provided, this argument causes us to only emit the selected samples into the VCF. No INFO field annotations (AC for example) or other features are modified. It's current primary use is for efficiently evaluating joint calling. -- Add integration test for onlyEmitSamples --- .../walkers/genotyper/UnifiedGenotyper.java | 32 ++++++++-- .../UnifiedGenotyperIntegrationTest.java | 60 +++++++++++++++++++ 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 82b93aa55..17d0217f0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -183,6 +183,10 @@ public class UnifiedGenotyper extends LocusWalker, Unif @Output(doc="File to which variants should be written") protected VariantContextWriter writer = null; + @Advanced + @Argument(fullName = "onlyEmitSamples", shortName = "onlyEmitSamples", doc = "If provided, only these samples will be emitted into the VCF, regardless of which samples are present in the BAM file", required = false) + protected Set onlyEmitSamples = Collections.emptySet(); + @Hidden @Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false) protected PrintStream verboseWriter = null; @@ -288,9 +292,16 @@ public class UnifiedGenotyper extends LocusWalker, Unif // and perform any necessary initialization/validation steps annotationEngine.invokeAnnotationInitializationMethods(headerInfo); - writer.writeHeader(new VCFHeader(headerInfo, samples)); - - + final Set samplesForHeader; + if ( ! onlyEmitSamples.isEmpty() ) { + // make sure that onlyEmitSamples is a subset of samples + if ( ! samples.containsAll(onlyEmitSamples) ) + throw new UserException.BadArgumentValue("onlyEmitSamples", "must be a strict subset of the samples in the BAM files but is wasn't"); + samplesForHeader = onlyEmitSamples; + } else { + samplesForHeader = samples; + } + writer.writeHeader(new VCFHeader(headerInfo, samplesForHeader)); } public static Set getHeaderInfo(final UnifiedArgumentCollection UAC, @@ -387,7 +398,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif try { // we are actually making a call sum.nCallsMade++; - writer.add(call); + writer.add(subsetToEmitSamples(call)); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage()); } @@ -403,6 +414,19 @@ public class UnifiedGenotyper extends LocusWalker, Unif return sum; } + /** + * Subset the VariantContext down to just the emitting samples, if onlyEmitSamples has been provided + * @param fullVC the VariantContext containing calls for all samples in the BAM files + * @return a VariantContext that has been appropriately reduced to a subset of samples, if required + */ + private VariantContext subsetToEmitSamples(final VariantContext fullVC) { + if ( onlyEmitSamples.isEmpty() ) { + return fullVC; + } else { + return GATKVariantContextUtils.trimAlleles(fullVC.subContextFromSamples(onlyEmitSamples, false), false, true); + } + } + public void onTraversalDone(UGStatistics sum) { if ( metricsWriter != null ) { metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index d64ceb953..ed39d1b56 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -47,16 +47,22 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.util.BlockCompressedInputStream; +import org.apache.commons.collections.IteratorUtils; import org.broad.tribble.readers.AsciiLineReader; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; import java.util.Collections; +import java.util.List; // ********************************************************************************** // // Note that this class also serves as an integration test for the VariantAnnotator! // @@ -324,4 +330,58 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { nLines++; Assert.assertTrue(nLines > 0); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing only emit samples + // + // -------------------------------------------------------------------------------------------------------------- + + @Test(enabled = true) + public void testOnlyEmitSample() throws Exception { + final String base = "-T UnifiedGenotyper -R " + b37KGReference + " -I " + + privateTestDir + "AFR.complex.variants.bam --disableDithering" + + " -o %s -L 20:10,000,000-10,100,000"; + final WalkerTestSpec specAllSamples = new WalkerTestSpec(base, 1, Arrays.asList("")); + specAllSamples.disableShadowBCF(); + final File allSamplesVCF = executeTest("testOnlyEmitSampleAllSamples", specAllSamples).first.get(0); + List allSampleVCs = IteratorUtils.toList(GATKVCFUtils.readAllVCs(allSamplesVCF, new VCFCodec()).getSecond()); + + final WalkerTestSpec onlyHG01879 = new WalkerTestSpec(base + " -onlyEmitSamples HG01879", 1, Arrays.asList("")); + onlyHG01879.disableShadowBCF(); + final File onlyHG01879VCF = executeTest("testOnlyEmitSample", onlyHG01879).first.get(0); + List onlyHG01879VCs = IteratorUtils.toList(GATKVCFUtils.readAllVCs(onlyHG01879VCF, new VCFCodec()).getSecond()); + + Assert.assertEquals(allSampleVCs.size(), onlyHG01879VCs.size()); + for ( int i = 0; i < allSampleVCs.size(); i++ ) { + final VariantContext allSampleVC = allSampleVCs.get(i); + final VariantContext onlyHG01879VC = onlyHG01879VCs.get(i); + + if ( allSampleVC == null ) { + Assert.assertNull(onlyHG01879VC); + } else { + Assert.assertNotNull(onlyHG01879VC); + + Assert.assertTrue(allSampleVC.getGenotypes().size() > 1, "All samples should have had more than 1 genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.getGenotypes().size(), 1, "Should have found a single sample genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.hasGenotype("HG01879"), true); + + Assert.assertEquals(allSampleVC.getStart(), onlyHG01879VC.getStart()); + Assert.assertEquals(allSampleVC.getChr(), onlyHG01879VC.getChr()); + Assert.assertEquals(allSampleVC.getEnd(), onlyHG01879VC.getEnd()); + Assert.assertEquals(allSampleVC.getFilters(), onlyHG01879VC.getFilters()); + Assert.assertEquals(allSampleVC.getAlleles(), onlyHG01879VC.getAlleles()); + Assert.assertEquals(allSampleVC.getAttributes(), onlyHG01879VC.getAttributes()); + Assert.assertEquals(allSampleVC.getPhredScaledQual(), onlyHG01879VC.getPhredScaledQual()); + + final Genotype allG = allSampleVC.getGenotype("HG01879"); + final Genotype onlyG = onlyHG01879VC.getGenotype("HG01879"); + Assert.assertEquals(allG.getAD(), onlyG.getAD()); + Assert.assertEquals(allG.getDP(), onlyG.getDP()); + Assert.assertEquals(allG.getAlleles(), onlyG.getAlleles()); + Assert.assertEquals(allG.getPL(), onlyG.getPL()); + Assert.assertEquals(allG.toString(), onlyG.toString()); + } + } + } }