Added onlyEmitSamples argument to UnifiedGenotyper

-- When provided, this argument causes us to only emit the selected samples into the VCF.  No INFO field annotations (AC for example) or other features are modified.  It's current primary use is for efficiently evaluating joint calling.
-- Add integration test for onlyEmitSamples
This commit is contained in:
Mark DePristo 2013-07-30 09:26:18 -04:00
parent ccf0df0fea
commit b7d1096ced
2 changed files with 88 additions and 4 deletions

View File

@ -183,6 +183,10 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
@Output(doc="File to which variants should be written")
protected VariantContextWriter writer = null;
@Advanced
@Argument(fullName = "onlyEmitSamples", shortName = "onlyEmitSamples", doc = "If provided, only these samples will be emitted into the VCF, regardless of which samples are present in the BAM file", required = false)
protected Set<String> onlyEmitSamples = Collections.emptySet();
@Hidden
@Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null;
@ -288,9 +292,16 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
// and perform any necessary initialization/validation steps
annotationEngine.invokeAnnotationInitializationMethods(headerInfo);
writer.writeHeader(new VCFHeader(headerInfo, samples));
final Set<String> samplesForHeader;
if ( ! onlyEmitSamples.isEmpty() ) {
// make sure that onlyEmitSamples is a subset of samples
if ( ! samples.containsAll(onlyEmitSamples) )
throw new UserException.BadArgumentValue("onlyEmitSamples", "must be a strict subset of the samples in the BAM files but is wasn't");
samplesForHeader = onlyEmitSamples;
} else {
samplesForHeader = samples;
}
writer.writeHeader(new VCFHeader(headerInfo, samplesForHeader));
}
public static Set<VCFHeaderLine> getHeaderInfo(final UnifiedArgumentCollection UAC,
@ -387,7 +398,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
try {
// we are actually making a call
sum.nCallsMade++;
writer.add(call);
writer.add(subsetToEmitSamples(call));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage());
}
@ -403,6 +414,19 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
return sum;
}
/**
* Subset the VariantContext down to just the emitting samples, if onlyEmitSamples has been provided
* @param fullVC the VariantContext containing calls for all samples in the BAM files
* @return a VariantContext that has been appropriately reduced to a subset of samples, if required
*/
private VariantContext subsetToEmitSamples(final VariantContext fullVC) {
if ( onlyEmitSamples.isEmpty() ) {
return fullVC;
} else {
return GATKVariantContextUtils.trimAlleles(fullVC.subContextFromSamples(onlyEmitSamples, false), false, true);
}
}
public void onTraversalDone(UGStatistics sum) {
if ( metricsWriter != null ) {
metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited));

View File

@ -47,16 +47,22 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.util.BlockCompressedInputStream;
import org.apache.commons.collections.IteratorUtils;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
// ********************************************************************************** //
// Note that this class also serves as an integration test for the VariantAnnotator! //
@ -324,4 +330,58 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
nLines++;
Assert.assertTrue(nLines > 0);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing only emit samples
//
// --------------------------------------------------------------------------------------------------------------
@Test(enabled = true)
public void testOnlyEmitSample() throws Exception {
final String base = "-T UnifiedGenotyper -R " + b37KGReference + " -I "
+ privateTestDir + "AFR.complex.variants.bam --disableDithering"
+ " -o %s -L 20:10,000,000-10,100,000";
final WalkerTestSpec specAllSamples = new WalkerTestSpec(base, 1, Arrays.asList(""));
specAllSamples.disableShadowBCF();
final File allSamplesVCF = executeTest("testOnlyEmitSampleAllSamples", specAllSamples).first.get(0);
List<VariantContext> allSampleVCs = IteratorUtils.toList(GATKVCFUtils.readAllVCs(allSamplesVCF, new VCFCodec()).getSecond());
final WalkerTestSpec onlyHG01879 = new WalkerTestSpec(base + " -onlyEmitSamples HG01879", 1, Arrays.asList(""));
onlyHG01879.disableShadowBCF();
final File onlyHG01879VCF = executeTest("testOnlyEmitSample", onlyHG01879).first.get(0);
List<VariantContext> onlyHG01879VCs = IteratorUtils.toList(GATKVCFUtils.readAllVCs(onlyHG01879VCF, new VCFCodec()).getSecond());
Assert.assertEquals(allSampleVCs.size(), onlyHG01879VCs.size());
for ( int i = 0; i < allSampleVCs.size(); i++ ) {
final VariantContext allSampleVC = allSampleVCs.get(i);
final VariantContext onlyHG01879VC = onlyHG01879VCs.get(i);
if ( allSampleVC == null ) {
Assert.assertNull(onlyHG01879VC);
} else {
Assert.assertNotNull(onlyHG01879VC);
Assert.assertTrue(allSampleVC.getGenotypes().size() > 1, "All samples should have had more than 1 genotype, but didn't");
Assert.assertEquals(onlyHG01879VC.getGenotypes().size(), 1, "Should have found a single sample genotype, but didn't");
Assert.assertEquals(onlyHG01879VC.hasGenotype("HG01879"), true);
Assert.assertEquals(allSampleVC.getStart(), onlyHG01879VC.getStart());
Assert.assertEquals(allSampleVC.getChr(), onlyHG01879VC.getChr());
Assert.assertEquals(allSampleVC.getEnd(), onlyHG01879VC.getEnd());
Assert.assertEquals(allSampleVC.getFilters(), onlyHG01879VC.getFilters());
Assert.assertEquals(allSampleVC.getAlleles(), onlyHG01879VC.getAlleles());
Assert.assertEquals(allSampleVC.getAttributes(), onlyHG01879VC.getAttributes());
Assert.assertEquals(allSampleVC.getPhredScaledQual(), onlyHG01879VC.getPhredScaledQual());
final Genotype allG = allSampleVC.getGenotype("HG01879");
final Genotype onlyG = onlyHG01879VC.getGenotype("HG01879");
Assert.assertEquals(allG.getAD(), onlyG.getAD());
Assert.assertEquals(allG.getDP(), onlyG.getDP());
Assert.assertEquals(allG.getAlleles(), onlyG.getAlleles());
Assert.assertEquals(allG.getPL(), onlyG.getPL());
Assert.assertEquals(allG.toString(), onlyG.toString());
}
}
}
}