From 931890915fba0d55e25298afaee2edb69878de45 Mon Sep 17 00:00:00 2001 From: Phillip Dexheimer Date: Fri, 8 Aug 2014 08:26:22 -0400 Subject: [PATCH] Add the --sample_name argument to HaplotypeCaller * This is a shortcut for people who have multi-sample BAMs but would like to use GVCF mode. Rather than creating single-sample BAMs with PrintReads, one could use the --sample_name argument to HaplotypeCaller to specify the single sample to make calls on * Completes PT 73075482 --- .../haplotypecaller/HaplotypeCaller.java | 34 +++++++++++++++++-- .../HaplotypeCallerGVCFIntegrationTest.java | 6 +++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 9037d7f3c..3914f90c1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -341,6 +341,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @ArgumentCollection private HaplotypeCallerArgumentCollection SCAC = new HaplotypeCallerArgumentCollection(); + @Argument(fullName="sample_name", shortName = "sn", doc="Name of single sample to use from a multi-sample bam. The name is case-sensitive", required=false) + protected String sampleNameToUse = null; + // ----------------------------------------------------------------------------------------------- // arguments to control internal behavior of the read threading assembler // ----------------------------------------------------------------------------------------------- @@ -655,8 +658,21 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final GenomeAnalysisEngine toolkit = getToolkit(); samplesList = toolkit.getReadSampleList(); - final Set sampleSet = SampleListUtils.asSet(samplesList); + Set sampleSet = SampleListUtils.asSet(samplesList); + if (sampleNameToUse != null) { + if (!sampleSet.contains(sampleNameToUse)) + throw new UserException.BadArgumentValue("sample_name", "Specified name does not exist in input bam files"); + if (sampleSet.size() == 1) { + //No reason to incur performance penalty associated with filtering if they specified the name of the only sample + sampleNameToUse = null; + } else { + samplesList = new IndexedSampleList(sampleNameToUse); + sampleSet = SampleListUtils.asSet(samplesList); + } + } + + // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested final UnifiedArgumentCollection simpleUAC = SCAC.cloneTo(UnifiedArgumentCollection.class); simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY; @@ -773,7 +789,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); if ( emitReferenceConfidence() ) { if ( samples.sampleCount() != 1 ) - throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); + throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently. Use the sample_name argument"); headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { // a kluge to enforce the use of this indexing strategy @@ -885,6 +901,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return NO_CALLS; + if (sampleNameToUse != null) + removeReadsFromAllSamplesExcept(sampleNameToUse, originalActiveRegion); + if( !originalActiveRegion.isActive() ) // Not active so nothing to do! return referenceModelForNoVariation(originalActiveRegion, true); @@ -1270,4 +1289,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); } } + + private void removeReadsFromAllSamplesExcept(final String targetSample, final ActiveRegion activeRegion) { + final Set readsToRemove = new LinkedHashSet<>(); + for( final GATKSAMRecord rec : activeRegion.getReads() ) { + if( !rec.getReadGroup().getSample().equals(targetSample) ) { + readsToRemove.add(rec); + } + } + activeRegion.removeAll( readsToRemove ); + + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 7b3c8f182..0db64b3cc 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -90,7 +90,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a64e2d8e60a1be9cb0b4306c85e96393"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "d5c07fa3edca496a84fd17cecad06230"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "de763f6c9a5aec4586a2671941e4c96d"}); + + final String NA12878bandedResolutionMD5 = "de763f6c9a5aec4586a2671941e4c96d"; + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); + tests.add(new Object[]{NA12878_WEx + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam -sn NA12878", + ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); return tests.toArray(new Object[][]{}); }