From d1a7edd796747639051a14dab6374e9af9c58020 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Fri, 15 May 2015 00:44:54 -0400 Subject: [PATCH 01/23] Update pom versions to mark the start of GATK 3.5 development --- pom.xml | 2 +- protected/gatk-package-distribution/pom.xml | 2 +- protected/gatk-queue-extensions-distribution/pom.xml | 2 +- protected/gatk-queue-package-distribution/pom.xml | 2 +- protected/gatk-tools-protected/pom.xml | 2 +- protected/pom.xml | 2 +- public/VectorPairHMM/pom.xml | 2 +- public/external-example/pom.xml | 2 +- public/gatk-engine/pom.xml | 2 +- public/gatk-queue-extensions-generator/pom.xml | 2 +- public/gatk-queue-extensions-public/pom.xml | 2 +- public/gatk-queue/pom.xml | 2 +- public/gatk-root/pom.xml | 2 +- public/gatk-tools-public/pom.xml | 2 +- public/gatk-utils/pom.xml | 2 +- public/gsalib/pom.xml | 2 +- public/package-tests/pom.xml | 2 +- public/pom.xml | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pom.xml b/pom.xml index 794bef1c9..da256634c 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT public/gatk-root diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml index 41029d6a8..dfc00c219 100644 --- a/protected/gatk-package-distribution/pom.xml +++ b/protected/gatk-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml index d64e3f120..6d484c8f7 100644 --- a/protected/gatk-queue-extensions-distribution/pom.xml +++ b/protected/gatk-queue-extensions-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml index b605bf61e..510aa339b 100644 --- a/protected/gatk-queue-package-distribution/pom.xml +++ b/protected/gatk-queue-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml index ce63b594d..9592586d2 100644 --- a/protected/gatk-tools-protected/pom.xml +++ b/protected/gatk-tools-protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/pom.xml b/protected/pom.xml index cae185e15..edfd9e7b3 100644 --- a/protected/pom.xml +++ b/protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT ../public/gatk-root diff --git a/public/VectorPairHMM/pom.xml b/public/VectorPairHMM/pom.xml index 05c6a5851..ccbc99eef 100644 --- a/public/VectorPairHMM/pom.xml +++ b/public/VectorPairHMM/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT ../../public/gatk-root diff --git a/public/external-example/pom.xml b/public/external-example/pom.xml index ac47625c3..02eaa9800 100644 --- a/public/external-example/pom.xml +++ b/public/external-example/pom.xml @@ -9,7 +9,7 @@ External Example - 3.4 + 3.5-SNAPSHOT - 1.132 - 1.131 + 1.134 + 1.133 diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java index d2c11407a..f21975cce 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java @@ -28,7 +28,6 @@ package org.broadinstitute.gatk.utils.refdata; import htsjdk.samtools.util.SequenceUtil; import htsjdk.tribble.Feature; import htsjdk.tribble.annotation.Strand; -import htsjdk.tribble.dbsnp.OldDbSNPFeature; import htsjdk.tribble.gelitext.GeliTextFeature; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -111,139 +110,6 @@ public class VariantContextAdaptors { } } - // -------------------------------------------------------------------------------------------------------------- - // - // dbSNP to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class DBSnpAdaptor implements VCAdaptor { - private static boolean isSNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); - } - - private static boolean isMNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); - } - - private static boolean isInsertion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("insertion"); - } - - private static boolean isDeletion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("deletion"); - } - - private static boolean isIndel(OldDbSNPFeature feature) { - return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature); - } - - public static boolean isComplexIndel(OldDbSNPFeature feature) { - return feature.getVariantType().contains("in-del"); - } - - /** - * gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference base. This is returned as a string list with no guarantee ordering - * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest - * frequency). - * - * @return an alternate allele list - */ - public static List getAlternateAlleleList(OldDbSNPFeature feature) { - List ret = new ArrayList(); - for (String allele : getAlleleList(feature)) - if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); - return ret; - } - - /** - * gets the alleles. This method should return all the alleles present at the location, - * including the reference base. The first allele should always be the reference allele, followed - * by an unordered list of alternate alleles. - * - * @return an alternate allele list - */ - public static List getAlleleList(OldDbSNPFeature feature) { - List alleleList = new ArrayList(); - // add ref first - if ( feature.getStrand() == Strand.POSITIVE ) - alleleList = Arrays.asList(feature.getObserved()); - else - for (String str : feature.getObserved()) - alleleList.add(SequenceUtil.reverseComplement(str)); - if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase()) - && !alleleList.get(0).equals(feature.getNCBIRefBase()) ) - Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0); - - return alleleList; - } - - /** - * Converts non-VCF formatted dbSNP records to VariantContext. - * @return OldDbSNPFeature. - */ - @Override - public Class getAdaptableFeatureType() { return OldDbSNPFeature.class; } - - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - OldDbSNPFeature dbsnp = (OldDbSNPFeature)input; - - int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - - final byte refBaseForIndel = ref.getBases()[index]; - final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-"); - - boolean addPaddingBase; - if ( isSNP(dbsnp) || isMNP(dbsnp) ) - addPaddingBase = false; - else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) - addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); - else - return null; // can't handle anything else - - Allele refAllele; - if ( refBaseIsDash ) - refAllele = Allele.create(refBaseForIndel, true); - else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) - return null; - else - refAllele = Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + dbsnp.getNCBIRefBase(), true); - - final List alleles = new ArrayList(); - alleles.add(refAllele); - - // add all of the alt alleles - for ( String alt : getAlternateAlleleList(dbsnp) ) { - if ( Allele.wouldBeNullAllele(alt.getBytes())) - alt = ""; - else if ( ! Allele.acceptableAlleleBases(alt) ) - return null; - - alleles.add(Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + alt, false)); - } - - final VariantContextBuilder builder = new VariantContextBuilder(); - builder.source(name).id(dbsnp.getRsID()); - builder.loc(dbsnp.getChr(), dbsnp.getStart() - (addPaddingBase ? 1 : 0), dbsnp.getEnd() - (addPaddingBase && refAllele.length() == 1 ? 1 : 0)); - builder.alleles(alleles); - return builder.make(); - } - - private static List stripNullDashes(final List alleles) { - final List newAlleles = new ArrayList(alleles.size()); - for ( final String allele : alleles ) { - if ( allele.equals("-") ) - newAlleles.add(""); - else - newAlleles.add(allele); - } - return newAlleles; - } - } // -------------------------------------------------------------------------------------------------------------- // diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index 8b093d4d5..2894a8041 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -144,6 +144,6 @@ public final class GATKVCFConstants { public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT"; public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME = "*:DEL"; - public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible spanning deletion allele at this site + public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED = "*:DEL"; + public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED+">", false); // represents any possible spanning deletion allele at this site } From b35085ca283c1d894551459b43c6e2d25eb3ac9b Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 12 Jun 2015 22:03:23 -0400 Subject: [PATCH 14/23] Indexing parameters not required if output file has the g.vcf.gz extensionv --- .../walkers/haplotypecaller/HaplotypeCaller.java | 5 +++-- .../HaplotypeCallerGVCFIntegrationTest.java | 12 ++++++++++++ .../broadinstitute/gatk/engine/GATKVCFUtils.java | 5 +++-- .../gatk/tools/CatVariantsIntegrationTest.java | 16 ++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 3d3299053..4738f01c7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -1204,9 +1204,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In /** * Is writing to an output GVCF file? * - * @return true if the VCF output file has a .g.vcf extension + * @return true if the VCF output file has a .g.vcf or .g.vcf.gz extension */ private boolean isGVCF() { - return ((VariantContextWriterStub) vcfWriter).getOutputFile().getName().endsWith("." + GATKVCFUtils.GVCF_EXT); + String fileName = ((VariantContextWriterStub) vcfWriter).getOutputFile().getName(); + return ( fileName.endsWith("." + GATKVCFUtils.GVCF_EXT) || fileName.endsWith("." + GATKVCFUtils.GVCF_GZ_EXT) ); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 42c8c6285..6f5634b56 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -224,6 +224,18 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { executeTest("testGVCFIndexNoThrow", spec); } + /** + * Test HaplotypeCaller to ensure it does not throw an exception when a .g.vcf.gz output file is specified and the indexing arguments are omitted + */ + @Test() + public void testGVCFGzIndexNoThrow() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000000-17000100"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(GATKVCFUtils.GVCF_GZ_EXT), Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testGVCFIndexNoThrow", spec); + } + @Test() public void testWrongParameterGVCFIndexException() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java index da3763f1a..4d05d824e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java @@ -69,8 +69,9 @@ public class GATKVCFUtils { public final static GATKVCFIndexType DEFAULT_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; public final static Integer DEFAULT_GVCF_INDEX_PARAMETER = 128000; - // GVCF file extension + // GVCF file extensions public final static String GVCF_EXT = "g.vcf"; + public final static String GVCF_GZ_EXT = "g.vcf.gz"; // Message for using the deprecated --variant_index_type or --variant_index_parameter arguments. public final static String DEPRECATED_INDEX_ARGS_MSG = "Naming your output file using the .g.vcf extension will automatically set the appropriate values " + @@ -389,7 +390,7 @@ public class GATKVCFUtils { indexType = variantIndexType; indexParameter = variantIndexParameter; logger.warn(DEPRECATED_INDEX_ARGS_MSG); - } else if (outputFile.getName().endsWith("." + GVCF_EXT)) { + } else if (outputFile.getName().endsWith("." + GVCF_EXT) || outputFile.getName().endsWith("." + GVCF_GZ_EXT)) { indexType = DEFAULT_GVCF_INDEX_TYPE; indexParameter = DEFAULT_GVCF_INDEX_PARAMETER; } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java index 18a2d6c12..69ae7a472 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java @@ -246,4 +246,20 @@ public class CatVariantsIntegrationTest { ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine)); pc.execAndCheck(ps); } + + @Test() + public void testCatVariantsGVCFGzIndexCreation() throws IOException{ + + String cmdLine = String.format("java -cp \"%s\" %s -R %s -V %s -V %s -out %s", + StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar), + CatVariants.class.getCanonicalName(), + BaseTest.b37KGReference, + CatVariantsVcf1, + CatVariantsVcf2, + BaseTest.createTempFile("CatVariantsGVCFIndexCreationTest", "." + GATKVCFUtils.GVCF_GZ_EXT)); + + ProcessController pc = ProcessController.getThreadLocal(); + ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine)); + pc.execAndCheck(ps); + } } \ No newline at end of file From ce5ecf1383766bb91aa343142dc69d1ad278020e Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Thu, 21 May 2015 15:57:55 -0400 Subject: [PATCH 15/23] Enable contamination correction via downsampling (as for HaplotypeCaller), added test Add oxoG read count annotation and add as default annotation Add ##SAMPLE VCF header line in accordance with TCGA VCF spec, specifying "File" line in sample header with BAM file name and "SampleName" with BAM sample name (Don't print sample file path if --no_cmdline_in_header is specified to help with test consistency) Turn on active region assembly-based physical phasing for M2 Clean up M2-related annotations so UG doesn't crash if M2 annotations are called --- .../haplotypecaller/HaplotypeCallerGenotypingEngine.java | 2 +- .../gatk/utils/variant/GATKVCFConstants.java | 5 +++++ .../gatk/utils/variant/GATKVCFHeaderLines.java | 9 +++++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 0f6d61ab6..e1e449409 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -85,7 +85,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine Date: Sun, 14 Jun 2015 00:06:38 -0400 Subject: [PATCH 17/23] Handle cases where a given sample has multiple spanning deletions. When a sample has multiple spanning deletions and we are asked to assign likelihoods to the spanning deletion allele, we currently choose the first deletion. Valentin pointed out that this isn't desired behavior. I promised Valentin that I would address this issue, so here it is. I do not believe that the correct thing to do is to sum the likelihoods over all spanning deletions (I came up with problematic cases where this breaks down). So instead I'm using a simple heuristic approach: using the hom alt PLs, find the most likely spanning deletion for this position and use its likelihoods. In the 10K-sample VCF from Monkol there were only 2 cases that this problem popped up. In both cases the heuristic approach works well. --- ...ferenceConfidenceVariantContextMerger.java | 62 ++++++++++++++++++- .../CombineGVCFsIntegrationTest.java | 33 ++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index fcb3dca0a..2780c3806 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFConstants; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; @@ -431,13 +432,72 @@ public class ReferenceConfidenceVariantContextMerger { // create the index mapping, using the allele whenever such a mapping doesn't exist for ( int i = 1; i < targetAlleles.size(); i++ ) { - final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + final Allele targetAllele = targetAlleles.get(i); + + // if there’s more than 1 DEL allele then we need to use the best one + if ( targetAllele == Allele.SPAN_DEL && g.hasPL() ) { + final int occurrences = Collections.frequency(remappedAlleles, Allele.SPAN_DEL); + if ( occurrences > 1 ) { + final int indexOfBestDel = indexOfBestDel(remappedAlleles, g.getPL(), g.getPloidy()); + indexMapping[i] = ( indexOfBestDel == -1 ? indexOfNonRef : indexOfBestDel ); + continue; + } + } + + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAllele); indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele; } return indexMapping; } + /** + * Returns the index of the best spanning deletion allele based on AD counts + * + * @param alleles the list of alleles + * @param PLs the list of corresponding PL values + * @param ploidy the ploidy of the sample + * @return the best index or -1 if not found + */ + private static int indexOfBestDel(final List alleles, final int[] PLs, final int ploidy) { + int bestIndex = -1; + int bestPL = Integer.MAX_VALUE; + + for ( int i = 0; i < alleles.size(); i++ ) { + if ( alleles.get(i) == Allele.SPAN_DEL ) { + final int homAltIndex = findHomIndex(i, ploidy, alleles.size()); + final int PL = PLs[homAltIndex]; + if ( PL < bestPL ) { + bestIndex = i; + bestPL = PL; + } + } + } + + return bestIndex; + } + + /** + * Returns the index of the PL that represents the homozygous genotype of the given i'th allele + * + * @param i the index of the allele with the list of alleles + * @param ploidy the ploidy of the sample + * @param numAlleles the total number of alleles + * @return the hom index + */ + private static int findHomIndex(final int i, final int ploidy, final int numAlleles) { + // some quick optimizations for the common case + if ( ploidy == 2 ) + return i + (i * (i + 1) / 2); // this is straight from the VCF spec on PLs + if ( ploidy == 1 ) + return i; + + final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(ploidy, numAlleles); + final int[] alleleIndexes = new int[ploidy]; + Arrays.fill(alleleIndexes, i); + return calculator.allelesToIndex(alleleIndexes); + } + /** * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current * alleles from the original AD. diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java index c5a1c8c84..8ac40aa74 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -222,6 +222,39 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { executeTest("testSpanningDeletions", spec); } + @Test + public void testMultipleSpanningDeletionsForOneSample() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.many.g.vcf", + 1, + Arrays.asList("5c88e10211def13ba847c29d0fe9e191")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsForOneSample", spec); + } + + @Test + public void testMultipleSpanningDeletionsForOneSampleHaploid() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.many.haploid.g.vcf", + 1, + Arrays.asList("76fc5f6b949ac0b893061828af800bf8")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsForOneSampleHaploid", spec); + } + + @Test + public void testMultipleSpanningDeletionsForOneSampleTetraploid() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.many.tetraploid.g.vcf", + 1, + Arrays.asList("0ec79471550ec5e30540f68cb0651b14")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsForOneSampleTetraploid", spec); + } + @Test public void testWrongReferenceBaseBugFix() throws Exception { final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf" From 697c4b0cf149aeea45eaed28121cf3620d6e02ad Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Wed, 10 Jun 2015 11:52:13 -0400 Subject: [PATCH 18/23] Added else clause to handle symbolic alleles Add test for createAlleleMapping --- .../CombineVariantsIntegrationTest.java | 14 ++++++++++++ .../variant/GATKVariantContextUtils.java | 12 +++++----- .../GATKVariantContextUtilsUnitTest.java | 22 +++++++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java index 293e4b21a..b6d05eefb 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -223,4 +223,18 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("combiningGVCFsFails", spec); } catch (Exception e) { } // do nothing } + + @Test + public void combineSymbolicVariants() { + // Just checking that this does not fail, hence no output files and MD5 + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -o %s " + + " -R " + hg19RefereneWithChrPrefixInChromosomeNames + + " -V " + privateTestDir + "WES-chr1.DEL.vcf" + + " -V " + privateTestDir + "WGS-chr1.DEL.vcf" + + " -genotypeMergeOptions UNIQUIFY", + 0, + Arrays.asList("")); + executeTest("combineSymbolicVariants: ", spec); + } } \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 8f3e5c450..445828f58 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -393,7 +393,7 @@ public class GATKVariantContextUtils { if (repetitionCount[0] == 0 || repetitionCount[1] == 0) return null; - if (lengths.size() == 0) { + if (lengths.isEmpty()) { lengths.add(repetitionCount[0]); // add ref allele length only once } lengths.add(repetitionCount[1]); // add this alt allele's length @@ -947,7 +947,7 @@ public class GATKVariantContextUtils { final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC ) { - if ( unsortedVCs == null || unsortedVCs.size() == 0 ) + if ( unsortedVCs == null || unsortedVCs.isEmpty() ) return null; if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) @@ -965,7 +965,7 @@ public class GATKVariantContextUtils { VCs.add(vc); } - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled + if ( VCs.isEmpty() ) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC @@ -1289,7 +1289,7 @@ public class GATKVariantContextUtils { * @param currentAlleles the list of alleles already created * @return a non-null mapping of original alleles to new (extended) ones */ - private static Map createAlleleMapping(final Allele refAllele, + protected static Map createAlleleMapping(final Allele refAllele, final VariantContext oneVC, final Collection currentAlleles) { final Allele myRef = oneVC.getReference(); @@ -1305,6 +1305,8 @@ public class GATKVariantContextUtils { if ( extended.equals(b) ) extended = b; map.put(a, extended); + } else if ( a.isSymbolic() ) { + map.put(a, a); } } @@ -1696,7 +1698,7 @@ public class GATKVariantContextUtils { // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list vcList.remove(k); // avoid having empty lists - if (vcList.size() == 0) + if (vcList.isEmpty()) mappedVCs.remove(type); if ( !mappedVCs.containsKey(vcType) ) mappedVCs.put(vcType, new ArrayList()); diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java index 860c54736..410bd848f 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -46,6 +46,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Allele ATref; Allele Anoref; Allele GT; + Allele Symbolic; private GenomeLocParser genomeLocParser; @@ -63,6 +64,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { ATref = Allele.create("AT",true); Anoref = Allele.create("A",false); GT = Allele.create("GT",false); + Symbolic = Allele.create("", false); genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); } @@ -1607,5 +1609,25 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { BaseUtils.fillWithRandomBases(bases, 1, bases.length); return bases; } + + @Test + public void testCreateAlleleMapping(){ + final List alleles = Arrays.asList(Aref,Symbolic,T); + final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make(); + Map map = GATKVariantContextUtils.createAlleleMapping(ATref, vc, alleles); + + final List expectedAlleles = Arrays.asList(Allele.create("", false), Allele.create("TT", false)); + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ){ + Assert.assertEquals(map.get(vc.getAlternateAlleles().get(i)), expectedAlleles.get(i)); + } + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testCreateAlleleMappingException(){ + final List alleles = Arrays.asList(Aref, Symbolic, T); + final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make(); + // Throws an exception if the ref allele length <= ref allele length to extend + Map map = GATKVariantContextUtils.createAlleleMapping(Aref, vc, alleles); + } } From 09686f4595b517a29a3feac6a59b9fe1c947ba9d Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 25 Jun 2015 10:17:29 -0400 Subject: [PATCH 19/23] Make VQSLOD definition accurate --- .../VariantRecalibrator.java | 2 +- ...ntRecalibrationWalkersIntegrationTest.java | 22 +++++++++---------- .../utils/variant/GATKVCFHeaderLines.java | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java index 8021db111..dfbc3dc0a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java @@ -99,7 +99,7 @@ import java.util.*; * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array (in humans). This adaptive * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is - * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + * the log odds of being a true variant versus being false under the trained Gaussian mixture model. *

* *

VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 3d5463d0f..473ab8cd4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -94,14 +94,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", "41e2d951a17de433fe378bb3d9ec75d4", // tranches - "04336b2453202f286da05b69e57f66ed", // recal file - "d29fd0bdc1c8c3a171e10d29f7ffeaec"); // cut VCF + "19c77724f08d90896914d3d348807399", // recal file + "c6a186a1a9271f5de35f1e5aeb8749a6"); // cut VCF VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf", "ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches - "1b33c10be7d8bf8e9accd11113835262", // recal file - "4700d52a06f2ef3a5882719b86911e51"); // cut VCF + "b7cad6a0bbbf0330e0ac712a80c3144f", // recal file + "bee399765991636461599565c9634bcf"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -196,8 +196,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", "3ad7f55fb3b072f373cbce0b32b66df4", // tranches - "e747c08131d58d9a4800720f6ca80e0c", // recal file - "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF + "e91a5b25ea1eefdcff488e0326028b51", // recal file + "e6a0c5173d8c8fbd08afdc5e5e7d3a78"); // cut VCF @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { @@ -251,14 +251,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest indelUnfiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF + "689c7853fe2e63216da3b0d47e27740e", // recal file + "4147373ec8e0aba7ace3658677007990"); // cut VCF VRTest indelFiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF + "689c7853fe2e63216da3b0d47e27740e", // recal file + "8dd8ea31e419f68d80422b34b14e24e4"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createTestVariantRecalibratorIndel() { @@ -316,7 +316,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -o %s" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("03a0ed00af6aac76d39e569f90594a02")); + Arrays.asList("cd42484985179c7f549e652f0f6a94d0")); final List outputFiles = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).getFirst(); setPDFsForDeletion(outputFiles); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index da90edb8f..fff7ea5f1 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -146,7 +146,7 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(ORIGINAL_DP_KEY, 1, VCFHeaderLineType.Integer, "Original DP")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_CONTIG_KEY, 1, VCFHeaderLineType.String, "Original contig name for the record")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_START_KEY, 1, VCFHeaderLineType.Integer, "Original start position for the record")); - addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); + addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds of being a true variant versus being false under the trained gaussian mixture model")); addInfoLine(new VCFInfoHeaderLine(CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); addInfoLine(new VCFInfoHeaderLine(POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants")); addInfoLine(new VCFInfoHeaderLine(NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants")); From f994220617bec1bf8d91f71ffac3b64d7856ae54 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 29 Jun 2015 17:54:08 -0400 Subject: [PATCH 20/23] Update the allele remapping code to handle the new spanning deletion allele. Now that Ron updated the GATK so that we use star to represent spanning deletions, we need to catch those cases in the code that remaps alleles. Otherwise, we try to pad the stars and that's just bad. Added test from actual failing data. --- .../ReferenceConfidenceVariantContextMerger.java | 9 +++++++-- .../variantutils/GenotypeGVCFsIntegrationTest.java | 12 ++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index 2780c3806..1bce6000c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -288,10 +288,15 @@ public class ReferenceConfidenceVariantContextMerger { for (final Allele a : vc.getAlternateAlleles()) { if (a.isSymbolic()) { result.add(a); - // we always skip when adding to finalAlleles; this is done outside if applies. - // we also skip <*DEL> if there isn't a real alternate allele. + // we always skip when adding to finalAlleles; this is done outside if it applies. + // we also skip <*:DEL> if there isn't a real alternate allele. if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() ) finalAlleles.add(a); + } else if ( a == Allele.SPAN_DEL ) { + result.add(a); + // we skip * if there isn't a real alternate allele. + if ( !vc.isBiallelic() ) + finalAlleles.add(a); } else if (a.isCalled()) { final Allele newAllele; if (extraBaseCount > 0) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 601183705..73f410786 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -502,4 +502,16 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { spec.disableShadowBCF(); executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); } + + @Test(enabled = true) + public void testGenotypingSpanningDeletionOverSpan() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.delOverSpan.1.g.vcf -V " + + privateTestDir + "spanningDel.delOverSpan.2.g.vcf", + 0, + Arrays.asList("")); // we do not care about the md5; we just want to make sure it doesn't blow up with an error + spec.disableShadowBCF(); + executeTest("testGenotypingSpanningDeletionOverSpan", spec); + } } \ No newline at end of file From 1a7e83fa50e091a68976fa1a50e42194c131b0cb Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 26 Jun 2015 13:12:55 -0400 Subject: [PATCH 21/23] Merge if both GT are phased --- .../gatk/tools/walkers/phasing/PhasingUtils.java | 6 +++++- .../walkers/phasing/PhasingUtilsUnitTest.java | 7 +++++-- .../ReadBackedPhasingIntegrationTest.java | 16 +++++++++++++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java index 686cd6d87..0dc151895 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java @@ -363,7 +363,7 @@ class PhasingUtils { * * @param gt1 genotype 1 * @param gt2 genotype 2 - * @return true if genotypes have the same number of chromosomes, haplotype, number of attributes + * @return true if genotypes have the same number of chromosomes, haplotype, phased, number of attributes * as chromosomes, and either genotype is homozygous, false otherwise */ static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { @@ -371,6 +371,10 @@ class PhasingUtils { if (gt1.getPloidy() != gt2.getPloidy()) return false; + // If gt1 or gt2 are not phased, then can not be merged. + if (!gt1.isPhased() || !gt2.isPhased()) + return false; + // If gt1 or gt2 are homozygous, then could be merged. if (gt1.isHom() || gt2.isHom()) return true; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java index 26a4a92f7..740f2759c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java @@ -81,6 +81,7 @@ public class PhasingUtilsUnitTest extends BaseTest { private ReferenceSequenceFile referenceFile; private Genotype genotype1; private Genotype genotype2; + private Genotype genotype2unphased; private String contig; private List alleleList1; private List alleleList2; @@ -93,8 +94,9 @@ public class PhasingUtilsUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(referenceFile); alleleList1 = Arrays.asList(Allele.create("T", true), Allele.create("C", false)); alleleList2 = Arrays.asList(Allele.create("G", true), Allele.create("A", false)); - genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).make(); - genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).make(); + genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).phased(true).make(); + genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(true).make(); + genotype2unphased = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(false).make(); contig = new String("1"); vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make(); vc2 = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make(); @@ -184,6 +186,7 @@ public class PhasingUtilsUnitTest extends BaseTest { @Test public void TestAlleleSegregationIsKnown(){ Assert.assertTrue(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2)); + Assert.assertFalse(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2unphased)); } @Test diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java index 88a0ee7fe..8933314c8 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -138,7 +138,21 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { + " -L chr20:332341-802503", 1, Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574")); - executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); + executeTest("Use trio-phased VCF, adding read-backed phasing information in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); } + @Test + public void testDoNotMergeUnphasedSNPs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T ReadBackedPhasing" + + " -R " + hg19Reference + + " -I " + privateTestDir + "S17C1-8.KRAS.bam" + + " --variant " + privateTestDir + "S17C1-8_bwa_mutect_filtered.vcf" + + " --phaseQualityThresh 20.0 -enableMergeToMNP -maxDistMNP 3 -L 12:25398281-25398284" + + " -o %s" + + " --no_cmdline_in_header", + 1, + Arrays.asList("59ee67d657ee955477bca94d07014ac3")); + executeTest("Do not merge unphased SNPs", spec); + } }