diff --git a/pom.xml b/pom.xml index 794bef1c9..da256634c 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT public/gatk-root diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml index 41029d6a8..dfc00c219 100644 --- a/protected/gatk-package-distribution/pom.xml +++ b/protected/gatk-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml index d64e3f120..6d484c8f7 100644 --- a/protected/gatk-queue-extensions-distribution/pom.xml +++ b/protected/gatk-queue-extensions-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml index b605bf61e..510aa339b 100644 --- a/protected/gatk-queue-package-distribution/pom.xml +++ b/protected/gatk-queue-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml index ce63b594d..9592586d2 100644 --- a/protected/gatk-tools-protected/pom.xml +++ b/protected/gatk-tools-protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java index 88390ad88..adff9dc33 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java @@ -75,10 +75,18 @@ import java.util.*; /** * Variant confidence normalized by unfiltered depth of variant samples * - *

This annotation puts the variant confidence QUAL score in perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.

+ *

This annotation puts the variant confidence QUAL score into perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.

* *

Statistical notes

- *

The calculation only takes into account coverage from samples genotyped as having the variant allele(s). This removes the influence of any homozygous-reference samples that might be present in the same cohort, which would otherwise penalize the call unfairly.

+ *

The QD is the QUAL score normalized by allele depth (AD) for a variant. For a single sample, the HaplotypeCaller calculates the QD by taking QUAL/AD. For multiple samples, HaplotypeCaller and GenotypeGVCFs calculate the QD by taking QUAL/AD of samples with a non hom-ref genotype call. The reason we leave out the samples with a hom-ref call is to not penalize the QUAL for the other samples with the variant call.

+ *

Here is a single sample example:

+ *
2	37629	.	C	G	1063.77	.	AC=2;AF=1.00;AN=2;DP=31;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.50;QD=34.32;SOR=2.376	GT:AD:DP:GQ:PL:QSS	1/1:0,31:31:93:1092,93,0:0,960
+

QUAL/AD = 1063.77/31 = 34.32 = QD

+ *

Here is a multi-sample example:

+ *
10	8046	.	C	T	4107.13	.	AC=1;AF=0.167;AN=6;BaseQRankSum=-3.717;DP=1063;FS=1.616;MLEAC=1;MLEAF=0.167;QD=11.54
+   GT:AD:DP:GQ:PL:QSS	0/0:369,4:373:99:0,1007,12207:10548,98	    0/0:331,1:332:99:0,967,11125:9576,27	    0/1:192,164:356:99:4138,0,5291:5501,4505
+ *

QUAL/AD = 4107.13/356 = 11.54 = QD

+ *

Note that currently, when HaplotypeCaller is run with `-ERC GVCF`, the QD calculation is invoked before AD itself has been calculated, due to a technical constraint. In that case, HaplotypeCaller uses the number of overlapping reads from the haplotype likelihood calculation in place of AD to calculate QD, which generally yields a very similar number. This does not cause any measurable problems, but can cause some confusion since the number may be slightly different than what you would expect to get if you did the calculation manually. For that reason, this behavior will be modified in an upcoming version.

* *

Caveat

*

This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.

diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 8dd616f41..2439e6219 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -250,12 +250,8 @@ public abstract class GenotypingEngine outputAlleles = outputAlternativeAlleles.outputAlleles(vc.getReference()); final VariantContextBuilder builder = new VariantContextBuilder(callSourceString(), loc.getContig(), loc.getStart(), loc.getStop(), outputAlleles); - - // Seems that when log10PError is 0.0, you must pass -0.0 to get a nice output at the other end otherwise is a "-0". - // Truth is that this should be fixed in the "variant" dependency code but perhaps it can be amended also in the VariantContextWriter. - //TODO Please remove this comment when this has been fixed (PT https://www.pivotaltracker.com/story/show/69492530) - //TODO and change the code below accordingly. - builder.log10PError(log10Confidence == 0.0 ? -0.0 : log10Confidence); + + builder.log10PError(log10Confidence); if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filter(GATKVCFConstants.LOW_QUAL_FILTER_NAME); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 3d3299053..4738f01c7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -1204,9 +1204,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In /** * Is writing to an output GVCF file? * - * @return true if the VCF output file has a .g.vcf extension + * @return true if the VCF output file has a .g.vcf or .g.vcf.gz extension */ private boolean isGVCF() { - return ((VariantContextWriterStub) vcfWriter).getOutputFile().getName().endsWith("." + GATKVCFUtils.GVCF_EXT); + String fileName = ((VariantContextWriterStub) vcfWriter).getOutputFile().getName(); + return ( fileName.endsWith("." + GATKVCFUtils.GVCF_EXT) || fileName.endsWith("." + GATKVCFUtils.GVCF_GZ_EXT) ); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 0f6d61ab6..e1e449409 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -85,7 +85,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine * *

VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java index fffd52e4f..50b4c98d2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java @@ -165,7 +165,6 @@ public class CombineGVCFs extends RodWalker vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); - headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location")); headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 9e71a0818..b0bbe02ec 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -219,7 +219,6 @@ public class GenotypeGVCFs extends RodWalker(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet))); } // Add and to the end if at all required in in the output. - if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE); + if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(Allele.SPAN_DEL); if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final List allelesList = new ArrayList<>(finalAlleleSet); @@ -286,10 +288,15 @@ public class ReferenceConfidenceVariantContextMerger { for (final Allele a : vc.getAlternateAlleles()) { if (a.isSymbolic()) { result.add(a); - // we always skip when adding to finalAlleles; this is done outside if applies. - // we also skip <*DEL> if there isn't a real alternate allele. + // we always skip when adding to finalAlleles; this is done outside if it applies. + // we also skip <*:DEL> if there isn't a real alternate allele. if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() ) finalAlleles.add(a); + } else if ( a == Allele.SPAN_DEL ) { + result.add(a); + // we skip * if there isn't a real alternate allele. + if ( !vc.isBiallelic() ) + finalAlleles.add(a); } else if (a.isCalled()) { final Allele newAllele; if (extraBaseCount > 0) { @@ -328,7 +335,7 @@ public class ReferenceConfidenceVariantContextMerger { if ( allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ) replacement = allele; else if ( allele.length() < vc.getReference().length() ) - replacement = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE; + replacement = Allele.SPAN_DEL; else replacement = Allele.NO_CALL; @@ -430,13 +437,72 @@ public class ReferenceConfidenceVariantContextMerger { // create the index mapping, using the allele whenever such a mapping doesn't exist for ( int i = 1; i < targetAlleles.size(); i++ ) { - final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + final Allele targetAllele = targetAlleles.get(i); + + // if there’s more than 1 DEL allele then we need to use the best one + if ( targetAllele == Allele.SPAN_DEL && g.hasPL() ) { + final int occurrences = Collections.frequency(remappedAlleles, Allele.SPAN_DEL); + if ( occurrences > 1 ) { + final int indexOfBestDel = indexOfBestDel(remappedAlleles, g.getPL(), g.getPloidy()); + indexMapping[i] = ( indexOfBestDel == -1 ? indexOfNonRef : indexOfBestDel ); + continue; + } + } + + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAllele); indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele; } return indexMapping; } + /** + * Returns the index of the best spanning deletion allele based on AD counts + * + * @param alleles the list of alleles + * @param PLs the list of corresponding PL values + * @param ploidy the ploidy of the sample + * @return the best index or -1 if not found + */ + private static int indexOfBestDel(final List alleles, final int[] PLs, final int ploidy) { + int bestIndex = -1; + int bestPL = Integer.MAX_VALUE; + + for ( int i = 0; i < alleles.size(); i++ ) { + if ( alleles.get(i) == Allele.SPAN_DEL ) { + final int homAltIndex = findHomIndex(i, ploidy, alleles.size()); + final int PL = PLs[homAltIndex]; + if ( PL < bestPL ) { + bestIndex = i; + bestPL = PL; + } + } + } + + return bestIndex; + } + + /** + * Returns the index of the PL that represents the homozygous genotype of the given i'th allele + * + * @param i the index of the allele with the list of alleles + * @param ploidy the ploidy of the sample + * @param numAlleles the total number of alleles + * @return the hom index + */ + private static int findHomIndex(final int i, final int ploidy, final int numAlleles) { + // some quick optimizations for the common case + if ( ploidy == 2 ) + return i + (i * (i + 1) / 2); // this is straight from the VCF spec on PLs + if ( ploidy == 1 ) + return i; + + final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(ploidy, numAlleles); + final int[] alleleIndexes = new int[ploidy]; + Arrays.fill(alleleIndexes, i); + return calculator.allelesToIndex(alleleIndexes); + } + /** * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current * alleles from the original AD. diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java index 0816b00f2..6c139b2e9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java @@ -147,11 +147,19 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testInvertFilter() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invert_filter_expression", 1, + baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invertFilterExpression", 1, Arrays.asList("d478fd6bcf0884133fe2a47adf4cd765")); executeTest("test inversion of selection of filter with separate names #2", spec); } + @Test + public void testInvertJexlFilter() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --filterName ABF -filter 'AlleleBalance >= 0.7' --filterName FSF -filter 'FisherStrand != 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + Arrays.asList("6fa6cd89bfc8b6b4dfc3da25eb36d08b")); // Differs from testInvertFilter() because their VCF header FILTER description uses the -filter argument. Their filter statuses are identical. + executeTest("test inversion of selection of filter via JEXL with separate names #2", spec); + } + @Test public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( @@ -207,8 +215,40 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testInvertGenotypeFilterExpression() { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference - + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invert_genotype_filter_expression", 1, + + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invertGenotypeFilterExpression", 1, Arrays.asList("d2664870e7145eb73a2295766482c823")); executeTest("testInvertGenotypeFilterExpression", spec); } + + @Test + public void testInvertJexlGenotypeFilterExpression() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP >= 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1, + Arrays.asList("8ddd8f3b5ee351c4ab79cb186b1d45ba")); // Differs from testInvertFilter because FILTER description uses the -genotypeFilterExpression argument + executeTest("testInvertJexlGenotypeFilterExpression", spec); + } + + @Test + public void testSetFilteredGtoNocall() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf --setFilteredGtToNocall", 1, + Arrays.asList("9ff801dd726eb4fc562b278ccc6854b1")); + executeTest("testSetFilteredGtoNocall", spec); + } + + @Test + public void testSetVcfFilteredGtoNocall() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("81b99386a64a8f2b857a7ef2bca5856e") + ); + + spec.disableShadowBCF(); + executeTest("testSetVcfFilteredGtoNocall--" + testfile, spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java index e19cfca29..778b38093 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -129,7 +129,7 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { eventLength = 5; alleles = getConsensusAlleles(eventLength,true,10,0.1, altBases); - Assert.assertEquals(alleles.size(),0); + Assert.assertEquals(alleles.size(),2); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 42c8c6285..6f5634b56 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -224,6 +224,18 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { executeTest("testGVCFIndexNoThrow", spec); } + /** + * Test HaplotypeCaller to ensure it does not throw an exception when a .g.vcf.gz output file is specified and the indexing arguments are omitted + */ + @Test() + public void testGVCFGzIndexNoThrow() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000000-17000100"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(GATKVCFUtils.GVCF_GZ_EXT), Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testGVCFIndexNoThrow", spec); + } + @Test() public void testWrongParameterGVCFIndexException() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java index 26a4a92f7..740f2759c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java @@ -81,6 +81,7 @@ public class PhasingUtilsUnitTest extends BaseTest { private ReferenceSequenceFile referenceFile; private Genotype genotype1; private Genotype genotype2; + private Genotype genotype2unphased; private String contig; private List alleleList1; private List alleleList2; @@ -93,8 +94,9 @@ public class PhasingUtilsUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(referenceFile); alleleList1 = Arrays.asList(Allele.create("T", true), Allele.create("C", false)); alleleList2 = Arrays.asList(Allele.create("G", true), Allele.create("A", false)); - genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).make(); - genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).make(); + genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).phased(true).make(); + genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(true).make(); + genotype2unphased = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(false).make(); contig = new String("1"); vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make(); vc2 = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make(); @@ -184,6 +186,7 @@ public class PhasingUtilsUnitTest extends BaseTest { @Test public void TestAlleleSegregationIsKnown(){ Assert.assertTrue(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2)); + Assert.assertFalse(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2unphased)); } @Test diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java index 88a0ee7fe..8933314c8 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -138,7 +138,21 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { + " -L chr20:332341-802503", 1, Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574")); - executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); + executeTest("Use trio-phased VCF, adding read-backed phasing information in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); } + @Test + public void testDoNotMergeUnphasedSNPs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T ReadBackedPhasing" + + " -R " + hg19Reference + + " -I " + privateTestDir + "S17C1-8.KRAS.bam" + + " --variant " + privateTestDir + "S17C1-8_bwa_mutect_filtered.vcf" + + " --phaseQualityThresh 20.0 -enableMergeToMNP -maxDistMNP 3 -L 12:25398281-25398284" + + " -o %s" + + " --no_cmdline_in_header", + 1, + Arrays.asList("59ee67d657ee955477bca94d07014ac3")); + executeTest("Do not merge unphased SNPs", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 3d5463d0f..473ab8cd4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -94,14 +94,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", "41e2d951a17de433fe378bb3d9ec75d4", // tranches - "04336b2453202f286da05b69e57f66ed", // recal file - "d29fd0bdc1c8c3a171e10d29f7ffeaec"); // cut VCF + "19c77724f08d90896914d3d348807399", // recal file + "c6a186a1a9271f5de35f1e5aeb8749a6"); // cut VCF VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf", "ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches - "1b33c10be7d8bf8e9accd11113835262", // recal file - "4700d52a06f2ef3a5882719b86911e51"); // cut VCF + "b7cad6a0bbbf0330e0ac712a80c3144f", // recal file + "bee399765991636461599565c9634bcf"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -196,8 +196,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", "3ad7f55fb3b072f373cbce0b32b66df4", // tranches - "e747c08131d58d9a4800720f6ca80e0c", // recal file - "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF + "e91a5b25ea1eefdcff488e0326028b51", // recal file + "e6a0c5173d8c8fbd08afdc5e5e7d3a78"); // cut VCF @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { @@ -251,14 +251,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest indelUnfiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF + "689c7853fe2e63216da3b0d47e27740e", // recal file + "4147373ec8e0aba7ace3658677007990"); // cut VCF VRTest indelFiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF + "689c7853fe2e63216da3b0d47e27740e", // recal file + "8dd8ea31e419f68d80422b34b14e24e4"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createTestVariantRecalibratorIndel() { @@ -316,7 +316,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -o %s" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("03a0ed00af6aac76d39e569f90594a02")); + Arrays.asList("cd42484985179c7f549e652f0f6a94d0")); final List outputFiles = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).getFirst(); setPDFsForDeletion(outputFiles); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java index c9fe0eda2..8ac40aa74 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -100,7 +100,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", 1, - Arrays.asList("ebe26077809961f53d5244643d24fd45")); + Arrays.asList("7b3153135e4f8e1d137d3f4beb46f182")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -112,7 +112,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", 1, - Arrays.asList("2d36a5f996cad47e5d05fcd78f6e572e")); + Arrays.asList("4f546634213ece6f08ec9258620b92bb")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -190,7 +190,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { @Test public void testMD5s() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("83ea9f4a9aadb1218c21c9d3780e8009")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b7c753452ab0c05f9cee538e420b87fa")); spec.disableShadowBCF(); executeTest("testMD5s", spec); } @@ -198,7 +198,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { @Test public void testBasepairResolutionOutput() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791 --convertToBasePairResolution"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f153cb6e986efc9b50f0b8833fe5d3da")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("bb6420ead95da4c72e76ca4bf5860ef0")); spec.disableShadowBCF(); executeTest("testBasepairResolutionOutput", spec); } @@ -206,7 +206,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { @Test public void testBreakBlocks() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6626ff272e7e76fba091f5bde4a1f963")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("dd31182124c4b78a8a03edb1e0cf618b")); spec.disableShadowBCF(); executeTest("testBreakBlocks", spec); } @@ -217,16 +217,49 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf", 1, - Arrays.asList("fba48ce2bf8761366ff2cd0b45d0421f")); + Arrays.asList("58984edf9a3a92c9fc97039b97755861")); spec.disableShadowBCF(); executeTest("testSpanningDeletions", spec); } + @Test + public void testMultipleSpanningDeletionsForOneSample() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.many.g.vcf", + 1, + Arrays.asList("5c88e10211def13ba847c29d0fe9e191")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsForOneSample", spec); + } + + @Test + public void testMultipleSpanningDeletionsForOneSampleHaploid() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.many.haploid.g.vcf", + 1, + Arrays.asList("76fc5f6b949ac0b893061828af800bf8")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsForOneSampleHaploid", spec); + } + + @Test + public void testMultipleSpanningDeletionsForOneSampleTetraploid() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.many.tetraploid.g.vcf", + 1, + Arrays.asList("0ec79471550ec5e30540f68cb0651b14")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsForOneSampleTetraploid", spec); + } + @Test public void testWrongReferenceBaseBugFix() throws Exception { final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf" + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input2.vcf") + " -o %s --no_cmdline_in_header"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("331c1a4a6a72ea1617c1697a5d945d56")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("c0fdba537399cf28b28771963e2c5174")); spec.disableShadowBCF(); executeTest("testWrongReferenceBaseBugFix",spec); @@ -235,7 +268,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { @Test public void testBasepairResolutionInput() throws Exception { final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("207e89b5677fbf0ef4d1ff768262cf0c")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6aeb88ca94cb5223f26175da72b985f2")); spec.disableShadowBCF(); executeTest("testBasepairResolutionInput", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java index 293e4b21a..b6d05eefb 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -223,4 +223,18 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("combiningGVCFsFails", spec); } catch (Exception e) { } // do nothing } + + @Test + public void combineSymbolicVariants() { + // Just checking that this does not fail, hence no output files and MD5 + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants --no_cmdline_in_header -o %s " + + " -R " + hg19RefereneWithChrPrefixInChromosomeNames + + " -V " + privateTestDir + "WES-chr1.DEL.vcf" + + " -V " + privateTestDir + "WGS-chr1.DEL.vcf" + + " -genotypeMergeOptions UNIQUIFY", + 0, + Arrays.asList("")); + executeTest("combineSymbolicVariants: ", spec); + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index d1ac6eca0..73f410786 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -82,7 +82,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference), 1, - Arrays.asList("4dfea9a9b1a77c4c6b9edc61f9ea8da2")); + Arrays.asList("23ff3e22262929138ca1f00fc111cadf")); executeTest("testUpdatePGT", spec); } @@ -91,7 +91,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample", b37KGReference), 1, - Arrays.asList("a96b79e7c3689c8d5506083cb6d27390")); + Arrays.asList("88fa4a021e4aac9a0e48bd54b2949ece")); executeTest("testUpdatePGT, adding StrandAlleleCountsBySample annotation", spec); } @@ -103,7 +103,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("bf3c1982ab6ffee410cb6a1fff6e7105")); + Arrays.asList("06b4e2589c5b903f7c51ae9968bebe77")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -115,7 +115,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("47d454936dc1f17cf4c4f84f02841346")); + Arrays.asList("599394c205c1d6641b9bebabbd29e13c")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -127,7 +127,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("5d79ea9de8ada8520d01284cf0c9f720")); + Arrays.asList("f7d5344a85e6d7fc2437d4253b424cb0")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -139,7 +139,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Arrays.asList("d69b43cac448f45218e77308fc01e9e6")); + Arrays.asList("c9e4d1e52ee1f3a5233f1fb100f24d5e")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -152,7 +152,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("7c93d82758bfb6e7efec257ef8a46217")); + Arrays.asList("aa19980b9a525afed43e98c821114ae5")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -164,7 +164,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("5b60a7a9575ea83407aa61123960a0cc")); + Arrays.asList("f23c9d62542a69b5cbf0e9f89fdd235d")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -174,7 +174,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + " -V " + privateTestDir + "gvcfExample1.vcf", 1, - Arrays.asList("9e59b94c84dd673b8db9d35cae7e0f68")); + Arrays.asList("d602d9e5d336798e4ccb52d2b5f91677")); executeTest("testJustOneSample", spec); } @@ -185,14 +185,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V " + privateTestDir + "gvcfExample1.vcf" + " -V " + privateTestDir + "gvcfExample2.vcf", 1, - Arrays.asList("8407cb9a1ab34e705e5a54a0d4146d84")); + Arrays.asList("6c6d6ef90386eb6c6ed649379aac0c13")); executeTest("testSamplesWithDifferentLs", spec); } @Test(enabled = true) public void testNoPLsException() { // Test with input files with (1) 0/0 and (2) ./. - final String md5 = "3e69805dc1c0ada0a050a65b89ecab30"; + final String md5 = "d04b32cf2fa97d303ff7fdc779a653d4"; WalkerTestSpec spec1 = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf", @@ -212,7 +212,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-nda"), 1, - Arrays.asList("5a036de16b7a87626d2b76727376d9df")); + Arrays.asList("7132a43d93a9855d03b27b4b0381194c")); executeTest("testNDA", spec); } @@ -221,7 +221,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-maxAltAlleles 1"), 1, - Arrays.asList("2f3e6879fa27128a8be7b067ded78966")); + Arrays.asList("07844593a4e1ff1110ef8c1de42cc290")); executeTest("testMaxAltAlleles", spec); } @@ -230,7 +230,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"), 1, - Arrays.asList("2e4a1ad71e8fc127b594077166c0344b")); + Arrays.asList("56caad762b26479ba5e2cc99222b9030")); executeTest("testStandardConf", spec); } @@ -274,7 +274,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + " --uniquifySamples", b37KGReference), 1, - Arrays.asList("9a472c4e101fff4892efb9255c5cd8b3")); + Arrays.asList("ba36b36145e038e3cb004adf11bce96c")); executeTest("testUniquifiedSamples", spec); } @@ -446,7 +446,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { } - private static final String simpleSpanningDeletionsMD5 = "e8616a396d40b4918ad30189856ceb01"; + private static final String simpleSpanningDeletionsMD5 = "1cf4ea1da40306741ec4b9a5fe1568b9"; @Test(enabled = true) public void testSpanningDeletionsMD5() { @@ -476,7 +476,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf", 1, - Arrays.asList("1c418229117bc8f148a69eda9c496309")); + Arrays.asList("0aa7ceae6af1dc4fda6732e978ace864")); spec.disableShadowBCF(); executeTest("testMultipleSpanningDeletionsMD5", spec); } @@ -487,8 +487,31 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.delOnly.g.vcf", 1, + Arrays.asList("02cca337e097b86c5471929036ad4b64")); + spec.disableShadowBCF(); + executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); + } + + @Test(enabled = true) + public void testDeprecatedSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.depr.delOnly.g.vcf", + 1, Arrays.asList("46169d08f93e5ff57856c7b64717314b")); spec.disableShadowBCF(); executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); } + + @Test(enabled = true) + public void testGenotypingSpanningDeletionOverSpan() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.delOverSpan.1.g.vcf -V " + + privateTestDir + "spanningDel.delOverSpan.2.g.vcf", + 0, + Arrays.asList("")); // we do not care about the md5; we just want to make sure it doesn't blow up with an error + spec.disableShadowBCF(); + executeTest("testGenotypingSpanningDeletionOverSpan", spec); + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index e0ce9405e..dce5ff522 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -64,8 +64,10 @@ public class SelectVariantsIntegrationTest extends WalkerTest { return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; } - private static final String sampleExclusionMD5 = "eea22fbf1e490e59389a663c3d6a6537"; - private static final String invertSelectionMD5 = "831bc0a5a723b0681a910d668ff3757b"; + private static final String SAMPLE_EXCLUSION_MD5 = "eea22fbf1e490e59389a663c3d6a6537"; + private static final String INVERT_SELECTION_MD5 = "831bc0a5a723b0681a910d668ff3757b"; + private static final String MAX_FILTERED_GT_SELECTION_MD5 = "0365de1bbf7c037be00badace0a74d02"; + private static final String MIN_FILTERED_GT_SELECTION_MD5 = "fcee8c8caa0696a6675961bb12664878"; @Test public void testDiscordanceNoSampleSpecified() { @@ -199,7 +201,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_se '[CDH]' --variant " + testfile, 1, - Arrays.asList(sampleExclusionMD5) + Arrays.asList(SAMPLE_EXCLUSION_MD5) ); spec.disableShadowBCF(); @@ -216,7 +218,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -se '[^CDH]' --variant " + testfile, 1, - Arrays.asList(sampleExclusionMD5) + Arrays.asList(SAMPLE_EXCLUSION_MD5) ); spec.disableShadowBCF(); @@ -557,7 +559,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { } /** - * Test inverting the variant selection criteria + * Test inverting the variant selection criteria by the -invertSelect argument */ @Test public void testInvertSelection() { @@ -565,16 +567,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' --invert_selection --variant " + testfile), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' -invertSelect --variant " + testfile), 1, - Arrays.asList(invertSelectionMD5) + Arrays.asList(INVERT_SELECTION_MD5) ); spec.disableShadowBCF(); executeTest("testInvertSelection--" + testfile, spec); } /** - * Test inverting the variant JEXL selection criteria + * Test inverting the variant selection criteria by inverting the JEXL expression logic following -select */ @Test public void testInvertJexlSelection() { @@ -584,7 +586,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP >= 20000'--variant " + testfile), 1, - Arrays.asList(invertSelectionMD5) + Arrays.asList(INVERT_SELECTION_MD5) ); spec.disableShadowBCF(); executeTest("testInvertJexlSelection--" + testfile, spec); @@ -659,10 +661,76 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String pedFile = privateTestDir + "CEUtrio.ped"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -inv_mv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", + "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -invMv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", 1, Arrays.asList("35921fb2dedca0ead83027a66b725794")); executeTest("testMendelianViolationSelection--" + testFile, spec); } + + @Test + public void testMaxFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --maxFilteredGenotypes 1 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MAX_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMaxFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testMinFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --minFilteredGenotypes 2 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MIN_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMinFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testMaxFractionFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --maxFractionFilteredGenotypes 0.4 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MAX_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMaxFractionFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testMinFractionFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --minFractionFilteredGenotypes 0.6 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MIN_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMinFractionFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testSetFilteredGtoNocall() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("81b99386a64a8f2b857a7ef2bca5856e") + ); + + spec.disableShadowBCF(); + executeTest("testSetFilteredGtoNocall--" + testfile, spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java index 6290fdd96..642733d8f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -95,7 +95,7 @@ public class VariantContextMergerUnitTest extends BaseTest { ATCATCT = Allele.create("ATCATCT"); ATref = Allele.create("AT",true); Anoref = Allele.create("A",false); - del = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE; + del = Allele.SPAN_DEL; GT = Allele.create("GT",false); genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCFIntegrationTest.java index 5d8c21926..ff9b45b98 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCFIntegrationTest.java @@ -67,23 +67,6 @@ import java.util.ArrayList; */ public class VariantsToVCFIntegrationTest extends WalkerTest { - @Test - public void testVariantsToVCFUsingDbsnpInput() { - List md5 = new ArrayList(); - md5.add("72e6ce7aff7dec7ca9e7580be7ddd435"); - - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " --variant:OldDbsnp " + GATKDataLocation + "Comparisons/Validated/dbSNP/dbsnp_129_b36.rod" + - " -T VariantsToVCF" + - " -L 1:1-30,000,000" + - " -o %s" + - " --no_cmdline_in_header", - 1, // just one output file - md5); - executeTest("testVariantsToVCFUsingDbsnpInput", spec).getFirst(); - } - @Test public void testVariantsToVCFUsingGeliInput() { List md5 = new ArrayList(); diff --git a/protected/pom.xml b/protected/pom.xml index cae185e15..edfd9e7b3 100644 --- a/protected/pom.xml +++ b/protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT ../public/gatk-root diff --git a/public/VectorPairHMM/pom.xml b/public/VectorPairHMM/pom.xml index 05c6a5851..ccbc99eef 100644 --- a/public/VectorPairHMM/pom.xml +++ b/public/VectorPairHMM/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT ../../public/gatk-root diff --git a/public/external-example/pom.xml b/public/external-example/pom.xml index ac47625c3..02eaa9800 100644 --- a/public/external-example/pom.xml +++ b/public/external-example/pom.xml @@ -9,7 +9,7 @@ External Example - 3.4 + 3.5-SNAPSHOT - 1.132 - 1.131 + 1.134 + 1.133 diff --git a/public/gatk-tools-public/pom.xml b/public/gatk-tools-public/pom.xml index e5af4d8ff..347839ec7 100644 --- a/public/gatk-tools-public/pom.xml +++ b/public/gatk-tools-public/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java index e71860794..e027464d4 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java @@ -137,6 +137,9 @@ public class CoverageUtils { public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) { Map countsByRG = new HashMap(); + Map countsByRGName = new HashMap(); + Map RGByName = new HashMap(); + List countPileup = new LinkedList(); FragmentCollection fpile; @@ -202,10 +205,20 @@ public class CoverageUtils { for (PileupElement e : countPileup) { SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); - if (!countsByRG.keySet().contains(readGroup)) - countsByRG.put(readGroup, new int[6]); - updateCounts(countsByRG.get(readGroup), e); + String readGroupId = readGroup.getSample() + "_" + readGroup.getReadGroupId(); + int[] counts = countsByRGName.get(readGroupId); + if (counts == null) { + counts = new int[6]; + countsByRGName.put(readGroupId, counts); + RGByName.put(readGroupId, readGroup); + } + + updateCounts(counts, e); + } + + for (String readGroupId : RGByName.keySet()) { + countsByRG.put(RGByName.get(readGroupId), countsByRGName.get(readGroupId)); } return countsByRG; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index aa2e96f1a..4c4d6f02d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -179,15 +179,21 @@ public class VariantFiltration extends RodWalker { /** * Invert the selection criteria for --filterExpression */ - @Argument(fullName="invert_filter_expression", shortName="inv_fil_sel", doc="Invert the selection criteria for --filterExpression", required=false) + @Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false) protected boolean invertFilterExpression = false; /** * Invert the selection criteria for --genotypeFilterExpression */ - @Argument(fullName="invert_genotype_filter_expression", shortName="inv_gen_fil_sel", doc="Invert the selection criteria for --genotypeFilterExpression", required=false) + @Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false) protected boolean invertGenotypeFilterExpression = false; + /** + * If this argument is provided, set filtered genotypes to no-call (./.). + */ + @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call") + private boolean setFilteredGenotypesToNocall = false; + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; @@ -201,8 +207,10 @@ public class VariantFiltration extends RodWalker { private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one private ArrayList windowInitializer = new ArrayList(); + private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + /** - * Prepend inverse phrase to description if --invert_filter_expression + * Prepend inverse phrase to description if --invertFilterExpression * * @param description the description * @return the description with inverse prepended if --invert_filter_expression @@ -379,7 +387,7 @@ public class VariantFiltration extends RodWalker { final VariantContextBuilder builder = new VariantContextBuilder(vc); // make new Genotypes based on filters - if ( !genotypeFilterExps.isEmpty() ) { + if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) { GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); // for each genotype, check filters then create a new object @@ -388,12 +396,17 @@ public class VariantFiltration extends RodWalker { final List filters = new ArrayList(); if ( g.isFiltered() ) filters.add(g.getFilters()); + // Add if expression filters the variant context for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) ) filters.add(exp.name); } - genotypes.add(new GenotypeBuilder(g).filters(filters).make()); + // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call + if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) + genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make()); + else + genotypes.add(new GenotypeBuilder(g).filters(filters).make()); } else { genotypes.add(g); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 63971c497..d44fbc84f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -146,7 +146,7 @@ import java.util.*; * -o output.vcf \ * -se 'SAMPLE.+PARC' \ * -select "QD > 10.0" - * --invert_selection + * -invertSelect * * *

Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):

@@ -207,7 +207,7 @@ import java.util.*; * -sn mySample * * - *

Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more

+ *

Generating a VCF of all the variants that are mendelian violations. The optional argument '-mvq' restricts the selection to sites that have a QUAL score of 50 or more

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -218,14 +218,14 @@ import java.util.*;
  *   -o violations.vcf
  * 
* - *

Generating a VCF of all the variants that are not mendelian violations. The optional argument `-mvq` together with '-inv_mv' restricts the selection to sites that have a QUAL score of 50 or less

+ *

Generating a VCF of all the variants that are not mendelian violations. The optional argument '-mvq' together with '-invMv' restricts the selection to sites that have a QUAL score of 50 or less

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
  *   -R reference.fasta \
  *   -V input.vcf \
  *   -ped family.ped \
- *   -mv -mvq 50 -inv_mv \
+ *   -mv -mvq 50 -invMv \
  *   -o violations.vcf
  * 
* @@ -275,7 +275,7 @@ import java.util.*; * *

Select IDs in fileKeep and exclude IDs in fileExclude:

*
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * java -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
  *   -T SelectVariants \
  *   --variant input.vcf \
@@ -284,9 +284,35 @@ import java.util.*;
  *   -excludeIDs fileExclude
  * 
* + *

Select sites where there are between 2 and 5 samples and between 10 and 50 percent of the sample genotypes are filtered:

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   --maxFilteredGenotypes 5
+ *   --minFilteredGenotypes 2
+ *   --maxFractionFilteredGenotypes 0.60
+ *   --minFractionFilteredGenotypes 0.10
+ * 
+ * + *

Set filtered genotypes to no-call (./.):

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   --setFilteredGenotypesToNocall
+ * 
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class SelectVariants extends RodWalker implements TreeReducible { + static final int MAX_FILTERED_GENOTYPES_DEFAULT_VALUE = Integer.MAX_VALUE; + static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0; + static final double MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 1.0; + static final double MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 0.0; + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** @@ -359,10 +385,10 @@ public class SelectVariants extends RodWalker implements TreeR public ArrayList selectExpressions = new ArrayList<>(); /** - * Invert the selection criteria for --select. + * Invert the selection criteria for -select. */ - @Argument(fullName="invert_selection", shortName="inv_sel", doc="Invert the selection criteria for --select", required=false) - protected boolean invertSelection = false; + @Argument(shortName="invertSelect", doc="Invert the selection criteria for -select", required=false) + protected boolean invertSelect = false; /* * If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none @@ -436,8 +462,8 @@ public class SelectVariants extends RodWalker implements TreeR * determined on the basis of family structure. Requires passing a pedigree file using the engine-level * `-ped` argument. */ - @Argument(fullName="invert_mendelianViolation", shortName="inv_mv", doc="Output non-mendelian violation sites only", required=false) - private Boolean invert_mendelianViolations = false; + @Argument(fullName="invertMendelianViolation", shortName="invMv", doc="Output non-mendelian violation sites only", required=false) + private Boolean invertMendelianViolations = false; /** * This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order @@ -514,6 +540,36 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="minIndelSize", required=false, doc="Minimum size of indels to include") private int minIndelSize = 0; + /** + * If this argument is provided, select sites where at most a maximum number of samples are filtered at the genotype level. + */ + @Argument(fullName="maxFilteredGenotypes", required=false, doc="Maximum number of samples filtered at the genotype level") + private int maxFilteredGenotypes = MAX_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, select sites where at least a minimum number of samples are filtered at the genotype level. + */ + @Argument(fullName="minFilteredGenotypes", required=false, doc="Minimum number of samples filtered at the genotype level") + private int minFilteredGenotypes = MIN_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, select sites where a fraction or less of the samples are filtered at the genotype level. + */ + @Argument(fullName="maxFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level") + private double maxFractionFilteredGenotypes = MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, select sites where a fraction or more of the samples are filtered at the genotype level. + */ + @Argument(fullName="minFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level") + private double minFractionFilteredGenotypes = MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, set filtered genotypes to no-call (./.). + */ + @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call") + private boolean setFilteredGenotypesToNocall = false; + @Hidden @Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.") private boolean allowNonOverlappingCommandLineSamples = false; @@ -547,6 +603,8 @@ public class SelectVariants extends RodWalker implements TreeR private Set IDsToRemove = null; private Map vcfRods; + private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ @@ -619,7 +677,7 @@ public class SelectVariants extends RodWalker implements TreeR selectedTypes.add(t); } - // remove specifed types + // remove specified types for (VariantContext.Type t : typesToExclude) selectedTypes.remove(t); @@ -713,16 +771,16 @@ public class SelectVariants extends RodWalker implements TreeR for (VariantContext vc : vcs) { // an option for performance testing only - if ( fullyDecode ) - vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() ); + if (fullyDecode) + vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing()); - if ( IDsToKeep != null && !IDsToKeep.contains(vc.getID()) ) + if (IDsToKeep != null && !IDsToKeep.contains(vc.getID())) continue; - if ( IDsToRemove != null && IDsToRemove.contains(vc.getID()) ) + if (IDsToRemove != null && IDsToRemove.contains(vc.getID())) continue; - if ( mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invert_mendelianViolations) ) + if (mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invertMendelianViolations)) break; if (discordanceOnly) { @@ -745,20 +803,30 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; - if ( containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize) ) + if (containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize)) continue; + if ( needNumFilteredGenotypes()) { + int numFilteredSamples = numFilteredGenotypes(vc); + double fractionFilteredGenotypes = samples.isEmpty() ? 0.0 : numFilteredSamples / samples.size(); + if (numFilteredSamples > maxFilteredGenotypes || numFilteredSamples < minFilteredGenotypes || + fractionFilteredGenotypes > maxFractionFilteredGenotypes || fractionFilteredGenotypes < minFractionFilteredGenotypes) + continue; + } + VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates); + VariantContext filteredGenotypeToNocall = setFilteredGenotypeToNocall(sub, setFilteredGenotypesToNocall); + // Not excluding non-variants or subsetted polymorphic variants AND including filtered loci or subsetted variant is not filtered - if ( (!XLnonVariants || sub.isPolymorphicInSamples()) && (!XLfiltered || !sub.isFiltered()) ) { + if ( (!XLnonVariants || filteredGenotypeToNocall.isPolymorphicInSamples()) && (!XLfiltered || !filteredGenotypeToNocall.isFiltered()) ) { // Write the subsetted variant if it matches all of the expressions boolean failedJexlMatch = false; try { for (VariantContextUtils.JexlVCMatchExp jexl : jexls) { - if ( Utils.invertLogic(!VariantContextUtils.match(sub, jexl), invertSelection) ){ + if ( Utils.invertLogic(!VariantContextUtils.match(filteredGenotypeToNocall, jexl), invertSelect) ){ failedJexlMatch = true; break; } @@ -771,7 +839,7 @@ public class SelectVariants extends RodWalker implements TreeR if ( !failedJexlMatch && !justRead && ( !selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) { - vcfWriter.add(sub); + vcfWriter.add(filteredGenotypeToNocall); } } } @@ -800,6 +868,26 @@ public class SelectVariants extends RodWalker implements TreeR return false; } + /** + * Find the number of filtered samples + * + * @param vc the variant rod VariantContext + * @return number of filtered samples + */ + private int numFilteredGenotypes(final VariantContext vc){ + if (vc == null) + return 0; + + int numFiltered = 0; + // check if we find it in the variant rod + GenotypesContext genotypes = vc.getGenotypes(samples); + for (final Genotype g : genotypes) + if ( g.isFiltered() && !g.getFilters().isEmpty()) + numFiltered++; + + return numFiltered; + } + /** * Checks if vc has a variant call for (at least one of) the samples. * @@ -807,7 +895,7 @@ public class SelectVariants extends RodWalker implements TreeR * @param compVCs the comparison VariantContext (discordance) * @return true VariantContexts are discordant, false otherwise */ - private boolean isDiscordant (VariantContext vc, Collection compVCs) { + private boolean isDiscordant (final VariantContext vc, final Collection compVCs) { if (vc == null) return false; @@ -845,7 +933,7 @@ public class SelectVariants extends RodWalker implements TreeR * @param compVCs the comparison VariantContext * @return true if VariantContexts are concordant, false otherwise */ - private boolean isConcordant (VariantContext vc, Collection compVCs) { + private boolean isConcordant (final VariantContext vc, final Collection compVCs) { if (vc == null || compVCs == null || compVCs.isEmpty()) return false; @@ -876,7 +964,7 @@ public class SelectVariants extends RodWalker implements TreeR return true; } - private boolean sampleHasVariant(Genotype g) { + private boolean sampleHasVariant(final Genotype g) { return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !XLfiltered))); } @@ -945,8 +1033,7 @@ public class SelectVariants extends RodWalker implements TreeR for ( Genotype genotype : newGC ) { //Set genotype to no call if it falls in the fraction. if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make()); + genotypes.add(new GenotypeBuilder(genotype).alleles(diploidNoCallAlleles).noGQ().make()); } else{ genotypes.add(genotype); @@ -966,6 +1053,30 @@ public class SelectVariants extends RodWalker implements TreeR return trimmed; } + /** + * If --setFilteredGtToNocall, set filtered genotypes to no-call + * + * @param vc the VariantContext record to set filtered genotypes to no-call + * @param filteredGenotypesToNocall set filtered genotypes to non-call? + * @return the VariantContext with no-call genotypes if the sample was filtered + */ + private VariantContext setFilteredGenotypeToNocall(final VariantContext vc, final boolean filteredGenotypesToNocall) { + + if ( !filteredGenotypesToNocall ) + return vc; + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); + + for ( final Genotype g : vc.getGenotypes() ) { + if ( g.isCalled() && g.isFiltered() ) + genotypes.add(new GenotypeBuilder(g).alleles(diploidNoCallAlleles).make()); + else + genotypes.add(g); + } + + return builder.genotypes(genotypes).make(); + } /* * Add annotations to the new VC * @@ -1061,4 +1172,16 @@ public class SelectVariants extends RodWalker implements TreeR } return result; } -} + + /** + * Need the number of filtered genotypes samples? + * + * @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise + */ + private boolean needNumFilteredGenotypes(){ + return maxFilteredGenotypes != MAX_FILTERED_GENOTYPES_DEFAULT_VALUE || + minFilteredGenotypes != MIN_FILTERED_GENOTYPES_DEFAULT_VALUE || + maxFractionFilteredGenotypes != MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE || + minFractionFilteredGenotypes != MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java index 18a2d6c12..69ae7a472 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java @@ -246,4 +246,20 @@ public class CatVariantsIntegrationTest { ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine)); pc.execAndCheck(ps); } + + @Test() + public void testCatVariantsGVCFGzIndexCreation() throws IOException{ + + String cmdLine = String.format("java -cp \"%s\" %s -R %s -V %s -V %s -out %s", + StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar), + CatVariants.class.getCanonicalName(), + BaseTest.b37KGReference, + CatVariantsVcf1, + CatVariantsVcf2, + BaseTest.createTempFile("CatVariantsGVCFIndexCreationTest", "." + GATKVCFUtils.GVCF_GZ_EXT)); + + ProcessController pc = ProcessController.getThreadLocal(); + ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine)); + pc.execAndCheck(ps); + } } \ No newline at end of file diff --git a/public/gatk-utils/pom.xml b/public/gatk-utils/pom.xml index fce54e9cc..25241183c 100644 --- a/public/gatk-utils/pom.xml +++ b/public/gatk-utils/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java index 22c7127c2..d0eb0b9f8 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java @@ -165,7 +165,12 @@ public class SequenceDictionaryUtils { } case OUT_OF_ORDER: { - UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2); + UserException ex = new UserException.IncompatibleSequenceDictionaries( + "The contig order in " + name1 + " and " + name2 + "is not " + + "the same; to fix this please see: " + + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), " + + " which describes reordering contigs in BAM and VCF files.", + name1, dict1, name2, dict2); if ( allowNonFatalIncompabilities(validationExclusion) ) logger.warn(ex.getMessage()); else diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java index d2c11407a..f21975cce 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java @@ -28,7 +28,6 @@ package org.broadinstitute.gatk.utils.refdata; import htsjdk.samtools.util.SequenceUtil; import htsjdk.tribble.Feature; import htsjdk.tribble.annotation.Strand; -import htsjdk.tribble.dbsnp.OldDbSNPFeature; import htsjdk.tribble.gelitext.GeliTextFeature; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -111,139 +110,6 @@ public class VariantContextAdaptors { } } - // -------------------------------------------------------------------------------------------------------------- - // - // dbSNP to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class DBSnpAdaptor implements VCAdaptor { - private static boolean isSNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); - } - - private static boolean isMNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); - } - - private static boolean isInsertion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("insertion"); - } - - private static boolean isDeletion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("deletion"); - } - - private static boolean isIndel(OldDbSNPFeature feature) { - return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature); - } - - public static boolean isComplexIndel(OldDbSNPFeature feature) { - return feature.getVariantType().contains("in-del"); - } - - /** - * gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference base. This is returned as a string list with no guarantee ordering - * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest - * frequency). - * - * @return an alternate allele list - */ - public static List getAlternateAlleleList(OldDbSNPFeature feature) { - List ret = new ArrayList(); - for (String allele : getAlleleList(feature)) - if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); - return ret; - } - - /** - * gets the alleles. This method should return all the alleles present at the location, - * including the reference base. The first allele should always be the reference allele, followed - * by an unordered list of alternate alleles. - * - * @return an alternate allele list - */ - public static List getAlleleList(OldDbSNPFeature feature) { - List alleleList = new ArrayList(); - // add ref first - if ( feature.getStrand() == Strand.POSITIVE ) - alleleList = Arrays.asList(feature.getObserved()); - else - for (String str : feature.getObserved()) - alleleList.add(SequenceUtil.reverseComplement(str)); - if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase()) - && !alleleList.get(0).equals(feature.getNCBIRefBase()) ) - Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0); - - return alleleList; - } - - /** - * Converts non-VCF formatted dbSNP records to VariantContext. - * @return OldDbSNPFeature. - */ - @Override - public Class getAdaptableFeatureType() { return OldDbSNPFeature.class; } - - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - OldDbSNPFeature dbsnp = (OldDbSNPFeature)input; - - int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - - final byte refBaseForIndel = ref.getBases()[index]; - final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-"); - - boolean addPaddingBase; - if ( isSNP(dbsnp) || isMNP(dbsnp) ) - addPaddingBase = false; - else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) - addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); - else - return null; // can't handle anything else - - Allele refAllele; - if ( refBaseIsDash ) - refAllele = Allele.create(refBaseForIndel, true); - else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) - return null; - else - refAllele = Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + dbsnp.getNCBIRefBase(), true); - - final List alleles = new ArrayList(); - alleles.add(refAllele); - - // add all of the alt alleles - for ( String alt : getAlternateAlleleList(dbsnp) ) { - if ( Allele.wouldBeNullAllele(alt.getBytes())) - alt = ""; - else if ( ! Allele.acceptableAlleleBases(alt) ) - return null; - - alleles.add(Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + alt, false)); - } - - final VariantContextBuilder builder = new VariantContextBuilder(); - builder.source(name).id(dbsnp.getRsID()); - builder.loc(dbsnp.getChr(), dbsnp.getStart() - (addPaddingBase ? 1 : 0), dbsnp.getEnd() - (addPaddingBase && refAllele.length() == 1 ? 1 : 0)); - builder.alleles(alleles); - return builder.make(); - } - - private static List stripNullDashes(final List alleles) { - final List newAlleles = new ArrayList(alleles.size()); - for ( final String allele : alleles ) { - if ( allele.equals("-") ) - newAlleles.add(""); - else - newAlleles.add(allele); - } - return newAlleles; - } - } // -------------------------------------------------------------------------------------------------------------- // diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index 95cf3e593..a7e9bf0de 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -51,10 +51,14 @@ public final class GATKVCFConstants { public static final String SPANNING_DELETIONS_KEY = "Dels"; public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants public static final String DOWNSAMPLED_KEY = "DS"; + public static final String EVENT_COUNT_IN_HAPLOTYPE_KEY = "ECNT"; //M2 + public static final String EVENT_DISTANCE_MAX_KEY = "MAX_ED"; //M2 + public static final String EVENT_DISTANCE_MIN_KEY = "MIN_ED"; //M2 public static final String FISHER_STRAND_KEY = "FS"; public static final String GC_CONTENT_KEY = "GC"; public static final String GQ_MEAN_KEY = "GQ_MEAN"; public static final String GQ_STDEV_KEY = "GQ_STDDEV"; + public static final String HAPLOTYPE_COUNT_KEY = "HCNT"; //M2 public static final String HAPLOTYPE_SCORE_KEY = "HaplotypeScore"; public static final String HI_CONF_DENOVO_KEY = "hiConfDeNovo"; public static final String HOMOPOLYMER_RUN_KEY = "HRun"; @@ -80,8 +84,10 @@ public final class GATKVCFConstants { public static final String ORIGINAL_CONTIG_KEY = "OriginalChr"; //LiftoverVariants public static final String ORIGINAL_START_KEY = "OriginalStart"; //LiftoverVariants public static final String N_BASE_COUNT_KEY = "PercentNBase"; + public static final String NORMAL_LOD_KEY = "NLOD"; //M2 public static final String RBP_INCONSISTENT_KEY = "PhasingInconsistent"; //ReadBackedPhasing public static final String GENOTYPE_PRIOR_KEY = "PG"; + public static final String PANEL_OF_NORMALS_COUNT_KEY = "PON"; //M2 public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; public static final String QUAL_BY_DEPTH_KEY = "QD"; public static final String BEAGLE_R2_KEY = "R2"; //BeagleOutputToVCF @@ -93,11 +99,18 @@ public final class GATKVCFConstants { public static final String STRAND_ODDS_RATIO_KEY = "SOR"; public static final String STR_PRESENT_KEY = "STR"; public static final String TRANSMISSION_DISEQUILIBRIUM_KEY = "TDT"; + public static final String TUMOR_LOD_KEY = "TLOD"; //M2 public static final String VARIANT_TYPE_KEY = "VariantType"; public static final String VQS_LOD_KEY = "VQSLOD"; + public static final String OXOG_ALT_F1R2_KEY = "ALT_F1R2"; + public static final String OXOG_ALT_F2R1_KEY = "ALT_F2R1"; + public static final String OXOG_REF_F1R2_KEY = "REF_F1R2"; + public static final String OXOG_REF_F2R1_KEY = "REF_F2R1"; + public static final String OXOG_FRACTION_KEY = "FOXOG"; //FORMAT keys public static final String ALLELE_BALANCE_KEY = "AB"; + public static final String ALLELE_FRACTION_KEY = "AF"; //M2 public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; public static final String RBP_HAPLOTYPE_KEY = "HP"; //ReadBackedPhasing public static final String AVG_INTERVAL_DP_BY_SAMPLE_KEY = "IDP"; //DiagnoseTargets @@ -110,6 +123,7 @@ public final class GATKVCFConstants { public static final String HAPLOTYPE_CALLER_PHASING_GT_KEY = "PGT"; public static final String HAPLOTYPE_CALLER_PHASING_ID_KEY = "PID"; public static final String PHRED_SCALED_POSTERIORS_KEY = "PP"; //FamilyLikelihoodsUtils / PosteriorLikelihoodsUtils + public static final String QUALITY_SCORE_SUM_KEY = "QSS"; //M2 public static final String REFERENCE_GENOTYPE_QUALITY = "RGQ"; public static final String STRAND_COUNT_BY_SAMPLE_KEY = "SAC"; public static final String STRAND_BIAS_BY_SAMPLE_KEY = "SB"; @@ -120,13 +134,21 @@ public final class GATKVCFConstants { /* Note that many filters used throughout GATK (most notably in VariantRecalibration) are dynamic, their names (or descriptions) depend on some threshold. Those filters are not included here */ - public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC"; - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + public static final String ALT_ALLELE_IN_NORMAL_FILTER_NAME = "alt_allele_in_normal"; //M2 + public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC"; + public static final String CLUSTERED_EVENTS_FILTER_NAME = "clustered_events"; //M2 + public static final String GERMLINE_RISK_FILTER_NAME = "germline_risk"; //M2 + public static final String HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME = "homologous_mapping_event"; //M2 + public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + public static final String MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME = "multi_event_alt_allele_in_normal"; //M2 + public static final String PON_FILTER_NAME = "panel_of_normals"; //M2 + public static final String STR_CONTRACTION_FILTER_NAME = "str_contraction"; //M2 + public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2 // Symbolic alleles public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT"; public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME = "*:DEL"; - public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible spanning deletion allele at this site + public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED = "*:DEL"; + public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED+">", false); // represents any possible spanning deletion allele at this site } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index 89b9510d2..fff7ea5f1 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -25,16 +25,11 @@ package org.broadinstitute.gatk.utils.variant; -import htsjdk.variant.vcf.VCFFilterHeaderLine; -import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.VCFHeaderLineCount; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.*; import static org.broadinstitute.gatk.utils.variant.GATKVCFConstants.*; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** * This class contains the VCFHeaderLine definitions for the annotation keys in GATKVCFConstants. @@ -66,6 +61,16 @@ public class GATKVCFHeaderLines { addFilterLine(new VCFFilterHeaderLine(LOW_QUAL_FILTER_NAME, "Low quality")); addFilterLine(new VCFFilterHeaderLine(BEAGLE_MONO_FILTER_NAME, "This site was set to monomorphic by Beagle")); + // M2-related filters + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Evidence seen in the normal sample")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME, "Clustered events observed in the tumor")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME, "Evidence indicates this site is germline, not somatic")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME, "More than three events were observed in the tumor")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Multiple events observed in tumor and normal")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.PON_FILTER_NAME, "Seen in at least 2 samples in the panel of normals")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short tandem repeat region")); + addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype")); addFormatLine(new VCFFormatHeaderLine(MAPPING_QUALITY_ZERO_BY_SAMPLE_KEY, 1, VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); addFormatLine(new VCFFormatHeaderLine(MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); @@ -89,6 +94,15 @@ public class GATKVCFHeaderLines { addFormatLine(new VCFFormatHeaderLine(JOINT_POSTERIOR_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred-scaled joint posterior probability of the genotype combination (after applying family priors)")); addFormatLine(new VCFFormatHeaderLine(ORIGINAL_GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Original Genotype input to Beagle")); + // M2-related info lines + addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.ALLELE_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Allele fraction of the event in the tumor")); + addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_ALT_F1R2_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting the alternate allele")); + addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_ALT_F2R1_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting the alternate allele")); + addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_REF_F1R2_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting the reference allele")); + addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_REF_F2R1_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting the reference allele")); + addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Fraction of alt reads indicating OxoG error")); + + addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed")); addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed")); addInfoLine(new VCFInfoHeaderLine(DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); @@ -132,7 +146,7 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(ORIGINAL_DP_KEY, 1, VCFHeaderLineType.Integer, "Original DP")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_CONTIG_KEY, 1, VCFHeaderLineType.String, "Original contig name for the record")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_START_KEY, 1, VCFHeaderLineType.Integer, "Original start position for the record")); - addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); + addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds of being a true variant versus being false under the trained gaussian mixture model")); addInfoLine(new VCFInfoHeaderLine(CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); addInfoLine(new VCFInfoHeaderLine(POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants")); addInfoLine(new VCFInfoHeaderLine(NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants")); @@ -147,5 +161,15 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(BEAGLE_AC_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site")); addInfoLine(new VCFInfoHeaderLine(BEAGLE_AF_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site")); addInfoLine(new VCFInfoHeaderLine(BEAGLE_AN_COMP_KEY, 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site")); + + // M2-related info lines + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.String, "Number of events in this haplotype")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, 1, VCFHeaderLineType.Integer, "Maximum distance between events in this active region")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, 1, VCFHeaderLineType.Integer, "Minimum distance between events in this active region")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, 1, VCFHeaderLineType.String, "Number of haplotypes that support this variant")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.NORMAL_LOD_KEY, 1, VCFHeaderLineType.String, "Normal LOD score")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY, 1, VCFHeaderLineType.String, "Count from Panel of Normals")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_LOD_KEY, 1, VCFHeaderLineType.String, "Tumor LOD score")); + } } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 8f3e5c450..445828f58 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -393,7 +393,7 @@ public class GATKVariantContextUtils { if (repetitionCount[0] == 0 || repetitionCount[1] == 0) return null; - if (lengths.size() == 0) { + if (lengths.isEmpty()) { lengths.add(repetitionCount[0]); // add ref allele length only once } lengths.add(repetitionCount[1]); // add this alt allele's length @@ -947,7 +947,7 @@ public class GATKVariantContextUtils { final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC ) { - if ( unsortedVCs == null || unsortedVCs.size() == 0 ) + if ( unsortedVCs == null || unsortedVCs.isEmpty() ) return null; if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) @@ -965,7 +965,7 @@ public class GATKVariantContextUtils { VCs.add(vc); } - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled + if ( VCs.isEmpty() ) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC @@ -1289,7 +1289,7 @@ public class GATKVariantContextUtils { * @param currentAlleles the list of alleles already created * @return a non-null mapping of original alleles to new (extended) ones */ - private static Map createAlleleMapping(final Allele refAllele, + protected static Map createAlleleMapping(final Allele refAllele, final VariantContext oneVC, final Collection currentAlleles) { final Allele myRef = oneVC.getReference(); @@ -1305,6 +1305,8 @@ public class GATKVariantContextUtils { if ( extended.equals(b) ) extended = b; map.put(a, extended); + } else if ( a.isSymbolic() ) { + map.put(a, a); } } @@ -1696,7 +1698,7 @@ public class GATKVariantContextUtils { // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list vcList.remove(k); // avoid having empty lists - if (vcList.size() == 0) + if (vcList.isEmpty()) mappedVCs.remove(type); if ( !mappedVCs.containsKey(vcType) ) mappedVCs.put(vcType, new ArrayList()); diff --git a/public/gatk-utils/src/main/resources/org/broadinstitute/gatk/utils/pairhmm/libVectorLoglessPairHMM.so b/public/gatk-utils/src/main/resources/org/broadinstitute/gatk/utils/pairhmm/libVectorLoglessPairHMM.so index 7913a51f8..e734211ba 100644 Binary files a/public/gatk-utils/src/main/resources/org/broadinstitute/gatk/utils/pairhmm/libVectorLoglessPairHMM.so and b/public/gatk-utils/src/main/resources/org/broadinstitute/gatk/utils/pairhmm/libVectorLoglessPairHMM.so differ diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java index 860c54736..410bd848f 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -46,6 +46,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Allele ATref; Allele Anoref; Allele GT; + Allele Symbolic; private GenomeLocParser genomeLocParser; @@ -63,6 +64,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { ATref = Allele.create("AT",true); Anoref = Allele.create("A",false); GT = Allele.create("GT",false); + Symbolic = Allele.create("", false); genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); } @@ -1607,5 +1609,25 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { BaseUtils.fillWithRandomBases(bases, 1, bases.length); return bases; } + + @Test + public void testCreateAlleleMapping(){ + final List alleles = Arrays.asList(Aref,Symbolic,T); + final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make(); + Map map = GATKVariantContextUtils.createAlleleMapping(ATref, vc, alleles); + + final List expectedAlleles = Arrays.asList(Allele.create("", false), Allele.create("TT", false)); + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ){ + Assert.assertEquals(map.get(vc.getAlternateAlleles().get(i)), expectedAlleles.get(i)); + } + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testCreateAlleleMappingException(){ + final List alleles = Arrays.asList(Aref, Symbolic, T); + final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make(); + // Throws an exception if the ref allele length <= ref allele length to extend + Map map = GATKVariantContextUtils.createAlleleMapping(Aref, vc, alleles); + } } diff --git a/public/gsalib/pom.xml b/public/gsalib/pom.xml index 00a84c6a5..5546ac79c 100644 --- a/public/gsalib/pom.xml +++ b/public/gsalib/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.4 + 3.5-SNAPSHOT ../.. diff --git a/public/package-tests/pom.xml b/public/package-tests/pom.xml index 3683ccb67..2f2e74940 100644 --- a/public/package-tests/pom.xml +++ b/public/package-tests/pom.xml @@ -9,7 +9,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT ../gatk-root diff --git a/public/pom.xml b/public/pom.xml index f4cea4f85..a166255a1 100644 --- a/public/pom.xml +++ b/public/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.4 + 3.5-SNAPSHOT gatk-root