diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index b22ea7931..0da865a85 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -112,18 +112,18 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { - HashMap alleleCounts = new HashMap(); - for ( Allele allele : vc.getAlleles() ) + final HashMap alleleCounts = new HashMap<>(); + for ( final Allele allele : vc.getAlleles() ) alleleCounts.put(allele.getBases()[0], 0); - ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - for ( PileupElement p : pileup ) { + final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); + for ( final PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); } // we need to add counts in the correct order - int[] counts = new int[alleleCounts.size()]; + final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); @@ -141,7 +141,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final HashMap alleleCounts = new HashMap<>(); for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } - for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); if (! a.isInformative() ) continue; // read is non-informative final GATKSAMRecord read = el.getKey(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index cdbd43a7a..95be967a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -58,6 +58,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -97,6 +99,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( !vc.isVariant() ) return null; + if ( vc.hasGenotypes() ) { + final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() ); + if ( tableFromPerSampleAnnotations != null ) { + return pValueForBestTable(tableFromPerSampleAnnotations, null); + } + } + if (vc.isSNP() && stratifiedContexts != null) { final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); @@ -117,6 +126,32 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return null; } + /** + * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together + * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation + * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation + */ + private int[][] getTableFromSamples( final GenotypesContext genotypes ) { + if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } + + final int[] sbArray = {0,0,0,0}; // forward-reverse -by- alternate-reference + boolean foundData = false; + + for( final Genotype g : genotypes ) { + if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) + continue; + + foundData = true; + final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); + final int[] data = encodeSBBS(sbbsString); + for( int index = 0; index < sbArray.length; index++ ) { + sbArray[index] += data[index]; + } + } + + return ( foundData ? decodeSBBS(sbArray) : null ); + } + /** * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 * @@ -148,12 +183,56 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } public List getKeyNames() { - return Arrays.asList(FS); + return Collections.singletonList(FS); } public List getDescriptions() { - return Arrays.asList( - new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + } + + /** + * Helper function to turn the FisherStrand table into the SB annotation array + * @param table the table used by the FisherStrand annotation + * @return the array used by the per-sample Strand Bias annotation + */ + public static int[] getContingencyArray( final int[][] table ) { + if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + final int[] array = new int[4]; // TODO - if we ever want to do something clever with multi-allelic sites this will need to change + array[0] = table[0][0]; + array[1] = table[0][1]; + array[2] = table[1][0]; + array[3] = table[1][1]; + return array; + } + + /** + * Helper function to parse the genotype annotation into the SB annotation array + * @param string the string that is returned by genotype.getAnnotation("SB") + * @return the array used by the per-sample Strand Bias annotation + */ + private static int[] encodeSBBS( final String string ) { + final int[] array = new int[4]; + final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); + for( int index = 0; index < 4; index++ ) { + array[index] = Integer.parseInt(tokenizer.nextToken()); + } + return array; + } + + /** + * Helper function to turn the SB annotation array into the FisherStrand table + * @param array the array used by the per-sample Strand Bias annotation + * @return the table used by the FisherStrand annotation + */ + private static int[][] decodeSBBS( final int[] array ) { + if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } + final int[][] table = new int[2][2]; + table[0][0] = array[0]; + table[0][1] = array[1]; + table[1][0] = array[2]; + table[1][1] = array[3]; + return table; } private Double pValueForContingencyTable(int[][] originalTable) { @@ -284,13 +363,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } + if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } + final Allele ref = vc.getReference(); final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); - int[][] table = new int[2][2]; + final int[][] table = new int[2][2]; - for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { - for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index bdf37df71..da2143ec1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -76,8 +76,10 @@ import java.util.*; public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final int MIN_SAMPLES = 10; + private static final String INBREEDING_COEFFICIENT_KEY_NAME = "InbreedingCoeff"; private Set founderIds; + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, @@ -92,15 +94,15 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) + if (genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) return null; int idxAA = 0, idxAB = 1, idxBB = 2; if (!vc.isBiallelic()) { // for non-bliallelic case, do test with most common alt allele. - // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB. - int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); + // Get then corresponding indices in GL vectors to retrieve GL of AA,AB and BB. + final int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); idxAA = idxVector[0]; idxAB = idxVector[1]; idxBB = idxVector[2]; @@ -132,12 +134,12 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno final double q = 1.0 - p; // expected alternative allele frequency final double F = 1.0 - ( hetCount / ( 2.0 * p * q * (double)N ) ); // inbreeding coefficient - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.4f", F)); - return map; + return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F)); } - public List getKeyNames() { return Arrays.asList("InbreedingCoeff"); } + @Override + public List getKeyNames() { return Collections.singletonList(INBREEDING_COEFFICIENT_KEY_NAME); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("InbreedingCoeff", 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); } + @Override + public List getDescriptions() { return Collections.singletonList(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY_NAME, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java new file mode 100644 index 000000000..fde344e9f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java @@ -0,0 +1,100 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; + +import java.util.*; + +/** + * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias + * User: rpoplin + * Date: 8/28/13 + */ + +public class StrandBiasBySample extends GenotypeAnnotation implements ExperimentalAnnotation { + + public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; + + @Override + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + return; + + if (alleleLikelihoodMap == null ) + throw new IllegalStateException("StrandBiasBySample can only be used with likelihood based annotations in the HaplotypeCaller"); + + final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); + + gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); + } + + @Override + public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } + + @Override + public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index ddf47805f..6f16a704f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -76,16 +76,13 @@ public class ConsensusAlleleCounter { private final int minIndelCountForGenotyping; private final boolean doMultiAllelicCalls; private final double minFractionInOneSample; - private final GenomeLocParser locParser; - public ConsensusAlleleCounter(final GenomeLocParser locParser, - final boolean doMultiAllelicCalls, + public ConsensusAlleleCounter(final boolean doMultiAllelicCalls, final int minIndelCountForGenotyping, final double minFractionInOneSample) { this.minIndelCountForGenotyping = minIndelCountForGenotyping; this.doMultiAllelicCalls = doMultiAllelicCalls; this.minFractionInOneSample = minFractionInOneSample; - this.locParser = locParser; } /** @@ -289,7 +286,7 @@ public class ConsensusAlleleCounter { if (vcs.isEmpty()) return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false); return mergedVC.getAlleles(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index 9c4694955..3cee8f2d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -108,7 +108,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener final List allAllelesToUse){ - List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true); + List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, UAC,true); if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE) alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 0f3f7739d..4a3231b3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -89,9 +89,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected static List computeConsensusAlleles(final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenomeLocParser locParser, final UnifiedArgumentCollection UAC) { - ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); return counter.computeConsensusAlleles(ref, contexts, contextType); } @@ -113,7 +112,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // starting a new site: clear allele list haplotypeMap.clear(); perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods - alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); + alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels); if (alleleList.isEmpty()) return null; } @@ -212,7 +211,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenomeLocParser locParser, final UnifiedArgumentCollection UAC, final boolean ignoreSNPAllelesWhenGenotypingIndels) { @@ -244,7 +242,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } else { - alleles = computeConsensusAlleles(ref, contexts, contextType, locParser, UAC); + alleles = computeConsensusAlleles(ref, contexts, contextType, UAC); } return alleles; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java index 063e3b218..4de90b337 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java @@ -101,24 +101,26 @@ class ActiveRegionTrimmer { * * @param region our full active region * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position + * @param emitReferenceConfidence are we going to estimate the reference confidence with this active region? * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully */ - public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion) { + public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) { if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region return null; - final List withinActiveRegion = new LinkedList(); - int pad = snpPadding; + final List withinActiveRegion = new LinkedList<>(); + boolean foundNonSnp = false; GenomeLoc trimLoc = null; for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { final GenomeLoc vcLoc = parser.createGenomeLoc(vc); if ( region.getLocation().overlapsP(vcLoc) ) { if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding - pad = nonSnpPadding; + foundNonSnp = true; trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); withinActiveRegion.add(vc); } } + final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding ); // we don't actually have anything in the region after removing variants that don't overlap the region's full location if ( trimLoc == null ) return null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index cb3b9b65f..cc89792d5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -183,7 +183,7 @@ public class GenotypingEngine { final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false); if( mergedVC == null ) { continue; } if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 0b95ed07e..8776a5e4b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -290,7 +290,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * B <= X < C * X >= C * - * The default bands give the following GQ blocks: + * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks: * * [0, 0] * (0, 10] @@ -304,7 +304,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) - protected List GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50); + protected List GVCFGQBands = Arrays.asList(5, 20, 60); /** * This parameter determines the maximum size of an indel considered as potentially segregating in the @@ -541,7 +541,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this? + // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); @@ -553,11 +553,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; - simpleUAC.CONTAMINATION_FRACTION_FILE=null; + simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - if( UAC.CONTAMINATION_FRACTION_FILE !=null) { + if( UAC.CONTAMINATION_FRACTION_FILE != null ) { UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); } @@ -867,17 +867,17 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // Create ReadErrorCorrector object if requested - will be used within assembly engine. ReadErrorCorrector readErrorCorrector = null; if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG,fullReferenceWithPadding); + readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); try { - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); - if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) { + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype, readErrorCorrector ); + if ( ! dontTrimActiveRegions ) { return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); } else { // we don't want to trim active regions, so go ahead and use the old one return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); } - } catch ( Exception e ) { + } catch ( final Exception e ) { // Capture any exception that might be thrown, and write out the assembly failure BAM if requested if ( captureAssemblyFailureBAM ) { final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); @@ -969,7 +969,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG); final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes); allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype); - final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion); + final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion, false); // TODO -- should pass emitReferenceConfidence() if ( trimmedActiveRegion == null ) { // there were no variants found within the active region itself, so just return null @@ -1001,7 +1001,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } } - // trim down the reads and add them to the trimmed active region final List trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size()); for( final GATKSAMRecord read : originalActiveRegion.getReads() ) { @@ -1096,7 +1095,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In activeRegion.addAll(downsampledReads); } - private Set filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private Set filterNonPassingReads( final ActiveRegion activeRegion ) { final Set readsToRemove = new LinkedHashSet<>(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { @@ -1107,7 +1106,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return readsToRemove; } - private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java index ee7565282..8fb7afec7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java @@ -68,5 +68,13 @@ final class RefVsAnyResult { /** * @return Get the DP (sum of AD values) */ - public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; } + protected int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; } + + /** + * Cap the het and hom var likelihood values by the hom ref likelihood. + */ + protected void capByHomRefLikelihood() { + genotypeLikelihoods[1] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[1]); + genotypeLikelihoods[2] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[2]); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java index 98264d4c2..2c123880d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -61,6 +61,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; import org.broadinstitute.variant.vcf.VCFHeaderLine; @@ -81,10 +82,9 @@ import java.util.*; * Time: 12:52 PM */ public class ReferenceConfidenceModel { - public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; - public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - public final static String INDEL_INFORMATIVE_DEPTH = "CD"; + //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now + public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele private final GenomeLocParser genomeLocParser; private final Set samples; @@ -94,6 +94,8 @@ public class ReferenceConfidenceModel { private final static boolean WRITE_DEBUGGING_BAM = false; private final SAMFileWriter debuggingWriter; + private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; + /** * Create a new ReferenceConfidenceModel * @@ -124,6 +126,8 @@ public class ReferenceConfidenceModel { } else { debuggingWriter = null; } + + initializeIndelPLCache(); } /** @@ -132,8 +136,9 @@ public class ReferenceConfidenceModel { */ public Set getVCFHeaderLines() { final Set headerLines = new LinkedHashSet<>(); - headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); - headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); + // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? + headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); + //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); return headerLines; } @@ -161,7 +166,7 @@ public class ReferenceConfidenceModel { * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence - * under any position is covers (for snps that 1 bp, but for deletion its the entire ref span) + * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence * contexts as well as calls from variantCalls if any were provided */ @@ -181,7 +186,7 @@ public class ReferenceConfidenceModel { if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); final GenomeLoc refSpan = activeRegion.getLocation(); - final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap); + final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); final byte[] ref = refHaplotype.getBases(); final List results = new ArrayList<>(refSpan.size()); final String sampleName = stratifiedReadMap.keySet().iterator().next(); @@ -201,9 +206,10 @@ public class ReferenceConfidenceModel { final int refOffset = offset + globalRefOffset; final byte refBase = ref[refOffset]; final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); + homRefCalc.capByHomRefLikelihood(); final Allele refAllele = Allele.create(refBase, true); - final List refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE); + final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); gb.AD(homRefCalc.AD_Ref_Any); @@ -224,7 +230,7 @@ public class ReferenceConfidenceModel { gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); gb.PL(leastConfidenceGLs.getAsPLs()); - gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); + //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); vcb.genotypes(gb.make()); results.add(vcb.make()); @@ -252,14 +258,21 @@ public class ReferenceConfidenceModel { * @return non-null GenotypeLikelihoods given N */ protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { - // TODO -- optimization -- this could easily be optimized with some caching - final double homRef = 0.0; - final double het = - LOG10_2 * nInformativeReads; - final double homVar = INDEL_ERROR_RATE * nInformativeReads; - return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; + } + + protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway + private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; + private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp + + private void initializeIndelPLCache() { + for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { + final double homRef = 0.0; + final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; + final double homVar = INDEL_ERROR_RATE * nInformativeReads; + indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + } } - private final static double LOG10_2 = Math.log10(2); - private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp /** * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt @@ -274,8 +287,8 @@ public class ReferenceConfidenceModel { final RefVsAnyResult result = new RefVsAnyResult(); for( final PileupElement p : pileup ) { - final byte qual = p.getQual(); - if( p.isDeletion() || qual > minBaseQual) { + final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); + if( p.isDeletion() || qual > minBaseQual ) { int AA = 0; final int AB = 1; int BB = 2; if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { AA = 2; @@ -302,20 +315,37 @@ public class ReferenceConfidenceModel { private List getPileupsOverReference(final Haplotype refHaplotype, final Collection calledHaplotypes, final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, final GenomeLoc activeRegionSpan, final Map stratifiedReadMap) { - final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); - final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); - writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves - writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); - final List realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + + List realignedReads; + + if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner + realignedReads = activeRegion.getReads(); + } else { + final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); + final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); + writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves + writer.setOnlyRealignInformativeReads(true); + writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); + realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + } if ( debuggingWriter != null ) for ( final GATKSAMRecord read : realignedReads ) debuggingWriter.addAlignment(read); final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, - false, genomeLocParser, samples, false); + true, genomeLocParser, samples, false); final List pileups = new LinkedList<>(); final int startPos = activeRegionSpan.getStart(); @@ -378,7 +408,7 @@ public class ReferenceConfidenceModel { final byte refBase = refBases[refStart + i]; if ( readBase != refBase ) { sum += readQuals[readStart + i]; - if ( sum > maxSum ) + if ( sum > maxSum ) // abort early return sum; } } @@ -403,7 +433,10 @@ public class ReferenceConfidenceModel { final byte[] refBases, final int refStart, final int maxIndelSize) { - // todo -- fast exit when n bases left < maxIndelSize + // fast exit when n bases left < maxIndelSize + if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) { + return false; + } final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); @@ -445,12 +478,16 @@ public class ReferenceConfidenceModel { final int offset = p.getOffset(); // doesn't count as evidence - if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() ) + if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) continue; // todo -- this code really should handle CIGARs directly instead of relying on the above tests - if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize)) + if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { nInformative++; + if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { + return MAX_N_INDEL_INFORMATIVE_READS; + } + } } return nInformative; } diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java index 8f509b36b..8ee3c166c 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -230,6 +230,7 @@ public class GVCFWriter implements VariantContextWriter { gb.DP(block.getMedianDP()); gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); + gb.PL(block.getMinPLs()); return vcb.genotypes(gb.make()).make(); } diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java index 282e49217..ebd167a31 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java @@ -69,10 +69,11 @@ import java.util.List; */ final class HomRefBlock { private final VariantContext startingVC; - int stop; + private int stop; private final int minGQ, maxGQ; - private List GQs = new ArrayList<>(100); - private List DPs = new ArrayList<>(100); + private int[] minPLs = null; + final private List GQs = new ArrayList<>(100); + final private List DPs = new ArrayList<>(100); private final Allele ref; /** @@ -116,9 +117,23 @@ final class HomRefBlock { public void add(final int pos, final Genotype g) { if ( g == null ) throw new IllegalArgumentException("g cannot be null"); if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); + if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field"); if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); + if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs = PL.clone(); + } + } else { // otherwise take the min with the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs[0] = Math.min(minPLs[0], PL[0]); + minPLs[1] = Math.min(minPLs[1], PL[1]); + minPLs[2] = Math.min(minPLs[2], PL[2]); + } + } stop = pos; GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission DPs.add(g.getDP()); @@ -141,6 +156,8 @@ final class HomRefBlock { public int getMinDP() { return MathUtils.arrayMin(DPs); } /** Get the median DP observed within this band */ public int getMedianDP() { return MathUtils.median(DPs); } + /** Get the min PLs observed within this band, can be null if no PLs have yet been observed */ + public int[] getMinPLs() { return minPLs; } protected int getGQUpperBound() { return maxGQ; } protected int getGQLowerBound() { return minGQ; } diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index c298485f6..1e6d7c13c 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -46,7 +46,6 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; -import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index 509399fd9..df523b74d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -77,8 +77,9 @@ public abstract class HaplotypeBAMWriter { protected final static String READ_GROUP_ID = "ArtificialHaplotype"; protected final static String HAPLOTYPE_TAG = "HC"; - final ReadDestination output; - boolean writeHaplotypesAsWell = true; + private final ReadDestination output; + private boolean writeHaplotypesAsWell = true; + private boolean onlyRealignInformativeReads = false; /** * Possible modes for writing haplotypes to BAMs @@ -181,9 +182,16 @@ public abstract class HaplotypeBAMWriter { final Haplotype haplotype, final int referenceStart, final boolean isInformative) { - final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); - if ( alignedToRef != null ) - output.add(alignedToRef); + if( onlyRealignInformativeReads && !isInformative ) { + if( originalRead != null ) { + output.add(originalRead); + } + } else { + final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); + if ( alignedToRef != null ) { + output.add(alignedToRef); + } + } } /** @@ -305,7 +313,15 @@ public abstract class HaplotypeBAMWriter { return writeHaplotypesAsWell; } - public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) { + public void setWriteHaplotypesAsWell(final boolean writeHaplotypesAsWell) { this.writeHaplotypesAsWell = writeHaplotypesAsWell; } + + public boolean getOnlyRealignInformativeReads() { + return onlyRealignInformativeReads; + } + + public void setOnlyRealignInformativeReads(final boolean onlyRealignInformativeReads) { + this.onlyRealignInformativeReads = onlyRealignInformativeReads; + } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 37dc7adba..9f8b72c1d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -46,14 +46,25 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.Arrays; public class VariantAnnotatorIntegrationTest extends WalkerTest { + final static String REF = b37KGReference; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static String baseTestString() { return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; } @@ -290,4 +301,48 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("Testing InbreedingCoeff annotation with PED file", spec); } + @Test + public void testStrandBiasBySample() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); + + final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; + final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); + specNoFS.disableShadowBCF(); + final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0); + + // confirm that the FisherStrand values are identical for the two pipelines + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + Assert.assertTrue(vc.hasAttribute("FS")); + Assert.assertTrue(vcAnn.hasAttribute("FS")); + Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0)); + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java index 355a47cbc..445864380 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -129,7 +129,7 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { } private List getConsensusAlleles(int eventLength, boolean isInsertion, int minCnt, double minFraction, String altBases) { - final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(pileupProvider.genomeLocParser, true, minCnt, minFraction); + final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, minCnt, minFraction); return counter.computeConsensusAlleles(pileupProvider.referenceContext, pileupProvider.getAlignmentContextFromAlleles(isInsertion?eventLength:-eventLength,altBases,numReadsPerAllele), AlignmentContextUtils.ReadOrientation.COMPLETE); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index e73c04d2c..e430cd8d1 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -57,18 +57,18 @@ import java.util.List; public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @DataProvider(name = "MyDataProvider") public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e32b7fc4de29ed141dcafc0d789d5ed6"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ecac86e8ef4856e6dfa306c436e9b545"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "7828256b82df377cc3a26a55dbf68f91"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "e41e0acf172a994e938a150390badd39"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"}); return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 5824e905f..f7056ef58 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -136,16 +136,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { - final List> VCs = new ArrayList>(); + final List> VCs = new ArrayList<>(); try { for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { - VCs.add(new Pair(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); + VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); } } catch( IOException e ) { throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); } - final Set> VCsAsSet = new HashSet>(VCs); + final Set> VCsAsSet = new HashSet<>(VCs); return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. } @@ -233,7 +233,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("f3e636d64042e766cc6515987e85a968")); + Arrays.asList("a43d6226a51eb525f0774f88e3778189")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -256,7 +256,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestAggressivePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb")); + Arrays.asList("19c2992541ede7407192660fdc1fadbf")); executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); } @@ -264,7 +264,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestConservativePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("16f7ffa063511c70bad795639a1c2638")); + Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16")); executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index 3402b73f0..21648b2b9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -58,10 +58,10 @@ import java.util.List; public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { @DataProvider(name = "NCTDataProvider") public Object[][] makeNCTDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "e4bf389676fa090c95980349310ba5ca"}); + tests.add(new Object[]{nct, "29cb04cca87f42b4762c34dfea5d15b7"}); } return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index 3a4ed7e59..5c6ae93e5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -99,7 +99,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { @DataProvider(name = "CalcNIndelInformativeReadsData") public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); { // very basic testing final String ref = "ACGT"; @@ -187,7 +187,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); - for ( int i = 1; i < 10000; i++ ) { + for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { final GenotypeLikelihoods current = model.getIndelPLs(i); final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); @@ -379,7 +379,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); Assert.assertFalse(refModel.hasLog10PError()); Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); - Assert.assertEquals(refModel.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); Assert.assertTrue(refModel.hasGenotype(sample)); final Genotype g = refModel.getGenotype(sample); @@ -388,7 +388,6 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(g.getDP(), expectedDP); Assert.assertTrue(g.hasGQ()); Assert.assertTrue(g.hasPL()); - Assert.assertTrue(g.hasExtendedAttribute(ReferenceConfidenceModel.INDEL_INFORMATIVE_DEPTH)); } final VariantContext vc = call == null ? refModel : call; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 917cbd542..66bc74caa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -67,7 +67,11 @@ public class CombineVariantsIntegrationTest extends WalkerTest { // TODO TODO TODO TODO TODO TODO TODO TODO // private static String baseTestString(String args) { - return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; + return baseTestString(args, b36KGReference); + } + + private static String baseTestString(String args, String ref) { + return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; //return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args; } @@ -181,6 +185,19 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } + @Test public void combineSingleSamplePipelineGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -multipleAllelesMergeType MIX_TYPES" + + " --excludeNonVariants -combineAnnotations -setKey null" + + " -L 20:10,000,000-10,001,000", b37KGReference), + 1, + Arrays.asList("2e15db35359144683f1e58e147362679")); + cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true); + } + @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java index ffbc3c43f..e353739e5 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java @@ -49,7 +49,6 @@ package org.broadinstitute.sting.utils.gvcf; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; @@ -91,7 +90,7 @@ public class GVCFWriterUnitTest extends BaseTest { private List standardPartition = Arrays.asList(1, 10, 20); private Allele REF = Allele.create("N", true); private Allele ALT = Allele.create("A"); - private List ALLELES = Arrays.asList(REF, ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); private final String SAMPLE_NAME = "XXYYZZ"; @BeforeMethod @@ -223,10 +222,10 @@ public class GVCFWriterUnitTest extends BaseTest { Assert.assertEquals(vc.getStart(), start); Assert.assertEquals(vc.getEnd(), stop); if ( nonRef ) { - Assert.assertNotEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); } else { Assert.assertEquals(vc.getNAlleles(), 2); - Assert.assertEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); Assert.assertEquals(vc.getAttributeAsInt(GVCFWriter.BLOCK_SIZE_INFO_FIELD, -1), stop - start + 1); Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); Assert.assertTrue(vc.hasGenotypes()); @@ -234,8 +233,9 @@ public class GVCFWriterUnitTest extends BaseTest { Assert.assertEquals(vc.getGenotypes().size(), 1); final Genotype g = vc.getGenotype(SAMPLE_NAME); Assert.assertEquals(g.hasAD(), false); - Assert.assertEquals(g.hasLikelihoods(), false); - Assert.assertEquals(g.hasPL(), false); + Assert.assertEquals(g.hasLikelihoods(), true); + Assert.assertEquals(g.hasPL(), true); + Assert.assertEquals(g.getPL().length == 3, true); Assert.assertEquals(g.hasDP(), true); Assert.assertEquals(g.hasGQ(), true); } @@ -307,7 +307,7 @@ public class GVCFWriterUnitTest extends BaseTest { @DataProvider(name = "BandPartitionData") public Object[][] makeBandPartitionData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); tests.add(new Object[]{null, false}); tests.add(new Object[]{Collections.emptyList(), false}); diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java index 239aa93b5..ec4797f3d 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java @@ -85,32 +85,34 @@ public class HomRefBlockUnitTest extends BaseTest { @Test public void testMinMedian() { + //TODO - might be better to make this test use a data provider? final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); int pos = vc.getStart(); - band.add(pos++, gb.DP(10).GQ(11).make()); + band.add(pos++, gb.DP(10).GQ(11).PL(new int[]{0,11,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 10, 11, 11); - band.add(pos++, gb.DP(11).GQ(10).make()); + band.add(pos++, gb.DP(11).GQ(10).PL(new int[]{0,10,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 11, 10, 11); - band.add(pos++, gb.DP(12).GQ(12).make()); + band.add(pos++, gb.DP(12).GQ(12).PL(new int[]{0,12,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 11, 10, 11); - band.add(pos++, gb.DP(13).GQ(15).make()); + band.add(pos++, gb.DP(13).GQ(15).PL(new int[]{0,15,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(14).GQ(16).make()); + band.add(pos++, gb.DP(14).GQ(16).PL(new int[]{0,16,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(15).GQ(17).make()); + band.add(pos++, gb.DP(15).GQ(17).PL(new int[]{0,17,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(16).GQ(18).make()); + band.add(pos++, gb.DP(16).GQ(18).PL(new int[]{0,18,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 13, 10, 15); Assert.assertEquals(band.getSize(), pos - vc.getStart()); + Assert.assertTrue(Arrays.equals(band.getMinPLs(), new int[]{0,10,100})); } @Test @@ -118,7 +120,7 @@ public class HomRefBlockUnitTest extends BaseTest { final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); - band.add(vc.getStart(), gb.DP(1000).GQ(1000).make()); + band.add(vc.getStart(), gb.DP(1000).GQ(1000).PL(new int[]{0,10,100}).make()); assertValues(band, 1000, 1000, 99, 99); } @@ -127,7 +129,7 @@ public class HomRefBlockUnitTest extends BaseTest { final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); - band.add(vc.getStart() + 10, gb.DP(10).GQ(11).make()); + band.add(vc.getStart() + 10, gb.DP(10).GQ(11).PL(new int[]{0,10,100}).make()); } private void assertValues(final HomRefBlock band, final int minDP, final int medianDP, final int minGQ, final int medianGQ) { @@ -140,7 +142,7 @@ public class HomRefBlockUnitTest extends BaseTest { @DataProvider(name = "ContiguousData") public Object[][] makeContiguousData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); for ( final String chrMod : Arrays.asList("", ".mismatch") ) { for ( final int offset : Arrays.asList(-10, -1, 0, 1, 10) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index b85365366..f8628bb78 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -86,7 +86,7 @@ public final class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList<>(); private TAROrderedReadCache myReads = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 10ba4ca17..f2f808cad 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; @@ -83,6 +84,7 @@ import java.util.*; @Requires(value={}) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) @By(DataSource.REFERENCE) public class VariantAnnotator extends RodWalker implements AnnotatorCompatible, TreeReducible { @@ -132,21 +134,21 @@ public class VariantAnnotator extends RodWalker implements Ann * See the -list argument to view available annotations. */ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList(); + protected List annotationsToUse = new ArrayList<>(); /** * Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(); + protected List annotationsToExclude = new ArrayList<>(); /** * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups. * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs. */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected List annotationGroupsToUse = new ArrayList(); + protected List annotationGroupsToUse = new ArrayList<>(); /** * This option enables you to add annotations from one VCF to another. @@ -193,8 +195,8 @@ public class VariantAnnotator extends RodWalker implements Ann } // get the list of all sample names from the variant VCF input rod, if applicable - List rodName = Arrays.asList(variantCollection.variants.getName()); - Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); + final List rodName = Arrays.asList(variantCollection.variants.getName()); + final Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); if ( USE_ALL_ANNOTATIONS ) engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit()); @@ -204,23 +206,23 @@ public class VariantAnnotator extends RodWalker implements Ann // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones - Set hInfo = new HashSet(); + final Set hInfo = new HashSet<>(); hInfo.addAll(engine.getVCFAnnotationDescriptions()); - for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { + for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } // for the expressions, pull the info header line from the header of the resource rod - for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { + for ( final VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { // special case the ID field if ( expression.fieldName.equals("ID") ) { hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource")); continue; } VCFInfoHeaderLine targetHeaderLine = null; - for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { + for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { if ( line instanceof VCFInfoHeaderLine ) { - VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; + final VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; if ( infoline.getID().equals(expression.fieldName) ) { targetHeaderLine = infoline; break; @@ -285,7 +287,7 @@ public class VariantAnnotator extends RodWalker implements Ann Map stratifiedContexts; if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup()); - annotatedVCs = new ArrayList(VCs.size()); + annotatedVCs = new ArrayList<>(VCs.size()); for ( VariantContext vc : VCs ) annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 078a36dd9..25e683c2f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -58,15 +58,15 @@ public class VariantAnnotatorEngine { public RodBinding binding; public VAExpression(String fullExpression, List> bindings) { - int indexOfDot = fullExpression.lastIndexOf("."); + final int indexOfDot = fullExpression.lastIndexOf("."); if ( indexOfDot == -1 ) throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format"); fullName = fullExpression; fieldName = fullExpression.substring(indexOfDot+1); - String bindingName = fullExpression.substring(0, indexOfDot); - for ( RodBinding rod : bindings ) { + final String bindingName = fullExpression.substring(0, indexOfDot); + for ( final RodBinding rod : bindings ) { if ( rod.getName().equals(bindingName) ) { binding = rod; break; @@ -96,7 +96,7 @@ public class VariantAnnotatorEngine { // select specific expressions to use public void initializeExpressions(Set expressionsToUse) { // set up the expressions - for ( String expression : expressionsToUse ) + for ( final String expression : expressionsToUse ) requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } @@ -113,15 +113,15 @@ public class VariantAnnotatorEngine { if ( annotationsToExclude.size() == 0 ) return; - List tempRequestedInfoAnnotations = new ArrayList(requestedInfoAnnotations.size()); - for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) { + final List tempRequestedInfoAnnotations = new ArrayList<>(requestedInfoAnnotations.size()); + for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations ) { if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) tempRequestedInfoAnnotations.add(annotation); } requestedInfoAnnotations = tempRequestedInfoAnnotations; - List tempRequestedGenotypeAnnotations = new ArrayList(requestedGenotypeAnnotations.size()); - for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + final List tempRequestedGenotypeAnnotations = new ArrayList<>(requestedGenotypeAnnotations.size()); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) tempRequestedGenotypeAnnotations.add(annotation); } @@ -143,24 +143,24 @@ public class VariantAnnotatorEngine { variantOverlapAnnotator = new VariantOverlapAnnotator(dbSNPBinding, overlapBindings, engine.getGenomeLocParser()); } - public void invokeAnnotationInitializationMethods( Set headerLines ) { - for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { + public void invokeAnnotationInitializationMethods( final Set headerLines ) { + for ( final VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { annotation.initialize(walker, toolkit, headerLines); } - for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { + for ( final VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { annotation.initialize(walker, toolkit, headerLines); } } public Set getVCFAnnotationDescriptions() { - Set descriptions = new HashSet(); + final Set descriptions = new HashSet<>(); - for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) + for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( String db : variantOverlapAnnotator.getOverlapNames() ) { + for ( final String db : variantOverlapAnnotator.getOverlapNames() ) { if ( VCFStandardHeaderLines.getInfoLine(db, false) != null ) descriptions.add(VCFStandardHeaderLines.getInfoLine(db)); else @@ -170,10 +170,10 @@ public class VariantAnnotatorEngine { return descriptions; } - public VariantContext annotateContext(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map stratifiedContexts, - VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { return annotateContext(tracker, ref, stratifiedContexts, vc, null); } @@ -182,20 +182,20 @@ public class VariantAnnotatorEngine { final Map stratifiedContexts, final VariantContext vc, final Map perReadAlleleLikelihoodMap) { - Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // annotate expressions where available annotateExpressions(tracker, ref.getLocus(), infoAnnotations); // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); + for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + final Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); if ( annotationsFromCurrentType != null ) infoAnnotations.putAll(annotationsFromCurrentType); } // generate a new annotated VC - VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); + final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process final VariantContext annotated = builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); @@ -210,11 +210,11 @@ public class VariantAnnotatorEngine { final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { if ( !(annotationType instanceof ActiveRegionBasedAnnotation) ) continue; - Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); + final Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); } @@ -244,12 +244,12 @@ public class VariantAnnotatorEngine { } private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { - for ( VAExpression expression : requestedExpressions ) { - Collection VCs = tracker.getValues(expression.binding, loc); + for ( final VAExpression expression : requestedExpressions ) { + final Collection VCs = tracker.getValues(expression.binding, loc); if ( VCs.size() == 0 ) continue; - VariantContext vc = VCs.iterator().next(); + final VariantContext vc = VCs.iterator().next(); // special-case the ID field if ( expression.fieldName.equals("ID") ) { if ( vc.hasID() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 45dbc937d..396d5686b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -164,6 +164,9 @@ public class CombineVariants extends RodWalker implements Tree @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) public boolean minimalVCF = false; + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) + public boolean EXCLUDE_NON_VARIANTS = false; + /** * Set to 'null' if you don't want the set field emitted. */ @@ -171,7 +174,7 @@ public class CombineVariants extends RodWalker implements Tree public String SET_KEY = "set"; /** - * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.. + * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. */ @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) public boolean ASSUME_IDENTICAL_SAMPLES = false; @@ -188,6 +191,9 @@ public class CombineVariants extends RodWalker implements Tree @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; + @Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false) + public boolean COMBINE_ANNOTATIONS = false; + private List priority = null; /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ @@ -238,7 +244,7 @@ public class CombineVariants extends RodWalker implements Tree throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); if ( PRIORITY_STRING != null){ - priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); + priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(","))); if ( rodNames.size() != priority.size() ) throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); @@ -252,13 +258,16 @@ public class CombineVariants extends RodWalker implements Tree if ( tracker == null ) // RodWalkers can make funky map calls return 0; - Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); + final Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getValues(variants, context.getLocation()); + Collection potentialRefVCs = tracker.getValues(variants); + potentialRefVCs.removeAll(vcs); if ( sitesOnlyVCF ) { vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); + potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs); } if ( ASSUME_IDENTICAL_SAMPLES ) { @@ -270,7 +279,7 @@ public class CombineVariants extends RodWalker implements Tree } int numFilteredRecords = 0; - for (VariantContext vc : vcs) { + for (final VariantContext vc : vcs) { if (vc.filtersWereApplied() && vc.isFiltered()) numFilteredRecords++; } @@ -278,16 +287,16 @@ public class CombineVariants extends RodWalker implements Tree if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - List mergedVCs = new ArrayList(); + final List mergedVCs = new ArrayList<>(); if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { - Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); + final Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); // TODO -- clean this up in a refactoring // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type) if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) { final List refs = VCsByType.remove(VariantContext.Type.NO_VARIATION); - for ( VariantContext.Type type : VariantContext.Type.values() ) { + for ( final VariantContext.Type type : VariantContext.Type.values() ) { if ( VCsByType.containsKey(type) ) { VCsByType.get(type).addAll(refs); break; @@ -296,23 +305,27 @@ public class CombineVariants extends RodWalker implements Tree } // iterate over the types so that it's deterministic - for (VariantContext.Type type : VariantContext.Type.values()) { - if (VCsByType.containsKey(type)) - mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + for (final VariantContext.Type type : VariantContext.Type.values()) { + // make sure that it is a variant or in case it is not, that we want to include the sites with no variants + if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { + if (VCsByType.containsKey(type)) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs, + priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); + } + } } } else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs, priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); } else { logger.warn("Ignoring all records at site " + ref.getLocus()); } - for ( VariantContext mergedVC : mergedVCs ) { + for ( final VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) continue; @@ -320,9 +333,12 @@ public class CombineVariants extends RodWalker implements Tree final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); // re-compute chromosome counts VariantContextUtils.calculateChromosomeCounts(builder, false); + if ( minimalVCF ) GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(builder.make()); + final VariantContext vc = builder.make(); + if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() ) + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3af71eabb..bfae7e94c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -845,17 +845,29 @@ public class MathUtils { } /** - * Compute the median element of the array of integers + * Compute the median element of the list of integers * @param array a list of integers * @return the median element */ - public static int median(final List array) { + public static > T median(final List array) { + /* TODO -- from Valentin + the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). + + But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] + + My suggestion for a solution is then: + + unify median and medianDoubles to public static T median(Collection) + check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. + relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) + In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) + */ if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); final int size = array.size(); if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); else if ( size == 1 ) return array.get(0); else { - final ArrayList sorted = new ArrayList<>(array); + final ArrayList sorted = new ArrayList<>(array); Collections.sort(sorted); return sorted.get(size / 2); } @@ -1405,7 +1417,7 @@ public class MathUtils { * @return */ public static List log10LinearRange(final int start, final int stop, final double eps) { - final LinkedList values = new LinkedList(); + final LinkedList values = new LinkedList<>(); final double log10range = Math.log10(stop - start); if ( start == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index f4c673e61..8a034dde0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -46,7 +46,7 @@ import java.util.List; * Time: 8:54:05 AM */ public class PileupElement implements Comparable { - private final static LinkedList EMPTY_LINKED_LIST = new LinkedList(); + private final static LinkedList EMPTY_LINKED_LIST = new LinkedList<>(); private final static EnumSet ON_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java index 3a8afca8c..4cf39d6be 100644 --- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java @@ -30,7 +30,7 @@ import net.sf.samtools.Cigar; /** * Generic interface for SmithWaterman calculations * - * This interface allows clients to use a generic SmithWaterman variable, without propogating the specific + * This interface allows clients to use a generic SmithWaterman variable, without propagating the specific * implementation of SmithWaterman throughout their code: * * SmithWaterman sw = new SpecificSmithWatermanImplementation(ref, read, params) diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 3bc5da82f..e8c438a53 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -45,7 +45,11 @@ public class GATKVariantContextUtils { public static final int DEFAULT_PLOIDY = 2; public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; + public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + public final static String MERGE_FILTER_PREFIX = "filterIn"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; @@ -108,7 +112,7 @@ public class GATKVariantContextUtils { int averageLengthNum = 0; int averageLengthDenom = 0; int refLength = vc.getReference().length(); - for ( Allele a : vc.getAlternateAlleles() ) { + for ( final Allele a : vc.getAlternateAlleles() ) { int numAllele = vc.getCalledChrCount(a); int alleleSize; if ( a.length() == refLength ) { @@ -182,8 +186,8 @@ public class GATKVariantContextUtils { */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap(vc.getAlleles().size()); - for ( Allele originalAllele : vc.getAlleles() ) { + HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); + for ( final Allele originalAllele : vc.getAlleles() ) { Allele newAllele; if ( originalAllele.isNoCall() ) newAllele = originalAllele; @@ -195,8 +199,8 @@ public class GATKVariantContextUtils { // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getAlleles() ) { + List newAlleles = new ArrayList<>(); + for ( final Allele allele : genotype.getAlleles() ) { Allele newAllele = alleleMap.get(allele); if ( newAllele == null ) newAllele = Allele.NO_CALL; @@ -267,7 +271,7 @@ public class GATKVariantContextUtils { final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); byte[] repeatUnit = null; - final ArrayList lengths = new ArrayList(); + final ArrayList lengths = new ArrayList<>(); for ( final Allele allele : vc.getAlternateAlleles() ) { Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); @@ -317,7 +321,7 @@ public class GATKVariantContextUtils { repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; - return new Pair(repetitionCount, repeatUnit); + return new Pair<>(repetitionCount, repeatUnit); } @@ -528,7 +532,7 @@ public class GATKVariantContextUtils { } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; int newIndex = 0; - for ( int oldIndex : likelihoodIndexesToUse ) + for ( final int oldIndex : likelihoodIndexesToUse ) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; // might need to re-normalize @@ -718,6 +722,7 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, @@ -728,9 +733,10 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { + final boolean mergeInfoWithMaxAC, + final boolean combineAnnotations ) { int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); + return simpleMerge(unsortedVCs, Collections.emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations); } /** @@ -738,11 +744,12 @@ public class GATKVariantContextUtils { * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name. * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use - * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. + * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. * * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ * * @param unsortedVCs collection of unsorted VCs + * @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes @@ -751,9 +758,11 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, + final Collection potentialRefVCs, final List priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, @@ -762,7 +771,8 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { + final boolean mergeInfoWithMaxAC, + final boolean combineAnnotations ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -775,12 +785,16 @@ public class GATKVariantContextUtils { final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary - final List VCs = new ArrayList(); + List VCs = new ArrayList<>(); for (final VariantContext vc : preFilteredVCs) { if ( ! filteredAreUncalled || vc.isNotFiltered() ) VCs.add(vc); } + + // cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible + VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs); + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -789,17 +803,18 @@ public class GATKVariantContextUtils { final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); - final Set alleles = new LinkedHashSet(); - final Set filters = new HashSet(); - final Map attributes = new LinkedHashMap(); - final Set inconsistentAttributes = new HashSet(); - final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id + final Set alleles = new LinkedHashSet<>(); + final Set filters = new HashSet<>(); + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id VariantContext longestVC = first; int depth = 0; int maxAC = -1; - final Map attributesWithMaxAC = new LinkedHashMap(); + final Map attributesWithMaxAC = new LinkedHashMap<>(); + final Map> annotationMap = new LinkedHashMap<>(); double log10PError = CommonInfo.NO_LOG10_PERROR; boolean anyVCHadFiltersApplied = false; VariantContext vcWithMaxAC = null; @@ -811,7 +826,6 @@ public class GATKVariantContextUtils { boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( final VariantContext vc : VCs ) { if ( longestVC.getStart() != vc.getStart() ) throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); @@ -846,10 +860,10 @@ public class GATKVariantContextUtils { if ( vc.hasID() ) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a , separator + // lets see if the string contains a "," separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (String alleleCount : alleleCountArray) { + final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (final String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; @@ -866,21 +880,36 @@ public class GATKVariantContextUtils { } for (final Map.Entry p : vc.getAttributes().entrySet()) { - String key = p.getKey(); - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + final String key = p.getKey(); + final Object value = p.getValue(); + boolean badAnnotation = false; + if ( combineAnnotations ) { // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + values.add(stringValue.contains(".") ? Double.parseDouble(stringValue) : Integer.parseInt(stringValue)); + } catch (NumberFormatException e) { + badAnnotation = true; + } + } + if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); - attributes.put(key, p.getValue()); + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); + } } } } @@ -906,6 +935,12 @@ public class GATKVariantContextUtils { // take the VC with the maxAC and pull the attributes into a modifiable map if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); + } + } } // if at least one record was unfiltered and we want a union, clear all of the filters @@ -922,7 +957,7 @@ public class GATKVariantContextUtils { else if ( variantSources.isEmpty() ) // everyone was reference setValue = MERGE_REF_IN_ALL; else { - final LinkedHashSet s = new LinkedHashSet(); + final LinkedHashSet s = new LinkedHashSet<>(); for ( final VariantContext vc : VCs ) if ( vc.isVariant() ) s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); @@ -950,7 +985,12 @@ public class GATKVariantContextUtils { if ( anyVCHadFiltersApplied ) { builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); } - builder.attributes(new TreeMap(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + if( combineAnnotations ) { + // unfortunately some attributes are just too dangerous to try to combine together + builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY); + builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + } // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); @@ -958,6 +998,68 @@ public class GATKVariantContextUtils { return merged; } + private static final Comparable combineAnnotationValues( final List array ) { + return MathUtils.median(array); // right now we take the median but other options could be explored + } + + /** + * cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible + * @param VCs the list of VCs in which to fill in symbolic alleles + * @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles + * @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele + */ + protected static final List fillInNonRefSymbolicAlleles( final List VCs, final Collection potentialRefVCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); } + if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); } + + final List VCsToReturn = new ArrayList<>(VCs.size()); + boolean containsNonRefSymbolicAllele = false; + VariantContext nonRefVC = null; + for( final VariantContext vc : VCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + containsNonRefSymbolicAllele = true; + } else if ( nonRefVC == null ) { + nonRefVC = vc; + } + if( nonRefVC != null && containsNonRefSymbolicAllele == true ) { + break; // break out so that we don't run over the whole list unnecessarily + } + } + for( final VariantContext vc : potentialRefVCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + containsNonRefSymbolicAllele = true; + VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below + } + } + + if( !containsNonRefSymbolicAllele ) { + return VCs; + } + + for( final VariantContext vc : VCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site + if( nonRefVC != null ) { + final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size()); + int depth = 0; + for( final String sample : vc.getSampleNames() ) { + final Genotype gt = vc.getGenotype(sample); + final ArrayList refAlleles = new ArrayList<>(2); + refAlleles.add(nonRefVC.getReference()); + refAlleles.add(nonRefVC.getReference()); + final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now + depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here + genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make()); + } + VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make()); + } + } else { + VCsToReturn.add(vc); + } + } + + return VCsToReturn; + } + private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { final Iterator it1 = alleleSet1.iterator(); final Iterator it2 = alleleSet2.iterator(); @@ -989,8 +1091,8 @@ public class GATKVariantContextUtils { static private Allele determineReferenceAllele(List VCs) { Allele ref = null; - for ( VariantContext vc : VCs ) { - Allele myRef = vc.getReference(); + for ( final VariantContext vc : VCs ) { + final Allele myRef = vc.getReference(); if ( ref == null || ref.length() < myRef.length() ) ref = myRef; else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) @@ -1024,13 +1126,13 @@ public class GATKVariantContextUtils { // System.out.printf("myref %s%n", myRef ); // System.out.printf("extrabases %s%n", new String(extraBases)); - Map map = new HashMap(); - for ( Allele a : vc.getAlleles() ) { + Map map = new HashMap<>(); + for ( final Allele a : vc.getAlleles() ) { if ( a.isReference() ) map.put(a, refAllele); else { Allele extended = Allele.extend(a, extraBases); - for ( Allele b : allAlleles ) + for ( final Allele b : allAlleles ) if ( extended.equals(b) ) extended = b; // System.out.printf(" Extending %s => %s%n", a, extended); @@ -1050,23 +1152,23 @@ public class GATKVariantContextUtils { throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList(unsortedVCs); + return new ArrayList<>(unsortedVCs); else { - ArrayList sorted = new ArrayList(unsortedVCs); + ArrayList sorted = new ArrayList<>(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); return sorted; } } - private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE - for ( Genotype g : oneVC.getGenotypes() ) { - String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); + for ( final Genotype g : oneVC.getGenotypes() ) { + final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); if ( ! mergedGenotypes.containsSample(name) ) { // only add if the name is new Genotype newG = g; - if ( uniqifySamples || alleleMapping.needsRemapping() ) { + if ( uniquifySamples || alleleMapping.needsRemapping() ) { final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); } @@ -1076,8 +1178,8 @@ public class GATKVariantContextUtils { } } - public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { - return uniqify ? sampleName + "." + trackName : sampleName; + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { + return uniquify ? sampleName + "." + trackName : sampleName; } /** @@ -1104,8 +1206,8 @@ public class GATKVariantContextUtils { * Trim the alleles in inputVC forward and reverse, as requested * * @param inputVC a non-null input VC whose alleles might need a haircut - * @param trimForward should we trim up the alleles from the foward direction? - * @param trimReverse shold we trim up the alleles from the reverse direction? + * @param trimForward should we trim up the alleles from the forward direction? + * @param trimReverse should we trim up the alleles from the reverse direction? * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles */ @Ensures("result != null") @@ -1140,8 +1242,8 @@ public class GATKVariantContextUtils { if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified return inputVC; - final List alleles = new LinkedList(); - final Map originalToTrimmedAlleleMap = new HashMap(); + final List alleles = new LinkedList<>(); + final Map originalToTrimmedAlleleMap = new HashMap<>(); for (final Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { @@ -1300,7 +1402,7 @@ public class GATKVariantContextUtils { } private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { - Map attributes = new HashMap(keysToPreserve.size()); + Map attributes = new HashMap<>(keysToPreserve.size()); for ( final String key : keysToPreserve ) { if ( igc.hasAttribute(key) ) attributes.put(key, igc.getAttribute(key)); @@ -1343,7 +1445,7 @@ public class GATKVariantContextUtils { if (!vc1.getReference().equals(vc2.getReference())) return false; - for (Allele a :vc1.getAlternateAlleles()) { + for (final Allele a :vc1.getAlternateAlleles()) { if (!vc2.getAlternateAlleles().contains(a)) return false; } @@ -1351,17 +1453,24 @@ public class GATKVariantContextUtils { return true; } - public static Map> separateVariantContextsByType(Collection VCs) { - HashMap> mappedVCs = new HashMap>(); - for ( VariantContext vc : VCs ) { + public static Map> separateVariantContextsByType( final Collection VCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } + + final HashMap> mappedVCs = new HashMap<>(); + for ( final VariantContext vc : VCs ) { + VariantContext.Type vcType = vc.getType(); + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + if( vc.getAlternateAlleles().size() > 1 ) { throw new IllegalStateException("Reference records should not have more than one alternate allele"); } + vcType = VariantContext.Type.NO_VARIATION; + } // look at previous variant contexts of different type. If: // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) // c) neither: do nothing, just add vc to its own list boolean addtoOwnList = true; - for (VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vc.getType())) + for (final VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vcType)) continue; if (!mappedVCs.containsKey(type)) @@ -1376,9 +1485,9 @@ public class GATKVariantContextUtils { // avoid having empty lists if (vcList.size() == 0) mappedVCs.remove(type); - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(otherVC); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(otherVC); break; } else if (allelesAreSubset(vc,otherVC)) { @@ -1390,9 +1499,9 @@ public class GATKVariantContextUtils { } } if (addtoOwnList) { - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(vc); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(vc); } } @@ -1403,10 +1512,10 @@ public class GATKVariantContextUtils { if ( allowedAttributes == null ) return vc; - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - Map attrs = new HashMap(); - for ( Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { + final Map attrs = new HashMap<>(); + for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { if ( allowedAttributes.contains(attr.getKey()) ) attrs.put(attr.getKey(), attr.getValue()); } @@ -1427,8 +1536,8 @@ public class GATKVariantContextUtils { public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List remap(List as) { - List newAs = new ArrayList(); - for ( Allele a : as ) { + List newAs = new ArrayList<>(); + for ( final Allele a : as ) { //System.out.printf(" Remapping %s => %s%n", a, remap(a)); newAs.add(remap(a)); } @@ -1467,7 +1576,7 @@ public class GATKVariantContextUtils { if ( alleleStrings == null || alleleStrings.isEmpty() ) throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); - final List alleles = new LinkedList(); + final List alleles = new LinkedList<>(); final int length = alleleStrings.get(0).length(); boolean first = true; @@ -1503,7 +1612,7 @@ public class GATKVariantContextUtils { if ( ref.length != alt.length ) throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); - final List result = new ArrayList(ref.length); + final List result = new ArrayList<>(ref.length); for ( int i = 0; i < ref.length; i++ ) { @@ -1518,7 +1627,7 @@ public class GATKVariantContextUtils { final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); // create new genotypes with updated alleles - final Map alleleMap = new HashMap(); + final Map alleleMap = new HashMap<>(); alleleMap.put(vc.getReference(), newRefAllele); alleleMap.put(vc.getAlternateAllele(0), newAltAllele); final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index f2718fb8c..a13797523 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -500,7 +500,7 @@ public class MathUtilsUnitTest extends BaseTest { @DataProvider(name = "MedianData") public Object[][] makeMedianData() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{Arrays.asList(10), 10}); @@ -510,12 +510,16 @@ public class MathUtilsUnitTest extends BaseTest { tests.add(new Object[]{values, 1}); } + for ( final List values : Utils.makePermutations(Arrays.asList(1.1,2.1,-3.1), 3, false) ) { + tests.add(new Object[]{values, 1.1}); + } + return tests.toArray(new Object[][]{}); } @Test(dataProvider = "MedianData") - public void testMedian(final List values, final int expected) { - final int actual = MathUtils.median(values); + public void testMedian(final List values, final Comparable expected) { + final Comparable actual = MathUtils.median(values); Assert.assertEquals(actual, expected, "Failed with " + values); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 937698d82..220e64f7d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -107,7 +107,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { private MergeAllelesTest(List... arg) { super(MergeAllelesTest.class); - LinkedList> all = new LinkedList>(Arrays.asList(arg)); + LinkedList> all = new LinkedList<>(Arrays.asList(arg)); expected = all.pollLast(); inputs = all; } @@ -185,7 +185,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); } @@ -243,7 +243,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false); Assert.assertEquals(merged.getID(), cfg.expected); } @@ -358,7 +358,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeFiltered(MergeFilteredTest cfg) { final List priority = vcs2priority(cfg.inputs); final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -485,7 +485,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = GATKVariantContextUtils.simpleMerge( cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -526,10 +526,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false); // test genotypes - Assert.assertEquals(merged.getSampleNames(), new HashSet(Arrays.asList("s1.1", "s1.2"))); + Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); } // TODO: remove after testing @@ -540,7 +540,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // // final VariantContext merged = VariantContextUtils.simpleMerge( // Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false); +// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); // } // -------------------------------------------------------------------------------- @@ -559,7 +559,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false); if ( annotate ) Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); @@ -570,7 +570,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } private static final List vcs2priority(final Collection vcs) { - final List priority = new ArrayList(); + final List priority = new ArrayList<>(); for ( final VariantContext vc : vcs ) { priority.add(vc.getSource()); @@ -997,7 +997,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "PrimitiveAlleleSplittingData") public Object[][] makePrimitiveAlleleSplittingData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); // no split tests.add(new Object[]{"A", "C", 0, null}); @@ -1039,6 +1039,26 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } + @Test(enabled = !DEBUG) + public void testFillInNonRefSymbolicAlleles() { + final int start = 10; + final String ref = "A"; + final String alt = "C"; + final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">")); + + List VCs = Arrays.asList(vcAlt, vcRef); + VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.emptyList()); + + // make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele + for( final VariantContext vc : VCs ) { + Assert.assertTrue(vc.getAlternateAlleles().size() == 1); + Assert.assertTrue(vc.getAlternateAllele(0).isNonReference()); + Assert.assertTrue(!vc.getReference().isSymbolic()); + Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic()); + } + } + // -------------------------------------------------------------------------------- // // test allele remapping @@ -1047,7 +1067,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "AlleleRemappingData") public Object[][] makeAlleleRemappingData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final Allele originalBase1 = Allele.create((byte)'A'); final Allele originalBase2 = Allele.create((byte)'T'); @@ -1055,7 +1075,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { for ( final byte base1 : BaseUtils.BASES ) { for ( final byte base2 : BaseUtils.BASES ) { for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { - Map map = new HashMap(2); + Map map = new HashMap<>(2); map.put(originalBase1, Allele.create(base1)); map.put(originalBase2, Allele.create(base2)); diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java index 381b282e0..a1b75a3f1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java @@ -147,7 +147,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { Set samples; public void run(final VariantContext vc) { if ( samples == null ) - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); VariantContext sub = vc.subContextFromSamples(samples); sub.getNSamples(); } @@ -176,7 +176,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { Set samples; public void run(final VariantContext vc) { if ( samples == null ) - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); vc.getGenotypes(samples).size(); } }; @@ -221,7 +221,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { case MERGE: return new FunctionToBenchmark() { public void run(final VariantContext vc) { - List toMerge = new ArrayList(); + List toMerge = new ArrayList<>(); for ( int i = 0; i < dupsToMerge; i++ ) { GenotypesContext gc = GenotypesContext.create(vc.getNSamples()); @@ -234,7 +234,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { GATKVariantContextUtils.simpleMerge(toMerge, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, - true, false, "set", false, true); + true, false, "set", false, true, false); } }; @@ -363,7 +363,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { // toMerge, null, // org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, // org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, -// true, false, "set", false, true); +// true, false, "set", false, true, false); // } // }; //