diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java index fde344e9f..4b1e48a36 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java @@ -80,12 +80,9 @@ public class StrandBiasBySample extends GenotypeAnnotation implements Experiment final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + if ( ! isAppropriateInput(alleleLikelihoodMap, g) ) return; - if (alleleLikelihoodMap == null ) - throw new IllegalStateException("StrandBiasBySample can only be used with likelihood based annotations in the HaplotypeCaller"); - final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); @@ -97,4 +94,7 @@ public class StrandBiasBySample extends GenotypeAnnotation implements Experiment @Override public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } + private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) { + return ! (map == null || g == null || !g.isCalled()); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index 6f16a704f..7457acb22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -286,7 +286,7 @@ public class ConsensusAlleleCounter { if (vcs.isEmpty()) return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); return mergedVC.getAlleles(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 697d162fd..139f2e07d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -183,7 +183,7 @@ public class GenotypingEngine { final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index a273cf01d..0b0fa020e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -262,7 +262,7 @@ public class PairHMMIndelErrorModel { * @return true if the read needs to be clipped, false otherwise */ protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) { - return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() > refWindowStop ); + return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() - 1 > refWindowStop ); } /** @@ -316,7 +316,7 @@ public class PairHMMIndelErrorModel { // if the read extends beyond the downstream (right) end of the reference window, clip it if ( mustClipDownstream(read, refWindowStop) ) - read = ReadClipper.hardClipByReadCoordinates(read, read.getSoftStart() + read.getReadLength() - refWindowStop + 1, read.getReadLength() - 1); + read = ReadClipper.hardClipByReadCoordinates(read, refWindowStop - read.getSoftStart() + 1, read.getReadLength() - 1); // if the read extends beyond the upstream (left) end of the reference window, clip it if ( mustClipUpstream(read, refWindowStart) ) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java new file mode 100644 index 000000000..223745431 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java @@ -0,0 +1,266 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import net.sf.samtools.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.BAQMode; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; +import java.util.*; + +/** + * + * Splits reads that contain Ns in their cigar string (e.g. spanning splicing events). + * + * Identifies all N cigar elements and creates k+1 new reads (where k is the number of N cigar elements). + * The first read includes the bases that are to the left of the first N element, while the part of the read that is to the right of the N + * (including the Ns) is hard clipped and so on for the rest of the new reads. + * + * + * User: ami + * Date: 11/14/13 + * Time: 11:52 AM + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class SplitNCigarReads extends ReadWalker, SAMFileWriter> { + + @Output(doc="Write output to this BAM filename instead of STDOUT") + StingSAMFileWriter out; + + @Argument(required = false) + PrintStream splitPositionsOutput = System.out; + + @Argument(fullName="outputAsBED", shortName="bed", doc="Output as BED file", required=false) + boolean outputAsBED = false; + + @Argument(fullName="printSplitPositions", shortName="splitPosition", doc="print the split positions", required=false) + boolean printSplitPositions = false; + + public static final String PROGRAM_RECORD_NAME = "GATK SplitNCigarReads"; // The name that will go in the @PG tag + // public static SplitPositions splitPositions = null; + public static String results = ""; + + /** + * The initialize function. + */ + public void initialize() { + final GenomeAnalysisEngine toolkit = getToolkit(); + + final boolean preSorted = false; + if (getToolkit() != null) { + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME); + } + out.setPresorted(preSorted); + // splitPositions = new SplitPositions(); + } + + /** + * The reads map function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param read the read itself, as a GATKSAMRecord + * + * @return a list of split read if there are N's in the cigar string, or the read itself. + */ + public List map(final ReferenceContext ref,final GATKSAMRecord read,final RefMetaDataTracker metaDataTracker) { + return splitNCigarRead(read); + } + + /** + * Goes through the cigar string of the read and create new reads for each consecutive non-N elements (while hard clipping the rest of the read). + * For example: for a read with cigar '1H2M2D1M2N1M2I1N1M2S' 3 new reads will be created with cigar strings: 1H2M2D1M, 1M2I and 1M2S + * + * @param read the read to split, as a GATKSAMRecord + * @return a list of split read if there are N's in the cigar string, or the read itself. + */ + + public static List splitNCigarRead(final GATKSAMRecord read){ + final List splitReads = new ArrayList<>(); + int firstCigarIndex = 0; + for (int i = 0; i < read.getCigar().numCigarElements(); i ++){ + final CigarElement cigarElement = read.getCigar().getCigarElement(i); + if(cigarElement.getOperator() == CigarOperator.N){ + final boolean addToSplitPositions = true; + splitReads.add(splitReadBasedOnCigar(read,firstCigarIndex,i, addToSplitPositions)); + firstCigarIndex = i+1; + } + } + //if there are no N's in the read + if (firstCigarIndex == 0) + splitReads.add(read); + + //add the last section of the read: from the last N to the the end of the read + // (it will be done for all the usual cigar string that does not end with N) + else if(firstCigarIndex < read.getCigar().numCigarElements()){ + final boolean addToSplitPositions = false; + splitReads.add(splitReadBasedOnCigar(read,firstCigarIndex,read.getCigar().numCigarElements(), addToSplitPositions)); + } + return splitReads; + } + + + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the splitPositionsOutput + * bam file, if it was specified on the command line + * + * @return SAMFileWriter, set to the BAM splitPositionsOutput file if the command line option was set, null otherwise + */ + public SAMFileWriter reduceInit() { + return out; + } + + /** + * given a read and a splitPositionsOutput location, reduce by emitting the read + * + * @param reads the split reads itself + * @param output the splitPositionsOutput source + * @return the SAMFileWriter, so that the next reduce can emit to the same source + */ + public SAMFileWriter reduce(final List reads,final SAMFileWriter output ) { + for (final GATKSAMRecord read: reads) + output.addAlignment(read); + return output; + } + + public void onTraversalDone(SAMFileWriter readResult) { + super.onTraversalDone(readResult); + if(printSplitPositions) + splitPositionsOutput.println(results); + // splitPositionsOutput.println(splitPositions); + + } + private static GATKSAMRecord splitReadBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex, final boolean addToSplitPositions){ + int cigarFirstIndex = cigarStartIndex; + int cigarSecondIndex = cigarEndIndex; + + //in case a section of the read is end or start with D (for example the first section in 1M1D1N1M is 1M1D), we should trim this cigar element + // it can be if, but it was kept as while to make sure the code can work with Cigar string that were not "cleaned" + while(read.getCigar().getCigarElement(cigarFirstIndex).getOperator().equals(CigarOperator.D)) + cigarFirstIndex++; + while(read.getCigar().getCigarElement(cigarSecondIndex-1).getOperator().equals(CigarOperator.D)) + cigarSecondIndex--; + if(cigarFirstIndex > cigarSecondIndex) + throw new UserException.BadInput("Cannot split this read (might be an empty section between Ns, for example 1N1D1N): "+read.getCigarString()); + + // we keep only the section of the read that is aligned to the reference between startRefIndex and stopRefIndex (inclusive). + // the other sections of the read are clipped: + final int startRefIndex = read.getOriginalAlignmentStart() + CigarUtils.countRefBasesBasedOnCigar(read,0,cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index. + final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index. + + if(addToSplitPositions){ + final int splitPosition = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end. + final String contig = read.getReferenceName(); +// results += String.format("%s:%d-%d\n", contig, splitPosition, splitPosition ); +// splitPositions.addSplitPosition(contig,splitPosition); + } + + return ReadClipper.hardClipToRegionIncludingClippedBases(read, startRefIndex, stopRefIndex); + + } + + private class SplitPosition { + public final String contig; + public final int start; + public final int end; + + public SplitPosition(final String c, final int position) { + contig = c; + start = position; + end = position; + } + } + + + private class SplitPositions { + private final HashSet splitPositions; + + public SplitPositions() { + splitPositions = new HashSet<>(); + } + + public void addSplitPosition(final String contig, final int position) { + final SplitPosition newSplitPosition = new SplitPosition(contig, position); + splitPositions.add(newSplitPosition); + } + + public String toString() { + String result = ""; // = "Contig\tstart\tstop\n"; + + for (SplitPosition position: splitPositions) { + if (outputAsBED) + result += String.format("%s\t%d\t%d\n", position.contig, position.start-1, position.end ); + else + result += String.format("%s:%d-%d\n", position.contig, position.start, position.end ); + } + return result; + } + } + + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index ac4654f73..1f355359d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; +import it.unimi.dsi.fastutil.booleans.BooleanLists; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -71,7 +72,7 @@ import java.util.*; */ public class VariantDataManager { - private List data; + private List data = Collections.emptyList(); private double[] meanVector; private double[] varianceVector; // this is really the standard deviation public List annotationKeys; @@ -80,7 +81,7 @@ public class VariantDataManager { protected final List trainingSets; public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { - this.data = null; + this.data = Collections.emptyList(); this.annotationKeys = new ArrayList<>( annotationKeys ); this.VRAC = VRAC; meanVector = new double[this.annotationKeys.size()]; @@ -279,6 +280,19 @@ public class VariantDataManager { return evaluationData; } + /** + * Remove all VariantDatum's from the data list which are marked as aggregate data + */ + public void dropAggregateData() { + final Iterator iter = data.iterator(); + while (iter.hasNext()) { + final VariantDatum datum = iter.next(); + if( datum.isAggregate ) { + iter.remove(); + } + } + } + public List getRandomDataForPlotting( final int numToAdd, final List trainingData, final List antiTrainingData, final List evaluationData ) { final List returnData = new ExpandingArrayList<>(); Collections.shuffle(trainingData); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index 905c97df4..41b27949d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -74,7 +74,8 @@ public class VariantDatum { public int consensusCount; public GenomeLoc loc; public int worstAnnotation; - public MultivariateGaussian assignment; // used in K-means implementation + public MultivariateGaussian assignment; // used in K-means implementation + public boolean isAggregate; // this datum was provided to aid in modeling but isn't part of the input callset public static class VariantDatumLODComparator implements Comparator, Serializable { @Override diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 1c32b852b..5da7b4219 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -152,11 +152,17 @@ public class VariantRecalibrator extends RodWalker> input; + /** + * These additional calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling. + */ + @Input(fullName="aggregate", shortName = "aggregate", doc="Additional raw input variants to be used in building the model", required=false) + public List> aggregate; + /** * Any set of VCF files to use as lists of training, truth, or known sites. * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model. @@ -290,29 +296,53 @@ public class VariantRecalibrator extends RodWalker addOverlappingVariants( final List> rods, final boolean isInput, final RefMetaDataTracker tracker, final AlignmentContext context ) { + if( rods == null ) { throw new IllegalArgumentException("rods cannot be null."); } + if( tracker == null ) { throw new IllegalArgumentException("tracker cannot be null."); } + if( context == null ) { throw new IllegalArgumentException("context cannot be null."); } + + final ExpandingArrayList variants = new ExpandingArrayList<>(); + + for( final VariantContext vc : tracker.getValues(rods, context.getLocation()) ) { if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) { if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) { final VariantDatum datum = new VariantDatum(); // Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need. dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine - datum.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vc); + datum.loc = ( isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null ); datum.originalQual = vc.getPhredScaledQual(); datum.isSNP = vc.isSNP() && vc.isBiallelic(); datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc); + datum.isAggregate = !isInput; // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC ); final double priorFactor = QualityUtils.qualToProb( datum.prior ); datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor ); - mapList.add( datum ); + variants.add( datum ); } } } - return mapList; + return variants; } //--------------------------------------------------------------------------------------------------------------- @@ -357,6 +387,7 @@ public class VariantRecalibrator extends RodWalker negativeTrainingData = dataManager.selectWorstVariants(); final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS)); + dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory engine.evaluateData( dataManager.getData(), badModel, true ); if( badModel.failedToConverge || goodModel.failedToConverge ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index 3828e6e20..dae3bffa5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -80,6 +80,9 @@ public class VariantRecalibratorEngine { } public GaussianMixtureModel generateModel( final List data, final int maxGaussians ) { + if( data == null || data.isEmpty() ) { throw new IllegalArgumentException("No data found."); } + if( maxGaussians <= 0 ) { throw new IllegalArgumentException("maxGaussians must be a positive integer but found: " + maxGaussians); } + final GaussianMixtureModel model = new GaussianMixtureModel( maxGaussians, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS ); variationalBayesExpectationMaximization( model, data ); return model; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java new file mode 100644 index 000000000..a587b0250 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java @@ -0,0 +1,228 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.*; + +/** + * Combines gVCF records that were produced by the Haplotype Caller from single sample sources. + * + *

+ * CombineReferenceCalculationVariants combines gVCF records that were produced as part of the "single sample discovery" + * pipeline using the '-ERC GVCF' mode of the Haplotype Caller. This tools performs the multi-sample joint aggregation + * step and merges the records together in a sophisticated manner. + * + * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, + * re-genotype the newly merged record, and then re-annotate it. + * + * + *

Input

+ *

+ * One or more Haplotype Caller gVCFs to combine. + *

+ * + *

Output

+ *

+ * A combined VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineReferenceCalculationVariants \
+ *   --variant input1.vcf \
+ *   --variant input2.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-10,stop=10)) +public class CombineReferenceCalculationVariants extends RodWalker implements AnnotatorCompatible, TreeReducible { + + // TODO -- allow a file of VCF paths to be entered? + + /** + * The VCF files to merge together + */ + @Input(fullName="variant", shortName = "V", doc="One or more input VCF files", required=true) + public List> variants; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + @Argument(fullName="includeNonVariants", shortName="inv", doc="Include loci found to be non-variant after the combining procedure", required=false) + public boolean INCLUDE_NON_VARIANTS = false; + + /** + * Which annotations to recompute for the combined output VCF file. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts"})); + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + // the genotyping engine + private UnifiedGenotyperEngine genotypingEngine; + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + + public List> getCompRodBindings() { return Collections.emptyList(); } + public RodBinding getSnpEffRodBinding() { return null; } + public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } + + + public void initialize() { + // take care of the VCF headers + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + + // create the genotyping engine + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH; + UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; + UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + // create the annotation engine + annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit()); + } + + public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + final GenomeLoc loc = ref.getLocus(); + final VariantContext combinedVC = GATKVariantContextUtils.referenceConfidenceMerge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null); + if ( combinedVC == null ) + return null; + + return regenotypeVC(tracker, ref, combinedVC); + } + + /** + * Re-genotype (and re-annotate) a combined genomic VC + * + * @param tracker the ref tracker + * @param ref the ref context + * @param combinedVC the combined genomic VC + * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites + */ + protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext combinedVC) { + if ( combinedVC == null ) throw new IllegalArgumentException("combinedVC cannot be null"); + + VariantContext result = combinedVC; + + // only re-genotype polymorphic sites + if ( combinedVC.isVariant() ) + result = genotypingEngine.calculateGenotypes(result); + + // if it turned monomorphic and we don't want such sites, quit + if ( !INCLUDE_NON_VARIANTS && result.isMonomorphicInSamples() ) + return null; + + // re-annotate it + return annotationEngine.annotateContext(tracker, ref, null, result); + } + + public VariantContextWriter reduceInit() { + return vcfWriter; + } + + public VariantContextWriter reduce(final VariantContext vc, final VariantContextWriter writer) { + if ( vc != null ) + writer.add(vc); + return writer; + } + + @Override + public VariantContextWriter treeReduce(final VariantContextWriter lhs, final VariantContextWriter rhs) { + return lhs; + } + + @Override + public void onTraversalDone(final VariantContextWriter writer) {} +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 9556f9bf1..6219eb578 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("3d12bdb816d27bf7c9efb4c13dc2aec7")); + Arrays.asList("e10c49fcf9a128745c2b050a52798e58")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("a2c8e83f37cd1e114b42af4b873f57bc")); + Arrays.asList("903af514f70db9238064da311c4ea0de")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 2cdddd49f..dcaed8bf2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("150b31ba05113ca1996b548be5170d6d")); + Arrays.asList("c4248f02103e37e89b0f22c0d9c98492")); executeTest(String.format("test multiple technologies"), spec); } @@ -279,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("7d0ee85cd89f4addd84c5511daaaa5c5")); + Arrays.asList("96c7862d55e933b274cabe45c9c443d9")); executeTest(String.format("test calling with BAQ"), spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 18554e157..01aab8ae3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("ec0977e3fd3e2ac29c9821f0ca830455")); + Arrays.asList("710d379607129935b1b7b6960ca7b213")); executeTest("test MultiSample Pilot1", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java index 38c06c25f..bbbef43d3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java @@ -130,4 +130,12 @@ public class PairHMMIndelErrorModelUnitTest extends BaseTest { final boolean result = PairHMMIndelErrorModel.mustClipDownstream(read, refWindowEnd); Assert.assertEquals(result, read.getSoftStart() < refWindowEnd && read.getSoftStart() + readLength > refWindowEnd); } + + @Test + public void clipDownstreamAtBorderTest() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, 5, 10); + read.setCigarString("10M"); + Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 13), true); + Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 14), false); + } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java new file mode 100644 index 000000000..398fe221c --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java @@ -0,0 +1,83 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; +import org.broadinstitute.sting.BaseTest; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: ami + * Date: 12/5/13 + * Time: 1:04 PM + */ +public class SplitNCigarReadsIntegrationTests extends WalkerTest { + + @Test + // contain reads without N's, with N's and with N's and I's + public void testSplitWithInsertions() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + BaseTest.b37KGReference + " -I " + BaseTest.privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withI.bam -o %s -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("037c72fe1572efb63cccbe0a8dda3cb1")); + executeTest("test split N cigar reads with insertions", spec); + } + + @Test + // contain reads without N's, with N's and with N's and D's, and also with more then one N element in the cigar. + public void testSplitWithDeletions() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + BaseTest.b37KGReference + " -I " + BaseTest.privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withD.bam -o %s -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("8472005c16353715025353d6d453faf4")); + executeTest("test split N cigar reads with deletions", spec); + } + + + + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java similarity index 55% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java index 36da92b4f..7eb95877d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java @@ -44,189 +44,139 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +package org.broadinstitute.sting.gatk.walkers.rnaseq; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.walkers.rnaseq.SplitNCigarReads; +import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; -import java.util.HashMap; -import java.util.Map; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; /** - * Given two BAMs with different read groups, it compares them based on ReduceReads metrics. - *

- * This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files. - *

- *

Input

- *

- * Two BAM files (using -I) with different read group IDs - *

- *

Output

- *

- * [Output description] - *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T $WalkerName
- *  
* - * @author carneiro - * @since 10/30/11 + * Tests all possible (and valid) cigar strings that might contain any cigar elements. It uses a code that were written to test the ReadClipper walker. + * For valid cigar sting in length 8 there are few thousands options, with N in every possible option and with more than one N (for example 1M1N1M1N1M1N2M). + * The cigarElements array is used to provide all the possible cigar element that might be included. + * + * User: ami + * Date: 11/14/13 + * Time: 6:49 PM */ +public class SplitNCigarReadsUnitTest { + final static CigarElement[] cigarElements = { + new CigarElement(1, CigarOperator.HARD_CLIP), + new CigarElement(1, CigarOperator.SOFT_CLIP), + new CigarElement(1, CigarOperator.INSERTION), + new CigarElement(1, CigarOperator.DELETION), + new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH), + new CigarElement(1, CigarOperator.SKIPPED_REGION) + }; -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) -public class CompareBAM extends LocusWalker, CompareBAM.TestResults> { - @Argument(required = true, shortName = "rr", fullName = "reduced_readgroup", doc = "The read group ID corresponding to the compressed BAM being tested") public String reducedReadGroupID; - @Argument(required = false, shortName = "teq", fullName = "test_equal_bases", doc = "Test if the bases marked as '=' are indeed ref bases.") public boolean TEST_EQUAL_BASES = false; - @Argument(required = false, shortName = "tbc", fullName = "test_base_counts", doc = "Test if the base counts tag in consensus reads are accurate.") public boolean TEST_BASE_COUNTS = false; - @Argument(required = false, shortName = "mbq", fullName = "min_base_qual", doc = "Minimum base quality to be considered.") public int MIN_BASE_QUAL = 20; - @Argument(required = false, shortName = "mmq", fullName = "min_mapping_qual", doc = "Minimum mapping quality to be considered.") public int MIN_MAPPING_QUAL = 20; + @Test(enabled = true) + public void splitReadAtN() { + final int cigarStringLength = 10; + final List cigarList = ReadClipperTestUtils.generateCigarList(cigarStringLength,cigarElements); + + // For Debugging use those lines (instead of above cigarList) to create specific read: + //------------------------------------------------------------------------------------ + // final GATKSAMRecord tmpRead = GATKSAMRecord.createRandomRead(6); + // tmpRead.setCigarString("1M1N1M"); + + // final List cigarList = new ArrayList<>(); + // cigarList.add(tmpRead.getCigar()); + + for(Cigar cigar: cigarList){ - @Override - public Map map (RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Map result = new HashMap(); + final int numOfSplits = numOfNElements(cigar.getCigarElements()); - if (TEST_EQUAL_BASES) result.put(TestName.EQUAL_BASES, testEqualBases(ref, context)); - if (TEST_BASE_COUNTS) result.put(TestName.BASE_COUNTS, testBaseCounts(ref, context)); + if(numOfSplits != 0 && isCigarDoesNotHaveEmptyRegionsBetweenNs(cigar)){ - return result; - } - - @Override - public TestResults reduceInit () { - TestResults sum = new TestResults(); // a fresh new TestResults object to sum up the results of every object passed by MAP. - - if (TEST_EQUAL_BASES) sum.createTest(TestName.EQUAL_BASES); - if (TEST_BASE_COUNTS) sum.createTest(TestName.BASE_COUNTS); - - return sum; - } - - @Override - public TestResults reduce (Map mapResult, TestResults sum) { - for (TestName test : mapResult.keySet()) { - if (mapResult.get(test)) - sum.reportSuccess(test); - else - sum.reportFailed(test); - } - - return sum; - } - - public void onTraversalDone (TestResults finalResults) { - finalResults.report(); - } - - private boolean testEqualBases (ReferenceContext ref, AlignmentContext context) { - return true; - } - - private boolean testBaseCounts (ReferenceContext ref, AlignmentContext context) { - - return true; - } - - public enum TestName { - EQUAL_BASES ("testEqualBases"), - BASE_COUNTS ("testBaseCounts"); - - private String testName; - - TestName(String testName) { - this.testName = testName; - } - - public String getTestName() { - return testName; - } - } - - public class TestResults { - private Map testStats = new HashMap(); - - public void createTest (TestName test) { - testStats.put(test, new TestOutcome()); - } - - public void reportSuccess(TestName test) { - if (testStats.containsKey(test)) - testStats.get(test).incPassed(); - else - throw new ReviewedStingException("No such test: " + test); - } - - public void reportFailed(TestName test) { - if (testStats.containsKey(test)) - testStats.get(test).incFailed(); - else - throw new ReviewedStingException("No such test: " + test); - } - - public void report() { - System.out.println(); - System.out.println(String.format("%20s\tPASS\tFAIL", "")); - for (TestName test : testStats.keySet()) - System.out.println(String.format("%20s\t%d\t%d", test.getTestName(), testStats.get(test).getPassed(), testStats.get(test).getFailed())); - System.out.println(); - } - } - - private class TestOutcome { - private long passed; - private long failed; - - public long getPassed() { - return passed; - } - - public void incPassed() { - this.passed++; - } - - public long getFailed() { - return failed; - } - - public void incFailed() { - this.failed++; - } - } - - private BaseCounts getFilteredBaseCounts(AlignmentContext context) { - return getBaseCounts(context, MIN_BASE_QUAL, MIN_MAPPING_QUAL); - } - - private BaseCounts getFullBaseCounts(AlignmentContext context) { - return getBaseCounts(context, 3, 0); - } - - private BaseCounts getBaseCounts(AlignmentContext context, int mbq, int mmq) { - BaseCounts fullBaseCounts = new BaseCounts(); - for (String rg : context.getBasePileup().getReadGroups()) { - if (!rg.equals(reducedReadGroupID)) { - BaseCounts b = BaseCounts.createWithCounts(context.getBasePileup().getPileupForReadGroup(rg).getBaseAndMappingFilteredPileup(mbq, mmq).getBaseCounts()); - fullBaseCounts.add(b); + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + List splitReads = SplitNCigarReads.splitNCigarRead(read); + final int expectedReads = numOfSplits+1; + Assert.assertEquals(splitReads.size(),expectedReads,"wrong number of reads after split read with cigar: "+cigar+" at Ns [expected]: "+expectedReads+" [actual value]: "+splitReads.size()); + final List readLengths = consecutiveNonNElements(read.getCigar().getCigarElements()); + int index = 0; + int offsetFromStart = 0; + for(GATKSAMRecord splitRead: splitReads){ + int expectedLength = readLengths.get(index); + Assert.assertTrue(splitRead.getReadLength() == expectedLength, + "the "+index+" (starting with 0) split read has a wrong length.\n" + + "cigar of original read: "+cigar+"\n"+ + "expected length: "+expectedLength+"\n"+ + "actual length: "+splitRead.getReadLength()+"\n"); + assertBases(splitRead.getReadBases(), read.getReadBases(), offsetFromStart); + index++; + offsetFromStart += expectedLength; + } } } - return fullBaseCounts; } + private int numOfNElements(final List cigarElements){ + int numOfNElements = 0; + for (CigarElement element: cigarElements){ + if (element.getOperator() == CigarOperator.SKIPPED_REGION) + numOfNElements++; + } + return numOfNElements; + } + + private static boolean isCigarDoesNotHaveEmptyRegionsBetweenNs(final Cigar cigar) { + boolean sawM = false; + boolean sawS = false; + + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (cigarElement.getOperator().equals(CigarOperator.SKIPPED_REGION)) { + if(!sawM && !sawS) + return false; + sawM = false; + sawS = false; + } + if (cigarElement.getOperator().equals(CigarOperator.MATCH_OR_MISMATCH)) + sawM = true; + if (cigarElement.getOperator().equals(CigarOperator.SOFT_CLIP)) + sawS = true; + + } + if(!sawS && !sawM) + return false; + return true; + } + + private List consecutiveNonNElements(final List cigarElements){ + final LinkedList results = new LinkedList<>(); + int consecutiveLength = 0; + for(CigarElement element: cigarElements){ + final CigarOperator op = element.getOperator(); + if(op.equals(CigarOperator.MATCH_OR_MISMATCH) || op.equals(CigarOperator.SOFT_CLIP) || op.equals(CigarOperator.INSERTION)){ + consecutiveLength += element.getLength(); + } + else if(op.equals(CigarOperator.SKIPPED_REGION)) + { + if(consecutiveLength != 0){ + results.addLast(consecutiveLength); + consecutiveLength = 0; + } + } + } + if(consecutiveLength != 0) + results.addLast(consecutiveLength); + return results; + } + + private void assertBases(final byte[] actualBase, final byte[] expectedBase, final int startIndex) { + for (int i = 0; i < actualBase.length; i++) { + Assert.assertEquals(actualBase[i], expectedBase[startIndex + i],"unmatched bases between: "+ Arrays.toString(actualBase)+"\nand:\n"+Arrays.toString(expectedBase)+"\nat position: "+i); + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java index 754fe30a2..9a1422608 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java @@ -142,4 +142,39 @@ public class VariantDataManagerUnitTest extends BaseTest { Assert.assertTrue( trainingData.size() == MAX_NUM_TRAINING_DATA ); } + + @Test + public final void testDropAggregateData() { + final int MAX_NUM_TRAINING_DATA = 5000; + final double passingQual = 400.0; + final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + VRAC.MAX_NUM_TRAINING_DATA = MAX_NUM_TRAINING_DATA; + + VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); + final List theData = new ArrayList<>(); + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 10; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = true; + datum.isAggregate = false; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 2; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = false; + datum.isAggregate = true; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + vdm.setData(theData); + vdm.dropAggregateData(); + + for( final VariantDatum datum : vdm.getData() ) { + Assert.assertFalse( datum.isAggregate ); + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index f3e57b48a..225000775 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -62,9 +62,11 @@ import java.util.List; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { private static class VRTest { String inVCF; + String aggregateVCF; String tranchesMD5; String recalMD5; String cutVCFMD5; + public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { this.inVCF = inVCF; this.tranchesMD5 = tranchesMD5; @@ -72,6 +74,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { this.cutVCFMD5 = cutVCFMD5; } + public VRTest(String inVCF, String aggregateVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { + this.inVCF = inVCF; + this.aggregateVCF = aggregateVCF; + this.tranchesMD5 = tranchesMD5; + this.recalMD5 = recalMD5; + this.cutVCFMD5 = cutVCFMD5; + } + @Override public String toString() { return "VRTest{inVCF='" + inVCF +"'}"; @@ -83,10 +93,20 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { "73c7897441622c9b37376eb4f071c560", // recal file "11a28df79b92229bd317ac49a3ed0fa1"); // cut VCF + VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf", + "ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches + "1b33c10be7d8bf8e9accd11113835262", // recal file + "4700d52a06f2ef3a5882719b86911e51"); // cut VCF + @DataProvider(name = "VRTest") public Object[][] createData1() { return new Object[][]{ {lowPass} }; - //return new Object[][]{ {yriTrio}, {lowPass} }; // Add hg19 chr20 trio calls here + } + + @DataProvider(name = "VRAggregateTest") + public Object[][] createData2() { + return new Object[][]{ {lowPassPlusExomes} }; } @Test(dataProvider = "VRTest") @@ -125,6 +145,43 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { executeTest("testApplyRecalibration-"+params.inVCF, spec); } + @Test(dataProvider = "VRAggregateTest") + public void testVariantRecalibratorAggregate(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -aggregate " + params.aggregateVCF + + " -L 20:1,000,000-40,000,000" + + " --no_cmdline_in_header" + + " -an QD -an HaplotypeScore -an MQ" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorAggregate-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRAggregateTest",dependsOnMethods="testVariantRecalibratorAggregate") + public void testApplyRecalibrationAggregate(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " --no_cmdline_in_header" + + " -input " + params.inVCF + + " -U LENIENT_VCF_PROCESSING -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles + executeTest("testApplyRecalibrationAggregate-"+params.inVCF, spec); + } + VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", "3ad7f55fb3b072f373cbce0b32b66df4", // tranches "e747c08131d58d9a4800720f6ca80e0c", // recal file @@ -133,7 +190,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { return new Object[][]{ {bcfTest} }; - //return new Object[][]{ {yriTrio}, {lowPass} }; // Add hg19 chr20 trio calls here } @Test(dataProvider = "VRBCFTest") diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java new file mode 100644 index 000000000..7b546b8db --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java @@ -0,0 +1,72 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class CombineReferenceCalculationVariantsIntegrationTest extends WalkerTest { + + private static String baseTestString(String args, String ref) { + return "-T CombineReferenceCalculationVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; + } + + // TODO -- enable this test (and create others) once the Haplotype Caller produces appropriate gVCFs (with for every record) + @Test(enabled = false) + public void combineSingleSamplePipelineGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -L 20:10,000,000-10,001,000", b37KGReference), + 1, + Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); + executeTest("combineSingleSamplePipelineGVCF", spec); + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 2eeb9221e..fb54ab400 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -185,19 +185,6 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } - @Test public void combineSingleSamplePipelineGVCF() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + - " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + - " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + - " -multipleAllelesMergeType MIX_TYPES" + - " --excludeNonVariants -combineAnnotations -setKey null" + - " -L 20:10,000,000-10,001,000", b37KGReference), - 1, - Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); - cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true); - } - @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 5a1b015fe..e194a9c43 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -170,7 +170,36 @@ public class RefMetaDataTracker { @Requires({"type != null", "onlyAtThisLoc != null"}) public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { return safeGetFirst(getValues(type, onlyAtThisLoc)); + } + /** + * Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param prioritizeThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null", "prioritizeThisLoc != null"}) + @Ensures("result != null") + public List getPrioritizedValue(final Collection> rodBindings, final GenomeLoc prioritizeThisLoc) { + final List results = new ArrayList<>(); + + for ( final RodBinding rodBinding : rodBindings ) { + + // if there's a value at the prioritized location, take it + T value = getFirstValue(rodBinding, prioritizeThisLoc); + + // otherwise, grab any one + if ( value == null ) + value = getFirstValue(rodBinding); + + // add if not null + if ( value != null ) + results.add(value); + } + + return results; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java index cc8b3401e..59b95f2ba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java @@ -26,8 +26,8 @@ package org.broadinstitute.sting.gatk.walkers.qc; import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -48,39 +48,53 @@ import java.io.PrintStream; import java.util.Arrays; /** + * Read clipping statistics for all reads. + * + * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length + * of the clipping to the output stream. + * + * Note: Ignores N's in the Cigar string. + * + *

Input

+ * One or more BAM files + * + *

Output

+ * A simple tabulated text file with read length and clipping statistics for every read (or every N reads if the "skip" + * option is used) + * * User: depristo * Date: May 5, 2010 * Time: 12:16:41 PM */ -/** - * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length - * of the clipping to the output stream. - */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) public class ReadClippingStats extends ReadWalker { @Output protected PrintStream out; - @Argument(fullName="mappedOnly", shortName="mo", doc="when this flag is set (default), statistics will be collected "+ - "on mapped reads only, while unmapped reads will be discarded", required=false) - protected boolean MAPPED_ONLY = true; + /** + * when this flag is set (default), statistics will be collected on unmapped reads as well. The default behavior + * is to ignore unmapped reads." + */ + @Argument(fullName="include_unmapped", shortName="u", doc="Include unmapped reads in the analysis", required=false) + protected boolean INCLUDE_UNMAPPED = false; - @Argument(fullName="skip", shortName="skip", doc="When provided, only every skip reads are analyzed", required=false) + /** + * print every read whose read number is divisible by SKIP. READ_NUMBER % SKIP == 0. First read in the file has read number = 1, + * second is 2, third is 3, ... A value of 1 means print every read. A value of 1000 means print every 1000th read. + */ + @Advanced + @Argument(fullName="skip", shortName="skip", doc="Do not print all reads, skip some.", required=false) protected int SKIP = 1; -// public void initialize() { -// -// } - public class ReadClippingInfo { SAMReadGroupRecord rg; int readLength, nClippingEvents, nClippedBases; } public ReadClippingInfo map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - if ( AlignmentUtils.isReadUnmapped(read) && MAPPED_ONLY) + if ( AlignmentUtils.isReadUnmapped(read) && !INCLUDE_UNMAPPED) return null; ReadClippingInfo info = new ReadClippingInfo(); @@ -89,24 +103,21 @@ public class ReadClippingStats extends ReadWalker implements Tree @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; - @Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false) - public boolean COMBINE_ANNOTATIONS = false; - private List priority = null; /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ @@ -260,12 +257,9 @@ public class CombineVariants extends RodWalker implements Tree // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getValues(variants, context.getLocation()); - Collection potentialRefVCs = tracker.getValues(variants); - potentialRefVCs.removeAll(vcs); if ( sitesOnlyVCF ) { vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); - potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs); } if ( ASSUME_IDENTICAL_SAMPLES ) { @@ -307,17 +301,16 @@ public class CombineVariants extends RodWalker implements Tree // make sure that it is a variant or in case it is not, that we want to include the sites with no variants if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { if (VCsByType.containsKey(type)) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs, - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(), + filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } } } } else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs, - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType, + genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } else { logger.warn("Ignoring all records at site " + ref.getLocus()); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 836c16a7e..7d5823ae0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -362,21 +362,16 @@ public class ClippingOp { * @return a cloned version of read that has been properly trimmed down */ private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { - final int firstBaseAfterSoftClips = read.getAlignmentStart() - read.getSoftStart(); - final int lastBaseBeforeSoftClips = read.getSoftEnd() - read.getSoftStart(); - - if (start == firstBaseAfterSoftClips && stop == lastBaseBeforeSoftClips) // note that if the read has no soft clips, these constants will be 0 and read length - 1 (beauty of math). - return GATKSAMRecord.emptyRead(read); // If the read is unmapped there is no Cigar string and neither should we create a new cigar string - CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); + final CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); // the cigar may force a shift left or right (or both) in case we are left with insertions // starting or ending the read after applying the hard clip on start/stop. - int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; + final int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; + final byte[] newBases = new byte[newLength]; + final byte[] newQuals = new byte[newLength]; + final int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); @@ -396,8 +391,8 @@ public class ClippingOp { hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar)); if (read.hasBaseIndelQualities()) { - byte[] newBaseInsertionQuals = new byte[newLength]; - byte[] newBaseDeletionQuals = new byte[newLength]; + final byte[] newBaseInsertionQuals = new byte[newLength]; + final byte[] newBaseDeletionQuals = new byte[newLength]; System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); @@ -516,20 +511,20 @@ public class ClippingOp { } /** - * Checks if a hard clipped cigar left a read starting or ending with insertions/deletions + * Checks if a hard clipped cigar left a read starting or ending with deletions or gap (N) * and cleans it up accordingly. * * @param cigar the original cigar * @return an object with the shifts (see CigarShift class) */ - private CigarShift cleanHardClippedCigar(Cigar cigar) { - Cigar cleanCigar = new Cigar(); + private CigarShift cleanHardClippedCigar(final Cigar cigar) { + final Cigar cleanCigar = new Cigar(); int shiftFromStart = 0; int shiftFromEnd = 0; Stack cigarStack = new Stack(); - Stack inverseCigarStack = new Stack(); + final Stack inverseCigarStack = new Stack(); - for (CigarElement cigarElement : cigar.getCigarElements()) + for (final CigarElement cigarElement : cigar.getCigarElements()) cigarStack.push(cigarElement); for (int i = 1; i <= 2; i++) { @@ -542,8 +537,8 @@ public class ClippingOp { CigarElement cigarElement = cigarStack.pop(); if (!readHasStarted && -// cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.DELETION && + cigarElement.getOperator() != CigarOperator.SKIPPED_REGION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) readHasStarted = true; @@ -553,6 +548,9 @@ public class ClippingOp { else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) totalHardClip += cigarElement.getLength(); + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) + totalHardClip += cigarElement.getLength(); + if (readHasStarted) { if (i == 1) { if (!addedHardClips) { diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index eaefa3aba..1b72503fa 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -30,6 +30,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -348,20 +349,38 @@ public class ReadClipper { public static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop ) { final int start = read.getAlignmentStart(); final int stop = read.getAlignmentEnd(); + return hardClipToRegion(read, refStart, refStop,start,stop); + } + /** + * Hard clip the read to the variable region (from refStart to refStop) processing also the clipped bases + * + * @param read the read to be clipped + * @param refStart the beginning of the variant region (inclusive) + * @param refStop the end of the variant region (inclusive) + * @return the read hard clipped to the variant region + */ + public static GATKSAMRecord hardClipToRegionIncludingClippedBases( final GATKSAMRecord read, final int refStart, final int refStop ) { + final int start = read.getOriginalAlignmentStart(); + final int stop = start + CigarUtils.countRefBasesBasedOnCigar(read,0,read.getCigarLength()) - 1; + return hardClipToRegion(read, refStart, refStop,start,stop); + } + + private static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop, final int alignmentStart, final int alignmentStop){ // check if the read is contained in region - if (start <= refStop && stop >= refStart) { - if (start < refStart && stop > refStop) + if (alignmentStart <= refStop && alignmentStop >= refStart) { + if (alignmentStart < refStart && alignmentStop > refStop) return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1); - else if (start < refStart) + else if (alignmentStart < refStart) return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1); - else if (stop > refStop) + else if (alignmentStop > refStop) return hardClipByReferenceCoordinatesRightTail(read, refStop + 1); return read; } else return GATKSAMRecord.emptyRead(read); } + public static List hardClipToRegion( final List reads, final int refStart, final int refStop ) { final List returnList = new ArrayList( reads.size() ); for( final GATKSAMRecord read : reads ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/CigarUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/CigarUtils.java new file mode 100644 index 000000000..a516ec11e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/CigarUtils.java @@ -0,0 +1,167 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Stack; + +/** + * Created with IntelliJ IDEA. + * User: ami + * Date: 11/26/13 + * Time: 11:33 AM + * To change this template use File | Settings | File Templates. + */ +public class CigarUtils { + + /** + * Combines equal adjacent elements of a Cigar object + * + * @param rawCigar the cigar object + * @return a combined cigar object + */ + public static Cigar combineAdjacentCigarElements(Cigar rawCigar) { + Cigar combinedCigar = new Cigar(); + CigarElement lastElement = null; + int lastElementLength = 0; + for (CigarElement cigarElement : rawCigar.getCigarElements()) { + if (lastElement != null && + ((lastElement.getOperator() == cigarElement.getOperator()) || + (lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) || + (lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I))) + lastElementLength += cigarElement.getLength(); + else + { + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + lastElement = cigarElement; + lastElementLength = cigarElement.getLength(); + } + } + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + return combinedCigar; + } + + public static Cigar invertCigar (Cigar cigar) { + Stack cigarStack = new Stack(); + for (CigarElement cigarElement : cigar.getCigarElements()) + cigarStack.push(cigarElement); + + Cigar invertedCigar = new Cigar(); + while (!cigarStack.isEmpty()) + invertedCigar.add(cigarStack.pop()); + + return invertedCigar; + } + + /** + * Checks whether or not the read has any cigar element that is not H or S + * + * @param read the read + * @return true if it has any M, I or D, false otherwise + */ + public static boolean readHasNonClippedBases(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } + + public static Cigar cigarFromString(String cigarString) { + return TextCigarCodec.getSingleton().decode(cigarString); + } + + + /** + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * - No consecutive I/D elements + **/ + public static boolean isCigarValid(Cigar cigar) { + if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation) + + Stack cigarElementStack = new Stack(); // Stack to invert cigar string to find ending operator + CigarOperator startingOp = null; + CigarOperator endingOp = null; + + // check if it doesn't start with deletions + boolean readHasStarted = false; // search the list of elements for the starting operator + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (!readHasStarted) { + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + startingOp = cigarElement.getOperator(); + } + } + cigarElementStack.push(cigarElement); + } + + while (!cigarElementStack.empty()) { + CigarElement cigarElement = cigarElementStack.pop(); + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + endingOp = cigarElement.getOperator(); + break; + } + } + + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.SKIPPED_REGION && endingOp != CigarOperator.SKIPPED_REGION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) + } + + return false; + } + + public static final int countRefBasesBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex){ + int result = 0; + for(int i = cigarStartIndex; i= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) + //TODO since we do not have contracts any more, should we check for the requirements in the method code? public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false); } @@ -479,9 +480,11 @@ public class ReadUtils { public static Pair getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { int readBases = 0; int refBases = 0; - boolean fallsInsideDeletion = false; + boolean fallsInsideDeletionOrSkippedRegion = false; + boolean endJustBeforeDeletionOrSkippedRegion = false; + boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false; - int goal = refCoord - alignmentStart; // The goal is to move this many reference bases + final int goal = refCoord - alignmentStart; // The goal is to move this many reference bases if (goal < 0) { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); @@ -493,7 +496,7 @@ public class ReadUtils { Iterator cigarElementIterator = cigar.getCigarElements().iterator(); while (!goalReached && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); + final CigarElement cigarElement = cigarElementIterator.next(); int shift = 0; if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { @@ -511,7 +514,7 @@ public class ReadUtils { if (goalReached) { // Is this base's reference position within this cigar element? Or did we use it all? - boolean endsWithinCigar = shift < cigarElement.getLength(); + final boolean endsWithinCigar = shift < cigarElement.getLength(); // If it isn't, we need to check the next one. There should *ALWAYS* be a next one // since we checked if the goal coordinate is within the read length, so this is just a sanity check. @@ -523,13 +526,13 @@ public class ReadUtils { } } - CigarElement nextCigarElement; + CigarElement nextCigarElement = null; - // if we end inside the current cigar element, we just have to check if it is a deletion + // if we end inside the current cigar element, we just have to check if it is a deletion (or skipped region) if (endsWithinCigar) - fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; + fallsInsideDeletionOrSkippedRegion = (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) ; - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + // if we end outside the current cigar element, we need to check if the next element is an insertion, deletion or skipped region. else { nextCigarElement = cigarElementIterator.next(); @@ -547,22 +550,27 @@ public class ReadUtils { nextCigarElement = cigarElementIterator.next(); } - // if it's a deletion, we will pass the information on to be handled downstream. - fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; + // if it's a deletion (or skipped region), we will pass the information on to be handled downstream. + endJustBeforeDeletionOrSkippedRegion = (nextCigarElement.getOperator() == CigarOperator.DELETION || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION); } - // If we reached our goal outside a deletion, add the shift - if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) + fallsInsideOrJustBeforeDeletionOrSkippedRegion = endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion; + + // If we reached our goal outside a deletion (or skipped region), add the shift + if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) readBases += shift; - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // If we reached our goal just before a deletion (or skipped region) we need // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar && cigarElement.getOperator().consumesReadBases()) + // base before the deletion (or skipped region) (see warning in function contracts) + else if (endJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) readBases += shift - 1; - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion - else if (fallsInsideDeletion && endsWithinCigar) + // If we reached our goal inside a deletion (or skipped region), or just between a deletion and a skipped region, + // then we must backtrack to the last base before the deletion (or skipped region) + else if (fallsInsideDeletionOrSkippedRegion || + (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.N)) || + (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.D))) readBases--; } } @@ -575,7 +583,7 @@ public class ReadUtils { } } - return new Pair(readBases, fallsInsideDeletion); + return new Pair(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 1ae34e268..c36d7f888 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -790,7 +790,6 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, @@ -801,10 +800,9 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC, - final boolean combineAnnotations ) { + final boolean mergeInfoWithMaxAC ) { int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, Collections.emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations); + return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); } /** @@ -817,7 +815,6 @@ public class GATKVariantContextUtils { * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ * * @param unsortedVCs collection of unsorted VCs - * @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes @@ -826,11 +823,9 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, - final Collection potentialRefVCs, final List priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, @@ -839,9 +834,7 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC, - final boolean combineAnnotations ) { - + final boolean mergeInfoWithMaxAC ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -860,9 +853,6 @@ public class GATKVariantContextUtils { VCs.add(vc); } - // cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible - VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs); - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -950,38 +940,19 @@ public class GATKVariantContextUtils { for (final Map.Entry p : vc.getAttributes().entrySet()) { final String key = p.getKey(); final Object value = p.getValue(); - boolean badAnnotation = false; - if ( combineAnnotations ) { // add the annotation values to a list for combining later - List values = annotationMap.get(key); - if( values == null ) { - values = new ArrayList<>(); - annotationMap.put(key, values); - } - try { - final String stringValue = value.toString(); - // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. - if (stringValue.contains(".")) - values.add(Double.parseDouble(stringValue)); - else - values.add(Integer.parseInt(stringValue)); - } catch (NumberFormatException e) { - badAnnotation = true; - } - } - if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - attributes.put(key, value); - } + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); } } } @@ -1007,12 +978,6 @@ public class GATKVariantContextUtils { // take the VC with the maxAC and pull the attributes into a modifiable map if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); - } else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided - for ( final Map.Entry> p : annotationMap.entrySet() ) { - if ( ! p.getValue().isEmpty() ) { - attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); - } - } } // if at least one record was unfiltered and we want a union, clear all of the filters @@ -1058,11 +1023,6 @@ public class GATKVariantContextUtils { builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); } builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); - if( combineAnnotations ) { - // unfortunately some attributes are just too dangerous to try to combine together - builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY); - builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); - } // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); @@ -1070,69 +1030,202 @@ public class GATKVariantContextUtils { return merged; } - private static final Comparable combineAnnotationValues( final List array ) { + private static Comparable combineAnnotationValues( final List array ) { return MathUtils.median(array); // right now we take the median but other options could be explored } /** - * cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible - * @param VCs the list of VCs in which to fill in symbolic alleles - * @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles - * @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele + * Merges VariantContexts from gVCFs into a single hybrid. + * Assumes that none of the input records are filtered. + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case + * @return new VariantContext representing the merge of all VCs or null if it not relevant */ - protected static final List fillInNonRefSymbolicAlleles( final List VCs, final Collection potentialRefVCs ) { - if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); } - if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); } + public static VariantContext referenceConfidenceMerge(final List VCs, final GenomeLoc loc, final Byte refBase) { - final List VCsToReturn = new ArrayList<>(VCs.size()); - boolean containsNonRefSymbolicAllele = false; - VariantContext nonRefVC = null; - for( final VariantContext vc : VCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - containsNonRefSymbolicAllele = true; - } else if ( nonRefVC == null ) { - nonRefVC = vc; - } - if( nonRefVC != null && containsNonRefSymbolicAllele == true ) { - break; // break out so that we don't run over the whole list unnecessarily - } + if ( VCs == null || VCs.size() == 0 ) throw new IllegalArgumentException("VCs cannot be null or empty"); + + // establish the baseline info (sometimes from the first VC) + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + + // ref allele + final Allele refAllele = determineReferenceAlleleGiveReferenceBase(VCs, loc, refBase); + if ( refAllele == null ) + return null; + + // alt alleles + final AlleleMapper alleleMapper = determineAlternateAlleleMapping(VCs, refAllele, loc); + final List alleles = getAllelesListFromMapper(refAllele, alleleMapper); + + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + + VariantContext longestVC = first; + int depth = 0; + final Map> annotationMap = new LinkedHashMap<>(); + GenotypesContext genotypes = GenotypesContext.create(); + + // cycle through and add info from the other VCs + for ( final VariantContext vc : VCs ) { + + // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) + final boolean isSpanningEvent = loc.getStart() != vc.getStart(); + final List remappedAlleles = isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) : alleleMapper.remap(vc.getAlleles()); + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, alleles); + + // special case DP (add it up) for all events + if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + + if ( isSpanningEvent ) + continue; + + // keep track of the longest location that starts here + if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) + longestVC = vc; + + // special case ID (just preserve it) + if ( vc.hasID() ) rsIDs.add(vc.getID()); + + // add attributes + addReferenceConfidenceAttributes(vc.getAttributes(), attributes, inconsistentAttributes, annotationMap); } - for( final VariantContext vc : potentialRefVCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - containsNonRefSymbolicAllele = true; - VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below + + // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); } } - if( !containsNonRefSymbolicAllele ) { - return VCs; - } + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - for( final VariantContext vc : VCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site - if( nonRefVC != null ) { - final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size()); - int depth = 0; - for( final String sample : vc.getSampleNames() ) { - final Genotype gt = vc.getGenotype(sample); - final ArrayList refAlleles = new ArrayList<>(2); - refAlleles.add(nonRefVC.getReference()); - refAlleles.add(nonRefVC.getReference()); - final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now - depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here - genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make()); - } - VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make()); - } - } else { - VCsToReturn.add(vc); - } - } + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - return VCsToReturn; + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles) + .loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()) + .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later + + // remove stale AC and AF based attributes + removeStaleAttributesAfterMerge(builder); + + return builder.make(); } - private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + /** + * Determines the ref allele given the provided reference base at this position + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning + * @return new Allele or null if no reference allele/base is available + */ + private static Allele determineReferenceAlleleGiveReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { + final Allele refAllele = determineReferenceAllele(VCs, loc); + if ( refAllele == null ) + return ( refBase == null ? null : Allele.create(refBase, true) ); + return refAllele; + } + + /** + * Creates an alleles list given a reference allele and a mapper + * + * @param refAllele the reference allele + * @param alleleMapper the allele mapper + * @return a non-null, non-empty list of Alleles + */ + private static List getAllelesListFromMapper(final Allele refAllele, final AlleleMapper alleleMapper) { + final List alleles = new ArrayList<>(); + alleles.add(refAllele); + alleles.addAll(alleleMapper.getUniqueMappedAlleles()); + return alleles; + } + + /** + * Remove the stale attributes from the merged VariantContext (builder) + * + * @param builder the VC builder + */ + private static void removeStaleAttributesAfterMerge(final VariantContextBuilder builder) { + builder.rmAttributes(Arrays.asList(VCFConstants.ALLELE_COUNT_KEY, + VCFConstants.ALLELE_FREQUENCY_KEY, + VCFConstants.ALLELE_NUMBER_KEY, + VCFConstants.MLE_ALLELE_COUNT_KEY, + VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + } + + /** + * Adds attributes to the global map from the new context in a sophisticated manner + * + * @param myAttributes attributes to add from + * @param globalAttributes global set of attributes to add to + * @param inconsistentAttributes set of attributes that are inconsistent among samples + * @param annotationMap map of annotations for combining later + */ + private static void addReferenceConfidenceAttributes(final Map myAttributes, + final Map globalAttributes, + final Set inconsistentAttributes, + final Map> annotationMap) { + for ( final Map.Entry p : myAttributes.entrySet() ) { + final String key = p.getKey(); + final Object value = p.getValue(); + boolean badAnnotation = false; + + // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); + } catch (final NumberFormatException e) { + badAnnotation = true; + } + + // only output annotations that have the same value in every input VC + if ( badAnnotation && ! inconsistentAttributes.contains(key) ) { + checkForConsistency(key, value, globalAttributes, inconsistentAttributes); + } + } + } + + /** + * Check attributes for consistency to others in the merge + * + * @param key the attribute key + * @param value the attribute value + * @param globalAttributes the global list of attributes being merged + * @param inconsistentAttributes the list of inconsistent attributes in the merge + */ + private static void checkForConsistency(final String key, + final Object value, + final Map globalAttributes, + final Set inconsistentAttributes) { + final boolean alreadyFound = globalAttributes.containsKey(key); + final Object boundValue = globalAttributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + globalAttributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + globalAttributes.put(key, value); + } + } + + private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { final Iterator it1 = alleleSet1.iterator(); final Iterator it2 = alleleSet2.iterator(); @@ -1262,65 +1355,130 @@ public class GATKVariantContextUtils { return builder.make(); } - static private Allele determineReferenceAllele(List VCs) { + static private Allele determineReferenceAllele(final List VCs) { + return determineReferenceAllele(VCs, null); + } + + /** + * Determines the common reference allele + * + * @param VCs the list of VariantContexts + * @param loc if not null, ignore records that do not begin at this start location + * @return possibly null Allele + */ + static private Allele determineReferenceAllele(final List VCs, final GenomeLoc loc) { Allele ref = null; for ( final VariantContext vc : VCs ) { - final Allele myRef = vc.getReference(); - if ( ref == null || ref.length() < myRef.length() ) - ref = myRef; - else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) - throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + if ( contextMatchesLoc(vc, loc) ) { + final Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + } } return ref; } - static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { + static private boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { + return loc == null || loc.getStart() == vc.getStart(); + } + + /** + * Given the reference allele, determines the mapping for common alternate alleles in the list of VariantContexts. + * + * @param VCs the list of VariantContexts + * @param refAllele the reference allele + * @param loc if not null, ignore records that do not begin at this start location + * @return non-null AlleleMapper + */ + static private AlleleMapper determineAlternateAlleleMapping(final List VCs, final Allele refAllele, final GenomeLoc loc) { + final Map map = new HashMap<>(); + + for ( final VariantContext vc : VCs ) { + if ( contextMatchesLoc(vc, loc) ) + addAllAlternateAllelesToMap(vc, refAllele, map); + } + + return new AlleleMapper(map); + } + + /** + * Adds all of the alternate alleles from the VariantContext to the allele mapping (for use in creating the AlleleMapper) + * + * @param vc the VariantContext + * @param refAllele the reference allele + * @param map the allele mapping to populate + */ + static private void addAllAlternateAllelesToMap(final VariantContext vc, final Allele refAllele, final Map map) { + // if the ref allele matches, then just add the alts as is + if ( refAllele.equals(vc.getReference()) ) { + for ( final Allele altAllele : vc.getAlternateAlleles() ) { + // ignore symbolic alleles + if ( ! altAllele.isSymbolic() ) + map.put(altAllele, altAllele); + } + } + else { + map.putAll(createAlleleMapping(refAllele, vc, map.values())); + } + } + + static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set allAlleles) { if ( refAllele.equals(vc.getReference()) ) return new AlleleMapper(vc); else { - // we really need to do some work. The refAllele is the longest reference allele seen at this - // start site. So imagine it is: - // - // refAllele: ACGTGA - // myRef: ACGT - // myAlt: A - // - // We need to remap all of the alleles in vc to include the extra GA so that - // myRef => refAllele and myAlt => AGA - // - - Allele myRef = vc.getReference(); - if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); - byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); - -// System.out.printf("Remapping allele at %s%n", vc); -// System.out.printf("ref %s%n", refAllele); -// System.out.printf("myref %s%n", myRef ); -// System.out.printf("extrabases %s%n", new String(extraBases)); - - Map map = new HashMap<>(); - for ( final Allele a : vc.getAlleles() ) { - if ( a.isReference() ) - map.put(a, refAllele); - else { - Allele extended = Allele.extend(a, extraBases); - for ( final Allele b : allAlleles ) - if ( extended.equals(b) ) - extended = b; -// System.out.printf(" Extending %s => %s%n", a, extended); - map.put(a, extended); - } - } - - // debugging -// System.out.printf("mapping %s%n", map); - + final Map map = createAlleleMapping(refAllele, vc, allAlleles); + map.put(vc.getReference(), refAllele); return new AlleleMapper(map); } } + /** + * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele + * + * The refAllele is the longest reference allele seen at this start site. + * So imagine it is: + * refAllele: ACGTGA + * myRef: ACGT + * myAlt: A + * + * We need to remap all of the alleles in vc to include the extra GA so that + * myRef => refAllele and myAlt => AGA + * + * @param refAllele the new (extended) reference allele + * @param oneVC the Variant Context to extend + * @param currentAlleles the list of alleles already created + * @return a non-null mapping of original alleles to new (extended) ones + */ + private static Map createAlleleMapping(final Allele refAllele, + final VariantContext oneVC, + final Collection currentAlleles) { + final Allele myRef = oneVC.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + + final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + + final Map map = new HashMap<>(); + for ( final Allele a : oneVC.getAlternateAlleles() ) { + if ( isUsableAlternateAllele(a) ) { + Allele extended = Allele.extend(a, extraBases); + for ( final Allele b : currentAlleles ) + if ( extended.equals(b) ) + extended = b; + map.put(a, extended); + } + } + + return map; + } + + static private boolean isUsableAlternateAllele(final Allele allele) { + return ! (allele.isReference() || allele.isSymbolic() ); + } + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); @@ -1352,6 +1510,124 @@ public class GATKVariantContextUtils { } } + /** + * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele + * + * @param alleles list of alleles to replace + * @return non-null list of alleles + */ + private static List replaceWithNoCalls(final List alleles) { + if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); + + final List result = new ArrayList<>(alleles.size()); + for ( final Allele allele : alleles ) + result.add(allele.equals(NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); + return result; + } + + /** + * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. + * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. + * + * @param mergedGenotypes the genotypes context to add to + * @param VC the Variant Context for the sample + * @param remappedAlleles the list of remapped alleles for the sample + * @param targetAlleles the list of target alleles + */ + private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, + final VariantContext VC, + final List remappedAlleles, + final List targetAlleles) { + for ( final Genotype g : VC.getGenotypes() ) { + + // only add if the name is new + final String name = g.getSampleName(); + if ( !mergedGenotypes.containsSample(name) ) { + // we need to modify it even if it already contains all of the alleles because we need to purge the PLs out anyways + final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles); + final int[] PLs = generatePLs(g, indexesOfRelevantAlleles); + // note that we set the alleles to null here (as we expect it to be re-genotyped) + final Genotype newG = new GenotypeBuilder(g).name(name).alleles(null).PL(PLs).noAD().noGQ().make(); + mergedGenotypes.add(newG); + } + } + } + + /** + * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. + * If the myAlleles set does not contain "" as an allele, it throws an exception. + * + * @param remappedAlleles the list of alleles to evaluate + * @param targetAlleles the target list of alleles + * @return non-null array of ints representing indexes + */ + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles) { + + if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); + if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); + + if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) ) + throw new IllegalArgumentException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele; please use the Haplotype Caller with gVCF output to generate appropriate records"); + final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE); + + final int[] indexMapping = new int[targetAlleles.size()]; + + // the reference alleles always match up (even if they don't appear to) + indexMapping[0] = 0; + + // create the index mapping, using the allele whenever such a mapping doesn't exist + for ( int i = 1; i < targetAlleles.size(); i++ ) { + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele; + } + + return indexMapping; + } + + /** + * Generates new PLs given the set of indexes of the Genotype's current alleles from the original PLs. + * Throws an exception if the Genotype does not contain PLs. + * + * @param genotype the genotype from which to grab PLs + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new PLs + */ + protected static int[] generatePLs(final Genotype genotype, final int[] indexesOfRelevantAlleles) { + if ( !genotype.hasPL() ) + throw new IllegalArgumentException("Cannot generate new PLs from a genotype without PLs"); + + final int[] originalPLs = genotype.getPL(); + + // assume diploid + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(indexesOfRelevantAlleles.length, 2); + final int[] newPLs = new int[numLikelihoods]; + + for ( int i = 0; i < indexesOfRelevantAlleles.length; i++ ) { + for ( int j = i; j < indexesOfRelevantAlleles.length; j++ ) { + final int originalPLindex = calculatePLindexFromUnorderedIndexes(indexesOfRelevantAlleles[i], indexesOfRelevantAlleles[j]); + if ( originalPLindex >= originalPLs.length ) + throw new IllegalStateException("The original PLs do not have enough values; accessing index " + originalPLindex + " but size is " + originalPLs.length); + + final int newPLindex = GenotypeLikelihoods.calculatePLindex(i, j); + newPLs[newPLindex] = originalPLs[originalPLindex]; + } + } + + return newPLs; + } + + /** + * This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex() + * + * @param originalIndex1 the index of the first allele + * @param originalIndex2 the index of the second allele + * @return the PL index + */ + protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) { + // we need to make sure they are ordered correctly + return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2); + } + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { return uniquify ? sampleName + "." + trackName : sampleName; } @@ -1717,6 +1993,15 @@ public class GATKVariantContextUtils { } return newAs; } + + /** + * @return the list of unique values + */ + public List getUniqueMappedAlleles() { + if ( map == null ) + return Collections.emptyList(); + return new ArrayList<>(new HashSet<>(map.values())); + } } private static class CompareByPriority implements Comparator, Serializable { diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index cbbc8252b..6dd4d8104 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -31,6 +31,7 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -57,11 +58,15 @@ public class ReadClipperTestUtils { } public static GATKSAMRecord makeReadFromCigar(String cigarString) { - return makeReadFromCigar(cigarFromString(cigarString)); + return makeReadFromCigar(CigarUtils.cigarFromString(cigarString)); + } + + public static List generateCigarList(int maximumLength) { + return generateCigarList(maximumLength, cigarElements); } /** - * This function generates every valid permutation of cigar strings with a given length. + * This function generates every valid permutation of cigar strings (with a given set of cigarElement) with a given length. * * A valid cigar object obeys the following rules: * - No Hard/Soft clips in the middle of the read @@ -72,7 +77,7 @@ public class ReadClipperTestUtils { * @param maximumLength the maximum number of elements in the cigar * @return a list with all valid Cigar objects */ - public static List generateCigarList(int maximumLength) { + public static List generateCigarList(int maximumLength, CigarElement[] cigarElements) { int numCigarElements = cigarElements.length; LinkedList cigarList = new LinkedList(); byte [] cigarCombination = new byte[maximumLength]; @@ -80,9 +85,9 @@ public class ReadClipperTestUtils { Utils.fillArrayWithByte(cigarCombination, (byte) 0); // we start off with all 0's in the combination array. int currentIndex = 0; while (true) { - Cigar cigar = createCigarFromCombination(cigarCombination); // create the cigar - cigar = combineAdjacentCigarElements(cigar); // combine adjacent elements - if (isCigarValid(cigar)) { // check if it's valid + Cigar cigar = createCigarFromCombination(cigarCombination, cigarElements); // create the cigar + cigar = CigarUtils.combineAdjacentCigarElements(cigar); // combine adjacent elements + if (CigarUtils.isCigarValid(cigar)) { // check if it's valid cigarList.add(cigar); // add it } @@ -107,41 +112,7 @@ public class ReadClipperTestUtils { return cigarList; } - private static boolean isCigarValid(Cigar cigar) { - if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation) - - Stack cigarElementStack = new Stack(); // Stack to invert cigar string to find ending operator - CigarOperator startingOp = null; - CigarOperator endingOp = null; - - // check if it doesn't start with deletions - boolean readHasStarted = false; // search the list of elements for the starting operator - for (CigarElement cigarElement : cigar.getCigarElements()) { - if (!readHasStarted) { - if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { - readHasStarted = true; - startingOp = cigarElement.getOperator(); - } - } - cigarElementStack.push(cigarElement); - } - - while (!cigarElementStack.empty()) { - CigarElement cigarElement = cigarElementStack.pop(); - if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { - endingOp = cigarElement.getOperator(); - break; - } - } - - if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION) - return true; // we don't accept reads starting or ending in deletions (add any other constraint here) - } - - return false; - } - - private static Cigar createCigarFromCombination(byte[] cigarCombination) { + private static Cigar createCigarFromCombination(byte[] cigarCombination, CigarElement[] cigarElements) { Cigar cigar = new Cigar(); for (byte i : cigarCombination) { cigar.add(cigarElements[i]); @@ -149,38 +120,6 @@ public class ReadClipperTestUtils { return cigar; } - - /** - * Combines equal adjacent elements of a Cigar object - * - * @param rawCigar the cigar object - * @return a combined cigar object - */ - private static Cigar combineAdjacentCigarElements(Cigar rawCigar) { - Cigar combinedCigar = new Cigar(); - CigarElement lastElement = null; - int lastElementLength = 0; - for (CigarElement cigarElement : rawCigar.getCigarElements()) { - if (lastElement != null && - ((lastElement.getOperator() == cigarElement.getOperator()) || - (lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) || - (lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I))) - lastElementLength += cigarElement.getLength(); - else - { - if (lastElement != null) - combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); - - lastElement = cigarElement; - lastElementLength = cigarElement.getLength(); - } - } - if (lastElement != null) - combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); - - return combinedCigar; - } - public static GATKSAMRecord makeRead() { return ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); } @@ -201,34 +140,5 @@ public class ReadClipperTestUtils { // Otherwise test if they're both empty else Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); - } - - public static Cigar invertCigar (Cigar cigar) { - Stack cigarStack = new Stack(); - for (CigarElement cigarElement : cigar.getCigarElements()) - cigarStack.push(cigarElement); - - Cigar invertedCigar = new Cigar(); - while (!cigarStack.isEmpty()) - invertedCigar.add(cigarStack.pop()); - - return invertedCigar; - } - - /** - * Checks whether or not the read has any cigar element that is not H or S - * - * @param read the read - * @return true if it has any M, I or D, false otherwise - */ - public static boolean readHasNonClippedBases(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) - if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) - return true; - return false; - } - - public static Cigar cigarFromString(String cigarString) { - return TextCigarCodec.getSingleton().decode(cigarString); - } + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index d6bd0d4d2..cc008404c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -30,6 +30,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -241,7 +242,7 @@ public class ReadClipperUnitTest extends BaseTest { int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) - expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); + expectedLength -= leadingCigarElementLength(CigarUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); if (!clippedRead.isEmpty()) { Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there @@ -256,7 +257,7 @@ public class ReadClipperUnitTest extends BaseTest { public void testRevertSoftClippedBases() { for (Cigar cigar : cigarList) { final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); - final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(CigarUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); @@ -278,7 +279,7 @@ public class ReadClipperUnitTest extends BaseTest { public void testRevertSoftClippedBasesWithThreshold() { for (Cigar cigar : cigarList) { final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); - final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(CigarUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); @@ -348,7 +349,7 @@ public class ReadClipperUnitTest extends BaseTest { * @param clipped clipped read */ private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) { - if (ReadClipperTestUtils.readHasNonClippedBases(clipped)) { + if (CigarUtils.readHasNonClippedBases(clipped)) { Assert.assertEquals(original.getUnclippedStart(), clipped.getUnclippedStart()); Assert.assertEquals(original.getUnclippedEnd(), clipped.getUnclippedEnd()); } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 7e085547f..5732e5746 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -220,7 +220,7 @@ public class ReadUtilsUnitTest extends BaseTest { } @Test (enabled = true) - public void testReadWithNs() throws FileNotFoundException { + public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException { final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); @@ -232,6 +232,22 @@ public class ReadUtilsUnitTest extends BaseTest { read.setCigarString("3M414N1D73M"); final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 2); + } + + @Test (enabled = true) + public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9393, ReadUtils.ClippingTail.LEFT_TAIL); Assert.assertEquals(result, 3); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 30f112241..23a24e180 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -27,12 +27,9 @@ package org.broadinstitute.sting.utils.variant; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -182,7 +179,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); } @@ -240,7 +237,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); Assert.assertEquals(merged.getID(), cfg.expected); } @@ -355,7 +352,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeFiltered(MergeFilteredTest cfg) { final List priority = vcs2priority(cfg.inputs); final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -482,7 +479,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = GATKVariantContextUtils.simpleMerge( cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -523,7 +520,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); // test genotypes Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); @@ -556,7 +553,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); if ( annotate ) Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); @@ -1036,26 +1033,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } - @Test(enabled = !DEBUG) - public void testFillInNonRefSymbolicAlleles() { - final int start = 10; - final String ref = "A"; - final String alt = "C"; - final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); - final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">")); - - List VCs = Arrays.asList(vcAlt, vcRef); - VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.emptyList()); - - // make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele - for( final VariantContext vc : VCs ) { - Assert.assertTrue(vc.getAlternateAlleles().size() == 1); - Assert.assertTrue(vc.getAlternateAllele(0).isNonReference()); - Assert.assertTrue(!vc.getReference().isSymbolic()); - Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic()); - } - } - // -------------------------------------------------------------------------------- // // test allele remapping @@ -1428,4 +1405,225 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { assertGenotypesAreEqual(actualGT, expected); } } + + // -------------------------------------------------------------------------------- + // + // Test methods for merging reference confidence VCs + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "generatePLsData") + public Object[][] makeGeneratePLsData() { + final List tests = new ArrayList<>(); + + for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) { + for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) { + for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) { + final int[] indexes = new int[originalAlleles]; + for ( int i = 0; i < originalAlleles; i++ ) + indexes[i] = i; + indexes[swapPosition1] = swapPosition2; + indexes[swapPosition2] = swapPosition1; + tests.add(new Object[]{originalAlleles, indexes}); + } + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "generatePLsData") + public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) { + + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2); + final int[] PLs = new int[numLikelihoods]; + for ( int i = 0; i < numLikelihoods; i++ ) + PLs[i] = i; + + final List alleles = new ArrayList<>(numOriginalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < numOriginalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make(); + + final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering); + + Assert.assertEquals(newPLs.length, numLikelihoods); + + final int[] expectedPLs = new int[numLikelihoods]; + for ( int i = 0; i < numOriginalAlleles; i++ ) { + for ( int j = i; j < numOriginalAlleles; j++ ) { + final int index = GenotypeLikelihoods.calculatePLindex(i, j); + final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]); + expectedPLs[index] = value; + } + } + + for ( int i = 0; i < numLikelihoods; i++ ) { + Assert.assertEquals(newPLs[i], expectedPLs[i]); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetIndexesOfRelevantAllelesWithNoALT() { + + final List alleles1 = new ArrayList<>(1); + alleles1.add(Allele.create("A", true)); + final List alleles2 = new ArrayList<>(1); + alleles2.add(Allele.create("A", true)); + GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2); + Assert.fail("We should have thrown an exception because the allele was not present"); + } + + @DataProvider(name = "getIndexesOfRelevantAllelesData") + public Object[][] makeGetIndexesOfRelevantAllelesData() { + final int totalAlleles = 5; + final List alleles = new ArrayList<>(totalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < totalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + + final List tests = new ArrayList<>(); + + for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { + tests.add(new Object[]{alleleIndex, alleles}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "getIndexesOfRelevantAllelesData") + public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { + final List myAlleles = new ArrayList<>(3); + + // always add the reference and alleles + myAlleles.add(allAlleles.get(0)); + myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + // optionally add another alternate allele + if ( allelesIndex > 0 ) + myAlleles.add(allAlleles.get(allelesIndex)); + + final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles); + + Assert.assertEquals(indexes.length, allAlleles.size()); + + for ( int i = 0; i < allAlleles.size(); i++ ) { + if ( i == 0 ) + Assert.assertEquals(indexes[i], 0); // ref should always match + else if ( i == allelesIndex ) + Assert.assertEquals(indexes[i], 2); // allele + else + Assert.assertEquals(indexes[i], 1); // + } + } + + @DataProvider(name = "referenceConfidenceMergeData") + public Object[][] makeReferenceConfidenceMergeData() { + final List tests = new ArrayList<>(); + + final int start = 10; + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); + final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); + + final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; + final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; + + final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).make(); + final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + final Allele AAref = Allele.create("AA", true); + final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).make(); + final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + final List A_C = Arrays.asList(Aref, C); + final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).make(); + final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).make(); + final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).make(); + final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + final List A_C_G = Arrays.asList(Aref, C, G); + final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).make(); + final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).make(); + final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).make(); + final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + final Allele A = Allele.create("A", false); + final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).make(); + final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + + // first test the case of a single record + tests.add(new Object[]{Arrays.asList(vcA_C_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); + + // now, test pairs: + // a SNP with another SNP + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).make()).make()}); + // a SNP with an indel + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).make()).make()}); + // a SNP with 2 SNPs + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); + // a SNP with a ref record + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); + + // spanning records: + // a SNP with a spanning ref record + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); + // a SNP with a spanning deletion + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).make()).make()}); + + // combination of all + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC, G)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 71, 72, 74, 20, 10, 73, 30}).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).make()).make()}); + + // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts + tests.add(new Object[]{Arrays.asList(vcAA_ALT), + loc, false, + null}); + tests.add(new Object[]{Arrays.asList(vcAA_ALT), + loc, true, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).make()).make()}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "referenceConfidenceMergeData") + public void testReferenceConfidenceMerge(final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { + final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte)'A' : null); + if ( result == null ) { + Assert.assertTrue(expectedResult == null); + return; + } + Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles()); + Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples()); + for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { + Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); + // use string comparisons to test equality for now + Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); + } + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java index a1b75a3f1..bb794dca4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java @@ -234,7 +234,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { GATKVariantContextUtils.simpleMerge(toMerge, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, - true, false, "set", false, true, false); + true, false, "set", false, true); } }; diff --git a/settings/repository/org.broadinstitute/variant-1.104.1634.jar b/settings/repository/org.broadinstitute/variant-1.105.1642.jar similarity index 89% rename from settings/repository/org.broadinstitute/variant-1.104.1634.jar rename to settings/repository/org.broadinstitute/variant-1.105.1642.jar index 6724e0588..c173d0fdd 100644 Binary files a/settings/repository/org.broadinstitute/variant-1.104.1634.jar and b/settings/repository/org.broadinstitute/variant-1.105.1642.jar differ diff --git a/settings/repository/org.broadinstitute/variant-1.104.1634.xml b/settings/repository/org.broadinstitute/variant-1.105.1642.xml similarity index 70% rename from settings/repository/org.broadinstitute/variant-1.104.1634.xml rename to settings/repository/org.broadinstitute/variant-1.105.1642.xml index 9e4b49241..d1846ce23 100644 --- a/settings/repository/org.broadinstitute/variant-1.104.1634.xml +++ b/settings/repository/org.broadinstitute/variant-1.105.1642.xml @@ -1,3 +1,3 @@ - +