Merge remote-tracking branch 'origin/master' into mf_new_RBP

This commit is contained in:
Menachem Fromer 2014-01-03 01:13:40 -05:00
commit d1275651ae
35 changed files with 1939 additions and 573 deletions

View File

@ -80,12 +80,9 @@ public class StrandBiasBySample extends GenotypeAnnotation implements Experiment
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
if ( ! isAppropriateInput(alleleLikelihoodMap, g) )
return;
if (alleleLikelihoodMap == null )
throw new IllegalStateException("StrandBiasBySample can only be used with likelihood based annotations in the HaplotypeCaller");
final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc);
gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table));
@ -97,4 +94,7 @@ public class StrandBiasBySample extends GenotypeAnnotation implements Experiment
@Override
public List<VCFFormatHeaderLine> getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); }
private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) {
return ! (map == null || g == null || !g.isCalled());
}
}

View File

@ -286,7 +286,7 @@ public class ConsensusAlleleCounter {
if (vcs.isEmpty())
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false);
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
return mergedVC.getAlleles();
}
}

View File

@ -183,7 +183,7 @@ public class GenotypingEngine {
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
// Merge the event to find a common reference representation
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false);
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {

View File

@ -262,7 +262,7 @@ public class PairHMMIndelErrorModel {
* @return true if the read needs to be clipped, false otherwise
*/
protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) {
return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() > refWindowStop );
return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() - 1 > refWindowStop );
}
/**
@ -316,7 +316,7 @@ public class PairHMMIndelErrorModel {
// if the read extends beyond the downstream (right) end of the reference window, clip it
if ( mustClipDownstream(read, refWindowStop) )
read = ReadClipper.hardClipByReadCoordinates(read, read.getSoftStart() + read.getReadLength() - refWindowStop + 1, read.getReadLength() - 1);
read = ReadClipper.hardClipByReadCoordinates(read, refWindowStop - read.getSoftStart() + 1, read.getReadLength() - 1);
// if the read extends beyond the upstream (left) end of the reference window, clip it
if ( mustClipUpstream(read, refWindowStart) )

View File

@ -0,0 +1,266 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.rnaseq;
import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.BAQMode;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.sam.CigarUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.*;
/**
*
* Splits reads that contain Ns in their cigar string (e.g. spanning splicing events).
*
* Identifies all N cigar elements and creates k+1 new reads (where k is the number of N cigar elements).
* The first read includes the bases that are to the left of the first N element, while the part of the read that is to the right of the N
* (including the Ns) is hard clipped and so on for the rest of the new reads.
*
*
* User: ami
* Date: 11/14/13
* Time: 11:52 AM
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
@Requires({DataSource.READS, DataSource.REFERENCE})
public class SplitNCigarReads extends ReadWalker<List<GATKSAMRecord>, SAMFileWriter> {
@Output(doc="Write output to this BAM filename instead of STDOUT")
StingSAMFileWriter out;
@Argument(required = false)
PrintStream splitPositionsOutput = System.out;
@Argument(fullName="outputAsBED", shortName="bed", doc="Output as BED file", required=false)
boolean outputAsBED = false;
@Argument(fullName="printSplitPositions", shortName="splitPosition", doc="print the split positions", required=false)
boolean printSplitPositions = false;
public static final String PROGRAM_RECORD_NAME = "GATK SplitNCigarReads"; // The name that will go in the @PG tag
// public static SplitPositions splitPositions = null;
public static String results = "";
/**
* The initialize function.
*/
public void initialize() {
final GenomeAnalysisEngine toolkit = getToolkit();
final boolean preSorted = false;
if (getToolkit() != null) {
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME);
}
out.setPresorted(preSorted);
// splitPositions = new SplitPositions();
}
/**
* The reads map function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a GATKSAMRecord
*
* @return a list of split read if there are N's in the cigar string, or the read itself.
*/
public List<GATKSAMRecord> map(final ReferenceContext ref,final GATKSAMRecord read,final RefMetaDataTracker metaDataTracker) {
return splitNCigarRead(read);
}
/**
* Goes through the cigar string of the read and create new reads for each consecutive non-N elements (while hard clipping the rest of the read).
* For example: for a read with cigar '1H2M2D1M2N1M2I1N1M2S' 3 new reads will be created with cigar strings: 1H2M2D1M, 1M2I and 1M2S
*
* @param read the read to split, as a GATKSAMRecord
* @return a list of split read if there are N's in the cigar string, or the read itself.
*/
public static List<GATKSAMRecord> splitNCigarRead(final GATKSAMRecord read){
final List<GATKSAMRecord> splitReads = new ArrayList<>();
int firstCigarIndex = 0;
for (int i = 0; i < read.getCigar().numCigarElements(); i ++){
final CigarElement cigarElement = read.getCigar().getCigarElement(i);
if(cigarElement.getOperator() == CigarOperator.N){
final boolean addToSplitPositions = true;
splitReads.add(splitReadBasedOnCigar(read,firstCigarIndex,i, addToSplitPositions));
firstCigarIndex = i+1;
}
}
//if there are no N's in the read
if (firstCigarIndex == 0)
splitReads.add(read);
//add the last section of the read: from the last N to the the end of the read
// (it will be done for all the usual cigar string that does not end with N)
else if(firstCigarIndex < read.getCigar().numCigarElements()){
final boolean addToSplitPositions = false;
splitReads.add(splitReadBasedOnCigar(read,firstCigarIndex,read.getCigar().numCigarElements(), addToSplitPositions));
}
return splitReads;
}
/**
* reduceInit is called once before any calls to the map function. We use it here to setup the splitPositionsOutput
* bam file, if it was specified on the command line
*
* @return SAMFileWriter, set to the BAM splitPositionsOutput file if the command line option was set, null otherwise
*/
public SAMFileWriter reduceInit() {
return out;
}
/**
* given a read and a splitPositionsOutput location, reduce by emitting the read
*
* @param reads the split reads itself
* @param output the splitPositionsOutput source
* @return the SAMFileWriter, so that the next reduce can emit to the same source
*/
public SAMFileWriter reduce(final List<GATKSAMRecord> reads,final SAMFileWriter output ) {
for (final GATKSAMRecord read: reads)
output.addAlignment(read);
return output;
}
public void onTraversalDone(SAMFileWriter readResult) {
super.onTraversalDone(readResult);
if(printSplitPositions)
splitPositionsOutput.println(results);
// splitPositionsOutput.println(splitPositions);
}
private static GATKSAMRecord splitReadBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex, final boolean addToSplitPositions){
int cigarFirstIndex = cigarStartIndex;
int cigarSecondIndex = cigarEndIndex;
//in case a section of the read is end or start with D (for example the first section in 1M1D1N1M is 1M1D), we should trim this cigar element
// it can be if, but it was kept as while to make sure the code can work with Cigar string that were not "cleaned"
while(read.getCigar().getCigarElement(cigarFirstIndex).getOperator().equals(CigarOperator.D))
cigarFirstIndex++;
while(read.getCigar().getCigarElement(cigarSecondIndex-1).getOperator().equals(CigarOperator.D))
cigarSecondIndex--;
if(cigarFirstIndex > cigarSecondIndex)
throw new UserException.BadInput("Cannot split this read (might be an empty section between Ns, for example 1N1D1N): "+read.getCigarString());
// we keep only the section of the read that is aligned to the reference between startRefIndex and stopRefIndex (inclusive).
// the other sections of the read are clipped:
final int startRefIndex = read.getOriginalAlignmentStart() + CigarUtils.countRefBasesBasedOnCigar(read,0,cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index.
final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index.
if(addToSplitPositions){
final int splitPosition = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end.
final String contig = read.getReferenceName();
// results += String.format("%s:%d-%d\n", contig, splitPosition, splitPosition );
// splitPositions.addSplitPosition(contig,splitPosition);
}
return ReadClipper.hardClipToRegionIncludingClippedBases(read, startRefIndex, stopRefIndex);
}
private class SplitPosition {
public final String contig;
public final int start;
public final int end;
public SplitPosition(final String c, final int position) {
contig = c;
start = position;
end = position;
}
}
private class SplitPositions {
private final HashSet<SplitPosition> splitPositions;
public SplitPositions() {
splitPositions = new HashSet<>();
}
public void addSplitPosition(final String contig, final int position) {
final SplitPosition newSplitPosition = new SplitPosition(contig, position);
splitPositions.add(newSplitPosition);
}
public String toString() {
String result = ""; // = "Contig\tstart\tstop\n";
for (SplitPosition position: splitPositions) {
if (outputAsBED)
result += String.format("%s\t%d\t%d\n", position.contig, position.start-1, position.end );
else
result += String.format("%s:%d-%d\n", position.contig, position.start, position.end );
}
return result;
}
}
}

View File

@ -46,6 +46,7 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import it.unimi.dsi.fastutil.booleans.BooleanLists;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -71,7 +72,7 @@ import java.util.*;
*/
public class VariantDataManager {
private List<VariantDatum> data;
private List<VariantDatum> data = Collections.emptyList();
private double[] meanVector;
private double[] varianceVector; // this is really the standard deviation
public List<String> annotationKeys;
@ -80,7 +81,7 @@ public class VariantDataManager {
protected final List<TrainingSet> trainingSets;
public VariantDataManager( final List<String> annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
this.data = null;
this.data = Collections.emptyList();
this.annotationKeys = new ArrayList<>( annotationKeys );
this.VRAC = VRAC;
meanVector = new double[this.annotationKeys.size()];
@ -279,6 +280,19 @@ public class VariantDataManager {
return evaluationData;
}
/**
* Remove all VariantDatum's from the data list which are marked as aggregate data
*/
public void dropAggregateData() {
final Iterator<VariantDatum> iter = data.iterator();
while (iter.hasNext()) {
final VariantDatum datum = iter.next();
if( datum.isAggregate ) {
iter.remove();
}
}
}
public List<VariantDatum> getRandomDataForPlotting( final int numToAdd, final List<VariantDatum> trainingData, final List<VariantDatum> antiTrainingData, final List<VariantDatum> evaluationData ) {
final List<VariantDatum> returnData = new ExpandingArrayList<>();
Collections.shuffle(trainingData);

View File

@ -74,7 +74,8 @@ public class VariantDatum {
public int consensusCount;
public GenomeLoc loc;
public int worstAnnotation;
public MultivariateGaussian assignment; // used in K-means implementation
public MultivariateGaussian assignment; // used in K-means implementation
public boolean isAggregate; // this datum was provided to aid in modeling but isn't part of the input callset
public static class VariantDatumLODComparator implements Comparator<VariantDatum>, Serializable {
@Override

View File

@ -152,11 +152,17 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Inputs
/////////////////////////////
/**
* These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
* These calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling.
*/
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
/**
* These additional calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling.
*/
@Input(fullName="aggregate", shortName = "aggregate", doc="Additional raw input variants to be used in building the model", required=false)
public List<RodBinding<VariantContext>> aggregate;
/**
* Any set of VCF files to use as lists of training, truth, or known sites.
* Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
@ -290,29 +296,53 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
return mapList;
}
for( final VariantContext vc : tracker.getValues(input, context.getLocation()) ) {
mapList.addAll( addOverlappingVariants(input, true, tracker, context) );
if( aggregate != null ) {
mapList.addAll( addOverlappingVariants(aggregate, false, tracker, context) );
}
return mapList;
}
/**
* Using the RefMetaDataTracker find overlapping variants and pull out the necessary information to create the VariantDatum
* @param rods the rods to search within
* @param isInput is this rod an -input rod?
* @param tracker the RefMetaDataTracker from the RODWalker map call
* @param context the AlignmentContext from the RODWalker map call
* @return a list of VariantDatums, can be empty
*/
private List<VariantDatum> addOverlappingVariants( final List<RodBinding<VariantContext>> rods, final boolean isInput, final RefMetaDataTracker tracker, final AlignmentContext context ) {
if( rods == null ) { throw new IllegalArgumentException("rods cannot be null."); }
if( tracker == null ) { throw new IllegalArgumentException("tracker cannot be null."); }
if( context == null ) { throw new IllegalArgumentException("context cannot be null."); }
final ExpandingArrayList<VariantDatum> variants = new ExpandingArrayList<>();
for( final VariantContext vc : tracker.getValues(rods, context.getLocation()) ) {
if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) {
if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) {
final VariantDatum datum = new VariantDatum();
// Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need.
dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine
datum.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vc);
datum.loc = ( isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null );
datum.originalQual = vc.getPhredScaledQual();
datum.isSNP = vc.isSNP() && vc.isBiallelic();
datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc);
datum.isAggregate = !isInput;
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
final double priorFactor = QualityUtils.qualToProb( datum.prior );
datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor );
mapList.add( datum );
variants.add( datum );
}
}
}
return mapList;
return variants;
}
//---------------------------------------------------------------------------------------------------------------
@ -357,6 +387,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Generate the negative model using the worst performing data and evaluate each variant contrastively
final List<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants();
final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS));
dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory
engine.evaluateData( dataManager.getData(), badModel, true );
if( badModel.failedToConverge || goodModel.failedToConverge ) {

View File

@ -80,6 +80,9 @@ public class VariantRecalibratorEngine {
}
public GaussianMixtureModel generateModel( final List<VariantDatum> data, final int maxGaussians ) {
if( data == null || data.isEmpty() ) { throw new IllegalArgumentException("No data found."); }
if( maxGaussians <= 0 ) { throw new IllegalArgumentException("maxGaussians must be a positive integer but found: " + maxGaussians); }
final GaussianMixtureModel model = new GaussianMixtureModel( maxGaussians, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS );
variationalBayesExpectationMaximization( model, data );
return model;

View File

@ -0,0 +1,228 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.variant.vcf.*;
import java.util.*;
/**
* Combines gVCF records that were produced by the Haplotype Caller from single sample sources.
*
* <p>
* CombineReferenceCalculationVariants combines gVCF records that were produced as part of the "single sample discovery"
* pipeline using the '-ERC GVCF' mode of the Haplotype Caller. This tools performs the multi-sample joint aggregation
* step and merges the records together in a sophisticated manner.
*
* At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods,
* re-genotype the newly merged record, and then re-annotate it.
*
*
* <h3>Input</h3>
* <p>
* One or more Haplotype Caller gVCFs to combine.
* </p>
*
* <h3>Output</h3>
* <p>
* A combined VCF.
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineReferenceCalculationVariants \
* --variant input1.vcf \
* --variant input2.vcf \
* -o output.vcf
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-10,stop=10))
public class CombineReferenceCalculationVariants extends RodWalker<VariantContext, VariantContextWriter> implements AnnotatorCompatible, TreeReducible<VariantContextWriter> {
// TODO -- allow a file of VCF paths to be entered?
/**
* The VCF files to merge together
*/
@Input(fullName="variant", shortName = "V", doc="One or more input VCF files", required=true)
public List<RodBinding<VariantContext>> variants;
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
@Argument(fullName="includeNonVariants", shortName="inv", doc="Include loci found to be non-variant after the combining procedure", required=false)
public boolean INCLUDE_NON_VARIANTS = false;
/**
* Which annotations to recompute for the combined output VCF file.
*/
@Advanced
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute", required=false)
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts"}));
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP is not used in any way for the calculations themselves.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
// the genotyping engine
private UnifiedGenotyperEngine genotypingEngine;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
public boolean alwaysAppendDbsnpId() { return false; }
public void initialize() {
// take care of the VCF headers
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
final Set<String> samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
final VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
vcfWriter.writeHeader(vcfHeader);
// create the genotyping engine
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH;
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
// create the annotation engine
annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.<String>emptyList(), this, getToolkit());
}
public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return null;
final GenomeLoc loc = ref.getLocus();
final VariantContext combinedVC = GATKVariantContextUtils.referenceConfidenceMerge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null);
if ( combinedVC == null )
return null;
return regenotypeVC(tracker, ref, combinedVC);
}
/**
* Re-genotype (and re-annotate) a combined genomic VC
*
* @param tracker the ref tracker
* @param ref the ref context
* @param combinedVC the combined genomic VC
* @return a new VariantContext or null if the site turned monomorphic and we don't want such sites
*/
protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext combinedVC) {
if ( combinedVC == null ) throw new IllegalArgumentException("combinedVC cannot be null");
VariantContext result = combinedVC;
// only re-genotype polymorphic sites
if ( combinedVC.isVariant() )
result = genotypingEngine.calculateGenotypes(result);
// if it turned monomorphic and we don't want such sites, quit
if ( !INCLUDE_NON_VARIANTS && result.isMonomorphicInSamples() )
return null;
// re-annotate it
return annotationEngine.annotateContext(tracker, ref, null, result);
}
public VariantContextWriter reduceInit() {
return vcfWriter;
}
public VariantContextWriter reduce(final VariantContext vc, final VariantContextWriter writer) {
if ( vc != null )
writer.add(vc);
return writer;
}
@Override
public VariantContextWriter treeReduce(final VariantContextWriter lhs, final VariantContextWriter rhs) {
return lhs;
}
@Override
public void onTraversalDone(final VariantContextWriter writer) {}
}

View File

@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("3d12bdb816d27bf7c9efb4c13dc2aec7"));
Arrays.asList("e10c49fcf9a128745c2b050a52798e58"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("a2c8e83f37cd1e114b42af4b873f57bc"));
Arrays.asList("903af514f70db9238064da311c4ea0de"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}

View File

@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("150b31ba05113ca1996b548be5170d6d"));
Arrays.asList("c4248f02103e37e89b0f22c0d9c98492"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -279,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("7d0ee85cd89f4addd84c5511daaaa5c5"));
Arrays.asList("96c7862d55e933b274cabe45c9c443d9"));
executeTest(String.format("test calling with BAQ"), spec);
}

View File

@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("ec0977e3fd3e2ac29c9821f0ca830455"));
Arrays.asList("710d379607129935b1b7b6960ca7b213"));
executeTest("test MultiSample Pilot1", spec);
}

View File

@ -130,4 +130,12 @@ public class PairHMMIndelErrorModelUnitTest extends BaseTest {
final boolean result = PairHMMIndelErrorModel.mustClipDownstream(read, refWindowEnd);
Assert.assertEquals(result, read.getSoftStart() < refWindowEnd && read.getSoftStart() + readLength > refWindowEnd);
}
@Test
public void clipDownstreamAtBorderTest() {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, 5, 10);
read.setCigarString("10M");
Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 13), true);
Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 14), false);
}
}

View File

@ -0,0 +1,83 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.rnaseq;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import org.broadinstitute.sting.BaseTest;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: ami
* Date: 12/5/13
* Time: 1:04 PM
*/
public class SplitNCigarReadsIntegrationTests extends WalkerTest {
@Test
// contain reads without N's, with N's and with N's and I's
public void testSplitWithInsertions() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T SplitNCigarReads -R " + BaseTest.b37KGReference + " -I " + BaseTest.privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withI.bam -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("037c72fe1572efb63cccbe0a8dda3cb1"));
executeTest("test split N cigar reads with insertions", spec);
}
@Test
// contain reads without N's, with N's and with N's and D's, and also with more then one N element in the cigar.
public void testSplitWithDeletions() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T SplitNCigarReads -R " + BaseTest.b37KGReference + " -I " + BaseTest.privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withD.bam -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("8472005c16353715025353d6d453faf4"));
executeTest("test split N cigar reads with deletions", spec);
}
}

View File

@ -44,189 +44,139 @@
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
package org.broadinstitute.sting.gatk.walkers.rnaseq;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.gatk.walkers.rnaseq.SplitNCigarReads;
import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.HashMap;
import java.util.Map;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
/**
* Given two BAMs with different read groups, it compares them based on ReduceReads metrics.
* <p>
* This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files.
* </p>
* <h3>Input</h3>
* <p>
* Two BAM files (using -I) with different read group IDs
* </p>
* <h3>Output</h3>
* <p>
* [Output description]
* </p>
* <h3>Examples</h3>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author carneiro
* @since 10/30/11
* Tests all possible (and valid) cigar strings that might contain any cigar elements. It uses a code that were written to test the ReadClipper walker.
* For valid cigar sting in length 8 there are few thousands options, with N in every possible option and with more than one N (for example 1M1N1M1N1M1N2M).
* The cigarElements array is used to provide all the possible cigar element that might be included.
*
* User: ami
* Date: 11/14/13
* Time: 6:49 PM
*/
public class SplitNCigarReadsUnitTest {
final static CigarElement[] cigarElements = {
new CigarElement(1, CigarOperator.HARD_CLIP),
new CigarElement(1, CigarOperator.SOFT_CLIP),
new CigarElement(1, CigarOperator.INSERTION),
new CigarElement(1, CigarOperator.DELETION),
new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH),
new CigarElement(1, CigarOperator.SKIPPED_REGION)
};
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
public class CompareBAM extends LocusWalker<Map<CompareBAM.TestName, Boolean>, CompareBAM.TestResults> {
@Argument(required = true, shortName = "rr", fullName = "reduced_readgroup", doc = "The read group ID corresponding to the compressed BAM being tested") public String reducedReadGroupID;
@Argument(required = false, shortName = "teq", fullName = "test_equal_bases", doc = "Test if the bases marked as '=' are indeed ref bases.") public boolean TEST_EQUAL_BASES = false;
@Argument(required = false, shortName = "tbc", fullName = "test_base_counts", doc = "Test if the base counts tag in consensus reads are accurate.") public boolean TEST_BASE_COUNTS = false;
@Argument(required = false, shortName = "mbq", fullName = "min_base_qual", doc = "Minimum base quality to be considered.") public int MIN_BASE_QUAL = 20;
@Argument(required = false, shortName = "mmq", fullName = "min_mapping_qual", doc = "Minimum mapping quality to be considered.") public int MIN_MAPPING_QUAL = 20;
@Test(enabled = true)
public void splitReadAtN() {
final int cigarStringLength = 10;
final List<Cigar> cigarList = ReadClipperTestUtils.generateCigarList(cigarStringLength,cigarElements);
// For Debugging use those lines (instead of above cigarList) to create specific read:
//------------------------------------------------------------------------------------
// final GATKSAMRecord tmpRead = GATKSAMRecord.createRandomRead(6);
// tmpRead.setCigarString("1M1N1M");
// final List<Cigar> cigarList = new ArrayList<>();
// cigarList.add(tmpRead.getCigar());
for(Cigar cigar: cigarList){
@Override
public Map<TestName, Boolean> map (RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
Map<TestName, Boolean> result = new HashMap<TestName, Boolean>();
final int numOfSplits = numOfNElements(cigar.getCigarElements());
if (TEST_EQUAL_BASES) result.put(TestName.EQUAL_BASES, testEqualBases(ref, context));
if (TEST_BASE_COUNTS) result.put(TestName.BASE_COUNTS, testBaseCounts(ref, context));
if(numOfSplits != 0 && isCigarDoesNotHaveEmptyRegionsBetweenNs(cigar)){
return result;
}
@Override
public TestResults reduceInit () {
TestResults sum = new TestResults(); // a fresh new TestResults object to sum up the results of every object passed by MAP.
if (TEST_EQUAL_BASES) sum.createTest(TestName.EQUAL_BASES);
if (TEST_BASE_COUNTS) sum.createTest(TestName.BASE_COUNTS);
return sum;
}
@Override
public TestResults reduce (Map<TestName,Boolean> mapResult, TestResults sum) {
for (TestName test : mapResult.keySet()) {
if (mapResult.get(test))
sum.reportSuccess(test);
else
sum.reportFailed(test);
}
return sum;
}
public void onTraversalDone (TestResults finalResults) {
finalResults.report();
}
private boolean testEqualBases (ReferenceContext ref, AlignmentContext context) {
return true;
}
private boolean testBaseCounts (ReferenceContext ref, AlignmentContext context) {
return true;
}
public enum TestName {
EQUAL_BASES ("testEqualBases"),
BASE_COUNTS ("testBaseCounts");
private String testName;
TestName(String testName) {
this.testName = testName;
}
public String getTestName() {
return testName;
}
}
public class TestResults {
private Map<TestName, TestOutcome> testStats = new HashMap<TestName, TestOutcome>();
public void createTest (TestName test) {
testStats.put(test, new TestOutcome());
}
public void reportSuccess(TestName test) {
if (testStats.containsKey(test))
testStats.get(test).incPassed();
else
throw new ReviewedStingException("No such test: " + test);
}
public void reportFailed(TestName test) {
if (testStats.containsKey(test))
testStats.get(test).incFailed();
else
throw new ReviewedStingException("No such test: " + test);
}
public void report() {
System.out.println();
System.out.println(String.format("%20s\tPASS\tFAIL", ""));
for (TestName test : testStats.keySet())
System.out.println(String.format("%20s\t%d\t%d", test.getTestName(), testStats.get(test).getPassed(), testStats.get(test).getFailed()));
System.out.println();
}
}
private class TestOutcome {
private long passed;
private long failed;
public long getPassed() {
return passed;
}
public void incPassed() {
this.passed++;
}
public long getFailed() {
return failed;
}
public void incFailed() {
this.failed++;
}
}
private BaseCounts getFilteredBaseCounts(AlignmentContext context) {
return getBaseCounts(context, MIN_BASE_QUAL, MIN_MAPPING_QUAL);
}
private BaseCounts getFullBaseCounts(AlignmentContext context) {
return getBaseCounts(context, 3, 0);
}
private BaseCounts getBaseCounts(AlignmentContext context, int mbq, int mmq) {
BaseCounts fullBaseCounts = new BaseCounts();
for (String rg : context.getBasePileup().getReadGroups()) {
if (!rg.equals(reducedReadGroupID)) {
BaseCounts b = BaseCounts.createWithCounts(context.getBasePileup().getPileupForReadGroup(rg).getBaseAndMappingFilteredPileup(mbq, mmq).getBaseCounts());
fullBaseCounts.add(b);
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
List<GATKSAMRecord> splitReads = SplitNCigarReads.splitNCigarRead(read);
final int expectedReads = numOfSplits+1;
Assert.assertEquals(splitReads.size(),expectedReads,"wrong number of reads after split read with cigar: "+cigar+" at Ns [expected]: "+expectedReads+" [actual value]: "+splitReads.size());
final List<Integer> readLengths = consecutiveNonNElements(read.getCigar().getCigarElements());
int index = 0;
int offsetFromStart = 0;
for(GATKSAMRecord splitRead: splitReads){
int expectedLength = readLengths.get(index);
Assert.assertTrue(splitRead.getReadLength() == expectedLength,
"the "+index+" (starting with 0) split read has a wrong length.\n" +
"cigar of original read: "+cigar+"\n"+
"expected length: "+expectedLength+"\n"+
"actual length: "+splitRead.getReadLength()+"\n");
assertBases(splitRead.getReadBases(), read.getReadBases(), offsetFromStart);
index++;
offsetFromStart += expectedLength;
}
}
}
return fullBaseCounts;
}
private int numOfNElements(final List<CigarElement> cigarElements){
int numOfNElements = 0;
for (CigarElement element: cigarElements){
if (element.getOperator() == CigarOperator.SKIPPED_REGION)
numOfNElements++;
}
return numOfNElements;
}
private static boolean isCigarDoesNotHaveEmptyRegionsBetweenNs(final Cigar cigar) {
boolean sawM = false;
boolean sawS = false;
for (CigarElement cigarElement : cigar.getCigarElements()) {
if (cigarElement.getOperator().equals(CigarOperator.SKIPPED_REGION)) {
if(!sawM && !sawS)
return false;
sawM = false;
sawS = false;
}
if (cigarElement.getOperator().equals(CigarOperator.MATCH_OR_MISMATCH))
sawM = true;
if (cigarElement.getOperator().equals(CigarOperator.SOFT_CLIP))
sawS = true;
}
if(!sawS && !sawM)
return false;
return true;
}
private List<Integer> consecutiveNonNElements(final List<CigarElement> cigarElements){
final LinkedList<Integer> results = new LinkedList<>();
int consecutiveLength = 0;
for(CigarElement element: cigarElements){
final CigarOperator op = element.getOperator();
if(op.equals(CigarOperator.MATCH_OR_MISMATCH) || op.equals(CigarOperator.SOFT_CLIP) || op.equals(CigarOperator.INSERTION)){
consecutiveLength += element.getLength();
}
else if(op.equals(CigarOperator.SKIPPED_REGION))
{
if(consecutiveLength != 0){
results.addLast(consecutiveLength);
consecutiveLength = 0;
}
}
}
if(consecutiveLength != 0)
results.addLast(consecutiveLength);
return results;
}
private void assertBases(final byte[] actualBase, final byte[] expectedBase, final int startIndex) {
for (int i = 0; i < actualBase.length; i++) {
Assert.assertEquals(actualBase[i], expectedBase[startIndex + i],"unmatched bases between: "+ Arrays.toString(actualBase)+"\nand:\n"+Arrays.toString(expectedBase)+"\nat position: "+i);
}
}
}

View File

@ -142,4 +142,39 @@ public class VariantDataManagerUnitTest extends BaseTest {
Assert.assertTrue( trainingData.size() == MAX_NUM_TRAINING_DATA );
}
@Test
public final void testDropAggregateData() {
final int MAX_NUM_TRAINING_DATA = 5000;
final double passingQual = 400.0;
final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
VRAC.MAX_NUM_TRAINING_DATA = MAX_NUM_TRAINING_DATA;
VariantDataManager vdm = new VariantDataManager(new ArrayList<String>(), VRAC);
final List<VariantDatum> theData = new ArrayList<>();
for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 10; iii++) {
final VariantDatum datum = new VariantDatum();
datum.atTrainingSite = true;
datum.isAggregate = false;
datum.failingSTDThreshold = false;
datum.originalQual = passingQual;
theData.add(datum);
}
for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 2; iii++) {
final VariantDatum datum = new VariantDatum();
datum.atTrainingSite = false;
datum.isAggregate = true;
datum.failingSTDThreshold = false;
datum.originalQual = passingQual;
theData.add(datum);
}
vdm.setData(theData);
vdm.dropAggregateData();
for( final VariantDatum datum : vdm.getData() ) {
Assert.assertFalse( datum.isAggregate );
}
}
}

View File

@ -62,9 +62,11 @@ import java.util.List;
public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
private static class VRTest {
String inVCF;
String aggregateVCF;
String tranchesMD5;
String recalMD5;
String cutVCFMD5;
public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) {
this.inVCF = inVCF;
this.tranchesMD5 = tranchesMD5;
@ -72,6 +74,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
this.cutVCFMD5 = cutVCFMD5;
}
public VRTest(String inVCF, String aggregateVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) {
this.inVCF = inVCF;
this.aggregateVCF = aggregateVCF;
this.tranchesMD5 = tranchesMD5;
this.recalMD5 = recalMD5;
this.cutVCFMD5 = cutVCFMD5;
}
@Override
public String toString() {
return "VRTest{inVCF='" + inVCF +"'}";
@ -83,10 +93,20 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
"73c7897441622c9b37376eb4f071c560", // recal file
"11a28df79b92229bd317ac49a3ed0fa1"); // cut VCF
VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf",
"ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches
"1b33c10be7d8bf8e9accd11113835262", // recal file
"4700d52a06f2ef3a5882719b86911e51"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
return new Object[][]{ {lowPass} };
//return new Object[][]{ {yriTrio}, {lowPass} }; // Add hg19 chr20 trio calls here
}
@DataProvider(name = "VRAggregateTest")
public Object[][] createData2() {
return new Object[][]{ {lowPassPlusExomes} };
}
@Test(dataProvider = "VRTest")
@ -125,6 +145,43 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
executeTest("testApplyRecalibration-"+params.inVCF, spec);
}
@Test(dataProvider = "VRAggregateTest")
public void testVariantRecalibratorAggregate(VRTest params) {
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -T VariantRecalibrator" +
" -input " + params.inVCF +
" -aggregate " + params.aggregateVCF +
" -L 20:1,000,000-40,000,000" +
" --no_cmdline_in_header" +
" -an QD -an HaplotypeScore -an MQ" +
" --trustAllPolymorphic" + // for speed
" -recalFile %s" +
" -tranchesFile %s",
Arrays.asList(params.recalMD5, params.tranchesMD5));
executeTest("testVariantRecalibratorAggregate-"+params.inVCF, spec).getFirst();
}
@Test(dataProvider = "VRAggregateTest",dependsOnMethods="testVariantRecalibratorAggregate")
public void testApplyRecalibrationAggregate(VRTest params) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -T ApplyRecalibration" +
" -L 20:12,000,000-30,000,000" +
" --no_cmdline_in_header" +
" -input " + params.inVCF +
" -U LENIENT_VCF_PROCESSING -o %s" +
" -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null),
Arrays.asList(params.cutVCFMD5));
spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles
executeTest("testApplyRecalibrationAggregate-"+params.inVCF, spec);
}
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
"3ad7f55fb3b072f373cbce0b32b66df4", // tranches
"e747c08131d58d9a4800720f6ca80e0c", // recal file
@ -133,7 +190,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@DataProvider(name = "VRBCFTest")
public Object[][] createVRBCFTest() {
return new Object[][]{ {bcfTest} };
//return new Object[][]{ {yriTrio}, {lowPass} }; // Add hg19 chr20 trio calls here
}
@Test(dataProvider = "VRBCFTest")

View File

@ -0,0 +1,72 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class CombineReferenceCalculationVariantsIntegrationTest extends WalkerTest {
private static String baseTestString(String args, String ref) {
return "-T CombineReferenceCalculationVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args;
}
// TODO -- enable this test (and create others) once the Haplotype Caller produces appropriate gVCFs (with <NON-REF> for every record)
@Test(enabled = false)
public void combineSingleSamplePipelineGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-10,001,000", b37KGReference),
1,
Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
}

View File

@ -185,19 +185,6 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
@Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); }
@Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); }
@Test public void combineSingleSamplePipelineGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -multipleAllelesMergeType MIX_TYPES" +
" --excludeNonVariants -combineAnnotations -setKey null" +
" -L 20:10,000,000-10,001,000", b37KGReference),
1,
Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a"));
cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true);
}
@Test
public void combineDBSNPDuplicateSites() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -170,7 +170,36 @@ public class RefMetaDataTracker {
@Requires({"type != null", "onlyAtThisLoc != null"})
public <T extends Feature> T getFirstValue(final Class<T> type, final GenomeLoc onlyAtThisLoc) {
return safeGetFirst(getValues(type, onlyAtThisLoc));
}
/**
* Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available
*
* @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched
* @param <T> The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features
* @param prioritizeThisLoc only Features starting at this site are considered
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
@Requires({"rodBindings != null", "prioritizeThisLoc != null"})
@Ensures("result != null")
public <T extends Feature> List<T> getPrioritizedValue(final Collection<RodBinding<T>> rodBindings, final GenomeLoc prioritizeThisLoc) {
final List<T> results = new ArrayList<>();
for ( final RodBinding<T> rodBinding : rodBindings ) {
// if there's a value at the prioritized location, take it
T value = getFirstValue(rodBinding, prioritizeThisLoc);
// otherwise, grab any one
if ( value == null )
value = getFirstValue(rodBinding);
// add if not null
if ( value != null )
results.add(value);
}
return results;
}
/**

View File

@ -26,8 +26,8 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
@ -48,39 +48,53 @@ import java.io.PrintStream;
import java.util.Arrays;
/**
* Read clipping statistics for all reads.
*
* Walks over the input reads, printing out statistics about the read length, number of clipping events, and length
* of the clipping to the output stream.
*
* Note: Ignores N's in the Cigar string.
*
* <h3>Input</h3>
* One or more BAM files
*
* <h3>Output</h3>
* A simple tabulated text file with read length and clipping statistics for every read (or every N reads if the "skip"
* option is used)
*
* User: depristo
* Date: May 5, 2010
* Time: 12:16:41 PM
*/
/**
* Walks over the input reads, printing out statistics about the read length, number of clipping events, and length
* of the clipping to the output stream.
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@Requires({DataSource.READS})
public class ReadClippingStats extends ReadWalker<ReadClippingStats.ReadClippingInfo,Integer> {
@Output
protected PrintStream out;
@Argument(fullName="mappedOnly", shortName="mo", doc="when this flag is set (default), statistics will be collected "+
"on mapped reads only, while unmapped reads will be discarded", required=false)
protected boolean MAPPED_ONLY = true;
/**
* when this flag is set (default), statistics will be collected on unmapped reads as well. The default behavior
* is to ignore unmapped reads."
*/
@Argument(fullName="include_unmapped", shortName="u", doc="Include unmapped reads in the analysis", required=false)
protected boolean INCLUDE_UNMAPPED = false;
@Argument(fullName="skip", shortName="skip", doc="When provided, only every skip reads are analyzed", required=false)
/**
* print every read whose read number is divisible by SKIP. READ_NUMBER % SKIP == 0. First read in the file has read number = 1,
* second is 2, third is 3, ... A value of 1 means print every read. A value of 1000 means print every 1000th read.
*/
@Advanced
@Argument(fullName="skip", shortName="skip", doc="Do not print all reads, skip some.", required=false)
protected int SKIP = 1;
// public void initialize() {
//
// }
public class ReadClippingInfo {
SAMReadGroupRecord rg;
int readLength, nClippingEvents, nClippedBases;
}
public ReadClippingInfo map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
if ( AlignmentUtils.isReadUnmapped(read) && MAPPED_ONLY)
if ( AlignmentUtils.isReadUnmapped(read) && !INCLUDE_UNMAPPED)
return null;
ReadClippingInfo info = new ReadClippingInfo();
@ -89,24 +103,21 @@ public class ReadClippingStats extends ReadWalker<ReadClippingStats.ReadClipping
if ( info.rg == null ) throw new UserException.ReadMissingReadGroup(read);
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
if ( elt.getOperator() != CigarOperator.N )
switch ( elt.getOperator()) {
case H : // ignore hard clips
case S : // soft clip
info.nClippingEvents++;
info.nClippedBases += elt.getLength();
// note the fall through here
break;
case M :
case D : // deletion w.r.t. the reference
case P : // ignore pads
case I : // insertion w.r.t. the reference
info.readLength += elt.getLength(); // Unless we have a reference skip, the read gets longer
break;
case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
break;
default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + elt.getOperator());
}
info.readLength = read.getReadLength();
}
return info; //To change body of implemented methods use File | Settings | File Templates.
@ -137,13 +148,10 @@ public class ReadClippingStats extends ReadWalker<ReadClippingStats.ReadClipping
id, info.readLength, info.nClippingEvents, info.nClippedBases,
100.0 * MathUtils.ratio(info.nClippedBases, info.readLength));
}
return sum + 1; //To change body of implemented methods use File | Settings | File Templates.
return sum + 1;
} else {
return sum;
}
}
public void onTraversalDone(Integer result) {
}
}

View File

@ -191,9 +191,6 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
@Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false)
public boolean MERGE_INFO_WITH_MAX_AC = false;
@Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false)
public boolean COMBINE_ANNOTATIONS = false;
private List<String> priority = null;
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
@ -260,12 +257,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
// get all of the vcf rods at this locus
// Need to provide reference bases to simpleMerge starting at current locus
Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());
Collection<VariantContext> potentialRefVCs = tracker.getValues(variants);
potentialRefVCs.removeAll(vcs);
if ( sitesOnlyVCF ) {
vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs);
}
if ( ASSUME_IDENTICAL_SAMPLES ) {
@ -307,17 +301,16 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
// make sure that it is a variant or in case it is not, that we want to include the sites with no variants
if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) {
if (VCsByType.containsKey(type)) {
mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs,
priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS));
mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(),
filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
}
}
}
else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs,
priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS));
mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType,
genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
else {
logger.warn("Ignoring all records at site " + ref.getLocus());

View File

@ -362,21 +362,16 @@ public class ClippingOp {
* @return a cloned version of read that has been properly trimmed down
*/
private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) {
final int firstBaseAfterSoftClips = read.getAlignmentStart() - read.getSoftStart();
final int lastBaseBeforeSoftClips = read.getSoftEnd() - read.getSoftStart();
if (start == firstBaseAfterSoftClips && stop == lastBaseBeforeSoftClips) // note that if the read has no soft clips, these constants will be 0 and read length - 1 (beauty of math).
return GATKSAMRecord.emptyRead(read);
// If the read is unmapped there is no Cigar string and neither should we create a new cigar string
CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop);
final CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop);
// the cigar may force a shift left or right (or both) in case we are left with insertions
// starting or ending the read after applying the hard clip on start/stop.
int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd;
byte[] newBases = new byte[newLength];
byte[] newQuals = new byte[newLength];
int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart;
final int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd;
final byte[] newBases = new byte[newLength];
final byte[] newQuals = new byte[newLength];
final int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart;
System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength);
System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength);
@ -396,8 +391,8 @@ public class ClippingOp {
hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar));
if (read.hasBaseIndelQualities()) {
byte[] newBaseInsertionQuals = new byte[newLength];
byte[] newBaseDeletionQuals = new byte[newLength];
final byte[] newBaseInsertionQuals = new byte[newLength];
final byte[] newBaseDeletionQuals = new byte[newLength];
System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength);
System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength);
hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION);
@ -516,20 +511,20 @@ public class ClippingOp {
}
/**
* Checks if a hard clipped cigar left a read starting or ending with insertions/deletions
* Checks if a hard clipped cigar left a read starting or ending with deletions or gap (N)
* and cleans it up accordingly.
*
* @param cigar the original cigar
* @return an object with the shifts (see CigarShift class)
*/
private CigarShift cleanHardClippedCigar(Cigar cigar) {
Cigar cleanCigar = new Cigar();
private CigarShift cleanHardClippedCigar(final Cigar cigar) {
final Cigar cleanCigar = new Cigar();
int shiftFromStart = 0;
int shiftFromEnd = 0;
Stack<CigarElement> cigarStack = new Stack<CigarElement>();
Stack<CigarElement> inverseCigarStack = new Stack<CigarElement>();
final Stack<CigarElement> inverseCigarStack = new Stack<CigarElement>();
for (CigarElement cigarElement : cigar.getCigarElements())
for (final CigarElement cigarElement : cigar.getCigarElements())
cigarStack.push(cigarElement);
for (int i = 1; i <= 2; i++) {
@ -542,8 +537,8 @@ public class ClippingOp {
CigarElement cigarElement = cigarStack.pop();
if (!readHasStarted &&
// cigarElement.getOperator() != CigarOperator.INSERTION &&
cigarElement.getOperator() != CigarOperator.DELETION &&
cigarElement.getOperator() != CigarOperator.SKIPPED_REGION &&
cigarElement.getOperator() != CigarOperator.HARD_CLIP)
readHasStarted = true;
@ -553,6 +548,9 @@ public class ClippingOp {
else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
totalHardClip += cigarElement.getLength();
else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.SKIPPED_REGION)
totalHardClip += cigarElement.getLength();
if (readHasStarted) {
if (i == 1) {
if (!addedHardClips) {

View File

@ -30,6 +30,7 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.sam.CigarUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -348,20 +349,38 @@ public class ReadClipper {
public static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop ) {
final int start = read.getAlignmentStart();
final int stop = read.getAlignmentEnd();
return hardClipToRegion(read, refStart, refStop,start,stop);
}
/**
* Hard clip the read to the variable region (from refStart to refStop) processing also the clipped bases
*
* @param read the read to be clipped
* @param refStart the beginning of the variant region (inclusive)
* @param refStop the end of the variant region (inclusive)
* @return the read hard clipped to the variant region
*/
public static GATKSAMRecord hardClipToRegionIncludingClippedBases( final GATKSAMRecord read, final int refStart, final int refStop ) {
final int start = read.getOriginalAlignmentStart();
final int stop = start + CigarUtils.countRefBasesBasedOnCigar(read,0,read.getCigarLength()) - 1;
return hardClipToRegion(read, refStart, refStop,start,stop);
}
private static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop, final int alignmentStart, final int alignmentStop){
// check if the read is contained in region
if (start <= refStop && stop >= refStart) {
if (start < refStart && stop > refStop)
if (alignmentStart <= refStop && alignmentStop >= refStart) {
if (alignmentStart < refStart && alignmentStop > refStop)
return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1);
else if (start < refStart)
else if (alignmentStart < refStart)
return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1);
else if (stop > refStop)
else if (alignmentStop > refStop)
return hardClipByReferenceCoordinatesRightTail(read, refStop + 1);
return read;
} else
return GATKSAMRecord.emptyRead(read);
}
public static List<GATKSAMRecord> hardClipToRegion( final List<GATKSAMRecord> reads, final int refStart, final int refStop ) {
final List<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>( reads.size() );
for( final GATKSAMRecord read : reads ) {

View File

@ -0,0 +1,167 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.TextCigarCodec;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Stack;
/**
* Created with IntelliJ IDEA.
* User: ami
* Date: 11/26/13
* Time: 11:33 AM
* To change this template use File | Settings | File Templates.
*/
public class CigarUtils {
/**
* Combines equal adjacent elements of a Cigar object
*
* @param rawCigar the cigar object
* @return a combined cigar object
*/
public static Cigar combineAdjacentCigarElements(Cigar rawCigar) {
Cigar combinedCigar = new Cigar();
CigarElement lastElement = null;
int lastElementLength = 0;
for (CigarElement cigarElement : rawCigar.getCigarElements()) {
if (lastElement != null &&
((lastElement.getOperator() == cigarElement.getOperator()) ||
(lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) ||
(lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I)))
lastElementLength += cigarElement.getLength();
else
{
if (lastElement != null)
combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator()));
lastElement = cigarElement;
lastElementLength = cigarElement.getLength();
}
}
if (lastElement != null)
combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator()));
return combinedCigar;
}
public static Cigar invertCigar (Cigar cigar) {
Stack<CigarElement> cigarStack = new Stack<CigarElement>();
for (CigarElement cigarElement : cigar.getCigarElements())
cigarStack.push(cigarElement);
Cigar invertedCigar = new Cigar();
while (!cigarStack.isEmpty())
invertedCigar.add(cigarStack.pop());
return invertedCigar;
}
/**
* Checks whether or not the read has any cigar element that is not H or S
*
* @param read the read
* @return true if it has any M, I or D, false otherwise
*/
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements())
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
return true;
return false;
}
public static Cigar cigarFromString(String cigarString) {
return TextCigarCodec.getSingleton().decode(cigarString);
}
/**
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read
* - No deletions in the beginning / end of the read
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
* - No consecutive I/D elements
**/
public static boolean isCigarValid(Cigar cigar) {
if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation)
Stack<CigarElement> cigarElementStack = new Stack<CigarElement>(); // Stack to invert cigar string to find ending operator
CigarOperator startingOp = null;
CigarOperator endingOp = null;
// check if it doesn't start with deletions
boolean readHasStarted = false; // search the list of elements for the starting operator
for (CigarElement cigarElement : cigar.getCigarElements()) {
if (!readHasStarted) {
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) {
readHasStarted = true;
startingOp = cigarElement.getOperator();
}
}
cigarElementStack.push(cigarElement);
}
while (!cigarElementStack.empty()) {
CigarElement cigarElement = cigarElementStack.pop();
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) {
endingOp = cigarElement.getOperator();
break;
}
}
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.SKIPPED_REGION && endingOp != CigarOperator.SKIPPED_REGION)
return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
}
return false;
}
public static final int countRefBasesBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex){
int result = 0;
for(int i = cigarStartIndex; i<cigarEndIndex;i++){
final CigarElement cigarElement = read.getCigar().getCigarElement(i);
switch (cigarElement.getOperator()) {
case M:
case S:
case D:
case N:
case H:
result += cigarElement.getLength();
break;
case I:
break;
default:
throw new ReviewedStingException("Unsupported cigar operator: " + cigarElement.getOperator());
}
}
return result;
}
}

View File

@ -457,21 +457,22 @@ public class ReadUtils {
/**
* Returns the read coordinate corresponding to the requested reference coordinate.
*
* WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function
* will return the last read base before the deletion. This function returns a
* Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with
* a deletion.
* WARNING: if the requested reference coordinate happens to fall inside or just before a deletion (or skipped region) in the read, this function
* will return the last read base before the deletion (or skipped region). This function returns a
* Pair(int readCoord, boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion) so you can choose which readCoordinate to use when faced with
* a deletion (or skipped region).
*
* SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a
* pre-processed result according to normal clipping needs. Or you can use this function and tailor the
* behavior to your needs.
*
* @param read
* @param refCoord
* @param refCoord the requested reference coordinate
* @return the read coordinate corresponding to the requested reference coordinate. (see warning!)
*/
@Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"})
@Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"})
//TODO since we do not have contracts any more, should we check for the requirements in the method code?
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) {
return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false);
}
@ -479,9 +480,11 @@ public class ReadUtils {
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) {
int readBases = 0;
int refBases = 0;
boolean fallsInsideDeletion = false;
boolean fallsInsideDeletionOrSkippedRegion = false;
boolean endJustBeforeDeletionOrSkippedRegion = false;
boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false;
int goal = refCoord - alignmentStart; // The goal is to move this many reference bases
final int goal = refCoord - alignmentStart; // The goal is to move this many reference bases
if (goal < 0) {
if (allowGoalNotReached) {
return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
@ -493,7 +496,7 @@ public class ReadUtils {
Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
while (!goalReached && cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
final CigarElement cigarElement = cigarElementIterator.next();
int shift = 0;
if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
@ -511,7 +514,7 @@ public class ReadUtils {
if (goalReached) {
// Is this base's reference position within this cigar element? Or did we use it all?
boolean endsWithinCigar = shift < cigarElement.getLength();
final boolean endsWithinCigar = shift < cigarElement.getLength();
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
// since we checked if the goal coordinate is within the read length, so this is just a sanity check.
@ -523,13 +526,13 @@ public class ReadUtils {
}
}
CigarElement nextCigarElement;
CigarElement nextCigarElement = null;
// if we end inside the current cigar element, we just have to check if it is a deletion
// if we end inside the current cigar element, we just have to check if it is a deletion (or skipped region)
if (endsWithinCigar)
fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;
fallsInsideDeletionOrSkippedRegion = (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) ;
// if we end outside the current cigar element, we need to check if the next element is an insertion or deletion.
// if we end outside the current cigar element, we need to check if the next element is an insertion, deletion or skipped region.
else {
nextCigarElement = cigarElementIterator.next();
@ -547,22 +550,27 @@ public class ReadUtils {
nextCigarElement = cigarElementIterator.next();
}
// if it's a deletion, we will pass the information on to be handled downstream.
fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
// if it's a deletion (or skipped region), we will pass the information on to be handled downstream.
endJustBeforeDeletionOrSkippedRegion = (nextCigarElement.getOperator() == CigarOperator.DELETION || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION);
}
// If we reached our goal outside a deletion, add the shift
if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
fallsInsideOrJustBeforeDeletionOrSkippedRegion = endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion;
// If we reached our goal outside a deletion (or skipped region), add the shift
if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases())
readBases += shift;
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
// If we reached our goal just before a deletion (or skipped region) we need
// to add the shift of the current cigar element but go back to it's last element to return the last
// base before the deletion (see warning in function contracts)
else if (fallsInsideDeletion && !endsWithinCigar && cigarElement.getOperator().consumesReadBases())
// base before the deletion (or skipped region) (see warning in function contracts)
else if (endJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases())
readBases += shift - 1;
// If we reached our goal inside a deletion then we must backtrack to the last base before the deletion
else if (fallsInsideDeletion && endsWithinCigar)
// If we reached our goal inside a deletion (or skipped region), or just between a deletion and a skipped region,
// then we must backtrack to the last base before the deletion (or skipped region)
else if (fallsInsideDeletionOrSkippedRegion ||
(endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.N)) ||
(endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.D)))
readBases--;
}
}
@ -575,7 +583,7 @@ public class ReadUtils {
}
}
return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
return new Pair<Integer, Boolean>(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion);
}
/**

View File

@ -790,7 +790,6 @@ public class GATKVariantContextUtils {
* @param setKey the key name of the set
* @param filteredAreUncalled are filtered records uncalled?
* @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count?
* @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d.
* @return new VariantContext representing the merge of unsortedVCs
*/
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
@ -801,10 +800,9 @@ public class GATKVariantContextUtils {
final boolean printMessages,
final String setKey,
final boolean filteredAreUncalled,
final boolean mergeInfoWithMaxAC,
final boolean combineAnnotations ) {
final boolean mergeInfoWithMaxAC ) {
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
return simpleMerge(unsortedVCs, Collections.<VariantContext>emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations);
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC);
}
/**
@ -817,7 +815,6 @@ public class GATKVariantContextUtils {
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
*
* @param unsortedVCs collection of unsorted VCs
* @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
@ -826,11 +823,9 @@ public class GATKVariantContextUtils {
* @param setKey the key name of the set
* @param filteredAreUncalled are filtered records uncalled?
* @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count?
* @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d.
* @return new VariantContext representing the merge of unsortedVCs
*/
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
final Collection<VariantContext> potentialRefVCs,
final List<String> priorityListOfVCs,
final int originalNumOfVCs,
final FilteredRecordMergeType filteredRecordMergeType,
@ -839,9 +834,7 @@ public class GATKVariantContextUtils {
final boolean printMessages,
final String setKey,
final boolean filteredAreUncalled,
final boolean mergeInfoWithMaxAC,
final boolean combineAnnotations ) {
final boolean mergeInfoWithMaxAC ) {
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
return null;
@ -860,9 +853,6 @@ public class GATKVariantContextUtils {
VCs.add(vc);
}
// cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible
VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs);
if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled
return null;
@ -950,38 +940,19 @@ public class GATKVariantContextUtils {
for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
final String key = p.getKey();
final Object value = p.getValue();
boolean badAnnotation = false;
if ( combineAnnotations ) { // add the annotation values to a list for combining later
List<Comparable> values = annotationMap.get(key);
if( values == null ) {
values = new ArrayList<>();
annotationMap.put(key, values);
}
try {
final String stringValue = value.toString();
// Branch to avoid unintentional, implicit type conversions that occur with the ? operator.
if (stringValue.contains("."))
values.add(Double.parseDouble(stringValue));
else
values.add(Integer.parseInt(stringValue));
} catch (NumberFormatException e) {
badAnnotation = true;
}
}
if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC
// if we don't like the key already, don't go anywhere
if ( ! inconsistentAttributes.contains(key) ) {
final boolean alreadyFound = attributes.containsKey(key);
final Object boundValue = attributes.get(key);
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
// only output annotations that have the same value in every input VC
// if we don't like the key already, don't go anywhere
if ( ! inconsistentAttributes.contains(key) ) {
final boolean alreadyFound = attributes.containsKey(key);
final Object boundValue = attributes.get(key);
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) {
// we found the value but we're inconsistent, put it in the exclude list
inconsistentAttributes.add(key);
attributes.remove(key);
} else if ( ! alreadyFound || boundIsMissingValue ) { // no value
attributes.put(key, value);
}
if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) {
// we found the value but we're inconsistent, put it in the exclude list
inconsistentAttributes.add(key);
attributes.remove(key);
} else if ( ! alreadyFound || boundIsMissingValue ) { // no value
attributes.put(key, value);
}
}
}
@ -1007,12 +978,6 @@ public class GATKVariantContextUtils {
// take the VC with the maxAC and pull the attributes into a modifiable map
if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) {
attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes());
} else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided
for ( final Map.Entry<String, List<Comparable>> p : annotationMap.entrySet() ) {
if ( ! p.getValue().isEmpty() ) {
attributes.put(p.getKey(), combineAnnotationValues(p.getValue()));
}
}
}
// if at least one record was unfiltered and we want a union, clear all of the filters
@ -1058,11 +1023,6 @@ public class GATKVariantContextUtils {
builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters));
}
builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes));
if( combineAnnotations ) {
// unfortunately some attributes are just too dangerous to try to combine together
builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY);
builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
}
// Trim the padded bases of all alleles if necessary
final VariantContext merged = builder.make();
@ -1070,69 +1030,202 @@ public class GATKVariantContextUtils {
return merged;
}
private static final Comparable combineAnnotationValues( final List<Comparable> array ) {
private static Comparable combineAnnotationValues( final List<Comparable> array ) {
return MathUtils.median(array); // right now we take the median but other options could be explored
}
/**
* cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible
* @param VCs the list of VCs in which to fill in symbolic alleles
* @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles
* @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele
* Merges VariantContexts from gVCFs into a single hybrid.
* Assumes that none of the input records are filtered.
*
* @param VCs collection of unsorted genomic VCs
* @param loc the current location
* @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case
* @return new VariantContext representing the merge of all VCs or null if it not relevant
*/
protected static final List<VariantContext> fillInNonRefSymbolicAlleles( final List<VariantContext> VCs, final Collection<VariantContext> potentialRefVCs ) {
if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); }
if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); }
public static VariantContext referenceConfidenceMerge(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase) {
final List<VariantContext> VCsToReturn = new ArrayList<>(VCs.size());
boolean containsNonRefSymbolicAllele = false;
VariantContext nonRefVC = null;
for( final VariantContext vc : VCs ) {
if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) {
containsNonRefSymbolicAllele = true;
} else if ( nonRefVC == null ) {
nonRefVC = vc;
}
if( nonRefVC != null && containsNonRefSymbolicAllele == true ) {
break; // break out so that we don't run over the whole list unnecessarily
}
if ( VCs == null || VCs.size() == 0 ) throw new IllegalArgumentException("VCs cannot be null or empty");
// establish the baseline info (sometimes from the first VC)
final VariantContext first = VCs.get(0);
final String name = first.getSource();
// ref allele
final Allele refAllele = determineReferenceAlleleGiveReferenceBase(VCs, loc, refBase);
if ( refAllele == null )
return null;
// alt alleles
final AlleleMapper alleleMapper = determineAlternateAlleleMapping(VCs, refAllele, loc);
final List<Allele> alleles = getAllelesListFromMapper(refAllele, alleleMapper);
final Map<String, Object> attributes = new LinkedHashMap<>();
final Set<String> inconsistentAttributes = new HashSet<>();
final Set<String> rsIDs = new LinkedHashSet<>(1); // most of the time there's one id
VariantContext longestVC = first;
int depth = 0;
final Map<String, List<Comparable>> annotationMap = new LinkedHashMap<>();
GenotypesContext genotypes = GenotypesContext.create();
// cycle through and add info from the other VCs
for ( final VariantContext vc : VCs ) {
// if this context doesn't start at the current location then it must be a spanning event (deletion or ref block)
final boolean isSpanningEvent = loc.getStart() != vc.getStart();
final List<Allele> remappedAlleles = isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) : alleleMapper.remap(vc.getAlleles());
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, alleles);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) )
depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
if ( isSpanningEvent )
continue;
// keep track of the longest location that starts here
if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) )
longestVC = vc;
// special case ID (just preserve it)
if ( vc.hasID() ) rsIDs.add(vc.getID());
// add attributes
addReferenceConfidenceAttributes(vc.getAttributes(), attributes, inconsistentAttributes, annotationMap);
}
for( final VariantContext vc : potentialRefVCs ) {
if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) {
containsNonRefSymbolicAllele = true;
VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below
// when combining annotations use the median value from all input VCs which had annotations provided
for ( final Map.Entry<String, List<Comparable>> p : annotationMap.entrySet() ) {
if ( ! p.getValue().isEmpty() ) {
attributes.put(p.getKey(), combineAnnotationValues(p.getValue()));
}
}
if( !containsNonRefSymbolicAllele ) {
return VCs;
}
if ( depth > 0 )
attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));
for( final VariantContext vc : VCs ) {
if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site
if( nonRefVC != null ) {
final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size());
int depth = 0;
for( final String sample : vc.getSampleNames() ) {
final Genotype gt = vc.getGenotype(sample);
final ArrayList<Allele> refAlleles = new ArrayList<>(2);
refAlleles.add(nonRefVC.getReference());
refAlleles.add(nonRefVC.getReference());
final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now
depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here
genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make());
}
VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make());
}
} else {
VCsToReturn.add(vc);
}
}
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
return VCsToReturn;
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles)
.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later
// remove stale AC and AF based attributes
removeStaleAttributesAfterMerge(builder);
return builder.make();
}
private static final boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
/**
* Determines the ref allele given the provided reference base at this position
*
* @param VCs collection of unsorted genomic VCs
* @param loc the current location
* @param refBase the reference allele to use if all contexts in the VC are spanning
* @return new Allele or null if no reference allele/base is available
*/
private static Allele determineReferenceAlleleGiveReferenceBase(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase) {
final Allele refAllele = determineReferenceAllele(VCs, loc);
if ( refAllele == null )
return ( refBase == null ? null : Allele.create(refBase, true) );
return refAllele;
}
/**
* Creates an alleles list given a reference allele and a mapper
*
* @param refAllele the reference allele
* @param alleleMapper the allele mapper
* @return a non-null, non-empty list of Alleles
*/
private static List<Allele> getAllelesListFromMapper(final Allele refAllele, final AlleleMapper alleleMapper) {
final List<Allele> alleles = new ArrayList<>();
alleles.add(refAllele);
alleles.addAll(alleleMapper.getUniqueMappedAlleles());
return alleles;
}
/**
* Remove the stale attributes from the merged VariantContext (builder)
*
* @param builder the VC builder
*/
private static void removeStaleAttributesAfterMerge(final VariantContextBuilder builder) {
builder.rmAttributes(Arrays.asList(VCFConstants.ALLELE_COUNT_KEY,
VCFConstants.ALLELE_FREQUENCY_KEY,
VCFConstants.ALLELE_NUMBER_KEY,
VCFConstants.MLE_ALLELE_COUNT_KEY,
VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
}
/**
* Adds attributes to the global map from the new context in a sophisticated manner
*
* @param myAttributes attributes to add from
* @param globalAttributes global set of attributes to add to
* @param inconsistentAttributes set of attributes that are inconsistent among samples
* @param annotationMap map of annotations for combining later
*/
private static void addReferenceConfidenceAttributes(final Map<String, Object> myAttributes,
final Map<String, Object> globalAttributes,
final Set<String> inconsistentAttributes,
final Map<String, List<Comparable>> annotationMap) {
for ( final Map.Entry<String, Object> p : myAttributes.entrySet() ) {
final String key = p.getKey();
final Object value = p.getValue();
boolean badAnnotation = false;
// add the annotation values to a list for combining later
List<Comparable> values = annotationMap.get(key);
if( values == null ) {
values = new ArrayList<>();
annotationMap.put(key, values);
}
try {
final String stringValue = value.toString();
// Branch to avoid unintentional, implicit type conversions that occur with the ? operator.
if (stringValue.contains("."))
values.add(Double.parseDouble(stringValue));
else
values.add(Integer.parseInt(stringValue));
} catch (final NumberFormatException e) {
badAnnotation = true;
}
// only output annotations that have the same value in every input VC
if ( badAnnotation && ! inconsistentAttributes.contains(key) ) {
checkForConsistency(key, value, globalAttributes, inconsistentAttributes);
}
}
}
/**
* Check attributes for consistency to others in the merge
*
* @param key the attribute key
* @param value the attribute value
* @param globalAttributes the global list of attributes being merged
* @param inconsistentAttributes the list of inconsistent attributes in the merge
*/
private static void checkForConsistency(final String key,
final Object value,
final Map<String, Object> globalAttributes,
final Set<String> inconsistentAttributes) {
final boolean alreadyFound = globalAttributes.containsKey(key);
final Object boundValue = globalAttributes.get(key);
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) {
// we found the value but we're inconsistent, put it in the exclude list
inconsistentAttributes.add(key);
globalAttributes.remove(key);
} else if ( ! alreadyFound || boundIsMissingValue ) { // no value
globalAttributes.put(key, value);
}
}
private static boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
final Iterator<Allele> it1 = alleleSet1.iterator();
final Iterator<Allele> it2 = alleleSet2.iterator();
@ -1262,65 +1355,130 @@ public class GATKVariantContextUtils {
return builder.make();
}
static private Allele determineReferenceAllele(List<VariantContext> VCs) {
static private Allele determineReferenceAllele(final List<VariantContext> VCs) {
return determineReferenceAllele(VCs, null);
}
/**
* Determines the common reference allele
*
* @param VCs the list of VariantContexts
* @param loc if not null, ignore records that do not begin at this start location
* @return possibly null Allele
*/
static private Allele determineReferenceAllele(final List<VariantContext> VCs, final GenomeLoc loc) {
Allele ref = null;
for ( final VariantContext vc : VCs ) {
final Allele myRef = vc.getReference();
if ( ref == null || ref.length() < myRef.length() )
ref = myRef;
else if ( ref.length() == myRef.length() && ! ref.equals(myRef) )
throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef));
if ( contextMatchesLoc(vc, loc) ) {
final Allele myRef = vc.getReference();
if ( ref == null || ref.length() < myRef.length() )
ref = myRef;
else if ( ref.length() == myRef.length() && ! ref.equals(myRef) )
throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef));
}
}
return ref;
}
static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set<Allele> allAlleles) {
static private boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) {
return loc == null || loc.getStart() == vc.getStart();
}
/**
* Given the reference allele, determines the mapping for common alternate alleles in the list of VariantContexts.
*
* @param VCs the list of VariantContexts
* @param refAllele the reference allele
* @param loc if not null, ignore records that do not begin at this start location
* @return non-null AlleleMapper
*/
static private AlleleMapper determineAlternateAlleleMapping(final List<VariantContext> VCs, final Allele refAllele, final GenomeLoc loc) {
final Map<Allele, Allele> map = new HashMap<>();
for ( final VariantContext vc : VCs ) {
if ( contextMatchesLoc(vc, loc) )
addAllAlternateAllelesToMap(vc, refAllele, map);
}
return new AlleleMapper(map);
}
/**
* Adds all of the alternate alleles from the VariantContext to the allele mapping (for use in creating the AlleleMapper)
*
* @param vc the VariantContext
* @param refAllele the reference allele
* @param map the allele mapping to populate
*/
static private void addAllAlternateAllelesToMap(final VariantContext vc, final Allele refAllele, final Map<Allele, Allele> map) {
// if the ref allele matches, then just add the alts as is
if ( refAllele.equals(vc.getReference()) ) {
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
// ignore symbolic alleles
if ( ! altAllele.isSymbolic() )
map.put(altAllele, altAllele);
}
}
else {
map.putAll(createAlleleMapping(refAllele, vc, map.values()));
}
}
static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set<Allele> allAlleles) {
if ( refAllele.equals(vc.getReference()) )
return new AlleleMapper(vc);
else {
// we really need to do some work. The refAllele is the longest reference allele seen at this
// start site. So imagine it is:
//
// refAllele: ACGTGA
// myRef: ACGT
// myAlt: A
//
// We need to remap all of the alleles in vc to include the extra GA so that
// myRef => refAllele and myAlt => AGA
//
Allele myRef = vc.getReference();
if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele);
byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length());
// System.out.printf("Remapping allele at %s%n", vc);
// System.out.printf("ref %s%n", refAllele);
// System.out.printf("myref %s%n", myRef );
// System.out.printf("extrabases %s%n", new String(extraBases));
Map<Allele, Allele> map = new HashMap<>();
for ( final Allele a : vc.getAlleles() ) {
if ( a.isReference() )
map.put(a, refAllele);
else {
Allele extended = Allele.extend(a, extraBases);
for ( final Allele b : allAlleles )
if ( extended.equals(b) )
extended = b;
// System.out.printf(" Extending %s => %s%n", a, extended);
map.put(a, extended);
}
}
// debugging
// System.out.printf("mapping %s%n", map);
final Map<Allele, Allele> map = createAlleleMapping(refAllele, vc, allAlleles);
map.put(vc.getReference(), refAllele);
return new AlleleMapper(map);
}
}
/**
* Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele
*
* The refAllele is the longest reference allele seen at this start site.
* So imagine it is:
* refAllele: ACGTGA
* myRef: ACGT
* myAlt: A
*
* We need to remap all of the alleles in vc to include the extra GA so that
* myRef => refAllele and myAlt => AGA
*
* @param refAllele the new (extended) reference allele
* @param oneVC the Variant Context to extend
* @param currentAlleles the list of alleles already created
* @return a non-null mapping of original alleles to new (extended) ones
*/
private static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
final VariantContext oneVC,
final Collection<Allele> currentAlleles) {
final Allele myRef = oneVC.getReference();
if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele);
final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length());
final Map<Allele, Allele> map = new HashMap<>();
for ( final Allele a : oneVC.getAlternateAlleles() ) {
if ( isUsableAlternateAllele(a) ) {
Allele extended = Allele.extend(a, extraBases);
for ( final Allele b : currentAlleles )
if ( extended.equals(b) )
extended = b;
map.put(a, extended);
}
}
return map;
}
static private boolean isUsableAlternateAllele(final Allele allele) {
return ! (allele.isReference() || allele.isSymbolic() );
}
public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption ) {
if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null )
throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list");
@ -1352,6 +1510,124 @@ public class GATKVariantContextUtils {
}
}
/**
* Replaces any alleles in the list with NO CALLS, except for the generic ALT allele
*
* @param alleles list of alleles to replace
* @return non-null list of alleles
*/
private static List<Allele> replaceWithNoCalls(final List<Allele> alleles) {
if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null");
final List<Allele> result = new ArrayList<>(alleles.size());
for ( final Allele allele : alleles )
result.add(allele.equals(NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL);
return result;
}
/**
* Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles.
* This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set.
*
* @param mergedGenotypes the genotypes context to add to
* @param VC the Variant Context for the sample
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
*/
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
final VariantContext VC,
final List<Allele> remappedAlleles,
final List<Allele> targetAlleles) {
for ( final Genotype g : VC.getGenotypes() ) {
// only add if the name is new
final String name = g.getSampleName();
if ( !mergedGenotypes.containsSample(name) ) {
// we need to modify it even if it already contains all of the alleles because we need to purge the <ALT> PLs out anyways
final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles);
final int[] PLs = generatePLs(g, indexesOfRelevantAlleles);
// note that we set the alleles to null here (as we expect it to be re-genotyped)
final Genotype newG = new GenotypeBuilder(g).name(name).alleles(null).PL(PLs).noAD().noGQ().make();
mergedGenotypes.add(newG);
}
}
}
/**
* Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "<ALT>" as appropriate.
* If the myAlleles set does not contain "<ALT>" as an allele, it throws an exception.
*
* @param remappedAlleles the list of alleles to evaluate
* @param targetAlleles the target list of alleles
* @return non-null array of ints representing indexes
*/
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles) {
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) )
throw new IllegalArgumentException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele; please use the Haplotype Caller with gVCF output to generate appropriate records");
final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE);
final int[] indexMapping = new int[targetAlleles.size()];
// the reference alleles always match up (even if they don't appear to)
indexMapping[0] = 0;
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele;
}
return indexMapping;
}
/**
* Generates new PLs given the set of indexes of the Genotype's current alleles from the original PLs.
* Throws an exception if the Genotype does not contain PLs.
*
* @param genotype the genotype from which to grab PLs
* @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles
* @return non-null array of new PLs
*/
protected static int[] generatePLs(final Genotype genotype, final int[] indexesOfRelevantAlleles) {
if ( !genotype.hasPL() )
throw new IllegalArgumentException("Cannot generate new PLs from a genotype without PLs");
final int[] originalPLs = genotype.getPL();
// assume diploid
final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(indexesOfRelevantAlleles.length, 2);
final int[] newPLs = new int[numLikelihoods];
for ( int i = 0; i < indexesOfRelevantAlleles.length; i++ ) {
for ( int j = i; j < indexesOfRelevantAlleles.length; j++ ) {
final int originalPLindex = calculatePLindexFromUnorderedIndexes(indexesOfRelevantAlleles[i], indexesOfRelevantAlleles[j]);
if ( originalPLindex >= originalPLs.length )
throw new IllegalStateException("The original PLs do not have enough values; accessing index " + originalPLindex + " but size is " + originalPLs.length);
final int newPLindex = GenotypeLikelihoods.calculatePLindex(i, j);
newPLs[newPLindex] = originalPLs[originalPLindex];
}
}
return newPLs;
}
/**
* This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex()
*
* @param originalIndex1 the index of the first allele
* @param originalIndex2 the index of the second allele
* @return the PL index
*/
protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) {
// we need to make sure they are ordered correctly
return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2);
}
public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) {
return uniquify ? sampleName + "." + trackName : sampleName;
}
@ -1717,6 +1993,15 @@ public class GATKVariantContextUtils {
}
return newAs;
}
/**
* @return the list of unique values
*/
public List<Allele> getUniqueMappedAlleles() {
if ( map == null )
return Collections.emptyList();
return new ArrayList<>(new HashSet<>(map.values()));
}
}
private static class CompareByPriority implements Comparator<VariantContext>, Serializable {

View File

@ -31,6 +31,7 @@ import net.sf.samtools.CigarOperator;
import net.sf.samtools.TextCigarCodec;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.CigarUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
@ -57,11 +58,15 @@ public class ReadClipperTestUtils {
}
public static GATKSAMRecord makeReadFromCigar(String cigarString) {
return makeReadFromCigar(cigarFromString(cigarString));
return makeReadFromCigar(CigarUtils.cigarFromString(cigarString));
}
public static List<Cigar> generateCigarList(int maximumLength) {
return generateCigarList(maximumLength, cigarElements);
}
/**
* This function generates every valid permutation of cigar strings with a given length.
* This function generates every valid permutation of cigar strings (with a given set of cigarElement) with a given length.
*
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read
@ -72,7 +77,7 @@ public class ReadClipperTestUtils {
* @param maximumLength the maximum number of elements in the cigar
* @return a list with all valid Cigar objects
*/
public static List<Cigar> generateCigarList(int maximumLength) {
public static List<Cigar> generateCigarList(int maximumLength, CigarElement[] cigarElements) {
int numCigarElements = cigarElements.length;
LinkedList<Cigar> cigarList = new LinkedList<Cigar>();
byte [] cigarCombination = new byte[maximumLength];
@ -80,9 +85,9 @@ public class ReadClipperTestUtils {
Utils.fillArrayWithByte(cigarCombination, (byte) 0); // we start off with all 0's in the combination array.
int currentIndex = 0;
while (true) {
Cigar cigar = createCigarFromCombination(cigarCombination); // create the cigar
cigar = combineAdjacentCigarElements(cigar); // combine adjacent elements
if (isCigarValid(cigar)) { // check if it's valid
Cigar cigar = createCigarFromCombination(cigarCombination, cigarElements); // create the cigar
cigar = CigarUtils.combineAdjacentCigarElements(cigar); // combine adjacent elements
if (CigarUtils.isCigarValid(cigar)) { // check if it's valid
cigarList.add(cigar); // add it
}
@ -107,41 +112,7 @@ public class ReadClipperTestUtils {
return cigarList;
}
private static boolean isCigarValid(Cigar cigar) {
if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation)
Stack<CigarElement> cigarElementStack = new Stack<CigarElement>(); // Stack to invert cigar string to find ending operator
CigarOperator startingOp = null;
CigarOperator endingOp = null;
// check if it doesn't start with deletions
boolean readHasStarted = false; // search the list of elements for the starting operator
for (CigarElement cigarElement : cigar.getCigarElements()) {
if (!readHasStarted) {
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) {
readHasStarted = true;
startingOp = cigarElement.getOperator();
}
}
cigarElementStack.push(cigarElement);
}
while (!cigarElementStack.empty()) {
CigarElement cigarElement = cigarElementStack.pop();
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) {
endingOp = cigarElement.getOperator();
break;
}
}
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION)
return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
}
return false;
}
private static Cigar createCigarFromCombination(byte[] cigarCombination) {
private static Cigar createCigarFromCombination(byte[] cigarCombination, CigarElement[] cigarElements) {
Cigar cigar = new Cigar();
for (byte i : cigarCombination) {
cigar.add(cigarElements[i]);
@ -149,38 +120,6 @@ public class ReadClipperTestUtils {
return cigar;
}
/**
* Combines equal adjacent elements of a Cigar object
*
* @param rawCigar the cigar object
* @return a combined cigar object
*/
private static Cigar combineAdjacentCigarElements(Cigar rawCigar) {
Cigar combinedCigar = new Cigar();
CigarElement lastElement = null;
int lastElementLength = 0;
for (CigarElement cigarElement : rawCigar.getCigarElements()) {
if (lastElement != null &&
((lastElement.getOperator() == cigarElement.getOperator()) ||
(lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) ||
(lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I)))
lastElementLength += cigarElement.getLength();
else
{
if (lastElement != null)
combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator()));
lastElement = cigarElement;
lastElementLength = cigarElement.getLength();
}
}
if (lastElement != null)
combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator()));
return combinedCigar;
}
public static GATKSAMRecord makeRead() {
return ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR);
}
@ -201,34 +140,5 @@ public class ReadClipperTestUtils {
// Otherwise test if they're both empty
else
Assert.assertEquals(actual.isEmpty(), expected.isEmpty());
}
public static Cigar invertCigar (Cigar cigar) {
Stack<CigarElement> cigarStack = new Stack<CigarElement>();
for (CigarElement cigarElement : cigar.getCigarElements())
cigarStack.push(cigarElement);
Cigar invertedCigar = new Cigar();
while (!cigarStack.isEmpty())
invertedCigar.add(cigarStack.pop());
return invertedCigar;
}
/**
* Checks whether or not the read has any cigar element that is not H or S
*
* @param read the read
* @return true if it has any M, I or D, false otherwise
*/
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements())
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
return true;
return false;
}
public static Cigar cigarFromString(String cigarString) {
return TextCigarCodec.getSingleton().decode(cigarString);
}
}
}

View File

@ -30,6 +30,7 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.CigarUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
@ -241,7 +242,7 @@ public class ReadClipperUnitTest extends BaseTest {
int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION);
if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar()))
expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION);
expectedLength -= leadingCigarElementLength(CigarUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION);
if (!clippedRead.isEmpty()) {
Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there
@ -256,7 +257,7 @@ public class ReadClipperUnitTest extends BaseTest {
public void testRevertSoftClippedBases() {
for (Cigar cigar : cigarList) {
final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP);
final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP);
final int tailSoftClips = leadingCigarElementLength(CigarUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP);
final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read);
@ -278,7 +279,7 @@ public class ReadClipperUnitTest extends BaseTest {
public void testRevertSoftClippedBasesWithThreshold() {
for (Cigar cigar : cigarList) {
final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP);
final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP);
final int tailSoftClips = leadingCigarElementLength(CigarUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP);
final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read);
@ -348,7 +349,7 @@ public class ReadClipperUnitTest extends BaseTest {
* @param clipped clipped read
*/
private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) {
if (ReadClipperTestUtils.readHasNonClippedBases(clipped)) {
if (CigarUtils.readHasNonClippedBases(clipped)) {
Assert.assertEquals(original.getUnclippedStart(), clipped.getUnclippedStart());
Assert.assertEquals(original.getUnclippedEnd(), clipped.getUnclippedEnd());
}

View File

@ -220,7 +220,7 @@ public class ReadUtilsUnitTest extends BaseTest {
}
@Test (enabled = true)
public void testReadWithNs() throws FileNotFoundException {
public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
@ -232,6 +232,22 @@ public class ReadUtilsUnitTest extends BaseTest {
read.setCigarString("3M414N1D73M");
final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL);
Assert.assertEquals(result, 2);
}
@Test (enabled = true)
public void testReadWithNsRefAfterDeletion() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
final int readLength = 76;
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
read.setCigarString("3M414N1D73M");
final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9393, ReadUtils.ClippingTail.LEFT_TAIL);
Assert.assertEquals(result, 3);
}

View File

@ -27,12 +27,9 @@ package org.broadinstitute.sting.utils.variant;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
@ -182,7 +179,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
inputs, priority,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false);
Assert.assertEquals(merged.getAlleles(), cfg.expected);
}
@ -240,7 +237,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
inputs, null,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false);
Assert.assertEquals(merged.getID(), cfg.expected);
}
@ -355,7 +352,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
public void testMergeFiltered(MergeFilteredTest cfg) {
final List<String> priority = vcs2priority(cfg.inputs);
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false);
cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false);
// test alleles are equal
Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles());
@ -482,7 +479,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
public void testMergeGenotypes(MergeGenotypesTest cfg) {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false);
// test alleles are equal
Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles());
@ -523,7 +520,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false);
// test genotypes
Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2")));
@ -556,7 +553,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false);
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false);
if ( annotate )
Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION);
@ -1036,26 +1033,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
}
}
@Test(enabled = !DEBUG)
public void testFillInNonRefSymbolicAlleles() {
final int start = 10;
final String ref = "A";
final String alt = "C";
final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt));
final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">"));
List<VariantContext> VCs = Arrays.asList(vcAlt, vcRef);
VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.<VariantContext>emptyList());
// make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele
for( final VariantContext vc : VCs ) {
Assert.assertTrue(vc.getAlternateAlleles().size() == 1);
Assert.assertTrue(vc.getAlternateAllele(0).isNonReference());
Assert.assertTrue(!vc.getReference().isSymbolic());
Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic());
}
}
// --------------------------------------------------------------------------------
//
// test allele remapping
@ -1428,4 +1405,225 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
assertGenotypesAreEqual(actualGT, expected);
}
}
// --------------------------------------------------------------------------------
//
// Test methods for merging reference confidence VCs
//
// --------------------------------------------------------------------------------
@DataProvider(name = "generatePLsData")
public Object[][] makeGeneratePLsData() {
final List<Object[]> tests = new ArrayList<>();
for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) {
for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) {
for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) {
final int[] indexes = new int[originalAlleles];
for ( int i = 0; i < originalAlleles; i++ )
indexes[i] = i;
indexes[swapPosition1] = swapPosition2;
indexes[swapPosition2] = swapPosition1;
tests.add(new Object[]{originalAlleles, indexes});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "generatePLsData")
public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) {
final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2);
final int[] PLs = new int[numLikelihoods];
for ( int i = 0; i < numLikelihoods; i++ )
PLs[i] = i;
final List<Allele> alleles = new ArrayList<>(numOriginalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < numOriginalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make();
final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering);
Assert.assertEquals(newPLs.length, numLikelihoods);
final int[] expectedPLs = new int[numLikelihoods];
for ( int i = 0; i < numOriginalAlleles; i++ ) {
for ( int j = i; j < numOriginalAlleles; j++ ) {
final int index = GenotypeLikelihoods.calculatePLindex(i, j);
final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]);
expectedPLs[index] = value;
}
}
for ( int i = 0; i < numLikelihoods; i++ ) {
Assert.assertEquals(newPLs[i], expectedPLs[i]);
}
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testGetIndexesOfRelevantAllelesWithNoALT() {
final List<Allele> alleles1 = new ArrayList<>(1);
alleles1.add(Allele.create("A", true));
final List<Allele> alleles2 = new ArrayList<>(1);
alleles2.add(Allele.create("A", true));
GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2);
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
}
@DataProvider(name = "getIndexesOfRelevantAllelesData")
public Object[][] makeGetIndexesOfRelevantAllelesData() {
final int totalAlleles = 5;
final List<Allele> alleles = new ArrayList<>(totalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < totalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final List<Object[]> tests = new ArrayList<>();
for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) {
tests.add(new Object[]{alleleIndex, alleles});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "getIndexesOfRelevantAllelesData")
public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List<Allele> allAlleles) {
final List<Allele> myAlleles = new ArrayList<>(3);
// always add the reference and <ALT> alleles
myAlleles.add(allAlleles.get(0));
myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
// optionally add another alternate allele
if ( allelesIndex > 0 )
myAlleles.add(allAlleles.get(allelesIndex));
final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles);
Assert.assertEquals(indexes.length, allAlleles.size());
for ( int i = 0; i < allAlleles.size(); i++ ) {
if ( i == 0 )
Assert.assertEquals(indexes[i], 0); // ref should always match
else if ( i == allelesIndex )
Assert.assertEquals(indexes[i], 2); // allele
else
Assert.assertEquals(indexes[i], 1); // <ALT>
}
}
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {
final List<Object[]> tests = new ArrayList<>();
final int start = 10;
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start);
final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make();
final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73};
final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10};
final List<Allele> A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).make();
final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make();
final Allele AAref = Allele.create("AA", true);
final List<Allele> AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).make();
final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make();
final List<Allele> A_C = Arrays.asList(Aref, C);
final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).make();
final List<Allele> A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).make();
final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make();
final List<Allele> A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).make();
final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make();
final List<Allele> A_C_G = Arrays.asList(Aref, C, G);
final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).make();
final List<Allele> A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).make();
final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make();
final List<Allele> A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).make();
final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make();
final Allele A = Allele.create("A", false);
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).make();
final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
// first test the case of a single record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()});
// now, test pairs:
// a SNP with another SNP
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).make()).make()});
// a SNP with an indel
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ATC_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).make()).make()});
// a SNP with 2 SNPs
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_C_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()});
// a SNP with a ref record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()});
// spanning records:
// a SNP with a spanning ref record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()});
// a SNP with a spanning deletion
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).make()).make()});
// combination of all
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC, G)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 71, 72, 74, 20, 10, 73, 30}).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).make()).make()});
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[]{Arrays.asList(vcAA_ALT),
loc, false,
null});
tests.add(new Object[]{Arrays.asList(vcAA_ALT),
loc, true,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).make()).make()});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "referenceConfidenceMergeData")
public void testReferenceConfidenceMerge(final List<VariantContext> toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) {
final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte)'A' : null);
if ( result == null ) {
Assert.assertTrue(expectedResult == null);
return;
}
Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles());
Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples());
for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) {
Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName());
// use string comparisons to test equality for now
Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString());
}
}
}

View File

@ -234,7 +234,7 @@ public class VariantContextBenchmark extends SimpleBenchmark {
GATKVariantContextUtils.simpleMerge(toMerge, null,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNSORTED,
true, false, "set", false, true, false);
true, false, "set", false, true);
}
};

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="org.broadinstitute" module="variant" revision="1.104.1634" status="integration" />
<info organisation="org.broadinstitute" module="variant" revision="1.105.1642" status="integration" />
</ivy-module>