*/
- private final double[] log10;
+ private final double[] ploidyLog10;
/**
* Buffer field use as a temporal container for sorted allele counts when calculating the likelihood of a
@@ -202,24 +202,22 @@ public class GenotypeLikelihoodCalculator {
* Creates a new calculator providing its ploidy and number of genotyping alleles.
*/
protected GenotypeLikelihoodCalculator(final int ploidy, final int alleleCount,
- final int[][] alleleFirstGenotypeOffsetByPloidy,
- final GenotypeAlleleCounts[][] genotypeTableByPloidy,
- final double[] ploidyLog10) {
+ final int[][] alleleFirstGenotypeOffsetByPloidy,
+ final GenotypeAlleleCounts[][] genotypeTableByPloidy,
+ final double[] ploidyLog10) {
+
this.alleleFirstGenotypeOffsetByPloidy = alleleFirstGenotypeOffsetByPloidy;
genotypeAlleleCounts = genotypeTableByPloidy[ploidy];
this.alleleCount = alleleCount;
this.ploidy = ploidy;
genotypeCount = this.alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
- if (genotypeCount == GenotypeLikelihoodCalculators.GENOTYPE_COUNT_OVERFLOW)
- throw new IllegalArgumentException(
- String.format("the combination of ploidy (%s) and number of alleles (%s) results in a very large number of genotypes (> %s). You need to limit ploidy or the number of alternative alleles to analyze this locus",
- ploidy,alleleCount,Integer.MAX_VALUE));
+
alleleHeap = new IntMaxHeap(ploidy);
readLikelihoodsByGenotypeIndex = new double[genotypeCount][];
- log10 = ploidyLog10;
+ this.ploidyLog10 = ploidyLog10;
// The number of possible components is limited by distinct allele count and ploidy.
maximumDistinctAllelesInGenotype = Math.min(ploidy, alleleCount);
- genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype << 1];
+ genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype*2];
}
/**
@@ -349,7 +347,7 @@ public class GenotypeLikelihoodCalculator {
*/
private double[] genotypeLikelihoods(final double[][] readLikelihoodsByGenotypeIndex, final int readCount) {
final double[] result = new double[genotypeCount];
- final double denominator = readCount * log10[ploidy]; // instead of dividing each read likelihood by ploidy
+ final double denominator = readCount * ploidyLog10[ploidy]; // instead of dividing each read likelihood by ploidy
// ( so subtract log10(ploidy) ) we multiply them all and the divide by ploidy^readCount (so substract readCount * log10(ploidy) )
for (int g = 0; g < genotypeCount; g++) {
final double[] likelihoodsByRead = readLikelihoodsByGenotypeIndex[g];
@@ -464,7 +462,9 @@ public class GenotypeLikelihoodCalculator {
* exactly one allele present in the genotype.
*/
private void singleComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts,
- final double[] likelihoodByRead, final double[] readLikelihoodComponentsByAlleleCount, final int readCount) {
+ final double[] likelihoodByRead,
+ final double[] readLikelihoodComponentsByAlleleCount,
+ final int readCount) {
final int allele = genotypeAlleleCounts.alleleIndexAt(0);
// the count of the only component must be = ploidy.
int offset = (allele * (ploidy + 1) + ploidy) * readCount;
@@ -493,7 +493,7 @@ public class GenotypeLikelihoodCalculator {
// p = 2 because the frequency == 1 we already have it.
for (int frequency = 2, destinationOffset = frequency1Offset + readCount; frequency <= ploidy; frequency++) {
- final double log10frequency = log10[frequency];
+ final double log10frequency = ploidyLog10[frequency];
for (int r = 0, sourceOffset = frequency1Offset; r < readCount; r++)
readAlleleLikelihoodByAlleleCount[destinationOffset++] =
readAlleleLikelihoodByAlleleCount[sourceOffset++] + log10frequency;
@@ -620,7 +620,11 @@ public class GenotypeLikelihoodCalculator {
* @param destination where to store the new genotype index mapping to old.
* @param sortedAlleleCountsBuffer a buffer to re-use to get the genotype-allele-count's sorted allele counts.
*/
- private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex, final GenotypeAlleleCounts alleleCounts, final int[] oldToNewAlleleIndexMap, final int[] destination, final int[] sortedAlleleCountsBuffer) {
+ private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex,
+ final GenotypeAlleleCounts alleleCounts,
+ final int[] oldToNewAlleleIndexMap,
+ final int[] destination,
+ final int[] sortedAlleleCountsBuffer) {
final int distinctAlleleCount = alleleCounts.distinctAlleleCount();
alleleCounts.copyAlleleCounts(sortedAlleleCountsBuffer,0);
for (int j = 0, jj = 0; j < distinctAlleleCount; j++) {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java
index 83b280bff..da0ff0975 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java
@@ -51,7 +51,11 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
+import org.broadinstitute.gatk.utils.MathUtils;
+import org.broadinstitute.gatk.utils.exceptions.GATKException;
+
import java.util.Arrays;
+import java.util.stream.IntStream;
/**
* Genotype likelihood calculator utility.
@@ -116,7 +120,10 @@ public class GenotypeLikelihoodCalculators {
private volatile static GenotypeAlleleCounts[][] genotypeTableByPloidy =
buildGenotypeAlleleCountsTable(maximumPloidy,maximumAllele,alleleFirstGenotypeOffsetByPloidy);
-
+ /**
+ * Cached log10 values for the first integer up to the maximum ploidy requested thus far.
+ */
+ private volatile static double[] ploidyLog10 = IntStream.range(0, maximumPloidy + 1).mapToDouble(Math::log10).toArray();
/**
* Build the table with the genotype offsets based on ploidy and the maximum allele index with representation
@@ -291,40 +298,29 @@ public class GenotypeLikelihoodCalculators {
return result;
}
- /**
- * Cached log10 values for the first integer up to the maximum ploidy requested thus far.
- */
- private volatile static double[] ploidyLog10;
-
- // Initialize {@link #ploidyLog10}.
- static {
- ploidyLog10 = new double[maximumPloidy + 1];
- for (int i = 0; i <= maximumPloidy; i++)
- ploidyLog10[i] = Math.log10(i);
- }
-
/**
* Returns an instance given its ploidy and the number of alleles.
*
* @param alleleCount the required allele-count.
* @param ploidy the required ploidy-count.
*
- * @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is {@code null}, or
- * the resulting number of genotypes is too large.
+ * @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is negative, or the resulting number of genotypes is too large.
*
* @return never {@code null}.
*/
- public static GenotypeLikelihoodCalculator getInstance(final int ploidy,
- final int alleleCount) {
+ public static GenotypeLikelihoodCalculator getInstance(final int ploidy, final int alleleCount) {
checkPloidyAndMaximumAllele(ploidy, alleleCount);
// Non-thread safe (fast) check on tables capacities,
- // if not enough capacity we expand the tables in a thread-safe manner:
- if (alleleCount > maximumAllele || ploidy > maximumPloidy)
- ensureCapacity(alleleCount, ploidy);
+ // if not enough capacity we expand the tables in a thread-safe manner
+ // also checks if the requested ploidy and allele count result in a genotype count too large to deal with
+ if(calculateGenotypeCountUsingTables(ploidy, alleleCount) == GENOTYPE_COUNT_OVERFLOW){
+ final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
+ throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
+ }
// At this point the tables must have at least the requested capacity, likely to be much more.
- return new GenotypeLikelihoodCalculator(ploidy,alleleCount,alleleFirstGenotypeOffsetByPloidy,genotypeTableByPloidy,ploidyLog10);
+ return new GenotypeLikelihoodCalculator(ploidy, alleleCount, alleleFirstGenotypeOffsetByPloidy, genotypeTableByPloidy, ploidyLog10);
}
/**
@@ -413,14 +409,59 @@ public class GenotypeLikelihoodCalculators {
* @param ploidy the requested ploidy.
* @param alleleCount the requested number of alleles.
*
- * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
+ * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative or
+ * the number of genotypes is too large (more than {@link Integer#MAX_VALUE}).
*
- * @return 0 or greater.
+ * @return the number of genotypes given ploidy and allele count (0 or greater).
*/
public final static int genotypeCount(final int ploidy, final int alleleCount) {
+
+ final int result = calculateGenotypeCountUsingTables(ploidy, alleleCount);
+ if (result == GENOTYPE_COUNT_OVERFLOW) {
+ final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
+ throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
+ }
+ return result;
+ }
+
+ /**
+ * Compute the maximally acceptable allele count (ref allele included) given the maximally acceptable genotype count.
+ * @param ploidy sample ploidy
+ * @param maxGenotypeCount maximum number of genotype count used to calculate upper bound on number of alleles given ploidy
+ * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
+ * @return the maximally acceptable allele count given ploidy and maximum number of genotypes acceptable
+ */
+ public static int computeMaxAcceptableAlleleCount(final int ploidy, final int maxGenotypeCount){
+
+ checkPloidyAndMaximumAllele(ploidy, ploidy); // a hack to check ploidy makes sense (could duplicate code but choice must be made)
+
+ final double log10MaxGenotypeCount = Math.log10(maxGenotypeCount);
+
+ // Math explanation: genotype count is determined by ${P+A-1 \choose A-1}$, this leads to constraint
+ // $\log(\frac{(P+A-1)!}{(A-1)!}) \le \log(P!G)$,
+ // where $P$ is ploidy, $A$ is allele count, and $G$ is maxGenotypeCount
+ // The upper and lower bounds of the left hand side of the constraint are $P \log(A-1+P)$ and $P \log(A)$
+ // which require $A$ to be searched in interval $[10^{\log(P!G)/P} - (P-1), 10^{\log(P!G)/P}]$
+ // Denote $[10^{\log(P!G)/P}$ as $x$ in the code.
+
+ final double x = Math.pow(10, (MathUtils.log10Factorial(ploidy) + log10MaxGenotypeCount)/ploidy );
+ final int lower = (int)Math.floor(x) - ploidy - 1;
+ final int upper = (int)Math.ceil(x);
+ for(int a=upper; a>=lower; --a){// check one by one
+
+ final double log10GTCnt = MathUtils.log10BinomialCoefficient(ploidy+a-1, a-1);
+ if(log10MaxGenotypeCount >= log10GTCnt) {
+ return a;
+ }
+ }
+ throw new GATKException("Code should never reach here.");
+ }
+
+ private static int calculateGenotypeCountUsingTables(int ploidy, int alleleCount) {
checkPloidyAndMaximumAllele(ploidy, alleleCount);
- if (ploidy > maximumPloidy || alleleCount > maximumAllele)
- ensureCapacity(alleleCount,ploidy);
+ if (ploidy > maximumPloidy || alleleCount > maximumAllele) {
+ ensureCapacity(alleleCount, ploidy);
+ }
return alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
}
}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
index 30e7efe22..ad0f2b773 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
@@ -56,6 +56,8 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.math3.stat.StatUtils;
import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
@@ -73,6 +75,7 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
+import java.util.stream.Collectors;
/**
* {@link HaplotypeCaller}'s genotyping strategy implementation.
@@ -84,6 +87,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine practicaAlleleCountForPloidy = new HashMap<>();
+
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
protected final boolean doPhysicalPhasing;
@@ -135,7 +141,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine calls;
@@ -189,16 +195,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine 0"})
@Ensures("result != null")
// TODO - can this be refactored? this is hard to follow!
- CalledHaplotypes assignGenotypeLikelihoods( final List haplotypes,
- final ReadLikelihoods readLikelihoods,
- final Map> perSampleFilteredReadList,
- final byte[] ref,
- final GenomeLoc refLoc,
- final GenomeLoc activeRegionWindow,
- final GenomeLocParser genomeLocParser,
- final RefMetaDataTracker tracker,
- final List activeAllelesToGenotype,
- final boolean emitReferenceConfidence) {
+ CalledHaplotypes assignGenotypeLikelihoods(final List haplotypes,
+ final ReadLikelihoods readLikelihoods,
+ final Map> perSampleFilteredReadList,
+ final byte[] ref,
+ final GenomeLoc refLoc,
+ final GenomeLoc activeRegionWindow,
+ final GenomeLocParser genomeLocParser,
+ final RefMetaDataTracker tracker,
+ final List activeAllelesToGenotype,
+ final boolean emitReferenceConfidence) {
// sanity check input arguments
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
@@ -232,15 +238,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
@@ -248,13 +256,37 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
-
if( configuration.DEBUG && logger != null ) {
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
- final ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
+ final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
+
+ // if the number of allele is so high that enumerating all possible genotypes is impractical,
+ // as determined by MAX_GENOTYPE_COUNT_TO_ENUMERATE,
+ // trim alleles that are not well supported by good-scored haplotypes,
+ // otherwise alias, i.e. no trimming if genotype count is small
+ final int originalAlleleCount = alleleMapper.size();
+ Integer practicalAlleleCount = practicaAlleleCountForPloidy.get(ploidy);
+ if(practicalAlleleCount==null) { // maximum allele count given this ploidy and MAX_GENOTYPE_COUNT_TO_ENUMERATE hasn't been computed
+ practicalAlleleCount = GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(ploidy, MAX_GENOTYPE_COUNT_TO_ENUMERATE);
+ }
+
+ Map> practicalAlleleMapper = null;
+ if (practicalAlleleCount < originalAlleleCount) {
+ practicalAlleleMapper = reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(alleleMapper, practicalAlleleCount);
+ if( configuration.DEBUG && logger != null ) {
+ logger.warn(String.format("Removed alt alleles where ploidy is %d and original allele count is %d, whereas after trimming the allele count becomes %d",
+ ploidy, originalAlleleCount, practicalAlleleCount));
+ logger.warn(String.format("Alleles kept are:%s", practicalAlleleMapper.keySet()));
+ }
+ }else{
+ practicalAlleleMapper = alleleMapper;
+ }
+
+ final ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(practicalAlleleMapper,
+ genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC),
+ ALLELE_EXTENSION));
if (configuration.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(configuration.getSampleContamination());
@@ -269,10 +301,23 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(final Map> alleleMapper, final int desiredNumOfAlleles) {
+
+ final PriorityQueue altAlleleMaxPriorityQ = new PriorityQueue<>((sa1, sa2) -> - sa2.compareTo(sa1)); // -1 to turn it into max priority q
+
+ final Set allelesToRetain = new HashSet<>();
+ // populate allelePriorityQ with the relevant information
+ for(final Allele allele : alleleMapper.keySet()){
+
+ if(allele.isReference()){ // collect scores information only on alt alleles; ref allele is never trimmed by this function
+ allelesToRetain.add(allele);
+ continue;
+ }
+
+ final List hapScores = alleleMapper.get(allele).stream().map(hap -> hap.getScore()).collect(Collectors.toList());
+ Collections.sort(hapScores);
+ final Double highestScore = hapScores.get(hapScores.size()-1);
+ final Double secondHighestScore = hapScores.size()>1 ? hapScores.get(hapScores.size()-2) : Double.NEGATIVE_INFINITY;
+
+ altAlleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore));
+ }
+
+ while(allelesToRetain.size() allelesToRetain.contains(p.getKey()))
+ .collect(Collectors.toMap(p->p.getKey(), p->p.getValue()));
+ }
+
+ /**
+ * A utility class that provides ordering information, given best and second best haplotype scores.
+ * If there's a tie between the two alleles when comparing their best haplotype score, the second best haplotype score
+ * is used for breaking the tie. In the case that one allele doesn't have a second best allele, i.e. it has only one
+ * supportive haplotype, its second best score is set as null, and is always considered "worse" than another allele
+ * that has the same best haplotype score, but also has a second best haplotype score.
+ * TODO: in the extremely unlikely case that two alleles, having the same best haplotype score, neither have a second
+ * best haplotype score, the case is undecided.
+ */
+ private static final class AlleleScoredByHaplotypeScores {
+ private final Allele allele;
+ private final Double bestHaplotypeScore;
+ private final Double secondBestHaplotypeScore;
+
+ public AlleleScoredByHaplotypeScores(final Allele allele, final Double bestHaplotypeScore, final Double secondBestHaplotypeScore){
+ this.allele = allele;
+ this.bestHaplotypeScore = bestHaplotypeScore;
+ this.secondBestHaplotypeScore = secondBestHaplotypeScore;
+ }
+
+ public int compareTo(final AlleleScoredByHaplotypeScores other) {
+ if(bestHaplotypeScore > other.bestHaplotypeScore) {
+ return 1;
+ } else if (bestHaplotypeScore < other.bestHaplotypeScore) {
+ return -1;
+ } else {
+ return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? 1 : -1;
+ }
+ }
+
+ public Allele getAllele(){
+ return allele;
+ }
+ }
+
/**
* Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value.
*
@@ -462,7 +579,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> constructHaplotypeMapping(final List originalCalls,
- final Set calledHaplotypes) {
+ final Set calledHaplotypes) {
final Map> haplotypeMap = new HashMap<>(originalCalls.size());
for ( final VariantContext call : originalCalls ) {
// don't try to phase if there is not exactly 1 alternate allele
@@ -674,14 +791,13 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine prepareReadAlleleLikelihoodsForAnnotation(
- final ReadLikelihoods readHaplotypeLikelihoods,
- final Map> perSampleFilteredReadList,
- final GenomeLocParser genomeLocParser,
- final boolean emitReferenceConfidence,
- final Map> alleleMapper,
- final ReadLikelihoods readAlleleLikelihoodsForGenotyping,
- final VariantContext call) {
+ protected ReadLikelihoods prepareReadAlleleLikelihoodsForAnnotation(final ReadLikelihoods readHaplotypeLikelihoods,
+ final Map> perSampleFilteredReadList,
+ final GenomeLocParser genomeLocParser,
+ final boolean emitReferenceConfidence,
+ final Map> alleleMapper,
+ final ReadLikelihoods readAlleleLikelihoodsForGenotyping,
+ final VariantContext call) {
final ReadLikelihoods readAlleleLikelihoodsForAnnotations;
final GenomeLoc loc = genomeLocParser.createGenomeLoc(call);
@@ -744,10 +860,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine decomposeHaplotypesIntoVariantContexts(final List haplotypes,
- final ReadLikelihoods readLikelihoods,
- final byte[] ref,
- final GenomeLoc refLoc,
- final List activeAllelesToGenotype) {
+ final ReadLikelihoods readLikelihoods,
+ final byte[] ref,
+ final GenomeLoc refLoc,
+ final List activeAllelesToGenotype) {
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
@@ -782,8 +898,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine getVCsAtThisLocation(final List haplotypes,
- final int loc,
- final List activeAllelesToGenotype) {
+ final int loc,
+ final List activeAllelesToGenotype) {
// the overlapping events to merge into a common reference view
final List eventsAtThisLoc = new ArrayList<>();
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java
index e7a9c9467..27c7ac04e 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java
@@ -170,6 +170,15 @@ public class GenotypeLikelihoodCalculatorUnitTest {
}
+ @Test
+ public void testComputeMaxAcceptableAlleleCount(){
+ Assert.assertEquals(1024, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(1, 1024));
+ Assert.assertEquals(44, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(2, 1024));
+ Assert.assertEquals(17, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(3, 1024));
+ Assert.assertEquals(5, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(10, 1024));
+ Assert.assertEquals(3, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(20, 1024));
+ Assert.assertEquals(2, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(100, 1024));
+ }
// Simple inefficient calculation of the genotype count given the ploidy.
private int calculateGenotypeCount(final int ploidy, final int alleleCount) {
if (ploidy == 0)
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java
index 28df151c9..59459b027 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java
@@ -130,6 +130,7 @@ public class Haplotype extends Allele {
final Haplotype ret = new Haplotype(newBases, isReference());
ret.setCigar(newCigar);
ret.setGenomeLocation(loc);
+ ret.setScore(score);
ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef());
return ret;
}
From 45607d1b30c15013ddaa13a40ebc8bddd38c3f98 Mon Sep 17 00:00:00 2001
From: Valentin Ruano Rubio
Date: Thu, 30 Jun 2016 11:41:03 -0400
Subject: [PATCH 17/68] RCM Variant sites merger won't output PL when there are
too many alleles in order to avoid memory issues with large cohort runs.
Small additional "cosmetic" changes to the code Addresses issue #1419.
---
...ferenceConfidenceVariantContextMerger.java | 41 ++++++---
.../VariantContextMergerUnitTest.java | 86 ++++++++++++++++++-
2 files changed, 111 insertions(+), 16 deletions(-)
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java
index 5ad0c581c..05c0a930d 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java
@@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.collections.Pair;
-import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
@@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger {
final List allelesList = new ArrayList<>(finalAlleleSet);
+ //TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419
+ //TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will
+ //TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs.
+ //TODO However we need to change this with a more permanent solution.
+ //TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs
+ //TODO in order to avoid to add yet another limit .
+ final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;
+ if (!shouldComputePLs) {
+ logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED));
+ }
+
for ( final Pair> pair : vcAndNewAllelePairs ) {
final VariantContext vc = pair.getFirst();
final List remappedAlleles = pair.getSecond();
- mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified);
+ mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) {
@@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger {
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
* @param samplesAreUniquified true if sample names have been uniquified
+ * @param shouldComputePLs true if the PL can be computed in this merge.
*/
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
final VariantContext vc,
final List remappedAlleles,
final List targetAlleles,
- final boolean samplesAreUniquified) {
+ final boolean samplesAreUniquified,
+ final boolean shouldComputePLs) {
final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY);
// the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
- int[] perSampleIndexesOfRelevantAlleles;
+
for (final Genotype g : vc.getGenotypes()) {
final String name;
@@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger {
else
name = g.getSampleName();
final int ploidy = g.getPloidy();
- final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
+ final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()))
+ .noPL();
genotypeBuilder.name(name);
- final boolean hasPL = g.hasPL();
+
+ final boolean doPLs = shouldComputePLs && g.hasPL();
+ final boolean hasAD = g.hasAD();
final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY);
- if (hasPL || hasSAC) {
- perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
- if (g.hasPL()) {
+ if (doPLs || hasSAC || hasAD) {
+ final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
+ if (doPLs) {
// lazy initialization of the genotype index map by ploidy.
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
: genotypeIndexMapsByPloidy[ploidy];
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
- final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
- genotypeBuilder.PL(PLs).AD(AD);
+ genotypeBuilder.PL(PLs);
}
- if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) {
+ if (hasAD) {
+ genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
+ }
+ if (hasSAC) {
final List sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles);
final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse);
genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs);
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java
index dabfa27d5..1705ebe49 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java
@@ -51,13 +51,16 @@
package org.broadinstitute.gatk.tools.walkers.variantutils;
+import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
@@ -65,10 +68,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
/**
* Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}.
@@ -170,6 +170,84 @@ public class VariantContextMergerUnitTest extends BaseTest {
}
}
+ @Test
+ public void testMergeTooManyAlleles() {
+ final int ALLELE_COUNT = 100;
+ final int SAMPLE_COUNT = 200;
+ final double MNP_PROB = 0.1;
+ final double MNP_MUT_RATE = 0.1;
+ final double NO_PL_PROB = 0.1;
+ final Random rdn = new Random(13);
+ final RandomDNA randomDNA = new RandomDNA(rdn);
+ final Allele refAllele = Allele.create(randomDNA.nextBases(30), true);
+ final Set alleles = new LinkedHashSet<>(ALLELE_COUNT);
+ alleles.add(refAllele);
+ alleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
+ while (alleles.size() < ALLELE_COUNT) {
+ if (rdn.nextDouble() <= MNP_PROB) {
+ final byte[] bytes = refAllele.getBases().clone();
+ for (int i = 0; i < bytes.length; i++) {
+ bytes[i] = rdn.nextDouble() <= MNP_MUT_RATE ? randomDNA.nextBase() : bytes[i];
+ }
+ if (!Arrays.equals(bytes, refAllele.getBases())) {
+ alleles.add(Allele.create(bytes, false));
+ }
+ } else {
+ final int newLength = rdn.nextInt(refAllele.getBases().length + 100) + 1;
+ if (newLength < refAllele.getBases().length) {
+ alleles.add(Allele.create(Arrays.copyOf(refAllele.getBases(), newLength), false));
+ } else if (newLength > refAllele.getBases().length) {
+ final byte[] bases = randomDNA.nextBases(newLength);
+ System.arraycopy(refAllele.getBases(), 0, bases, 0, refAllele.getBases().length);
+ } // else same length... we skip and try again.
+ }
+ }
+ final List alleleList = new ArrayList<>(alleles);
+
+ final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
+ final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
+ final List variantContexts = new ArrayList<>(SAMPLE_COUNT);
+ for (int i = 0; i < SAMPLE_COUNT; i++) {
+ final GenotypeBuilder genotypeBuilder = new GenotypeBuilder("SAMPLE_" + (i+1));
+ genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(2));
+ final List sampleAlleles = new ArrayList<>();
+ sampleAlleles.add(refAllele);
+ sampleAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
+ for (int j = 2; j < alleles.size(); j++) {
+ if (rdn.nextDouble() <= 0.01) {
+ sampleAlleles.add(alleleList.get(j));
+ }
+ }
+ if (rdn.nextDouble() > NO_PL_PROB) {
+ final int[] PLs = new int[(sampleAlleles.size() * (sampleAlleles.size() + 1)) >> 1];
+ for (int j = 0; j < PLs.length; j++) {
+ PLs[j] = rdn.nextInt(1000);
+ }
+ genotypeBuilder.PL(PLs);
+ }
+ final int[] AD = new int[sampleAlleles.size()];
+ for (int j = 0; j < AD.length; j++) {
+ AD[j] = rdn.nextInt(100);
+ }
+ genotypeBuilder.AD(AD);
+
+ variantContexts.add(new VariantContextBuilder()
+ .loc("chr1", 10000, 10000 + refAllele.getBases().length - 1)
+ .alleles(sampleAlleles)
+ .genotypes(genotypeBuilder.make())
+ .make());
+ }
+ final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(variantContexts,
+ genomeLocParser.createGenomeLoc("chr1", 10000), refAllele.getBases()[0], false, true, null);
+ Assert.assertNotNull(result);
+ Assert.assertEquals(result.getGenotypes().size(), SAMPLE_COUNT);
+ Assert.assertTrue(result.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED,
+ "Not necessarily a bug, need to fix the random data generation to make sure there is enough alt-alleles to go beyond this threshold");
+ System.err.println("Alleles: " + result.getNAlleles());
+ for (int i = 0; i < SAMPLE_COUNT; i++) {
+ Assert.assertFalse(result.getGenotype(i).hasPL(), "" + i);
+ }
+ }
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {
From 9b32cf5291e1e46bb0e29682958fef7204476efb Mon Sep 17 00:00:00 2001
From: Samuel Lee
Date: Tue, 21 Jun 2016 21:44:27 -0400
Subject: [PATCH 18/68] Fixed merging of GVCF blocks by fixing rounding of GQ
values in ReferenceConfidenceModel.
---
.../ReferenceConfidenceModel.java | 9 ++--
.../gatk/utils/gvcf/GVCFWriter.java | 22 +---------
.../HaplotypeCallerGVCFIntegrationTest.java | 44 ++++++++++++-------
.../variant/GATKVariantContextUtils.java | 26 +++++++++++
.../GATKVariantContextUtilsUnitTest.java | 29 ++++++++++++
5 files changed, 89 insertions(+), 41 deletions(-)
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
index 397179f35..9da43df3f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
@@ -63,7 +63,9 @@ import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
-import org.broadinstitute.gatk.utils.genotyper.*;
+import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
+import org.broadinstitute.gatk.utils.genotyper.SampleList;
+import org.broadinstitute.gatk.utils.genotyper.SampleListUtils;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
@@ -250,8 +252,9 @@ public class ReferenceConfidenceModel {
// as our GLs for the site.
final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs);
- gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF)));
- gb.PL(leastConfidenceGLs.getAsPLs());
+ final int[] leastConfidenceGLsAsPLs = leastConfidenceGLs.getAsPLs();
+ gb.GQ(GATKVariantContextUtils.calculateGQFromPLs(leastConfidenceGLsAsPLs));
+ gb.PL(leastConfidenceGLsAsPLs);
//gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads);
vcb.genotypes(gb.make());
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
index 18379f990..f0e4ca9eb 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
@@ -246,7 +246,7 @@ public class GVCFWriter implements VariantContextWriter {
final int[] minPLs = block.getMinPLs();
gb.PL(minPLs);
- final int gq = genotypeQualityFromPLs(minPLs);
+ final int gq = GATKVariantContextUtils.calculateGQFromPLs(minPLs);
gb.GQ(gq);
gb.DP(block.getMedianDP());
gb.attribute(GATKVCFConstants.MIN_DP_FORMAT_KEY, block.getMinDP());
@@ -257,26 +257,6 @@ public class GVCFWriter implements VariantContextWriter {
return vcb.genotypes(gb.make()).make();
}
-
- private int genotypeQualityFromPLs(final int[] minPLs) {
- int first = minPLs[0];
- int second = minPLs[1];
- if (first > second) {
- second = first;
- first = minPLs[1];
- }
- for (int i = 3; i < minPLs.length; i++) {
- final int candidate = minPLs[i];
- if (candidate >= second) continue;
- if (candidate <= first) {
- second = first;
- first = candidate;
- } else
- second = candidate;
- }
- return second - first;
- }
-
/**
* Helper function to create a new HomRefBlock from a variant context and current genotype
*
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
index fd15a2834..bb8373b20 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
@@ -86,7 +86,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
//TODO the following test is commented out for the record
//tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "119a30fac57a0e5cf1b8164c1059b22c"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "cf5545094ebb264fa8eb879fd848d9ef"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a6bbc30b82e7864baf64163d55f5aee5"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"});
@@ -105,8 +105,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "822856b75c792be81693019bee672c09"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "6ef5ce3fbc943f15c077a0f12ff5bc2e"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "8bb824886fb0e77d0e8317d69f9d1b62"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ca87b62a070801e4954d72169b88fb9c"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"});
@@ -128,10 +128,10 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "2f1534d30b51fd8a7861d73091be2336"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "2307bcb9f9e3468375a389720036b7da"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "450906ce3c11860c25b90cf0a56bb1a0"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "3c0346d41a7e57b45b85a920cc04f51f"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"});
- tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a0be095ed902a8acdb80fb56ca4e8fb4"});
+ tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "50e628de2a79cd6887af020b713ca3b8"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"});
return tests.toArray(new Object[][]{});
@@ -146,11 +146,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7dc7cfd463ecb7ac535c6ba925c46ef0"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "689d4b9cdc21be370c82251e1f7a3c4f"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0bc1ca3bff07381a344685b048e76ee4"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9d1724150feccb0a09b6fad522605bb1"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"});
- tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "228e1d2ec2e729a5f79c37f3f2557708"});
- tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "2fc7020457dde4439b4133c098d9ab9b"});
+ tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "8a094080fb25bbcd39325dcdd62bcf65"});
+ tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "f35192d245babba9764128abad669019"});
return tests.toArray(new Object[][]{});
}
@@ -293,7 +293,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testNoCallGVCFMissingPLsBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("d55ccf214fd5095e6d586c1547cb1a7a"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9fc3c68f46e747b730615c0be98cb013"));
spec.disableShadowBCF();
executeTest("testNoCallGVCFMissingPLsBugFix", spec);
}
@@ -326,7 +326,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1733d15e960ed473f58a2bfc7f686a2e"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("d986868d83057c0ecdf7ba177b8282f3"));
spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec);
}
@@ -335,7 +335,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("e6e09a82cade24f8121c81c1d43b5d03"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("87723bd4442c7ec25f65a77d6434957a"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@@ -344,7 +344,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("33db0c7e64fc963c160f8bb59d983375"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("a63d6912b2f2fab7debee9488fbbd0b0"));
spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec);
}
@@ -362,7 +362,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMaxNumPLValues() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1176028faca6cd397f581f9e60c474a8"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("446604d4398d4c1bad41b9506624ab91"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValues", spec);
}
@@ -379,7 +379,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("19f5398e4013c06b52c0085fe0b3469e"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
@@ -404,7 +404,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("19f5398e4013c06b52c0085fe0b3469e"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
@@ -414,4 +414,14 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// Set the log level back
logger.setLevel(level);
}
+
+ //Regression test for https://github.com/broadinstitute/gsa-unstable/issues/1345
+ @Test
+ public void testHaplotypeCallerGVCFBlocks() {
+ final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
+ b37KGReference, privateTestDir + "gvcf_blocks_test.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("802c53621bd2004d9052a8e81d91df3e"));
+ spec.disableShadowBCF();
+ executeTest("testHaplotypeCallerGVCFBlocks", spec);
+ }
}
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
index 407a49570..d67dd6c01 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
@@ -2314,5 +2314,31 @@ public class GATKVariantContextUtils {
builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFrequency.toArray());
}
}
+
+ /**
+ * @param plValues array of PL values
+ * @return the genotype quality corresponding to the PL values
+ */
+ public static int calculateGQFromPLs(final int[] plValues) {
+ if ( plValues == null ) throw new IllegalArgumentException("Array of PL values cannot be null.");
+ if ( plValues.length < 2 ) throw new IllegalArgumentException("Array of PL values must contain at least two elements.");
+
+ int first = plValues[0];
+ int second = plValues[1];
+ if (first > second) {
+ second = first;
+ first = plValues[1];
+ }
+ for (int i = 2; i < plValues.length; i++) {
+ final int candidate = plValues[i];
+ if (candidate >= second) continue;
+ if (candidate <= first) {
+ second = first;
+ first = candidate;
+ } else
+ second = candidate;
+ }
+ return second - first;
+ }
}
diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
index f875d2e8f..20ec7da58 100644
--- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
+++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
@@ -1872,5 +1872,34 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final Map calledAltAlleles = new LinkedHashMap<>();
GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, null);
}
+
+ @DataProvider(name="gqFromPLsData")
+ public Object[][] gqFromPLsData() {
+ return new Object[][]{
+ {new int[]{0, 15}, 15},
+ {new int[]{15, 0}, 15},
+ {new int[]{0, 10, 20}, 10},
+ {new int[]{20, 10, 0}, 10},
+ {new int[]{0, 10, 20, 30, 40}, 10},
+ {new int[]{30, 40, 20, 10, 0}, 10},
+ {new int[]{-10, 20, 35}, 30},
+ {new int[]{35, 40, -10, 15, 20}, 25},
+ {new int[]{0, 10, 20, 30, 40, 50, 5}, 5},
+ {new int[]{15, 15, 0, 5}, 5},
+ {new int[]{15, 15, 0, 25}, 15},
+ {new int[]{0, 15, 0, 25}, 0}
+ };
+ }
+
+ @Test(dataProvider = "gqFromPLsData")
+ public void testCalculateGQFromPLs(final int[] plValues, final int expectedGQ) {
+ Assert.assertEquals(GATKVariantContextUtils.calculateGQFromPLs(plValues), expectedGQ);
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testCalculateGQFromShortPLArray() {
+ final int[] plValues = new int[]{0};
+ GATKVariantContextUtils.calculateGQFromPLs(plValues);
+ }
}
From d6d0678b50a6a4a36a6be5c22ed8c8926c7e4a2d Mon Sep 17 00:00:00 2001
From: Takuto Sato
Date: Wed, 15 Jun 2016 15:32:17 -0400
Subject: [PATCH 19/68] Build on Laura's code and finish porting MuTect1
clustered read position filter.
---
.../walkers/cancer/ClusteredReadPosition.java | 350 ++++++++++++++++++
...MuTectStats.java => MedianStatistics.java} | 71 ++--
.../cancer/m2/ClusteredEventsAnnotator.java | 191 ----------
.../cancer/m2/M2ArgumentCollection.java | 3 +
.../gatk/tools/walkers/cancer/m2/MuTect2.java | 28 +-
.../cancer/m2/SomaticGenotypingEngine.java | 29 +-
.../cancer/m2/MuTect2IntegrationTest.java | 33 +-
.../gatk/utils/sam/ReadUtils.java | 4 +-
.../gatk/utils/variant/GATKVCFConstants.java | 6 +
.../utils/variant/GATKVCFHeaderLines.java | 10 +-
.../broadinstitute/gatk/utils/BaseTest.java | 1 +
11 files changed, 456 insertions(+), 270 deletions(-)
create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java
rename protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/{m2/MuTectStats.java => MedianStatistics.java} (85%)
delete mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java
new file mode 100644
index 000000000..01fb11c69
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java
@@ -0,0 +1,350 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2016 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.cancer;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.apache.commons.math3.stat.descriptive.rank.Median;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.gatk.tools.walkers.cancer.m2.MuTect2;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.collections.Pair;
+import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
+import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
+import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.sam.ReadUtils;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Detect clustering of variants near the ends of reads
+ *
+ *
This annotation detects clustering of evidence for a somatic variant near the ends of reads. To turn on the annotation and the accompanying filter (clustered_read_position), add --enable_clustered_read_position_filter flag in the commandline.
+ *
+ *
+ *
Statistical notes
+ *
ClusteredReadPosition produces four INFO field annotations. At a given somatic variant site, MEDIAN_LEFT_OFFSET is the median of the number of bases from the left end of the tumor read to the variant. MEDIAN_RIGHT_OFFSET is similar, but counts from the right end of the read. MAD_LEFT_OFFSET and MAD_RIGHT_OFFSET measure the median absolute deviations. The median gives us the offset of a representative read, while the median absolute deviation captures the spread. We filter a variant if MEDIAN_LEFT_OFFSET <= 10 and MAD_LEFT_OFFSET <= 3, or if MEDIAN_RIGHT_OFFSET <= 10 and MAD_RIGHT_OFFSET <= 3.
+ *
+ *
+ *
Caveat
+ *
ClusteredReadPosition is available with MuTect2 only
+ *
+ *
RelatedAnnotation
+ *
ReadPosRankSum is a similar annotation designed for germline variants.
+ *
+ */
+public class ClusteredReadPosition extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
+ private final static Logger logger = Logger.getLogger(ClusteredReadPosition.class);
+ private String tumorSampleName = null;
+
+ @Override
+ public List getKeyNames() { return Arrays.asList(
+ GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY,
+ GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY,
+ GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY,
+ GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
+ }
+
+ @Override
+ public List getDescriptions() {
+ List descriptions = new ArrayList<>();
+ for (final String infoFieldKey : getKeyNames()){
+ descriptions.add(GATKVCFHeaderLines.getInfoLine(infoFieldKey));
+ }
+ return descriptions;
+
+ // the following causes a cryptic class not found error, similar to the one in computeReadPositionStats
+ // return getKeyNames().stream().map(GATKVCFHeaderLines::getInfoLine).collect(Collectors.toList());
+ }
+
+ @Override
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map stratifiedPerReadAlleleLikelihoodMap) {
+ // TODO: might make sense to move this code to SomaticGenoypingEngine.
+ // FIXME: checking walker is mutect2 is not ideal...moving this annotation to SomaticGenoypingEngine will solve it
+
+ // populate tumorSampleName the first time we call this method. skip afterwards.
+ if (tumorSampleName == null){
+ if (walker instanceof MuTect2) {
+ tumorSampleName = ((MuTect2) walker).getTumorSampleName();
+ } else {
+ throw new IllegalStateException("ClusteredReadPosition: walker is not MuTect2");
+ }
+ }
+
+ // we skip multi-allelic sites
+ if (vc.getAlternateAlleles().size() > 1){
+ return null;
+ }
+
+ final Map result = new HashMap<>();
+
+ if ( stratifiedPerReadAlleleLikelihoodMap != null ) {
+ final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(tumorSampleName);
+ if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
+ final Optional readPositionStatsOption = computeReadPositionStats(vc, likelihoodMap);
+ if (readPositionStatsOption.isPresent()){
+ MedianStatistics readPositionStats = readPositionStatsOption.get();
+ result.put(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMedian());
+ result.put(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMedian());
+ result.put(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMAD());
+ result.put(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMAD());
+ } else {
+ return null;
+ }
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ *
+ * @param vc
+ * @param pralm
+ * @return median of left and right offsets and their median absolute deviations. does not return null.
+ */
+ private Optional computeReadPositionStats(final VariantContext vc,
+ final PerReadAlleleLikelihoodMap pralm) {
+ final int variantStartPosition = vc.getStart();
+ final List tumorLeftOffsets = new ArrayList<>();
+ final List tumorRightOffsets = new ArrayList<>();
+ for ( final Map.Entry> readAlleleLikelihood : pralm.getLikelihoodReadMap().entrySet() ) {
+ final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(readAlleleLikelihood.getValue());
+ final GATKSAMRecord read = readAlleleLikelihood.getKey();
+ if ( mostLikelyAllele.getMostLikelyAllele().isReference() || ! mostLikelyAllele.isInformative() || ! isUsableRead(read)) {
+ continue;
+ }
+
+ final Pair offsetPair = getVariantPositionInRead(read, variantStartPosition);
+ final OptionalInt variantPositionInReadFromLeft = offsetPair.getFirst();
+ final OptionalInt variantPositionInReadFromRight = offsetPair.getSecond();
+
+ // suffices to check only the left offset because the right offset depends on it
+ if ( variantPositionInReadFromLeft.isPresent() ) {
+ tumorLeftOffsets.add(variantPositionInReadFromLeft.getAsInt());
+ tumorRightOffsets.add(variantPositionInReadFromRight.getAsInt());
+ }
+ }
+
+ if (tumorLeftOffsets.isEmpty() || tumorRightOffsets.isEmpty()) {
+ // This condition seems to arise when the reads as aligned in the bam (as represented by PRALM) do not contain the alt read found by HaplotypeCaller
+ logger.warn("At Position " + vc.getContig() + ": " + vc.getStart() + " , the left or right offset list is empty");
+ return Optional.empty();
+ }
+
+ // The following (mapToDouble() in particular) causes ClusteredReadPosition to be not added to ClassMap
+ // leftMedian = median.evaluate(tumorLeftOffsets.stream().mapToDouble( x -> x ).toArray());
+ // rightMedian = median.evaluate(tumorRightOffsets.stream().mapToDouble( x -> x).toArray());
+
+ // until we understand why mapToDouble() causes the above error, have to compute medians in two steps
+ // first use a for loop to manually cast integer to doubles, then call median :: evaluate
+ double[] tumorLeftOffsetsDouble = new double[tumorLeftOffsets.size()];
+ double[] tumorRightOffsetsDouble = new double[tumorRightOffsets.size()];
+ for (int i = 0; i < tumorLeftOffsets.size(); i++){
+ tumorLeftOffsetsDouble[i] = (double) tumorLeftOffsets.get(i);
+ tumorRightOffsetsDouble[i] = (double) tumorRightOffsets.get(i);
+ }
+
+ Median median = new Median();
+ double leftMedian = median.evaluate(tumorLeftOffsetsDouble);
+ double rightMedian = median.evaluate(tumorRightOffsetsDouble);
+ double leftMAD = calculateMAD(tumorLeftOffsets, leftMedian);
+ double rightMAD = calculateMAD(tumorRightOffsets, rightMedian);
+
+ return( Optional.of(new MedianStatistics(leftMedian, rightMedian, leftMAD, rightMAD) ) );
+ }
+
+ private static class MedianStatistics {
+ private double leftMedian;
+ private double rightMedian;
+ private double leftMAD;
+ private double rightMAD;
+
+ public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) {
+ this.leftMedian = leftMedian;
+ this.rightMedian = rightMedian;
+ this.leftMAD = leftMAD;
+ this.rightMAD = rightMAD;
+ }
+
+ public double getLeftMedian() {
+ return leftMedian;
+ }
+
+ public double getRightMedian() {
+ return rightMedian;
+ }
+
+ public double getLeftMAD() {
+ return leftMAD;
+ }
+
+ public double getRightMAD() {
+ return rightMAD;
+ }
+ }
+
+
+ /**
+ Examples below show how we compute the position of the variant with respect to the left and right end of the reads.
+ Note that a variant may be SNP, deletion, or insertion, and we are counting the number of bases from the left/right end of the read to that variant.
+ We first compute the left offset. Then, right offset = read length - left offset.
+ This means that if there is an insertion between the either end of a read and the variant, we count the inserted bases. Conversely, we do not count the deleted bases between the end of a read and a variant.
+ We count soft-clipped bases.
+
+ example 1 : SNP
+
+ right offset: 9 8 7 6 5 4 3 2 1 0
+ ref: _ _ _ _ _ _ _ _ _ _
+ read: _ _ _ _ x _ _ _ _ _
+ left offset: 0 1 2 3 4 5 6 7 8 9
+
+ left-offset = 4. right offset = 5.
+ read.getReadLength() = 10. numReadBasesToVariant = 5.
+
+ example 2: deletion
+
+ We count from the left end of the read to the last non-deleted base i.e. the first deleted base is not counted.
+ From the right end, we count bases to the *end* of the deletion.
+
+ right offset: 9 8 7 6 5 4 3 2 1 0
+ ref: _ _ _ _ _ _ _ _ _ _
+ read: _ _ _ _|- - - -|_ _
+ left offset: 0 1 2 3 4 5 6 7 8 9
+
+ left-offset = 3. right-offset = 2.
+ read.getReadLength() = 6. numReadBasesToVariant = 4
+
+ example 3: insertion
+
+ For insertions, we count from the left to the first inserted base. From the right, we count all the way to the first inserted base.
+ In the future, we may modify this; it might be desirable to count from the right to the *last* inserted base.
+
+ right offset: 9 8 7 6 5 4 3 2 1 0
+ ref: _ _ _ _ _ _ _ _
+ read: _ _ _ I I I _ _ _ _
+ left offset: 0 1 2 3 4 5 6 7 8 9
+
+ left-offset = 3. right offset = 6
+ read.getReadLength() = 10. numReadBasesToVariant = 4.
+
+ */
+
+ /**
+ * The function assumes that read contains the variant allele.
+ *
+ * @param read
+ * @param variantStartPosition the location of the variant in the reference
+ * @return
+ */
+
+ protected Pair getVariantPositionInRead(final GATKSAMRecord read, final int variantStartPosition) {
+ final Pair refPositionAndDeletionFlag = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), variantStartPosition, true);
+ // the +1 is needed there because getReadCoordinateForReferenceCoordinate() returns the number of read bases from the left end to the variant - 1
+ int numReadBasesFromLeftEndToVariant = refPositionAndDeletionFlag.getFirst() + 1;
+
+ // we don't take advantage of fallsInsideOrJustBeforeDeletionOrSkippedRegion flag now, but we might want to, so I will leave it here in comments.
+ // boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = refPositionAndDeletionFlag.getSecond();
+
+ if ( numReadBasesFromLeftEndToVariant == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ return new Pair(OptionalInt.empty(), OptionalInt.empty());
+ } else {
+ int leftOffset = numReadBasesFromLeftEndToVariant - 1;
+ int rightOffset = read.getReadLength() - numReadBasesFromLeftEndToVariant;
+ return new Pair(OptionalInt.of(leftOffset), OptionalInt.of(rightOffset));
+ }
+ }
+
+ /**
+ * Can the read be used in comparative tests between ref / alt bases?
+ *
+ * @param read the read to consider
+ * @return false if MQ is either 0 or unavailable. true otherwise.
+ */
+ private boolean isUsableRead(final GATKSAMRecord read) {
+ return( read.getMappingQuality() != 0 || read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE);
+ }
+
+ /**
+ *
+ * @param offsets a list of integers
+ * @param median median of the list offsets.
+ * @return median absolute deviation (median of the list of deviations from the median)
+ */
+ private double calculateMAD(final List offsets, final double median) {
+ // This code is concise but somehow leads to ClusteredReadPosition class being removed from ClassMap.
+ // mapToDouble() seems to be the trigger
+ // return new Median().evaluate(offsets.stream().mapToDouble(x -> Math.abs(x - median)).toArray());
+
+ double[] medianAbsoluteDeviations = new double[offsets.size()];
+ for (int i = 0; i < offsets.size(); i++){
+ medianAbsoluteDeviations[i] = Math.abs(offsets.get(i) - median);
+ }
+
+ return new Median().evaluate(medianAbsoluteDeviations);
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java
similarity index 85%
rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java
rename to protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java
index bb02db4ad..0a88c28de 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java
@@ -49,60 +49,37 @@
* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
-package org.broadinstitute.gatk.tools.walkers.cancer.m2;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
+package org.broadinstitute.gatk.tools.walkers.cancer;
/**
- * Collection of Statistical methods and tests used by MuTect
+ * Created by tsato on 6/27/16.
*/
-public class MuTectStats {
-
- public static double calculateMAD(ArrayList xs, double median) {
- ArrayList deviations = new ArrayList<>(xs.size());
-
- for(double x : xs) {
- deviations.add(Math.abs(x - median));
- }
-
- return getMedian(deviations);
+public class MedianStatistics {
+ private double leftMedian;
+ private double rightMedian;
+ private double leftMAD;
+ private double rightMAD;
+ public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) {
+ this.leftMedian = leftMedian;
+ this.rightMedian = rightMedian;
+ this.leftMAD = leftMAD;
+ this.rightMAD = rightMAD;
}
- public static double getMedian(ArrayList data) {
- Collections.sort(data);
- Double result;
-
- if (data.size() % 2 == 1) {
- // If the number of entries in the list is not even.
-
- // Get the middle value.
- // You must floor the result of the division to drop the
- // remainder.
- result = data.get((int) Math.floor(data.size()/2) );
-
- } else {
- // If the number of entries in the list are even.
-
- // Get the middle two values and average them.
- Double lowerMiddle = data.get(data.size()/2 );
- Double upperMiddle = data.get(data.size()/2 - 1 );
- result = (lowerMiddle + upperMiddle) / 2;
- }
-
- return result;
+ public double getLeftMedian() {
+ return leftMedian;
}
- public static double[] convertIntegersToDoubles(List integers)
- {
- double[] ret = new double[integers.size()];
- for (int i=0; i < ret.length; i++)
- {
- ret[i] = integers.get(i);
- }
- return ret;
+ public double getRightMedian() {
+ return rightMedian;
+ }
+
+ public double getLeftMAD() {
+ return leftMAD;
+ }
+
+ public double getRightMAD() {
+ return rightMAD;
}
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java
deleted file mode 100644
index db47d34e8..000000000
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE
-* SOFTWARE LICENSE AGREEMENT
-* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE").
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. PHONE-HOME FEATURE
-* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
-*
-* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012-2016 Broad Institute, Inc.
-* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 5. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 6. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 7. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 8. MISCELLANEOUS
-* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
-*/
-
-package org.broadinstitute.gatk.tools.walkers.cancer.m2;
-
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.vcf.VCFHeaderLineType;
-import htsjdk.variant.vcf.VCFInfoHeaderLine;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
-import org.broadinstitute.gatk.utils.QualityUtils;
-import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
-import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
-import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
-import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
-import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
-import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
-import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
-import org.broadinstitute.gatk.utils.sam.ReadUtils;
-
-import java.util.*;
-
-/**
- * Created by gauthier on 7/27/15.
- */
-public class ClusteredEventsAnnotator extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
-
- private String tumorSampleName = null;
-
- @Override
- public List getKeyNames() { return Arrays.asList("tumorForwardOffsetMedian","tumorReverseOffsetMedian","tumorForwardOffsetMAD","tumorReverseOffsetMAD"); }
-
- @Override
- public List getDescriptions() {
- //TODO: this needs a lot of re-phrasing
- return Arrays.asList(new VCFInfoHeaderLine("TUMOR_FWD_POS_MEDIAN", 1, VCFHeaderLineType.Integer, "Median offset of tumor variant position from positive read end"),
- new VCFInfoHeaderLine("TUMOR_FWD_POS_MAD", 1, VCFHeaderLineType.Integer, "Median absolute deviation from the median for tumor forward read positions"),
- new VCFInfoHeaderLine("TUMOR_REV_POS_MEDIAN", 1, VCFHeaderLineType.Integer, "Median offset of tumor variant position from negative read end"),
- new VCFInfoHeaderLine("TUMOR_REV_POS_MAD", 1, VCFHeaderLineType.Integer, "Median absolute deviation from the median for tumor reverse read positions"));
- }
-
- @Override
- public Map annotate(final RefMetaDataTracker tracker,
- final AnnotatorCompatible walker,
- final ReferenceContext ref,
- final Map stratifiedContexts,
- final VariantContext vc,
- final Map stratifiedPerReadAlleleLikelihoodMap) {
-
- if (tumorSampleName == null){
- if (walker instanceof MuTect2 ) {
- tumorSampleName = ((MuTect2) walker).tumorSampleName;
- } else {
- // ts: log error and exit
- throw new IllegalStateException("ClusteredEventsAnnotator: walker is not MuTect2");
- }
- }
-
- final Map map = new HashMap<>();
-
-
- if ( stratifiedPerReadAlleleLikelihoodMap != null ) {
- final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(tumorSampleName);
- MuTect2.logReadInfo("HAVCYADXX150109:2:2209:19034:53394", likelihoodMap.getLikelihoodReadMap().keySet(), "Present inside ClusteredEventsAnnotator:annotate");
- if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
- double[] list = fillQualsFromLikelihoodMap(vc.getStart(), likelihoodMap); // [fwdMedian, revMedian, fwdMAD, revMAD]
- final int FWDMEDIAN = 0, REVMEDIAN = 1, FWDMAD = 2, REVMAD = 3; // ts: make a class to contain these values
- map.put("TUMOR_FWD_POS_MEDIAN", list[FWDMEDIAN]);
- map.put("TUMOR_REV_POS_MEDIAN", list[REVMEDIAN]);
- map.put("TUMOR_FWD_POS_MAD", list[FWDMAD]);
- map.put("TUMOR_REV_POS_MAD", list[REVMAD]);
- }
- }
-
- return map;
-
- }
-
- private double[] fillQualsFromLikelihoodMap(final int refLoc,
- final PerReadAlleleLikelihoodMap likelihoodMap) {
- final ArrayList tumorFwdOffset = new ArrayList<>();
- final ArrayList tumorRevOffset = new ArrayList<>();
- for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
- final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
- if ( ! a.isInformative() )
- continue; // read is non-informative
-
- final GATKSAMRecord read = el.getKey();
- if ( isUsableRead(read, refLoc) ) {
- if ( a.getMostLikelyAllele().isReference() )
- continue;
- final Double valueRight = getElementForRead(read, refLoc, ReadUtils.ClippingTail.RIGHT_TAIL);
- if ( valueRight == null )
- continue;
- tumorFwdOffset.add(valueRight);
- final Double valueLeft = getElementForRead(read, refLoc, ReadUtils.ClippingTail.LEFT_TAIL);
- if ( valueLeft == null )
- continue;
- tumorRevOffset.add(valueLeft);
- }
- }
-
- double fwdMedian = 0.0;
- double revMedian = 0.0;
- double fwdMAD = 0.0;
- double revMAD = 0.0;
-
- if (!tumorFwdOffset.isEmpty() && !tumorRevOffset.isEmpty()) {
- fwdMedian = MuTectStats.getMedian(tumorFwdOffset);
- revMedian = MuTectStats.getMedian(tumorRevOffset);
- fwdMAD = MuTectStats.calculateMAD(tumorFwdOffset, fwdMedian);
- revMAD = MuTectStats.calculateMAD(tumorRevOffset, revMedian);
- }
-
- return( new double[] {fwdMedian, revMedian, fwdMAD, revMAD} ); // TODO: make an object container instead of array
- }
-
- protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final ReadUtils.ClippingTail tail) {
- final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refLoc, tail, true);
- if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) // offset is the number of bases in the read, including inserted bases, from start of read to the variant
- return null;
-
- int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0); // readpos is the number of REF bases from start to variant. I would name it as such...
- final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
- if (readPos > numAlignedBases / 2)
- readPos = numAlignedBases - (readPos + 1);
- return (double)readPos;
- }
-
- /**
- * Can the read be used in comparative tests between ref / alt bases?
- *
- * @param read the read to consider
- * @param refLoc the reference location
- * @return true if this read is meaningful for comparison, false otherwise
- */
- protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
- return !( read.getMappingQuality() == 0 ||
- read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE );
- }
-}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java
index 339603567..8ad89a848 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java
@@ -138,6 +138,9 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
@Argument(fullName = "enable_strand_artifact_filter", required = false, doc = "turn on strand artifact filter")
public boolean ENABLE_STRAND_ARTIFACT_FILTER = false;
+ @Argument(fullName = "enable_clustered_read_position_filter", required = false, doc = "turn on clustered read position filter")
+ public boolean ENABLE_CLUSTERED_READ_POSITION_FILTER = false;
+
/**
* This argument is used for the M1-style read position filter
*/
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java
index 08ea5c243..b3c7d950f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java
@@ -205,8 +205,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i
private byte MIN_TAIL_QUALITY;
private double log10GlobalReadMismappingRate;
-
-
@ArgumentCollection
protected M2ArgumentCollection MTAC = new M2ArgumentCollection();
@@ -364,6 +362,9 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i
private VariantAnnotatorEngine initializeVCFOutput() {
// initialize the output VCF header
+ if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER) {
+ annotationsToUse.add("ClusteredReadPosition");
+ }
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
Set headerInfo = new HashSet<>();
@@ -418,6 +419,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i
headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME));
headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME));
headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME));
+ headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME));
if ( ! doNotRunPhysicalPhasing ) {
@@ -835,10 +837,21 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i
filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME);
}
- // TODO: Add clustered read position filter here
- // TODO: Move strand bias filter here
- return filters;
+ // clustered read position filter
+ if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
+ Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
+ Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY);
+ Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY);
+ Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
+ //If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter
+ if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) ||
+ (tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD))
+ filters.add(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME);
+ }
+ // TODO: Move strand bias filter here
+
+ return filters;
}
@@ -1313,6 +1326,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
}
+
+ public String getTumorSampleName(){
+ return tumorSampleName;
+ }
+
// KCIBUL: new stuff -- read up on this!!
/**
* As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior.
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java
index 536283bc3..dbacfbe41 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java
@@ -209,7 +209,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
// TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!)
PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
- MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads");
+ MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
// extend to multiple samples
// compute tumor LOD for each alternate allele
@@ -249,12 +249,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
if (hasNormal) {
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
- MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads");
+ MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
Collection cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc);
Collection dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc);
-
// remove the effect of cosmic from dbSNP
final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
@@ -320,6 +319,10 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
+ MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
+ MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
+ MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
+
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
final PerAlleleCollection alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
@@ -328,6 +331,22 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
final PerAlleleCollection alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
final PerAlleleCollection tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
+ if( configuration.DEBUG && logger != null ) {
+ StringBuilder forwardMessage = new StringBuilder("Calculated forward allelic fraction at " + loc + " = [");
+ StringBuilder reverseMessage = new StringBuilder("Calculated reverse allelic fraction at " + loc + " = [");
+
+ for (Allele altAllele : altAlleleFractions.getAltAlleles()){
+ forwardMessage.append( altAllele + ": " + alleleFractionsForward.getAlt(altAllele) + ", ");
+ reverseMessage.append( altAllele + ": " + alleleFractionsReverse.getAlt(altAllele) + ", ");
+ }
+
+ forwardMessage.append("]");
+ reverseMessage.append("]");
+
+ logger.info(forwardMessage.toString());
+ logger.info(reverseMessage.toString());
+ }
+
double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
@@ -500,6 +519,10 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
int altCount = alleleCounts.getAlt(altAllele);
double alleleFraction = (double) altCount / (refCount + altCount);
+ // weird case, but I've seen it happen in one strand cases
+ if (refCount == 0 && altCount == refCount ) {
+ alleleFraction = 0;
+ }
alleleFractions.setAlt(altAllele, alleleFraction);
// logger.info("Counted " + refCount + " ref and " + altCount + " alt " );
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java
index 70b401333..0b93b9fc1 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java
@@ -57,8 +57,6 @@ import org.testng.annotations.Test;
import java.util.*;
public class MuTect2IntegrationTest extends WalkerTest {
- final static String REF = hg19Reference;
-
final static String CCLE_MICRO_TUMOR_BAM = privateTestDir + "HCC1143.cghub.ccle.micro.bam";
final static String CCLE_MICRO_NORMAL_BAM = privateTestDir + "HCC1143_BL.cghub.ccle.micro.bam";
final static String CCLE_MICRO_INTERVALS_FILE = privateTestDir + "HCC1143.cghub.ccle.micro.intervals";
@@ -72,8 +70,6 @@ public class MuTect2IntegrationTest extends WalkerTest {
final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals";
final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals";
- final static String MULTIALLELIC_TUMOR_BAM = privateTestDir + "m2-multiallelic-tumor.bam";
-
final String commandLine =
@@ -82,7 +78,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
private void M2Test(String tumorBam, String normalBam, String intervals, String args, String md5) {
final String base = String.format(
commandLine,
- REF, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
+ hg19Reference, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
" -o %s ";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
@@ -97,7 +93,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
private void m2TumorOnlyTest(String tumorBam, String intervals, String args, String md5) {
final String base = String.format(
"-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -L %s",
- REF, DBSNP, COSMIC, PON, tumorBam, intervals) +
+ hg19Reference, DBSNP, COSMIC, PON, tumorBam, intervals) +
" -o %s ";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
@@ -109,7 +105,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
private void M2TestWithDroppedReads(String tumorBam, String normalBam, String intervals, String args, String md5Variants, String md5Bamout) {
final String base = String.format(
commandLine,
- REF, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
+ hg19Reference, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) +
" -o %s " +
"-bamout %s --emitDroppedReads";
@@ -124,7 +120,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
@Test
public void testMicroRegression() {
- M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "", "dc6d742e85a59b237f5541109a6d343e");
+ M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "", "dd3bb9526c85c0aed39545c4639ff138");
}
/**
@@ -134,7 +130,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
*/
@Test
public void testTruePositivesDream3() {
- M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_TP_INTERVALS_FILE, "", "7faeb329798cca63a42867404111847c");
+ M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_TP_INTERVALS_FILE, "", "5bd540d238916a2b91e827aed3592e59");
}
/**
@@ -143,7 +139,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
@Test
public void testTruePositivesDream3TrackedDropped() {
M2TestWithDroppedReads(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, "21:10935369", "",
- "a2e6cc12a21219d510b6719ee86c676e",
+ "48a446d47bb10434cb7f0ee726d15721",
"b536e76870326b4be01b8d6b83c1cf1c");
}
@@ -153,7 +149,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
*/
@Test
public void testFalsePositivesDream3() {
- M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "fe3adcf8ac45e8ec9a9feb26908f67a9"); // e2413f4166b6ed20be6cdee6616ba43d
+ M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "c23f794866797f9bbcb3ed04451758be"); // e2413f4166b6ed20be6cdee6616ba43d
}
/**
@@ -161,7 +157,7 @@ public class MuTect2IntegrationTest extends WalkerTest {
*/
@Test
public void testContaminationCorrection() {
- M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "4ffcef4c72ac72b9b8738efdcf3e04e9");
+ M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b");
}
/**
@@ -169,19 +165,18 @@ public class MuTect2IntegrationTest extends WalkerTest {
*/
@Test
public void testTumorOnly(){
- m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "6044780242414820090c5b4b1d4b8ac0");
+ m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0");
}
@Test
public void testStrandArtifactFilter(){
- M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_strand_artifact_filter", "b988ba4b5f3af4674e28b3501bd3b124");
+ M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_strand_artifact_filter", "75c9349ff9f8dc84291396ac50871f64");
}
-// @Test
-// public void testMultiAllelicSite(){
-// // TODO need b38 reference
-// m2TumorOnlyTest(MULTIALLELIC_TUMOR_BAM, "1:23558000-23560000", "", "5c7182623391c1faec3f7c05c0506781")
-// }
+ @Test
+ public void testClusteredReadPositionFilter() {
+ M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "c333f7dc11e39e0713147ad9af2bf4db");
+ }
}
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java
index 75617e87d..afa486cac 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java
@@ -501,7 +501,7 @@ public class ReadUtils {
if (refBases + cigarElement.getLength() < goal)
shift = cigarElement.getLength();
else
- shift = goal - refBases;
+ shift = goal - refBases; // get to the goal
refBases += shift;
}
@@ -515,7 +515,7 @@ public class ReadUtils {
final boolean endsWithinCigar = shift < cigarElement.getLength();
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
- // since we checked if the goal coordinate is within the read length, so this is just a sanity check.
+ // since we checked if the goal coordinate is within the read length, this is just a sanity check.
if (!endsWithinCigar && !cigarElementIterator.hasNext()) {
if (allowGoalNotReached) {
return new Pair(CLIPPING_GOAL_NOT_REACHED, false);
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java
index 06c626496..1a8381ade 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java
@@ -138,6 +138,10 @@ public final class GATKVCFConstants {
public static final String TLOD_REV_KEY = "TLOD_REV";
public static final String TUMOR_SB_POWER_FWD_KEY = "TUMOR_SB_POWER_FWD";
public static final String TUMOR_SB_POWER_REV_KEY = "TUMOR_SB_POWER_REV";
+ public static final String MEDIAN_LEFT_OFFSET_KEY = "MEDIAN_LEFT_OFFSET";
+ public static final String MEDIAN_RIGHT_OFFSET_KEY = "MEDIAN_RIGHT_OFFSET";
+ public static final String MAD_MEDIAN_LEFT_OFFSET_KEY = "MAD_LEFT_OFFSET";
+ public static final String MAD_MEDIAN_RIGHT_OFFSET_KEY = "MAD_RIGHT_OFFSET";
//FORMAT keys
@@ -179,6 +183,8 @@ public final class GATKVCFConstants {
public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2
public static final String TRIALLELIC_SITE_FILTER_NAME = "triallelic_site"; //M2
public static final String STRAND_ARTIFACT_FILTER_NAME = "strand_artifact"; // M2
+ public static final String CLUSTERED_READ_POSITION_FILTER_NAME = "clustered_read_position"; // M2
+
// Symbolic alleles
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java
index a64eccf92..0c683cf20 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java
@@ -72,8 +72,8 @@ public class GATKVCFHeaderLines {
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short tandem repeat region"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME, "Site filtered because more than two alt alleles pass tumor LOD"));
- addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, "Strand bias detected: evidence for alt allele comes from one read direction only"));
- // addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME, "Variant appears in similar read positions"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, "Evidence for alt allele comes from one read direction only"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME, "Evidence for somatic variant clusters near the ends of reads"));
@@ -197,7 +197,7 @@ public class GATKVCFHeaderLines {
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AF_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site"));
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AN_COMP_KEY, 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site"));
- // M2-related info lines
+ // More M2-related info lines
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.String, "Number of events in this haplotype"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, 1, VCFHeaderLineType.Integer, "Maximum distance between events in this active region"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, 1, VCFHeaderLineType.Integer, "Minimum distance between events in this active region"));
@@ -209,6 +209,10 @@ public class GATKVCFHeaderLines {
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TLOD_REV_KEY,1,VCFHeaderLineType.Float,"TLOD from reverse reads only"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY,1,VCFHeaderLineType.Float,"Strand bias power for forward reads"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY,1,VCFHeaderLineType.Float,"Stand bias power for reverse reads"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median of the number of bases between the left end of the tumor read and the variant"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median of the number of bases between the variant and the right end of the tumor read"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median absolute deviation of medians of the number of bases between the left end of the tumor read and the variant"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median absolute deviation of medians of the number of bases between the variant and the right end of the tumor read"));
}
}
diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java
index 4d11a43cd..a9ffd21e7 100644
--- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java
+++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java
@@ -97,6 +97,7 @@ public abstract class BaseTest {
public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta";
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
+ public static final String b38Reference = "/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta";
public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta";
public static final String hg19ReferenceWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta";
public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
From aace73e884ffb98477b92e5ad2cac1c0c657214e Mon Sep 17 00:00:00 2001
From: Ron Levine
Date: Fri, 8 Jul 2016 12:46:50 -0400
Subject: [PATCH 20/68] Enable control of reporting periodicity
---
.../arguments/GATKArgumentCollection.java | 7 +++++
.../gatk/engine/executive/MicroScheduler.java | 3 +-
.../utils/progressmeter/ProgressMeter.java | 13 ++-------
.../progressmeter/ProgressMeterDaemon.java | 29 ++++++-------------
4 files changed, 21 insertions(+), 31 deletions(-)
diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
index 737a46ba1..6f527b121 100644
--- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
+++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
@@ -52,6 +52,9 @@ public class GATKArgumentCollection {
// the default value of the stop of the expanded window
public static final int DEFAULT_REFERENCE_WINDOW_STOP = 0;
+ // the default time in seconds between progress meter calls
+ public final static long DEFAULT_SECONDS_BETWEEN_PROGRESS_UPDATES = 10;
+
/** the constructor */
public GATKArgumentCollection() {
}
@@ -354,6 +357,10 @@ public class GATKArgumentCollection {
@Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false)
public double globalQScorePrior = -1.0;
+ @Advanced
+ @Argument(fullName="secondsBetweenProgressUpdates", shortName = "secondsBetweenProgressUpdates", doc = "Time interval for process meter information output (in seconds)", required=false)
+ public long secondsBetweenProgressUpdates = DEFAULT_SECONDS_BETWEEN_PROGRESS_UPDATES;
+
// --------------------------------------------------------------------------------------------------------------
//
diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java
index e40edb760..5da39245e 100644
--- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java
+++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java
@@ -204,7 +204,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
// Create the progress meter, and register it with the analysis engine
engine.registerProgressMeter(new ProgressMeter(progressLogFile,
availableTraversalEngines.peek().getTraversalUnits(),
- engine.getRegionsOfGenomeBeingProcessed()));
+ engine.getRegionsOfGenomeBeingProcessed(),
+ engine.getArguments().secondsBetweenProgressUpdates));
// Now that we have a progress meter, go through and initialize the traversal engines
for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines )
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java
index b68a5b9e0..b2fdbb8bc 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java
@@ -170,22 +170,15 @@ public class ProgressMeter {
/**
* Create a new ProgressMeter
*
- * Note that progress meter isn't started until the client calls start()
- *
* @param performanceLogFile an optional performance log file where a table of performance logs will be written
* @param processingUnitName the name of the unit type being processed, suitable for saying X seconds per processingUnitName
* @param processingIntervals the intervals being processed
+ * @param secondsBetweenProgressUpdates how frequently (in seconds) to print progress
*/
public ProgressMeter(final File performanceLogFile,
- final String processingUnitName,
- final GenomeLocSortedSet processingIntervals) {
- this(performanceLogFile, processingUnitName, processingIntervals, ProgressMeterDaemon.DEFAULT_POLL_FREQUENCY_MILLISECONDS);
- }
-
- protected ProgressMeter(final File performanceLogFile,
final String processingUnitName,
final GenomeLocSortedSet processingIntervals,
- final long pollingFrequency) {
+ final long secondsBetweenProgressUpdates) {
if ( processingUnitName == null ) throw new IllegalArgumentException("processingUnitName cannot be null");
if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null");
@@ -212,7 +205,7 @@ public class ProgressMeter {
targetSizeInBP = processingIntervals.coveredSize();
// start up the timer
- progressMeterDaemon = new ProgressMeterDaemon(this, pollingFrequency);
+ progressMeterDaemon = new ProgressMeterDaemon(this, secondsBetweenProgressUpdates);
}
public ProgressMeterDaemon getProgressMeterDaemon() {
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java
index eb18cb16c..6eb4d0954 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java
@@ -25,6 +25,8 @@
package org.broadinstitute.gatk.utils.progressmeter;
+import java.util.concurrent.TimeUnit;
+
/**
* Daemon thread that periodically prints the progress of the progress meter
*
@@ -33,20 +35,10 @@ package org.broadinstitute.gatk.utils.progressmeter;
* Time: 9:16 PM
*/
public final class ProgressMeterDaemon extends Thread {
- public final static long DEFAULT_POLL_FREQUENCY_MILLISECONDS = 10 * 1000;
-
/**
* How frequently should we poll and print progress?
*/
- private final long pollFrequencyMilliseconds;
-
- /**
- * How long are we waiting between print progress calls are issued?
- * @return the time in milliseconds between progress meter calls
- */
- private long getPollFrequencyMilliseconds() {
- return pollFrequencyMilliseconds;
- }
+ private final long secondsBetweenProgressUpdates;
/**
* Are we to continue periodically printing status, or should we shut down?
@@ -60,22 +52,19 @@ public final class ProgressMeterDaemon extends Thread {
/**
* Create a new ProgressMeterDaemon printing progress for meter
- * @param meter the progress meter to print progress of
+ * @param meter the progress meter to print progress
+ * @param secondsBetweenProgressUpdates how frequently (in seconds) to print progress
*/
- public ProgressMeterDaemon(final ProgressMeter meter, final long pollFrequencyMilliseconds) {
+ public ProgressMeterDaemon(final ProgressMeter meter, final long secondsBetweenProgressUpdates) {
if ( meter == null ) throw new IllegalArgumentException("meter cannot be null");
- if ( pollFrequencyMilliseconds <= 0 ) throw new IllegalArgumentException("pollFrequencyMilliseconds must be greater than 0 but got " + pollFrequencyMilliseconds);
+ if ( secondsBetweenProgressUpdates <= 0 ) throw new IllegalArgumentException("secondsBetweenProgressUpdates must be greater than 0 but got " + secondsBetweenProgressUpdates);
this.meter = meter;
- this.pollFrequencyMilliseconds = pollFrequencyMilliseconds;
+ this.secondsBetweenProgressUpdates = secondsBetweenProgressUpdates;
setDaemon(true);
setName("ProgressMeterDaemon");
}
- public ProgressMeterDaemon(final ProgressMeter meter) {
- this(meter, DEFAULT_POLL_FREQUENCY_MILLISECONDS);
- }
-
/**
* Tells this daemon thread to shutdown at the next opportunity, as the progress
* metering is complete.
@@ -102,7 +91,7 @@ public final class ProgressMeterDaemon extends Thread {
meter.printProgress(false);
meter.updateElapsedTimeInNanoseconds();
try {
- Thread.sleep(getPollFrequencyMilliseconds());
+ Thread.sleep(TimeUnit.SECONDS.toMillis(secondsBetweenProgressUpdates));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
From 641382eb8bbe1af0208021263829fe23ba626c62 Mon Sep 17 00:00:00 2001
From: Laura Gauthier
Date: Wed, 1 Jun 2016 12:29:14 -0400
Subject: [PATCH 21/68] Fix BetaTestingAnnotation group Add test
---
.../HaplotypeCallerGVCFIntegrationTest.java | 9 +++++++++
.../annotator/interfaces/BetaTestingAnnotation.java | 2 +-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
index bb8373b20..bfd444274 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
@@ -340,6 +340,15 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
executeTest(" testASMQMateRankSumAnnotation", spec);
}
+ @Test
+ public void testBetaTestingAnnotationGroup() {
+ final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G BetaTesting --disableDithering",
+ HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("df746da577c1d8a340f93b9d5df4df80"));
+ spec.disableShadowBCF();
+ executeTest(" testASMQMateRankSumAnnotation", spec);
+ }
+
@Test
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java
index 2a7887021..fa7bc5fde 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java
@@ -28,5 +28,5 @@ package org.broadinstitute.gatk.tools.walkers.annotator.interfaces;
/**
* Annotations implementing this interface are not guaranteed to persist between GATK versions
*/
-public interface BetaTestingAnnotation {
+public interface BetaTestingAnnotation extends AnnotationType {
}
From 4f2e3128050db69150290e5982c78dbaa27aff18 Mon Sep 17 00:00:00 2001
From: Ron Levine
Date: Thu, 7 Jul 2016 10:24:20 -0400
Subject: [PATCH 22/68] Throw an exception for invalid Picard intervals
---
.../gatk/utils/interval/IntervalUtils.java | 13 +++++++------
.../gatk/utils/GenomeLocParserUnitTest.java | 17 +++++++++--------
.../utils/interval/IntervalUtilsUnitTest.java | 15 ++-------------
3 files changed, 18 insertions(+), 27 deletions(-)
diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java
index b2ff1708e..9f6e352bb 100644
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java
@@ -144,16 +144,17 @@ public class IntervalUtils {
IntervalList il = IntervalList.fromFile(inputFile);
isPicardInterval = true;
- int nInvalidIntervals = 0;
for (Interval interval : il.getIntervals()) {
- if ( glParser.isValidGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true))
- ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true));
+ if (interval.getStart() - interval.getEnd() == 1 ) { // remove once a corrected version of the exome interval list is released.
+ logger.warn("Possible incorrectly converted length 1 interval : " + interval);
+ }
+ else if ( glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) {
+ ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true));
+ }
else {
- nInvalidIntervals++;
+ throw new UserException(inputFile.toString() + " has an invalid genome location : " + interval) ;
}
}
- if ( nInvalidIntervals > 0 )
- logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile);
}
// if that didn't work, try parsing file as a GATK interval file
diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java
index 5432e236f..3f58e2d77 100644
--- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java
+++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java
@@ -51,6 +51,7 @@ import java.util.*;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.assertFalse;
/**
* @author aaron
@@ -316,14 +317,14 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test
public void testValidationOfGenomeLocs() {
assertTrue(genomeLocParser.isValidGenomeLoc("chr1",1,1));
- assertTrue(!genomeLocParser.isValidGenomeLoc("chr2",1,1)); // shouldn't have an entry
- assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig
- assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start
- assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
- assertTrue( genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop
- assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
- assertTrue( genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end
- assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start
+ assertFalse(genomeLocParser.isValidGenomeLoc("chr2",1,1)); // shouldn't have an entry
+ assertFalse(genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig
+ assertFalse(genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start
+ assertFalse(genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
+ assertTrue(genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop
+ assertFalse(genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
+ assertTrue(genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end
+ assertFalse(genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start
}
@Test(expectedExceptions = ReviewedGATKException.class)
diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java
index 07b49d480..b0db3800c 100644
--- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java
+++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java
@@ -1046,23 +1046,12 @@ public class IntervalUtilsUnitTest extends BaseTest {
};
}
- /*
- * This test is disabled because its assumption that we will not throw an error
- * upon parsing invalid Picard intervals is no longer true, as htsjdk has added
- * extra protection against invalid intervals to IntervalList.add().
- *
- * We should reconsider our decision in IntervalUtils.intervalFileToList() to
- * silently ignore invalid intervals when parsing Picard interval files, as it's
- * inconsistent with the way we handle invalid intervals for GATK interval files
- * (throw a UserException, covered by testInvalidGATKFileIntervalHandling() below),
- * and update this test accordingly.
- */
- @Test(dataProvider="invalidIntervalTestData", enabled = false)
+ @Test(dataProvider="invalidIntervalTestData", expectedExceptions=UserException.class, enabled = true)
public void testInvalidPicardIntervalHandling(GenomeLocParser genomeLocParser,
String contig, int intervalStart, int intervalEnd ) throws Exception {
SAMFileHeader picardFileHeader = new SAMFileHeader();
- picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1"));
+ picardFileHeader.addSequence(genomeLocParser.getContigInfo(contig));
IntervalList picardIntervals = new IntervalList(picardFileHeader);
picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname"));
From 7392c4d1b068113f87139ad05ffe5cc4ddab7045 Mon Sep 17 00:00:00 2001
From: Ron Levine
Date: Thu, 23 Jun 2016 17:14:06 -0400
Subject: [PATCH 23/68] Removed spanning deletions if the deletion was removed
---
.../walkers/genotyper/GenotypingEngine.java | 84 +++++++++++++++----
.../GenotypeGVCFsIntegrationTest.java | 9 ++
2 files changed, 76 insertions(+), 17 deletions(-)
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
index 83c7ed533..cdcb2c195 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
@@ -104,8 +104,7 @@ public abstract class GenotypingEngine upstreamDeletionsLoc = new LinkedList<>();
/**
* Construct a new genotyper engine, on a specific subset of samples.
@@ -233,7 +232,7 @@ public abstract class GenotypingEngine alleles = afcr.getAllelesUsedInGenotyping();
final int alternativeAlleleCount = alleles.size() - 1;
@@ -355,23 +354,74 @@ public abstract class GenotypingEngine 0) {
+ final GenomeLoc genomeLoc = genomeLocParser.createGenomeLocOnContig(vc.getContig(), vc.getStart(), vc.getStart() + deletionSize);
+ upstreamDeletionsLoc.add(genomeLoc);
+ }
+ }
+
+ /**
+ * Is the variant context covered by an upstream deletion?
+ *
+ * @param vc variant context
+ * @return true if the location is covered by an upstream deletion, false otherwise
+ */
+ private boolean coveredByDeletion(final VariantContext vc) {
+ for (Iterator it = upstreamDeletionsLoc.iterator(); it.hasNext(); ) {
+ final GenomeLoc loc = it.next();
+ if (!loc.getContig().equals(vc.getContig())) { // past contig deletion.
+ it.remove();
+ } else if (loc.getStop() < vc.getStart()) { // past position in current contig deletion.
+ it.remove();
+ } else if (loc.getStart() == vc.getStart()) {
+ // ignore this deletion, the symbolic one does not make reference to it.
+ } else { // deletion covers.
+ return true;
+ }
+ }
+
+ return false;
+ }
+
/**
* Checks whether even if the allele is not well supported by the data, we should keep it for genotyping.
*
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java
index 56d506ce3..04153ed2d 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java
@@ -661,4 +661,13 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
spec.disableShadowBCF();
executeTest("testGenotypingSpanningDeletionWithAllSites", spec);
}
+
+ @Test
+ public void testGenotypingSpanningDeletionAcrossLines() {
+ final WalkerTestSpec spec = new WalkerTestSpec(
+ baseTestString(" -V " + privateTestDir + "input-1_2256566.vcf", b37KGReference),
+ Collections.singletonList("1f914189326cdd17d0a8753f13cb221f"));
+ spec.disableShadowBCF();
+ executeTest("testGenotypingSpanningDeletionAcrossLines", spec);
+ }
}
\ No newline at end of file
From 3daed9e5a19097ac57cfe46b25ea4601b09eb1a3 Mon Sep 17 00:00:00 2001
From: Samuel Lee
Date: Tue, 19 Jul 2016 14:44:49 -0400
Subject: [PATCH 24/68] Added exception for GQB values greater than
MAX_GENOTYPE_QUAL and tests.
---
.../haplotypecaller/HaplotypeCaller.java | 8 +--
.../gatk/utils/gvcf/GVCFWriter.java | 34 ++++++-----
.../HaplotypeCallerGVCFIntegrationTest.java | 58 +++++++++++++------
.../GenotypeGVCFsIntegrationTest.java | 2 +-
.../gatk/utils/gvcf/GVCFWriterUnitTest.java | 30 ++++++----
5 files changed, 83 insertions(+), 49 deletions(-)
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
index c7cc4cecc..b284b56a6 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
@@ -367,10 +367,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
*
* This argument allows you to set the GQ boundaries. HC expects a list of multiple GQ threshold values. To pass
* multiple values, you provide them one by one with the argument, as in `-GQB 10 -GQB 20 -GQB 30` and so on. Note
- * that GQ values are capped at 99 in the GATK.
+ * that GQ values are capped at 99 in the GATK, so values must be integers in [1, 99].
*/
@Advanced
- @Argument(fullName="GVCFGQBands", shortName="GQB", doc="GQ thresholds for reference confidence bands", required = false)
+ @Argument(fullName="GVCFGQBands", shortName="GQB", doc="GQ thresholds for reference confidence bands (must be in [1, 99] and specified in increasing order)", required = false)
protected List GVCFGQBands = new ArrayList(70) {{
for (int i=1; i<=60; ++i) add(i);
add(70); add(80); add(90); add(99);
@@ -754,8 +754,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
try {
vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands, HCAC.genotypeArgs.samplePloidy);
- } catch ( IllegalArgumentException e ) {
- throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage());
+ } catch ( final IllegalArgumentException e ) {
+ throw new UserException.BadArgumentValue("GVCFGQBands", e.getMessage());
}
}
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
index f0e4ca9eb..23d6dbb2d 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
@@ -74,13 +74,15 @@ import java.util.List;
*/
public class GVCFWriter implements VariantContextWriter {
+ private static final int MAX_GENOTYPE_QUAL = VCFConstants.MAX_GENOTYPE_QUAL;
+
//
// Final fields initialized in constructor
//
/** Where we'll ultimately write our VCF records */
- final private VariantContextWriter underlyingWriter;
+ private final VariantContextWriter underlyingWriter;
- final private List GQPartitions;
+ private final List GQPartitions;
/** fields updated on the fly during GVCFWriter operation */
int nextAvailableStart = -1;
@@ -90,26 +92,28 @@ public class GVCFWriter implements VariantContextWriter {
private final int defaultPloidy;
/**
- * Is the proposed GQ partitions well-formed?
+ * Are the proposed GQ partitions well-formed?
*
* @param GQPartitions proposed GQ partitions
* @return a non-null string if something is wrong (string explains issue)
*/
protected static List parsePartitions(final List GQPartitions, final int defaultPloidy) {
- if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null");
- if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty");
+ if ( GQPartitions == null ) throw new IllegalArgumentException("The list of GQ partitions cannot be null.");
+ if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("The list of GQ partitions cannot be empty.");
final List result = new LinkedList<>();
int lastThreshold = 0;
for ( final Integer value : GQPartitions ) {
- if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer");
- if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value);
- if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value);
- result.add(new HomRefBlock(lastThreshold, value,defaultPloidy));
+ if ( value == null || value <= 0 ) throw new IllegalArgumentException("The list of GQ partitions contains a null or non-positive integer.");
+ if ( value < lastThreshold ) throw new IllegalArgumentException(String.format("The list of GQ partitions is out of order. Previous value is %d but the next is %d.", lastThreshold, value));
+ if ( value == lastThreshold ) throw new IllegalArgumentException(String.format("The value %d appears more than once in the list of GQ partitions.", value));
+ if ( value > MAX_GENOTYPE_QUAL ) throw new IllegalArgumentException(String.format("The value %d in the list of GQ partitions is greater than VCFConstants.MAX_GENOTYPE_QUAL = %d.", value, VCFConstants.MAX_GENOTYPE_QUAL));
+ result.add(new HomRefBlock(lastThreshold, value, defaultPloidy));
lastThreshold = value;
}
- result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE,defaultPloidy));
-
+ if (lastThreshold <= MAX_GENOTYPE_QUAL ) {
+ result.add(new HomRefBlock(lastThreshold, MAX_GENOTYPE_QUAL + 1, defaultPloidy));
+ }
return result;
}
@@ -209,10 +213,14 @@ public class GVCFWriter implements VariantContextWriter {
}
private boolean genotypeCanBeMergedInCurrentBlock(final Genotype g) {
- return currentBlock != null && currentBlock.withinBounds(g.getGQ()) && currentBlock.getPloidy() == g.getPloidy()
+ return currentBlock != null && currentBlock.withinBounds(capToMaxGQ(g.getGQ())) && currentBlock.getPloidy() == g.getPloidy()
&& (currentBlock.getMinPLs() == null || !g.hasPL() || (currentBlock.getMinPLs().length == g.getPL().length));
}
+ private int capToMaxGQ(final int gq) {
+ return Math.min(gq, MAX_GENOTYPE_QUAL);
+ }
+
/**
* Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null
*/
@@ -268,7 +276,7 @@ public class GVCFWriter implements VariantContextWriter {
// figure out the GQ limits to use based on the GQ of g
HomRefBlock partition = null;
for ( final HomRefBlock maybePartition : GQPartitions ) {
- if ( maybePartition.withinBounds(g.getGQ()) ) {
+ if ( maybePartition.withinBounds(capToMaxGQ(g.getGQ())) ) {
partition = maybePartition;
break;
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
index bb8373b20..80139438a 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
@@ -51,6 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
+import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Level;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
@@ -67,6 +68,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import java.util.stream.Collectors;
public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@@ -87,10 +89,10 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
//tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "cf5545094ebb264fa8eb879fd848d9ef"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a6bbc30b82e7864baf64163d55f5aee5"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0086cc735cf792a9f236ec057c73b750"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"});
- tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "63fa5841a21e2c13f1e1a8e2d4ea3380"});
+ tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "861fa31b135d200f765914126b422cf4"});
return tests.toArray(new Object[][]{});
}
@@ -106,11 +108,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "8bb824886fb0e77d0e8317d69f9d1b62"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ca87b62a070801e4954d72169b88fb9c"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "1f19c2b2b528dff502bc1a47701edde7"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"});
- final String NA12878bandedResolutionMD5 = "8d4a51af32cd13ba4b3e33dd00c58398";
+ final String NA12878bandedResolutionMD5 = "7240907ec3dc2ed49b55c9956546ba13";
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5});
tests.add(new Object[]{NA12878_WEx + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam -sn NA12878",
ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5});
@@ -129,10 +131,10 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "450906ce3c11860c25b90cf0a56bb1a0"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "3c0346d41a7e57b45b85a920cc04f51f"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "49f41972e19f6897659e497d32730dde"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "50e628de2a79cd6887af020b713ca3b8"});
- tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"});
+ tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "e48bbcf453e63a6ea5eeda05f6865f94"});
return tests.toArray(new Object[][]{});
}
@@ -147,15 +149,14 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0bc1ca3bff07381a344685b048e76ee4"});
- tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9d1724150feccb0a09b6fad522605bb1"});
+ tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "3ff7e3cd9f6b1949d19f52fab53bdb5e"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "8a094080fb25bbcd39325dcdd62bcf65"});
- tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "f35192d245babba9764128abad669019"});
+ tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "685025831ac783784d7838e568e35f46"});
return tests.toArray(new Object[][]{});
}
-
/**
* Test HaplotypeCaller, using MyDataProvider
*/
@@ -276,7 +277,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testWrongGVCFNonVariantRecordOrderBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("6facd3d2cf9f52877182d627cef1c872"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("f70b7052dfeb065ee8c7d796f1a1f84a"));
spec.disableShadowBCF();
executeTest("testMissingGVCFIndexingStrategyException", spec);
}
@@ -293,7 +294,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testNoCallGVCFMissingPLsBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9fc3c68f46e747b730615c0be98cb013"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("66f242cf3f1f1776c743505b84505f94"));
spec.disableShadowBCF();
executeTest("testNoCallGVCFMissingPLsBugFix", spec);
}
@@ -326,7 +327,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("d986868d83057c0ecdf7ba177b8282f3"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("5877ccbc99bbaffbcd5fe3aaa3d7e7f7"));
spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec);
}
@@ -335,7 +336,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("87723bd4442c7ec25f65a77d6434957a"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("0381fec3b0d21508b28fa62c2a61ccfc"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@@ -344,7 +345,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("a63d6912b2f2fab7debee9488fbbd0b0"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("4599a591427c188c117f09ac40cc866f"));
spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec);
}
@@ -353,7 +354,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMultiAllelicNonRef() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample",
b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("182aa78f42235d2b4dabb87cc6c8a433"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("19fc2c5218d907fcdcd36de2afbef19c"));
spec.disableShadowBCF();
executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec);
}
@@ -362,7 +363,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMaxNumPLValues() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("446604d4398d4c1bad41b9506624ab91"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("b2adc744d9dff2f488149bcc96d6bb6d"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValues", spec);
}
@@ -379,7 +380,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbd37b492f77c50d2da744d5e00c6f90"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
@@ -404,7 +405,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbd37b492f77c50d2da744d5e00c6f90"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
@@ -420,8 +421,27 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerGVCFBlocks() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "gvcf_blocks_test.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
- final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("802c53621bd2004d9052a8e81d91df3e"));
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("0cdf4d6d0a45def15fb11ea30c78e470"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerGVCFBlocks", spec);
}
+
+ @DataProvider(name = "dataBadGQBValues")
+ public Object[][] dataBadGQBValues() {
+ return new Object[][]{
+ {Arrays.asList(-1, 10, 20)},
+ {Arrays.asList(10, 20, 1)},
+ {Arrays.asList(10, 10, 20)},
+ {Arrays.asList(10, 20, VCFConstants.MAX_GENOTYPE_QUAL + 1)}
+ };
+ }
+ @Test(dataProvider = "dataBadGQBValues")
+ public void testBadGQBValues(final List inputGQBValues) {
+ final String inputGQBValuesString = inputGQBValues.stream().map(gqb -> "-GQB " + gqb).collect(Collectors.joining(" "));
+ final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
+ b37KGReference, privateTestDir + "gvcf_blocks_test.bam", inputGQBValuesString, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
+ final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.BadArgumentValue.class);
+ spec.disableShadowBCF();
+ executeTest("testBadGQBValues", spec);
+ }
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java
index 56d506ce3..ae77125d5 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java
@@ -282,7 +282,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference),
1,
- Collections.singletonList("34d76dc8dabc6a97e6d8f5365d7531e5"));
+ Collections.singletonList("5d8fff160ec6eedb8e02c9207e256073"));
spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC
executeTest("testStrandAlleleCountsBySample", spec);
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java
index eb421ba3f..7067fd89f 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java
@@ -69,6 +69,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import java.util.stream.Collectors;
public class GVCFWriterUnitTest extends BaseTest {
private static class MockWriter implements VariantContextWriter {
@@ -378,25 +379,30 @@ public class GVCFWriterUnitTest extends BaseTest {
public Object[][] makeBandPartitionData() {
List