diff --git a/build.xml b/build.xml
index ca5d22a5a..f681ddafa 100644
--- a/build.xml
+++ b/build.xml
@@ -76,7 +76,8 @@
-
+
+
@@ -92,12 +93,10 @@
-
+
-
-
@@ -208,19 +207,19 @@
-
-
-
+
+
+
-
+
-
+
-
+
-
+
@@ -430,9 +429,9 @@
-
-
-
+
+
+
@@ -680,9 +679,9 @@
-
-
-
+
+
+
diff --git a/ivy.xml b/ivy.xml
index b197d0714..0761cb411 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -52,7 +52,7 @@
-
+
@@ -87,7 +87,7 @@
-
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java
index 9eca81852..d714ca185 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java
@@ -25,10 +25,14 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* OTHER DEALINGS IN THE SOFTWARE.
*/
+import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.recalibration.EventType;
+import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
+import org.broadinstitute.sting.utils.recalibration.RecalDatum;
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
index cea596a7d..6134101d9 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
@@ -5,7 +5,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
-import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
+import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
index f91e535b0..26ff4db24 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
@@ -1,6 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Requires;
+import org.apache.commons.lang.ArrayUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@@ -9,6 +13,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
+import java.util.LinkedHashMap;
/**
* Created by IntelliJ IDEA.
@@ -30,24 +35,26 @@ public class ErrorModel {
private static final boolean compressRange = false;
private static final double log10MinusE = Math.log10(Math.exp(1.0));
-
+ private static final boolean DEBUG = false;
/**
* Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
*
- * @param minQualityScore Minimum site quality score to evaluate
- * @param maxQualityScore Maximum site quality score to evaluate
- * @param phredScaledPrior Prior for site quality
+ * @param UAC Argument Collection
* @param refSamplePileup Reference sample pileup
* @param refSampleVC VC with True alleles in reference sample pileup
- * @param minPower Minimum power
*/
- public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior,
- ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) {
- this.maxQualityScore = maxQualityScore;
- this.minQualityScore = minQualityScore;
- this.phredScaledPrior = phredScaledPrior;
- log10minPower = Math.log10(minPower);
+ public ErrorModel (final UnifiedArgumentCollection UAC,
+ final ReadBackedPileup refSamplePileup,
+ VariantContext refSampleVC, final ReferenceContext refContext) {
+ this.maxQualityScore = UAC.maxQualityScore;
+ this.minQualityScore = UAC.minQualityScore;
+ this.phredScaledPrior = UAC.phredScaledPrior;
+ log10minPower = Math.log10(UAC.minPower);
+ PairHMMIndelErrorModel pairModel = null;
+ LinkedHashMap haplotypeMap = null;
+ HashMap> indelLikelihoodMap = null;
+ double[][] perReadLikelihoods = null;
double[] model = new double[maxQualityScore+1];
Arrays.fill(model,Double.NEGATIVE_INFINITY);
@@ -61,11 +68,17 @@ public class ErrorModel {
break;
}
}
-
-
+ haplotypeMap = new LinkedHashMap();
+ if (refSampleVC.isIndel()) {
+ pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
+ UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
+ indelLikelihoodMap = new HashMap>();
+ IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
+ }
}
+
+ double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
- double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
// maximum uncertainty if there's no ref data at site
model[q] = p;
@@ -75,23 +88,48 @@ public class ErrorModel {
else {
hasData = true;
int matches = 0;
- int coverage = refSamplePileup.getNumberOfElements();
+ int coverage = 0;
Allele refAllele = refSampleVC.getReference();
+ if (refSampleVC.isIndel()) {
+ final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
+ //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
+ final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
+ if (!haplotypeMap.isEmpty())
+ perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
+ }
+ int idx = 0;
for (PileupElement refPileupElement : refSamplePileup) {
+ if (DEBUG)
+ System.out.println(refPileupElement.toString());
boolean isMatch = false;
- for (Allele allele : refSampleVC.getAlleles())
- isMatch |= pileupElementMatches(refPileupElement, allele, refAllele);
+ for (Allele allele : refSampleVC.getAlleles()) {
+ boolean m = pileupElementMatches(refPileupElement, allele, refAllele, refContext.getBase());
+ if (DEBUG) System.out.println(m);
+ isMatch |= m;
+ }
+ if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) {
+ // ignore match/mismatch if reads, as determined by their likelihood, are not informative
+ double[] perAlleleLikelihoods = perReadLikelihoods[idx++];
+ if (!isInformativeElement(perAlleleLikelihoods))
+ matches++;
+ else
+ matches += (isMatch?1:0);
- matches += (isMatch?1:0);
- // System.out.format("MATCH:%b\n",isMatch);
+ } else {
+ matches += (isMatch?1:0);
+ }
+ coverage++;
}
int mismatches = coverage - matches;
//System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
- model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
+ if (coverage==0)
+ model[q] = p;
+ else
+ model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
}
this.refDepth = coverage;
}
@@ -101,6 +139,17 @@ public class ErrorModel {
}
+ @Requires("likelihoods.length>0")
+ private boolean isInformativeElement(double[] likelihoods) {
+ // if likelihoods are the same, they're not informative
+ final double thresh = 0.1;
+ int maxIdx = MathUtils.maxElementIndex(likelihoods);
+ int minIdx = MathUtils.minElementIndex(likelihoods);
+ if (likelihoods[maxIdx]-likelihoods[minIdx]< thresh)
+ return false;
+ else
+ return true;
+ }
/**
* Simple constructor that just takes a given log-probability vector as error model.
* Only intended for unit testing, not general usage.
@@ -115,23 +164,27 @@ public class ErrorModel {
}
- public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) {
- /* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
+ public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele, byte refBase) {
+ if (DEBUG)
+ System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
- */
+ //pileupElement.
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
if (allele.isReference()) {
// for a ref allele, any base mismatch or new indel is a mismatch.
- if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case
- return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]);
+ if(allele.getBases().length>0)
+ // todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location
+ // could clean up/simplify this when unpadding is removed
+ return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
else
// either null allele to compare, or ref/alt lengths are different (indel by definition).
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
}
+ // for non-ref alleles to compare:
if (refAllele.getBases().length == allele.getBases().length)
// alleles have the same length (eg snp or mnp)
return pileupElement.getBase() == allele.getBases()[0];
@@ -209,18 +262,19 @@ public class ErrorModel {
}
public String toString() {
- String result = "(";
+ StringBuilder result = new StringBuilder("(");
boolean skipComma = true;
for (double v : probabilityVector.getProbabilityVector()) {
if (skipComma) {
skipComma = false;
}
else {
- result += ",";
+ result.append(",");
}
- result += String.format("%.4f", v);
+ result.append(String.format("%.4f", v));
}
- return result + ")";
+ result.append(")");
+ return result.toString();
}
public static int getTotalReferenceDepth(HashMap perLaneErrorModels) {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java
similarity index 95%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java
index b8be24cad..78ab11eb1 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java
@@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
-public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
+public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
final protected UnifiedArgumentCollection UAC;
@@ -42,7 +42,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false;
- protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
+ protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
ploidy = UAC.samplePloidy;
this.UAC = UAC;
@@ -140,7 +140,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
- final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL);
+ final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL);
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
for (int k=1; k < acCount.length;k++) {
if (acCount[k] > 0)
@@ -238,7 +238,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
return newPool;
}
- // todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods
+ // todo - refactor, function almost identical except for log10LofK computation in GeneralPloidyGenotypeLikelihoods
/**
*
* @param set ExactACset holding conformation to be computed
@@ -301,7 +301,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
continue;
- PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
+ GeneralPloidyGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
}
@@ -341,14 +341,14 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
- PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
+ GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
final double[] x = originalPool.getLikelihoodsAsVector(true);
while(firstIterator.hasNext()) {
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
firstIterator.next();
}
- PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
+ GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
final double[] y = yy.clone();
while(secondIterator.hasNext()) {
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
@@ -357,7 +357,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
// initialize output to -log10(choose(m1+m2,[k1 k2...])
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
- final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
+ final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
@@ -419,7 +419,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
// for current conformation, get all possible ways to break vector K into two components G1 and G2
- final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
+ final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
while (innerIterator.hasNext()) {
// check if breaking current conformation into g1 and g2 is feasible.
@@ -617,7 +617,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( numOriginalAltAlleles == numNewAltAlleles) {
newLikelihoods = originalLikelihoods;
} else {
- newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse);
+ newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
// might need to re-normalize
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
@@ -668,7 +668,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
// find the genotype with maximum likelihoods
final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
- final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
+ final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
final ArrayList alleleFreqs = new ArrayList();
final ArrayList alleleCounts = new ArrayList();
@@ -681,8 +681,8 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
}
// per-pool logging of AC and AF
- gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
- gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
+ gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
+ gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
// remove PLs if necessary
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
similarity index 97%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
index 438acbacd..6b0831323 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
@@ -37,7 +37,7 @@ import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import java.util.*;
-public abstract class PoolGenotypeLikelihoods {
+public abstract class GeneralPloidyGenotypeLikelihoods {
protected final int numChromosomes;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
@@ -67,8 +67,8 @@ public abstract class PoolGenotypeLikelihoods {
private static final boolean FAST_GL_COMPUTATION = true;
// constructor with given logPL elements
- public PoolGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy,
- final HashMap perLaneErrorModels, final boolean ignoreLaneInformation) {
+ public GeneralPloidyGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy,
+ final HashMap perLaneErrorModels, final boolean ignoreLaneInformation) {
this.alleles = alleles;
this.nAlleles = alleles.size();
numChromosomes = ploidy;
@@ -101,7 +101,7 @@ public abstract class PoolGenotypeLikelihoods {
Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD);
} else {
if (logLikelihoods.length != likelihoodDim)
- throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object");
+ throw new ReviewedStingException("BUG: inconsistent parameters when creating GeneralPloidyGenotypeLikelihoods object");
log10Likelihoods = logLikelihoods; //.clone(); // is clone needed?
}
@@ -174,7 +174,7 @@ public abstract class PoolGenotypeLikelihoods {
final int numAlleles = currentState.length;
final int ploidy = restrictSumTo;
- linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy);
+ linearIndex = GeneralPloidyGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy);
}
else
throw new ReviewedStingException("BUG: Not supported");
@@ -308,7 +308,7 @@ public abstract class PoolGenotypeLikelihoods {
public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes,
final List originalAlleles, final List allelesToSubset) {
- int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes);
+ int newPLSize = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes);
double[] newPLs = new double[newPLSize];
@@ -357,7 +357,7 @@ public abstract class PoolGenotypeLikelihoods {
newCount[idx] = pVec[permutationKey[idx]];
// get corresponding index from new count
- int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes);
+ int outputIdx = GeneralPloidyGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes);
newPLs[outputIdx] = pl;
if (VERBOSE) {
System.out.println("Old Key:"+Arrays.toString(pVec));
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
similarity index 93%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
index 37b676601..f6ce818be 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
@@ -39,7 +39,7 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
-public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
+public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
//protected Set laneIDs;
public enum Model {
@@ -52,7 +52,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
final protected UnifiedArgumentCollection UAC;
- protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
+ protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC,logger);
this.UAC = UAC;
@@ -90,7 +90,6 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
return new VariantContextBuilder("pc",referenceSampleVC.getChr(), referenceSampleVC.getStart(), referenceSampleVC.getEnd(),
referenceSampleVC.getAlleles())
- .referenceBaseForIndel(referenceSampleVC.getReferenceBaseForIndel())
.genotypes(new GenotypeBuilder(UAC.referenceSampleName, referenceAlleles).GQ(referenceGenotype.getGQ()).make())
.make();
}
@@ -138,11 +137,11 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
protected static class PoolGenotypeData {
public final String name;
- public final PoolGenotypeLikelihoods GL;
+ public final GeneralPloidyGenotypeLikelihoods GL;
public final int depth;
public final List alleles;
- public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List alleles) {
+ public PoolGenotypeData(final String name, final GeneralPloidyGenotypeLikelihoods GL, final int depth, final List alleles) {
this.name = name;
this.GL = GL;
this.depth = depth;
@@ -237,7 +236,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
// create the GenotypeLikelihoods object
- final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
+ final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
// actually compute likelihoods
final int nGoodBases = GL.add(pileup, UAC);
if ( nGoodBases > 0 )
@@ -247,7 +246,8 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
// find the alternate allele(s) that we should be using
final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
-
+ if (alleles == null || alleles.isEmpty())
+ return null;
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final int endLoc = getEndLocation(tracker, ref, alleles);
@@ -268,7 +268,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
for ( PoolGenotypeData sampleData : GLs ) {
// extract from multidimensional array
- final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes,
+ final double[] myLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(), sampleData.GL.numChromosomes,
allAlleles, alleles);
// normalize in log space so that max element is zero.
@@ -313,7 +313,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
refLanePileup = refPileup.getPileupForLane(laneID);
//ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles);
- perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower));
+ perLaneErrorModels.put(laneID, new ErrorModel(UAC, refLanePileup, refVC, ref));
}
return perLaneErrorModels;
@@ -327,7 +327,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
Abstract methods - must be implemented in derived classes
*/
- protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles,
+ protected abstract GeneralPloidyGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap perLaneErrorModels,
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
similarity index 89%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoods.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
index dae7bd43d..34267b9a8 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
@@ -18,26 +18,30 @@ import java.util.*;
* Time: 10:06 AM
* To change this template use File | Settings | File Templates.
*/
-public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
+public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods {
final PairHMMIndelErrorModel pairModel;
final LinkedHashMap haplotypeMap;
final ReferenceContext refContext;
final int eventLength;
double[][] readHaplotypeLikelihoods;
- public PoolIndelGenotypeLikelihoods(final List alleles,
- final double[] logLikelihoods,
- final int ploidy,
- final HashMap perLaneErrorModels,
- final boolean ignoreLaneInformation,
- final PairHMMIndelErrorModel pairModel,
- final LinkedHashMap haplotypeMap,
- final ReferenceContext referenceContext) {
+ final byte refBase;
+
+ public GeneralPloidyIndelGenotypeLikelihoods(final List alleles,
+ final double[] logLikelihoods,
+ final int ploidy,
+ final HashMap perLaneErrorModels,
+ final boolean ignoreLaneInformation,
+ final PairHMMIndelErrorModel pairModel,
+ final LinkedHashMap haplotypeMap,
+ final ReferenceContext referenceContext) {
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
this.pairModel = pairModel;
this.haplotypeMap = haplotypeMap;
this.refContext = referenceContext;
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
+ // todo - not needed if indel alleles have base at current position
+ this.refBase = referenceContext.getBase();
}
// -------------------------------------------------------------------------------------
@@ -138,10 +142,8 @@ public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
List numSeenBases = new ArrayList(this.alleles.size());
if (!hasReferenceSampleData) {
- final int numHaplotypes = haplotypeMap.size();
-
final int readCounts[] = new int[pileup.getNumberOfElements()];
- readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
+ readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
n = readHaplotypeLikelihoods.length;
} else {
Allele refAllele = null;
@@ -161,7 +163,7 @@ public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
int idx =0;
for (Allele allele : alleles) {
int cnt = numSeenBases.get(idx);
- numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0));
+ numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele, refBase)?1:0));
}
n++;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
similarity index 81%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
index c87ed04a0..f6559f666 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
@@ -32,17 +32,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.collections.Pair;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
-public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
+public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends GeneralPloidyGenotypeLikelihoodsCalculationModel {
private static final int MAX_NUM_ALLELES_TO_GENOTYPE = 4;
private PairHMMIndelErrorModel pairModel;
- private boolean allelesArePadded = false;
/*
private static ThreadLocal>> indelLikelihoodMap =
new ThreadLocal>>() {
@@ -60,7 +57,7 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi
}
*/
- protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) {
+ protected GeneralPloidyIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) {
super(UAC, logger);
@@ -70,20 +67,14 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi
}
- public static HashMap> getIndelLikelihoodMap() {
- return IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
- }
-
-
-
- protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles,
+ protected GeneralPloidyGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap perLaneErrorModels,
final boolean useBQAedPileup,
final ReferenceContext ref,
final boolean ignoreLaneInformation){
- return new PoolIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
+ return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
}
protected List getInitialAllelesToUse(final RefMetaDataTracker tracker,
@@ -94,12 +85,10 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi
final List allAllelesToUse){
- final Pair,Boolean> pair = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
- List alleles = pair.first;
+ List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE)
alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE);
- allelesArePadded = pair.second;
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
haplotypeMap.clear();
@@ -131,6 +120,6 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi
protected int getEndLocation(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final List allelesToUse) {
- return IndelGenotypeLikelihoodsCalculationModel.computeEndLocation(allelesToUse, ref.getLocus(), allelesArePadded);
+ return ref.getLocus().getStart() + allelesToUse.get(0).length() - 1;
}
}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
similarity index 94%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
index 1e445270f..944372907 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
@@ -5,6 +5,7 @@ import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@@ -22,7 +23,7 @@ import static java.lang.Math.pow;
* and posteriors given a pile of bases and quality scores
*
*/
-public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implements Cloneable*/ {
+public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods/* implements Cloneable*/ {
final List myAlleles;
final int[] alleleIndices;
@@ -41,14 +42,19 @@ public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implem
* @param useBQAedPileup Use BAQed pileup
* @param ignoreLaneInformation If true, lane info is ignored
*/
- public PoolSNPGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy,
- final HashMap perLaneErrorModels, final boolean useBQAedPileup,final boolean ignoreLaneInformation) {
+ public GeneralPloidySNPGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy,
+ final HashMap perLaneErrorModels, final boolean useBQAedPileup, final boolean ignoreLaneInformation) {
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
this.useBAQedPileup = useBQAedPileup;
myAlleles = new ArrayList(alleles);
- refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref!
+ Allele refAllele = alleles.get(0);
+ //sanity check: by construction, first allele should ALWAYS be the reference alleles
+ if (!refAllele.isReference())
+ throw new ReviewedStingException("BUG: First allele in list passed to GeneralPloidySNPGenotypeLikelihoods should be reference!");
+
+ refByte = refAllele.getBases()[0]; // by construction, first allele in list is always ref!
if (myAlleles.size() < BaseUtils.BASES.length) {
// likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones,
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
similarity index 91%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
index 61f505445..30d614455 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
@@ -35,22 +35,22 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
-public class PoolSNPGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
+public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends GeneralPloidyGenotypeLikelihoodsCalculationModel {
- protected PoolSNPGenotypeLikelihoodsCalculationModel( UnifiedArgumentCollection UAC, Logger logger) {
+ protected GeneralPloidySNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
}
- protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles,
+ protected GeneralPloidyGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap perLaneErrorModels,
final boolean useBQAedPileup,
final ReferenceContext ref,
final boolean ignoreLaneInformation) {
- return new PoolSNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
+ return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
}
protected List getInitialAllelesToUse(final RefMetaDataTracker tracker,
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index e2445e926..c56cf5bf2 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -33,7 +33,6 @@ import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFAlleleClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
@@ -57,8 +56,12 @@ public class GenotypingEngine {
// This function is the streamlined approach, currently not being used
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
- public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, final byte[] ref, final GenomeLoc refLoc,
- final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) {
+ public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
+ final ArrayList haplotypes,
+ final byte[] ref,
+ final GenomeLoc refLoc,
+ final GenomeLoc activeRegionWindow,
+ final GenomeLocParser genomeLocParser ) {
// Prepare the list of haplotype indices to genotype
final ArrayList allelesToGenotype = new ArrayList();
@@ -184,8 +187,13 @@ public class GenotypingEngine {
}
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
- public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, final byte[] ref, final GenomeLoc refLoc,
- final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser, final ArrayList activeAllelesToGenotype ) {
+ public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
+ final ArrayList haplotypes,
+ final byte[] ref,
+ final GenomeLoc refLoc,
+ final GenomeLoc activeRegionWindow,
+ final GenomeLocParser genomeLocParser,
+ final ArrayList activeAllelesToGenotype ) {
final ArrayList>>> returnCalls = new ArrayList>>>();
@@ -220,7 +228,6 @@ public class GenotypingEngine {
}
}
-
// Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
@@ -317,7 +324,7 @@ public class GenotypingEngine {
}
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
- final int MAX_SIZE_TO_COMBINE = 10;
+ final int MAX_SIZE_TO_COMBINE = 15;
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
if( startPosKeySet.size() <= 1 ) { return; }
@@ -334,10 +341,10 @@ public class GenotypingEngine {
boolean isBiallelic = true;
VariantContext thisVC = null;
VariantContext nextVC = null;
- int x11 = 0;
- int x12 = 0;
- int x21 = 0;
- int x22 = 0;
+ double x11 = Double.NEGATIVE_INFINITY;
+ double x12 = Double.NEGATIVE_INFINITY;
+ double x21 = Double.NEGATIVE_INFINITY;
+ double x22 = Double.NEGATIVE_INFINITY;
for( final Haplotype h : haplotypes ) {
// only make complex substitutions out of consecutive biallelic sites
@@ -360,21 +367,24 @@ public class GenotypingEngine {
}
}
// count up the co-occurrences of the events for the R^2 calculation
- // BUGBUG: use haplotype likelihoods per-sample to make this more accurate
- if( thisHapVC == null ) {
- if( nextHapVC == null ) { x11++; }
- else { x12++; }
- } else {
- if( nextHapVC == null ) { x21++; }
- else { x22++; }
+ final ArrayList haplotypeList = new ArrayList();
+ haplotypeList.add(h);
+ for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
+ final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0];
+ if( thisHapVC == null ) {
+ if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
+ else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
+ } else {
+ if( nextHapVC == null ) { x21 = MathUtils.approximateLog10SumLog10(x21, haplotypeLikelihood); }
+ else { x22 = MathUtils.approximateLog10SumLog10(x22, haplotypeLikelihood); }
+ }
}
}
if( thisVC == null || nextVC == null ) {
continue;
- //throw new ReviewedStingException("StartPos TreeSet has an entry for an event that is found on no haplotype. start pos = " + thisStart + ", next pos = " + nextStart);
}
if( isBiallelic ) {
- final double R2 = calculateR2LD( x11, x12, x21, x22 );
+ final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) );
if( DEBUG ) {
System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2));
System.out.println("-- " + thisVC);
@@ -419,24 +429,21 @@ public class GenotypingEngine {
protected static VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) {
final int thisStart = thisVC.getStart();
final int nextStart = nextVC.getStart();
- byte[] refBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} );
- byte[] altBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} );
+ byte[] refBases = new byte[]{};
+ byte[] altBases = new byte[]{};
refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases());
altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases());
- for( int locus = thisStart + refBases.length; locus < nextStart; locus++ ) {
+ int locus;
+ for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) {
final byte refByte = ref[locus - refLoc.getStart()];
refBases = ArrayUtils.add(refBases, refByte);
altBases = ArrayUtils.add(altBases, refByte);
}
- if( nextVC.hasReferenceBaseForIndel() ) {
- refBases = ArrayUtils.add(refBases, nextVC.getReferenceBaseForIndel());
- altBases = ArrayUtils.add(altBases, nextVC.getReferenceBaseForIndel());
- }
- refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases());
+ refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel
altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
int iii = 0;
- if( refBases.length == altBases.length && VCFAlleleClipper.needsPadding(thisVC) ) { // special case of insertion + deletion of same length creates an MNP --> trim padding bases off the allele
+ if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele
while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; }
}
final ArrayList mergedAlleles = new ArrayList();
@@ -445,12 +452,11 @@ public class GenotypingEngine {
return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make();
}
- @Requires({"x11 >= 0", "x12 >= 0", "x21 >= 0", "x22 >= 0"})
- protected static double calculateR2LD( final int x11, final int x12, final int x21, final int x22 ) {
- final int total = x11 + x12 + x21 + x22;
- final double pa1b1 = ((double) x11) / ((double) total);
- final double pa1b2 = ((double) x12) / ((double) total);
- final double pa2b1 = ((double) x21) / ((double) total);
+ protected static double calculateR2LD( final double x11, final double x12, final double x21, final double x22 ) {
+ final double total = x11 + x12 + x21 + x22;
+ final double pa1b1 = x11 / total;
+ final double pa1b2 = x12 / total;
+ final double pa2b1 = x21 / total;
final double pa1 = pa1b1 + pa1b2;
final double pb1 = pa1b1 + pa2b1;
return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) );
@@ -530,35 +536,37 @@ public class GenotypingEngine {
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
- final byte[] insertionBases = Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength );
- boolean allN = true;
- for( final byte b : insertionBases ) {
- if( b != (byte) 'N' ) {
- allN = false;
- break;
- }
+ {
+ final ArrayList insertionAlleles = new ArrayList();
+ final int insertionStart = refLoc.getStart() + refPos - 1;
+ final byte refByte = ref[refPos-1];
+ if( BaseUtils.isRegularBase(refByte) ) {
+ insertionAlleles.add( Allele.create(refByte, true) );
}
- if( !allN ) {
- final ArrayList insertionAlleles = new ArrayList();
- final int insertionStart = refLoc.getStart() + refPos - 1;
- if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
- insertionAlleles.add( Allele.create(ref[refPos-1], true) );
- insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
- vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
- } else {
- insertionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, true) );
+ if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
+ insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
+ } else {
+ byte[] insertionBases = new byte[]{};
+ insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base
+ insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength ));
+ if( BaseUtils.isAllRegularBases(insertionBases) ) {
insertionAlleles.add( Allele.create(insertionBases, false) );
- vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
}
-
+ }
+ if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
+ vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
}
alignmentPos += elementLength;
break;
+ }
case S:
+ {
alignmentPos += elementLength;
break;
+ }
case D:
- final byte[] deletionBases = Arrays.copyOfRange( ref, refPos, refPos + elementLength );
+ {
+ final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base
final ArrayList deletionAlleles = new ArrayList();
final int deletionStart = refLoc.getStart() + refPos - 1;
// BUGBUG: how often does this symbolic deletion allele case happen?
@@ -568,13 +576,20 @@ public class GenotypingEngine {
// deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
// vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make());
//} else {
+ final byte refByte = ref[refPos-1];
+ if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
deletionAlleles.add( Allele.create(deletionBases, true) );
- deletionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, false) );
- vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
+ deletionAlleles.add( Allele.create(refByte, false) );
+ vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
+ }
//}
refPos += elementLength;
break;
+ }
case M:
+ case EQ:
+ case X:
+ {
int numSinceMismatch = -1;
int stopOfMismatch = -1;
int startOfMismatch = -1;
@@ -597,11 +612,13 @@ public class GenotypingEngine {
if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) {
final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 );
final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 );
- final ArrayList snpAlleles = new ArrayList();
- snpAlleles.add( Allele.create( refBases, true ) );
- snpAlleles.add( Allele.create( mismatchBases, false ) );
- final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
- vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
+ if( BaseUtils.isAllRegularBases(refBases) && BaseUtils.isAllRegularBases(mismatchBases) ) {
+ final ArrayList snpAlleles = new ArrayList();
+ snpAlleles.add( Allele.create( refBases, true ) );
+ snpAlleles.add( Allele.create( mismatchBases, false ) );
+ final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
+ vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
+ }
numSinceMismatch = -1;
stopOfMismatch = -1;
startOfMismatch = -1;
@@ -611,6 +628,7 @@ public class GenotypingEngine {
alignmentPos++;
}
break;
+ }
case N:
case H:
case P:
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 8ab36050b..4f434bba6 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import net.sf.picard.reference.IndexedFastaSequenceFile;
+import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
+import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
@@ -56,6 +58,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
@@ -103,7 +106,7 @@ import java.util.*;
@DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.LOCUS)
-@ActiveRegionExtension(extension=65, maxRegion=275)
+@ActiveRegionExtension(extension=65, maxRegion=300)
public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible {
/**
@@ -126,9 +129,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 1;
+ @Advanced
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
+ @Advanced
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
@@ -139,9 +144,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false)
protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000;
- @Argument(fullName="useExpandedTriggerSet", shortName="expandedTriggers", doc = "If specified, use additional, experimental triggers designed to capture larger indels but which may lead to an increase in the false positive rate", required=false)
- protected boolean USE_EXPANDED_TRIGGER_SET = false;
-
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
protected boolean USE_ALLELES_TRIGGER = false;
@@ -188,7 +190,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
protected String[] annotationClassesToUse = { "Standard" };
@ArgumentCollection
- private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
+ private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
@@ -239,12 +241,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
samplesList.addAll( samples );
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
- UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; // the GLmodel isn't used by the HaplotypeCaller but it is dangerous to let the user change this argument
+ final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
- UAC.STANDARD_CONFIDENCE_FOR_CALLING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
- UAC.STANDARD_CONFIDENCE_FOR_EMITTING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
+ UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
+ UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header
@@ -308,8 +310,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
public boolean wantsNonPrimaryReads() { return true; }
@Override
- @Ensures({"result >= 0.0", "result <= 1.0"})
- public double isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
+ @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"})
+ public ActivityProfileResult isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
@@ -318,56 +320,52 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
}
}
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
- return 1.0;
+ return new ActivityProfileResult(1.0);
}
}
if( USE_ALLELES_TRIGGER ) {
- return ( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
+ return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
}
- if( context == null ) { return 0.0; }
+ if( context == null ) { return new ActivityProfileResult(0.0); }
final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied
noCall.add(Allele.NO_CALL);
final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
- for( final String sample : splitContexts.keySet() ) {
+ final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage();
+ for( final Map.Entry sample : splitContexts.entrySet() ) {
final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event)
Arrays.fill(genotypeLikelihoods, 0.0);
- for( final PileupElement p : splitContexts.get(sample).getBasePileup() ) {
- final byte qual = ( USE_EXPANDED_TRIGGER_SET ?
- ( p.isNextToSoftClip() || p.isBeforeInsertion() || p.isAfterInsertion() ? ( p.getQual() > QualityUtils.MIN_USABLE_Q_SCORE ? p.getQual() : (byte) 20 ) : p.getQual() )
- : p.getQual() );
- if( p.isDeletion() || qual > (USE_EXPANDED_TRIGGER_SET ? QualityUtils.MIN_USABLE_Q_SCORE : (byte) 18) ) {
+ for( final PileupElement p : sample.getValue().getBasePileup() ) {
+ final byte qual = p.getQual();
+ if( p.isDeletion() || qual > (byte) 18) {
int AA = 0; final int AB = 1; int BB = 2;
- if( USE_EXPANDED_TRIGGER_SET ) {
- if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ||
- (!p.getRead().getNGSPlatform().equals(NGSPlatform.SOLID) && ((p.getRead().getReadPairedFlag() && p.getRead().getMateUnmappedFlag()) || BadMateFilter.hasBadMate(p.getRead()))) ) {
- AA = 2;
- BB = 0;
- }
- } else {
- if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
- AA = 2;
- BB = 0;
+ if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
+ AA = 2;
+ BB = 0;
+ if( p.isNextToSoftClip() ) {
+ averageHQSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
}
}
- genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
- genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
- genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
+ genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
+ genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
+ genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
}
}
- genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
+ genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final ArrayList alleles = new ArrayList();
alleles.add( FAKE_REF_ALLELE );
alleles.add( FAKE_ALT_ALLELE );
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
- return ( vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ) );
+ final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
+
+ return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
}
//---------------------------------------------------------------------------------------------------------------
@@ -416,45 +414,48 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
for( final Pair>> callResult :
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
- ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() )
+ ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() )
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
final Map>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
-
- // add some custom annotations to the calls
final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes());
- // Calculate the number of variants on the haplotype
- int maxNumVar = 0;
- for( final Allele allele : callResult.getFirst().getAlleles() ) {
- if( !allele.isReference() ) {
- for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
- final int numVar = haplotype.getEventMap().size();
- if( numVar > maxNumVar ) { maxNumVar = numVar; }
+
+ if( !GENOTYPE_FULL_ACTIVE_REGION ) {
+ // add some custom annotations to the calls
+
+ // Calculate the number of variants on the haplotype
+ int maxNumVar = 0;
+ for( final Allele allele : callResult.getFirst().getAlleles() ) {
+ if( !allele.isReference() ) {
+ for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
+ final int numVar = haplotype.getEventMap().size();
+ if( numVar > maxNumVar ) { maxNumVar = numVar; }
+ }
}
}
- }
- // Calculate the event length
- int maxLength = 0;
- for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
- final int length = a.length() - annotatedCall.getReference().length();
- if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
- }
+ // Calculate the event length
+ int maxLength = 0;
+ for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
+ final int length = a.length() - annotatedCall.getReference().length();
+ if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
+ }
- myAttributes.put("NVH", maxNumVar);
- myAttributes.put("NumHapEval", bestHaplotypes.size());
- myAttributes.put("NumHapAssembly", haplotypes.size());
- myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
- myAttributes.put("EVENTLENGTH", maxLength);
- myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
- myAttributes.put("extType", annotatedCall.getType().toString() );
+ myAttributes.put("NVH", maxNumVar);
+ myAttributes.put("NumHapEval", bestHaplotypes.size());
+ myAttributes.put("NumHapAssembly", haplotypes.size());
+ myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
+ myAttributes.put("EVENTLENGTH", maxLength);
+ myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
+ myAttributes.put("extType", annotatedCall.getType().toString() );
- //if( likelihoodCalculationEngine.haplotypeScore != null ) {
- // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
- //}
- if( annotatedCall.hasAttribute("QD") ) {
- myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
+ //if( likelihoodCalculationEngine.haplotypeScore != null ) {
+ // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
+ //}
+ if( annotatedCall.hasAttribute("QD") ) {
+ myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
+ }
}
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
@@ -493,7 +494,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
//---------------------------------------------------------------------------------------------------------------
private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
- if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getExtendedLoc() + " with " + activeRegion.size() + " reads:"); }
+ if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
final ArrayList finalizedReadList = new ArrayList();
final FragmentCollection fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) );
activeRegion.clearReads();
@@ -522,7 +523,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
final ArrayList readsToRemove = new ArrayList();
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
- if( rec.getReadLength() < 24 || rec.getMappingQuality() <= 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
+ if( rec.getReadLength() < 24 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
readsToRemove.add(rec);
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
index 45deb9b2a..0ef1a13a4 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
@@ -82,11 +82,11 @@ public class KBestPaths {
}
}
- protected static class PathComparatorLowestEdge implements Comparator {
- public int compare(final Path path1, final Path path2) {
- return path2.lowestEdge - path1.lowestEdge;
- }
- }
+ //protected static class PathComparatorLowestEdge implements Comparator {
+ // public int compare(final Path path1, final Path path2) {
+ // return path2.lowestEdge - path1.lowestEdge;
+ // }
+ //}
public static List getKBestPaths( final DefaultDirectedGraph graph, final int k ) {
if( k > MAX_PATHS_TO_HOLD/2 ) { throw new ReviewedStingException("Asked for more paths than MAX_PATHS_TO_HOLD!"); }
@@ -99,7 +99,7 @@ public class KBestPaths {
}
}
- Collections.sort(bestPaths, new PathComparatorLowestEdge() );
+ Collections.sort(bestPaths, new PathComparatorTotalScore() );
Collections.reverse(bestPaths);
return bestPaths.subList(0, Math.min(k, bestPaths.size()));
}
@@ -114,8 +114,8 @@ public class KBestPaths {
if ( allOutgoingEdgesHaveBeenVisited(graph, path) ) {
if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
// clean out some low scoring paths
- Collections.sort(bestPaths, new PathComparatorLowestEdge() );
- for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); }
+ Collections.sort(bestPaths, new PathComparatorTotalScore() );
+ for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20
}
bestPaths.add(path);
} else if( n.val > 10000) {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index 535508d09..b5ce4b4bc 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -30,6 +30,7 @@ import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@@ -50,18 +51,17 @@ public class LikelihoodCalculationEngine {
}
public void computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) {
- final int numHaplotypes = haplotypes.size();
int X_METRIC_LENGTH = 0;
- for( final String sample : perSampleReadList.keySet() ) {
- for( final GATKSAMRecord read : perSampleReadList.get(sample) ) {
+ for( final Map.Entry> sample : perSampleReadList.entrySet() ) {
+ for( final GATKSAMRecord read : sample.getValue() ) {
final int readLength = read.getReadLength();
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
}
}
int Y_METRIC_LENGTH = 0;
- for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
- final int haplotypeLength = haplotypes.get(jjj).getBases().length;
+ for( final Haplotype h : haplotypes ) {
+ final int haplotypeLength = h.getBases().length;
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
}
@@ -90,8 +90,10 @@ public class LikelihoodCalculationEngine {
final int numHaplotypes = haplotypes.size();
final int numReads = reads.size();
final double[][] readLikelihoods = new double[numHaplotypes][numReads];
+ final int[][] readCounts = new int[numHaplotypes][numReads];
for( int iii = 0; iii < numReads; iii++ ) {
final GATKSAMRecord read = reads.get(iii);
+ final int readCount = ReadUtils.getMeanRepresentativeReadCount(read);
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
@@ -103,7 +105,7 @@ public class LikelihoodCalculationEngine {
readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
- readQuals[kkk] = ( readQuals[kkk] < (byte) 17 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
+ readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
@@ -114,10 +116,11 @@ public class LikelihoodCalculationEngine {
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP,
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
+ readCounts[jjj][iii] = readCount;
}
}
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
- haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] );
+ haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] );
}
}
@@ -142,10 +145,20 @@ public class LikelihoodCalculationEngine {
}
return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
}
-
+
+ // This function takes just a single sample and a haplotypeMapping
@Requires({"haplotypeMapping.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList> haplotypeMapping ) {
+ final TreeSet sampleSet = new TreeSet();
+ sampleSet.add(sample);
+ return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping);
+ }
+
+ // This function takes a set of samples to pool over and a haplotypeMapping
+ @Requires({"haplotypeMapping.size() > 0"})
+ @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
+ public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final ArrayList> haplotypeMapping ) {
final int numHaplotypes = haplotypeMapping.size();
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
@@ -154,17 +167,21 @@ public class LikelihoodCalculationEngine {
}
// compute the diploid haplotype likelihoods
+ // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
- final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
- final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
double haplotypeLikelihood = 0.0;
- for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
- // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
- // First term is approximated by Jacobian log with table lookup.
- haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
+ for( final String sample : samples ) {
+ final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
+ final int[] readCounts_iii = iii_mapped.getReadCounts(sample);
+ final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
+ for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
+ // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
+ // First term is approximated by Jacobian log with table lookup.
+ haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
+ }
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
}
@@ -176,48 +193,6 @@ public class LikelihoodCalculationEngine {
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
}
- @Requires({"haplotypes.size() > 0"})
- @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
- public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList haplotypes, final Set samples ) {
- // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
- final ArrayList> haplotypeMapping = new ArrayList>();
- for( final Haplotype h : haplotypes ) {
- final ArrayList list = new ArrayList();
- list.add(h);
- haplotypeMapping.add(list);
- }
-
- final int numHaplotypes = haplotypeMapping.size();
- final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
- for( int iii = 0; iii < numHaplotypes; iii++ ) {
- Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
- }
-
- // compute the diploid haplotype likelihoods
- for( int iii = 0; iii < numHaplotypes; iii++ ) {
- for( int jjj = 0; jjj <= iii; jjj++ ) {
- for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
- for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
- double haplotypeLikelihood = 0.0;
- for( final String sample : samples ) {
- final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
- final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
- for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
- // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
- // First term is approximated by Jacobian log with table lookup.
- haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
- }
- }
- haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
- }
- }
- }
- }
-
- // normalize the diploid likelihoods matrix
- return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
- }
-
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
@Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"})
protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) {
@@ -306,12 +281,19 @@ public class LikelihoodCalculationEngine {
final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final ArrayList bestHaplotypesIndexList = new ArrayList();
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
- final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( haplotypes, sampleKeySet ); // all samples pooled together
+ // set up the default 1-to-1 haplotype mapping object
+ final ArrayList> haplotypeMapping = new ArrayList>();
+ for( final Haplotype h : haplotypes ) {
+ final ArrayList list = new ArrayList();
+ list.add(h);
+ haplotypeMapping.add(list);
+ }
+ final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together
int hap1 = 0;
int hap2 = 0;
//double bestElement = Double.NEGATIVE_INFINITY;
- final int maxChosenHaplotypes = Math.min( 8, sampleKeySet.size() * 2 + 1 );
+ final int maxChosenHaplotypes = Math.min( 13, sampleKeySet.size() * 2 + 1 );
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
double maxElement = Double.NEGATIVE_INFINITY;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
@@ -343,9 +325,9 @@ public class LikelihoodCalculationEngine {
public static Map>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) {
final Map>> returnMap = new HashMap>>();
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
- for( final String sample : perSampleReadList.keySet() ) {
+ for( final Map.Entry> sample : perSampleReadList.entrySet() ) {
final Map> alleleReadMap = new HashMap>();
- final ArrayList readsForThisSample = perSampleReadList.get(sample);
+ final ArrayList readsForThisSample = sample.getValue();
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
@@ -355,7 +337,7 @@ public class LikelihoodCalculationEngine {
for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood
double maxLikelihood = Double.NEGATIVE_INFINITY;
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
- final double likelihood = h.getReadLikelihoods(sample)[iii];
+ final double likelihood = h.getReadLikelihoods(sample.getKey())[iii];
if( likelihood > maxLikelihood ) {
maxLikelihood = likelihood;
}
@@ -390,13 +372,13 @@ public class LikelihoodCalculationEngine {
readList = new ArrayList();
alleleReadMap.put(Allele.NO_CALL, readList);
}
- for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample) ) {
+ for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
readList.add(read);
}
}
- returnMap.put(sample, alleleReadMap);
+ returnMap.put(sample.getKey(), alleleReadMap);
}
return returnMap;
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
index e2bc7a10f..56cb6c3d4 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@@ -4,6 +4,7 @@ import com.google.java.contract.Ensures;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
+import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -68,7 +69,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
}
- private void createDeBruijnGraphs( final ArrayList reads, final Haplotype refHaplotype ) {
+ protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) {
graphs.clear();
// create the graph
@@ -161,7 +162,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
}
}
- private static boolean createGraphFromSequences( final DefaultDirectedGraph graph, final ArrayList reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
+ private static boolean createGraphFromSequences( final DefaultDirectedGraph graph, final Collection reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
final byte[] refSequence = refHaplotype.getBases();
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
@@ -183,6 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
+ final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not readuced
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
@@ -194,6 +196,14 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
break;
}
}
+ int countNumber = 1;
+ if (read.isReducedRead()) {
+ // compute mean number of reduced read counts in current kmer span
+ final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1);
+ // precise rounding can make a difference with low consensus counts
+ countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
+ }
+
if( !badKmer ) {
// get the kmers
final byte[] kmer1 = new byte[KMER_LENGTH];
@@ -201,7 +211,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
final byte[] kmer2 = new byte[KMER_LENGTH];
System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH);
- addKmersToGraph(graph, kmer1, kmer2, false);
+ for (int k=0; k < countNumber; k++)
+ addKmersToGraph(graph, kmer1, kmer2, false);
}
}
}
@@ -230,7 +241,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
return true;
}
- private void printGraphs() {
+ protected void printGraphs() {
int count = 0;
for( final DefaultDirectedGraph graph : graphs ) {
GRAPH_WRITER.println("digraph kmer" + count++ +" {");
@@ -281,7 +292,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
- final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place
+ final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
@@ -311,7 +322,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
}
private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList haplotypeList, final int activeRegionStart, final int activeRegionStop ) {
- //final int sizeOfActiveRegion = activeRegionStop - activeRegionStart;
+ if( haplotype == null ) { return false; }
+
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0) );
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
index cf6d1cd77..580667ee2 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
@@ -50,20 +50,21 @@ public class BQSRIntegrationTest extends WalkerTest {
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
String HiSeqInterval = "chr1:10,000,000-10,100,000";
return new Object[][]{
- {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "239ce3387b4540faf44ec000d844ccd1")},
- {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "d69127341938910c38166dd18449598d")},
- {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "b77e621bed1b0dc57970399a35efd0da")},
- {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "2697f38d467a7856c40abce0f778456a")},
- {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "a55018b1643ca3964dbb50783db9f3e4")},
- {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "54fe8d1f5573845e6a2aa9688f6dd950")},
- {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "6b518ad3c56d66c6f5ea812d058f5c4d")},
- {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "3ddb9730f00ee3a612b42209ed9f7e03")},
- {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4cd4fb754e1ef142ad691cb35c74dc4c")},
- {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "364eab693e5e4c7d18a77726b6460f3f")},
- {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "c449cfca61d605b534f0dce35581339d")},
- {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "5268cb5a4b69335568751d5e5ab80d43")},
- {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "3ddb9730f00ee3a612b42209ed9f7e03")},
- {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "4a786ba42e38e7fd101947c34a6883ed")},
+ {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")},
+ {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")},
+ {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")},
+ {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")},
+ {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")},
+ {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")},
+ {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")},
+ {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
+ {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")},
+ {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")},
+ {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")},
+ {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")},
+ {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "90d70542076715a8605a8d4002614b34")},
+ {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
+ {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")},
};
}
@@ -74,10 +75,11 @@ public class BQSRIntegrationTest extends WalkerTest {
Arrays.asList(params.md5));
executeTest("testBQSR-"+params.args, spec).getFirst();
- WalkerTestSpec specNT2 = new WalkerTestSpec(
- params.getCommandLine() + " -nt 2",
- Arrays.asList(params.md5));
- executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
+ // TODO -- re-enable once parallelization is fixed in BaseRecalibrator
+ //WalkerTestSpec specNT2 = new WalkerTestSpec(
+ // params.getCommandLine() + " -nt 2",
+ // Arrays.asList(params.md5));
+ //executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
}
@Test
@@ -94,6 +96,20 @@ public class BQSRIntegrationTest extends WalkerTest {
executeTest("testBQSRFailWithoutDBSNP", spec);
}
+ @Test
+ public void testBQSRFailWithSolidNoCall() {
+ WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+ " -T BaseRecalibrator" +
+ " -R " + b36KGReference +
+ " -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" +
+ " -L 1:50,000-80,000" +
+ " --no_plots" +
+ " -o %s",
+ 1, // just one output file
+ UserException.class);
+ executeTest("testBQSRFailWithSolidNoCall", spec);
+ }
+
private static class PRTest {
final String args;
final String md5;
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java
similarity index 96%
rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java
rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java
index 5a6f7df0f..983f562d2 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java
@@ -19,7 +19,7 @@ import java.util.Arrays;
* Time: 7:44 AM
* To change this template use File | Settings | File Templates.
*/
-public class PoolAFCalculationModelUnitTest extends BaseTest {
+public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
@@ -138,10 +138,10 @@ public class PoolAFCalculationModelUnitTest extends BaseTest {
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
- final int len = PoolGenotypeLikelihoods.getNumLikelihoodElements(1+cfg.numAltAlleles,cfg.ploidy*cfg.GLs.size());
+ final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors
- PoolAFCalculationModel.combineSinglePools(cfg.GLs, 1+cfg.numAltAlleles, cfg.ploidy, priors, result);
+ GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java
similarity index 87%
rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java
rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java
index 35a1bb043..f95ba66b2 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java
@@ -41,7 +41,7 @@ import java.io.PrintStream;
import java.util.*;
-public class PoolGenotypeLikelihoodsUnitTest {
+public class GeneralPloidyGenotypeLikelihoodsUnitTest {
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
final Logger logger = Logger.getLogger(Walker.class);
@@ -49,11 +49,18 @@ public class PoolGenotypeLikelihoodsUnitTest {
private static final boolean SIMULATE_NOISY_PILEUP = false;
private static final int NUM_SIMULATED_OBS = 10;
+ void PoolGenotypeLikelihoodsUnitTest() {
+ UAC.minQualityScore = 5;
+ UAC.maxQualityScore = 40;
+ UAC.phredScaledPrior = (byte)20;
+ UAC.minPower = 0.0;
+
+ }
@Test
public void testStoringLikelihoodElements() {
- // basic test storing a given PL vector in a PoolGenotypeLikelihoods object and then retrieving it back
+ // basic test storing a given PL vector in a GeneralPloidyGenotypeLikelihoods object and then retrieving it back
int ploidy = 20;
int numAlleles = 4;
@@ -71,7 +78,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
for (int k=0; k < gls.length; k++)
gls[k]= (double)k;
- PoolGenotypeLikelihoods gl = new PoolSNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true);
+ GeneralPloidyGenotypeLikelihoods gl = new GeneralPloidySNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true);
double[] glnew = gl.getLikelihoods();
Assert.assertEquals(gls, glnew);
@@ -83,7 +90,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
for (int ploidy = 2; ploidy < 10; ploidy++) {
for (int nAlleles = 2; nAlleles < 10; nAlleles++)
- Assert.assertEquals(PoolGenotypeLikelihoods.getNumLikelihoodElements(nAlleles,ploidy),
+ Assert.assertEquals(GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(nAlleles, ploidy),
GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy));
}
@@ -95,7 +102,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// create iterator, compare linear index given by iterator with closed form function
int numAlleles = 4;
int ploidy = 2;
- PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles, ploidy);
+ GeneralPloidyGenotypeLikelihoods.SumIterator iterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles, ploidy);
while(iterator.hasNext()) {
System.out.format("\n%d:",iterator.getLinearIndex());
@@ -104,7 +111,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
System.out.format("%d ",aa);
- int computedIdx = PoolGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy);
+ int computedIdx = GeneralPloidyGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy);
System.out.format("Computed idx = %d\n",computedIdx);
iterator.next();
}
@@ -133,7 +140,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
allelesToSubset.add(Allele.create("A",false));
allelesToSubset.add(Allele.create("C",false));
- double[] newGLs = PoolGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy,
+ double[] newGLs = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy,
originalAlleles, allelesToSubset);
@@ -163,7 +170,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
@Test
public void testIndexIterator() {
int[] seed = new int[]{1,2,3,4};
- PoolGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1);
+ GeneralPloidyGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
@@ -221,12 +228,12 @@ public class PoolGenotypeLikelihoodsUnitTest {
}
- private PoolGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) {
- PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(seed, restrictSumTo);
+ private GeneralPloidyGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) {
+ GeneralPloidyGenotypeLikelihoods.SumIterator iterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(seed, restrictSumTo);
while(iterator.hasNext()) {
int[] a = iterator.getCurrentVector();
- int idx = PoolGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo);
+ int idx = GeneralPloidyGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo);
if (VERBOSE) {
System.out.format("%d:",iterator.getLinearIndex());
for (int i=0; i < seed.length; i++)
@@ -251,8 +258,6 @@ public class PoolGenotypeLikelihoodsUnitTest {
@Test
public void testErrorModel() {
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
- final byte minQ = 5;
- final byte maxQ = 40;
final byte refByte = refPileupTestProvider.getRefByte();
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
@@ -270,7 +275,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// get artificial alignment context for ref sample - no noise
Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, new String(new byte[]{altByte}), new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
- final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refVC, 0.0);
+ final ErrorModel emodel = new ErrorModel(UAC, refPileup, refVC, refPileupTestProvider.getReferenceContext());
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
@@ -287,19 +292,17 @@ public class PoolGenotypeLikelihoodsUnitTest {
@Test
public void testIndelErrorModel() {
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
- final byte minQ = 5;
- final byte maxQ = 40;
final byte refByte = refPileupTestProvider.getRefByte();
final String altBases = "TCA";
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
final List trueAlleles = new ArrayList();
- trueAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, true));
- trueAlleles.add(Allele.create("TC", false));
+ trueAlleles.add(Allele.create(refByte, true));
+ trueAlleles.add(Allele.create((char)refByte + "TC", false));
final String fw = new String(refPileupTestProvider.getReferenceContext().getForwardBases());
final VariantContext refInsertionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
refPileupTestProvider.getReferenceContext().getLocus().getStart(), trueAlleles).
- genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).referenceBaseForIndel(refByte).make();
+ genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).make();
final int[] matchArray = {95, 995, 9995, 10000};
@@ -311,9 +314,9 @@ public class PoolGenotypeLikelihoodsUnitTest {
// get artificial alignment context for ref sample - no noise
// CASE 1: Test HET insertion
// Ref sample has TC insertion but pileup will have TCA inserted instead to test mismatches
- Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(altBases.length(), altBases, new int[]{matches, mismatches}, false, 30);
+ Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(1+altBases.length(), altBases, new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
- final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refInsertionVC, 0.0);
+ final ErrorModel emodel = new ErrorModel(UAC, refPileup, refInsertionVC, refPileupTestProvider.getReferenceContext());
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
@@ -329,12 +332,12 @@ public class PoolGenotypeLikelihoodsUnitTest {
// create deletion VC
final int delLength = 4;
final List delAlleles = new ArrayList();
- delAlleles.add(Allele.create(fw.substring(1,delLength+1), true));
- delAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, false));
+ delAlleles.add(Allele.create(fw.substring(0,delLength+1), true));
+ delAlleles.add(Allele.create(refByte, false));
final VariantContext refDeletionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
refPileupTestProvider.getReferenceContext().getLocus().getStart()+delLength, delAlleles).
- genotypes(GenotypeBuilder.create(refSampleName, delAlleles)).referenceBaseForIndel(refByte).make();
+ genotypes(GenotypeBuilder.create(refSampleName, delAlleles)).make();
for (int matches: matchArray) {
for (int mismatches: mismatchArray) {
@@ -343,7 +346,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// Ref sample has 4bp deletion but pileup will have 3 bp deletion instead to test mismatches
Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(-delLength+1, altBases, new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
- final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refDeletionVC, 0.0);
+ final ErrorModel emodel = new ErrorModel(UAC, refPileup, refDeletionVC, refPileupTestProvider.getReferenceContext());
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
@@ -388,8 +391,6 @@ public class PoolGenotypeLikelihoodsUnitTest {
final byte refByte = readPileupTestProvider.getRefByte();
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
- final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte);
- final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte);
final List allAlleles = new ArrayList(); // this contains only ref Allele up to now
final Set laneIDs = new TreeSet();
@@ -407,17 +408,28 @@ public class PoolGenotypeLikelihoodsUnitTest {
for (String laneID : laneIDs)
noisyErrorModels.put(laneID, Q30ErrorModel);
+ // all first ref allele
+ allAlleles.add(Allele.create(refByte,true));
for (byte b: BaseUtils.BASES) {
- if (refByte == b)
- allAlleles.add(Allele.create(b,true));
- else
+ if (refByte != b)
allAlleles.add(Allele.create(b, false));
}
+ final int refIdx = 0;
+ int altIdx = -1;
+
+ for (int k=0; k < allAlleles.size(); k++)
+ if (altByte == allAlleles.get(k).getBases()[0]) {
+ altIdx = k;
+ break;
+ }
+
+
+
PrintStream out = null;
if (SIMULATE_NOISY_PILEUP) {
try {
- out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table"));
+ out = new PrintStream(new File("GLUnitTest.table"));
// out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table"));
}
catch (Exception e) {}
@@ -441,7 +453,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// get now likelihoods for this
- final PoolSNPGenotypeLikelihoods GL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true);
+ final GeneralPloidySNPGenotypeLikelihoods GL = new GeneralPloidySNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true);
final int nGoodBases = GL.add(alignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
if (VERBOSE) {
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\nGLs:", depth,ac,altDepth, nSamplesPerPool);
@@ -470,7 +482,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// get now likelihoods for this
- final PoolSNPGenotypeLikelihoods noisyGL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true);
+ final GeneralPloidySNPGenotypeLikelihoods noisyGL = new GeneralPloidySNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true);
noisyGL.add(noisyAlignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
mlPair = noisyGL.getMostLikelyACCount();
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
similarity index 51%
rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java
rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
index 903465538..6ae34f190 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
@@ -12,50 +12,66 @@ import org.testng.annotations.Test;
* Time: 11:28 AM
* To change this template use File | Settings | File Templates.
*/
-public class PoolCallerIntegrationTest extends WalkerTest {
+public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
final static String REF = b37KGReference;
final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list";
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
final String REFSAMPLE_NAME = "NA12878";
- final String MTINTERVALS = "MT";
+ final String MTINTERVALS = "MT:1-3000";
final String LSVINTERVALS = "20:40,000,000-41,000,000";
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
+
private void PC_MT_Test(String bam, String args, String name, String md5) {
- final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
+ final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -ignoreLane ",
REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
private void PC_LSV_Test(String args, String name, String model, String md5) {
- final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
+ final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ",
REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
- @Test
+ private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
+ final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane",
+ REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
+ final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
+ executeTest("testPoolCaller:"+name+" args=" + args, spec);
+ }
+
+ @Test(enabled = true)
public void testBOTH_GGA_Pools() {
- PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","36b8db57f65be1cc3d2d9d7f9f3f26e4");
+ PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0934f72865388999efec64bd9d4a9b93");
}
- @Test
+ @Test(enabled = true)
public void testINDEL_GGA_Pools() {
- PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
+ PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","126581c72d287722437274d41b6fed7b");
}
- @Test
+ @Test(enabled = true)
+ public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
+ PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","b543aa1c3efedb301e525c1d6c50ed8d");
+ }
+
+ @Test(enabled = true)
+ public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
+ PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","55b20557a836bb92688e68f12d7f5dc4");
+ }
+
+ @Test(enabled = true)
public void testMT_SNP_DISCOVERY_sp4() {
- PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fa5ee7c957c473a80f3a7f3c35dc80b5");
+ PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","7eb889e8e07182f4c3d64609591f9459");
}
- @Test
+ @Test(enabled = true)
public void testMT_SNP_GGA_sp10() {
-
- PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "6907c8617d49bb57b33f8704ce7f0323");
+ PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "db8114877b99b14f7180fdcd24b040a7");
}
-
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
index 04bb3a753..539190fe9 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
@@ -262,8 +262,6 @@ public class GenotypingEngineUnitTest extends BaseTest {
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// SNP + ref + SNP = MNP with ref base gap
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
@@ -274,11 +272,9 @@ public class GenotypingEngineUnitTest extends BaseTest {
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + SNP
- thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make();
+ thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
@@ -286,23 +282,19 @@ public class GenotypingEngineUnitTest extends BaseTest {
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// SNP + insertion
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
- nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","AAAAA").referenceBaseForIndel("C").make();
+ nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// deletion + SNP
- thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("C","-").referenceBaseForIndel("T").make();
+ thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make();
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
@@ -310,68 +302,66 @@ public class GenotypingEngineUnitTest extends BaseTest {
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// SNP + deletion
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
- nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
+ nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + deletion = MNP
- thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make();
- nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
+ thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make();
+ nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + deletion
- thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make();
- nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
+ thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make();
+ nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// insertion + insertion
- thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make();
- nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","A").referenceBaseForIndel("C").make();
+ thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make();
+ nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make();
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
// deletion + deletion
- thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("T","-").referenceBaseForIndel("A").make();
- nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
+ thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make();
+ nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make();
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
logger.warn(truthVC + " == " + mergedVC);
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
+
+ // deletion + insertion (abutting)
+ thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make();
+ nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make();
+ truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make();
+ mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
+ logger.warn(truthVC + " == " + mergedVC);
+ Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
+ Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
+ Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
// complex + complex
thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make();
@@ -382,8 +372,6 @@ public class GenotypingEngineUnitTest extends BaseTest {
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
- Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
- Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
}
/**
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 6fdfb83e4..c766f363c 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -20,17 +20,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
- HCTest(CEUTRIO_BAM, "", "882ece8391cfec30ab658970a487d078");
+ HCTest(CEUTRIO_BAM, "", "6b30c7e1b6bbe80d180d9d67441cec12");
}
@Test
public void testHaplotypeCallerSingleSample() {
- HCTest(NA12878_BAM, "", "6f458e04251bc4c09b5b544d46f19d68");
+ HCTest(NA12878_BAM, "", "4cdfbfeadef00725974828310558d7d4");
}
@Test
public void testHaplotypeCallerMultiSampleGGA() {
- HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "c172791007f867bf3b975d4194564d9e");
+ HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "6183fb6e374976d7087150009685e043");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@@ -41,8 +41,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
- HCTestComplexVariants(CEUTRIO_BAM, "", "94186811016b332a58c150df556278f8");
+ HCTestComplexVariants(CEUTRIO_BAM, "", "ab7593a7a60a2e9a66053572f1718df1");
}
-
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
index 185641140..e82946690 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
@@ -95,9 +95,10 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
ArrayList haplotypes = new ArrayList();
for( int iii = 1; iii <= 3; iii++) {
Double readLikelihood = ( iii == 1 ? readLikelihoodForHaplotype1 : ( iii == 2 ? readLikelihoodForHaplotype2 : readLikelihoodForHaplotype3) );
+ int readCount = 1;
if( readLikelihood != null ) {
Haplotype haplotype = new Haplotype( (iii == 1 ? "AAAA" : (iii == 2 ? "CCCC" : "TTTT")).getBytes() );
- haplotype.addReadLikelihoods("myTestSample", new double[]{readLikelihood});
+ haplotype.addReadLikelihoods("myTestSample", new double[]{readLikelihood}, new int[]{readCount});
haplotypes.add(haplotype);
}
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
index 4f42d5bc8..5652b118d 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
@@ -7,6 +7,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
*/
import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.walkers.genotyper.ArtificialReadPileupTestProvider;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@@ -18,6 +20,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
+import java.io.PrintStream;
import java.util.*;
public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
@@ -143,6 +146,44 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
Assert.assertTrue(graphEquals(graph, expectedGraph));
}
+ @Test(enabled=false)
+// not ready yet
+ public void testBasicGraphCreation() {
+ final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
+ final byte refBase = refPileupTestProvider.getReferenceContext().getBase();
+ final String altBase = (refBase==(byte)'A'?"C":"A");
+ final int matches = 50;
+ final int mismatches = 50;
+ Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, altBase, new int[]{matches, mismatches}, false, 30);
+ PrintStream graphWriter = null;
+
+ try{
+ graphWriter = new PrintStream("du.txt");
+ } catch (Exception e) {}
+
+
+ SimpleDeBruijnAssembler assembler = new SimpleDeBruijnAssembler(true,graphWriter);
+ final Haplotype refHaplotype = new Haplotype(refPileupTestProvider.getReferenceContext().getBases());
+ refHaplotype.setIsReference(true);
+ assembler.createDeBruijnGraphs(refContext.get(refPileupTestProvider.getSampleNames().get(0)).getBasePileup().getReads(), refHaplotype);
+
+/* // clean up the graphs by pruning and merging
+ for( final DefaultDirectedGraph graph : graphs ) {
+ SimpleDeBruijnAssembler.pruneGraph( graph, PRUNE_FACTOR );
+ //eliminateNonRefPaths( graph );
+ SimpleDeBruijnAssembler.mergeNodes( graph );
+ }
+ */
+ if( graphWriter != null ) {
+ assembler.printGraphs();
+ }
+
+ int k=2;
+
+ // find the best paths in the graphs
+ // return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
+
+ }
@Test(enabled = true)
public void testEliminateNonRefPaths() {
DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class);
diff --git a/public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R b/public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R
similarity index 74%
rename from public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R
rename to public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R
index 6c4dace1d..8a9eecf48 100644
--- a/public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R
+++ b/public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R
@@ -1,8 +1,18 @@
library("ggplot2")
+library(gplots)
+library("reshape")
+library("grid")
library("tools") #For compactPDF in R 2.13+
+library(gsalib)
-args <- commandArgs(TRUE)
+
+if ( interactive() ) {
+ args <- c("NA12878.6.1.dedup.realign.recal.bqsr.grp.csv", "NA12878.6.1.dedup.realign.recal.bqsr.grp", NA)
+} else {
+ args <- commandArgs(TRUE)
+}
data <- read.csv(args[1])
+gsa.report <- gsa.read.gatkreport(args[2])
data <- within(data, EventType <- factor(EventType, levels = rev(levels(EventType))))
numRG = length(unique(data$ReadGroup))
@@ -82,20 +92,45 @@ for(cov in levels(data$CovariateName)) { # for each covariate in turn
p <- ggplot(d, aes(x=CovariateValue)) +
xlab(paste(cov,"Covariate")) +
- ylab("Number of Observations") +
+ ylab("No. of Observations (area normalized)") +
blankTheme
- d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations),alpha=0.6,binwidth=1,position="identity") + scale_fill_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
- scale_y_continuous(formatter="comma")
-
+ d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations,y=..ndensity..),alpha=0.6,binwidth=1,position="identity")
+ d <- d + scale_fill_manual(values=c("maroon1","blue"))
+ d <- d + facet_grid(.~EventType)
+# d <- d + scale_y_continuous(formatter="comma")
}
}
-pdf(args[2],height=9,width=15)
+if ( ! is.na(args[3]) )
+ pdf(args[3],height=9,width=15)
+
+#frame()
+textplot(gsa.report$Arguments, show.rownames=F)
+title(
+ main="GATK BaseRecalibration report",
+ sub=date())
+
distributeGraphRows(list(a,b,c), c(1,1,1))
distributeGraphRows(list(d,e,f), c(1,1,1))
-dev.off()
+# format the overall information
+rt0 <- data.frame(
+ ReadGroup = gsa.report$RecalTable0$ReadGroup,
+ EventType = gsa.report$RecalTable0$EventType,
+ EmpiricalQuality = sprintf("%.1f", gsa.report$RecalTable0$EmpiricalQuality),
+ EstimatedQReported = sprintf("%.1f", gsa.report$RecalTable0$EstimatedQReported),
+ Observations = sprintf("%.2e", gsa.report$RecalTable0$Observations),
+ Errors = sprintf("%.2e", gsa.report$RecalTable0$Errors))
+textplot(t(rt0), show.colnames=F)
+title("Overall error rates by event type")
-if (exists('compactPDF')) {
- compactPDF(args[2])
+# plot per quality score recalibration table
+textplot(gsa.report$RecalTable1, show.rownames=F)
+title("Error rates by event type and initial quality score")
+
+if ( ! is.na(args[3]) ) {
+ dev.off()
+ if (exists('compactPDF')) {
+ compactPDF(args[2])
+ }
}
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
index 19567e7e6..45dacd835 100644
--- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
@@ -207,7 +207,7 @@ plotVariantQC <- function(metrics, measures, requestedStrat = "Sample",
if ( requestedStrat == "Sample" ) {
perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale
- perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
+ perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)")
} else { # by AlleleCount
perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs)))
perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount")
diff --git a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java
index 6c8fe1834..10326ef2e 100644
--- a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java
+++ b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java
@@ -208,6 +208,7 @@ public class FastaSequenceIndexBuilder {
break;
}
}
+ in.close();
return sequenceIndex;
}
catch (IOException e) {
diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java
index bc0a5b63d..7cd85cfd8 100644
--- a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java
+++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java
@@ -12,10 +12,10 @@ import java.util.*;
*/
public class Bases implements Iterable
{
- public static byte A = 'A';
- public static byte C = 'C';
- public static byte G = 'G';
- public static byte T = 'T';
+ public static final byte A = 'A';
+ public static final byte C = 'C';
+ public static final byte G = 'G';
+ public static final byte T = 'T';
public static final Bases instance = new Bases();
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java
index a5647ec0f..618120217 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java
@@ -26,6 +26,7 @@
package org.broadinstitute.sting.commandline;
import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.lang.annotation.Annotation;
import java.util.List;
@@ -147,6 +148,9 @@ public class ArgumentDefinition {
this.exclusiveOf = exclusiveOf;
this.validation = validation;
this.validOptions = validOptions;
+
+ validateName(shortName);
+ validateName(fullName);
}
/**
@@ -192,6 +196,9 @@ public class ArgumentDefinition {
else
shortName = null;
+ validateName(shortName);
+ validateName(fullName);
+
this.ioType = ioType;
this.argumentType = argumentType;
this.fullName = fullName;
@@ -277,4 +284,14 @@ public class ArgumentDefinition {
String validation = (String)CommandLineUtils.getValue(annotation, "validation");
return validation.trim().length() > 0 ? validation.trim() : null;
}
+
+ /**
+ * Make sure the argument's name is valid
+ *
+ * @param name
+ */
+ private void validateName(final String name) {
+ if ( name != null && name.startsWith("-") )
+ throw new ReviewedStingException("Invalid argument definition: " + name + " begins with a -");
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java
index b47677b08..474225e2a 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java
@@ -55,10 +55,8 @@ public class ArgumentDefinitionGroup implements Iterable {
* Does the name of this argument group match the name of another?
*/
public boolean groupNameMatches( ArgumentDefinitionGroup other ) {
- if( this.groupName == null && other.groupName == null )
- return true;
- if( this.groupName == null && other.groupName != null )
- return false;
+ if( this.groupName == null )
+ return other.groupName == null;
return this.groupName.equals(other.groupName);
}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index c201e95f0..dd4a151bf 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -53,7 +53,7 @@ public abstract class ArgumentTypeDescriptor {
/**
* our log, which we want to capture anything from org.broadinstitute.sting
*/
- protected static Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class);
+ protected static final Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class);
/**
* Fetch the given descriptor from the descriptor repository.
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java
index 26af49e12..376b6f210 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java
@@ -120,8 +120,8 @@ public abstract class ParsingMethod {
*/
private static final String TAG_TEXT = "[\\w\\-\\.\\=]*";
- public static ParsingMethod FullNameParsingMethod = new ParsingMethod(Pattern.compile(String.format("\\s*--(%1$s)(?:\\:(%2$s(?:,%2$s)*))?\\s*",ARGUMENT_TEXT,TAG_TEXT)),
+ public static final ParsingMethod FullNameParsingMethod = new ParsingMethod(Pattern.compile(String.format("\\s*--(%1$s)(?:\\:(%2$s(?:,%2$s)*))?\\s*",ARGUMENT_TEXT,TAG_TEXT)),
ArgumentDefinitions.FullNameDefinitionMatcher) {};
- public static ParsingMethod ShortNameParsingMethod = new ParsingMethod(Pattern.compile(String.format("\\s*-(%1$s)(?:\\:(%2$s(?:,%2$s)*))?\\s*",ARGUMENT_TEXT,TAG_TEXT)),
+ public static final ParsingMethod ShortNameParsingMethod = new ParsingMethod(Pattern.compile(String.format("\\s*-(%1$s)(?:\\:(%2$s(?:,%2$s)*))?\\s*",ARGUMENT_TEXT,TAG_TEXT)),
ArgumentDefinitions.ShortNameDefinitionMatcher) {};
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
index c6bb4a27c..0286cdc52 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@@ -130,8 +130,8 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) {
if ( getArgumentCollection().gatkKeyFile == null ) {
throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " +
- "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " +
- "for more information and instructions on how to obtain a key.");
+ "Please see " + GATKRunReport.PHONE_HOME_DOCS_URL +
+ " for more information and instructions on how to obtain a key.");
}
else {
PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index b1ad19e69..312d31727 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -130,6 +130,12 @@ public class CommandLineGATK extends CommandLineExecutable {
// can't close tribble index when writing
if ( message.indexOf("Unable to close index for") != -1 )
+ exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
+
+ // disk is full
+ if ( message.indexOf("No space left on device") != -1 )
+ exitSystemWithUserError(new UserException(t.getMessage()));
+ if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 )
exitSystemWithUserError(new UserException(t.getCause().getMessage()));
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index 5d6fb75ed..e76cde43a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -233,10 +233,6 @@ public class GenomeAnalysisEngine {
if (args.nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis());
- // TODO -- REMOVE ME WHEN WE STOP BCF testing
- if ( args.USE_SLOW_GENOTYPES )
- GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false;
-
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (args.BQSR_RECAL_FILE != null)
setBaseRecalibration(args.BQSR_RECAL_FILE, args.quantizationLevels, args.disableIndelQuals, args.PRESERVE_QSCORES_LESS_THAN, args.emitOriginalQuals);
@@ -797,6 +793,14 @@ public class GenomeAnalysisEngine {
if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
+ if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
+ throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
+
+ boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class);
+
+ if (argCollection.keepProgramRecords)
+ removeProgramRecords = false;
+
return new SAMDataSource(
samReaderIDs,
threadAllocation,
@@ -813,7 +817,8 @@ public class GenomeAnalysisEngine {
getWalkerBAQQualityMode(),
refReader,
getBaseRecalibration(),
- argCollection.defaultBaseQualities);
+ argCollection.defaultBaseQualities,
+ removeProgramRecords);
}
/**
@@ -840,20 +845,9 @@ public class GenomeAnalysisEngine {
SAMSequenceDictionary sequenceDictionary,
GenomeLocParser genomeLocParser,
ValidationExclusion.TYPE validationExclusionType) {
- VCFHeader header = null;
- if ( getArguments().repairVCFHeader != null ) {
- try {
- final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(getArguments().repairVCFHeader));
- header = (VCFHeader)new VCFCodec().readHeader(pbs).getHeaderValue();
- pbs.close();
- } catch ( IOException e ) {
- throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e);
- }
- }
+ final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType);
- RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType);
-
- List dataSources = new ArrayList();
+ final List dataSources = new ArrayList();
for (RMDTriplet fileDescriptor : referenceMetaDataFiles)
dataSources.add(new ReferenceOrderedDataSource(fileDescriptor,
builder,
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 3fd3857c5..bbbd96cf1 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -57,8 +57,6 @@ public class GATKArgumentCollection {
public GATKArgumentCollection() {
}
- public Map walkerArgs = new HashMap();
-
// parameters and their defaults
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
public List samFiles = new ArrayList();
@@ -66,10 +64,10 @@ public class GATKArgumentCollection {
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
- @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
+ @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
- @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
+ @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
public File gatkKeyFile = null;
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
@@ -249,6 +247,12 @@ public class GATKArgumentCollection {
@Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false)
public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT;
+ @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we override the Walker's default and remove program records from the SAM header", required = false)
+ public boolean removeProgramRecords = false;
+
+ @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false)
+ public boolean keepProgramRecords = false;
+
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
@@ -375,19 +379,5 @@ public class GATKArgumentCollection {
@Hidden
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
-
- @Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false)
- @Hidden
- public boolean USE_SLOW_GENOTYPES = false;
- // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
-
- /**
- * The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
- * and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other
- * VCF file that GATK reads in. This allows us to have in effect a master set of header records and use these
- * to fill in any missing ones in input VCF files.
- */
- @Argument(fullName="repairVCFHeader", shortName = "repairVCFHeader", doc="If provided, whenever we read a VCF file we will use the header in this file to repair the header of the input VCF files", required=false)
- public File repairVCFHeader = null;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
new file mode 100644
index 000000000..f30fc0316
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@@ -0,0 +1,62 @@
+package org.broadinstitute.sting.gatk.arguments;
+
+import org.broadinstitute.sting.commandline.Advanced;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 8/20/12
+ * A collection of arguments that are common to the various callers.
+ * This is pulled out so that every caller isn't exposed to the arguments from every other caller.
+ */
+
+public class StandardCallerArgumentCollection {
+ /**
+ * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
+ * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
+ */
+ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
+ public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
+
+ @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
+ public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
+
+ @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
+ public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
+
+ /**
+ * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
+ * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
+ * is the default).
+ */
+ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
+ public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
+
+ /**
+ * This argument allows you to emit low quality calls as filtered records.
+ */
+ @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
+ public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
+
+ /**
+ * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
+ */
+ @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
+ public RodBinding alleles;
+
+ /**
+ * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
+ * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
+ * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
+ * that you not play around with this parameter.
+ */
+ @Advanced
+ @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
+ public int MAX_ALTERNATE_ALLELES = 3;
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java
index 142c8a178..01e24df67 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java
@@ -118,7 +118,7 @@ class WindowedData {
rec.getAlignmentStart(),
stop);
states = new ArrayList();
- if (provider != null && provider.getReferenceOrderedData() != null)
+ if (provider.getReferenceOrderedData() != null)
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
states.add(new RMDDataState(dataSource, dataSource.seek(range)));
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
index 0fa4234b3..7f0a0c4c0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -89,6 +89,11 @@ public class SAMDataSource {
*/
private final SAMFileReader.ValidationStringency validationStringency;
+ /**
+ * Do we want to remove the program records from this data source?
+ */
+ private final boolean removeProgramRecords;
+
/**
* Store BAM indices for each reader present.
*/
@@ -200,7 +205,8 @@ public class SAMDataSource {
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
null, // no BQSR
- (byte) -1);
+ (byte) -1,
+ false);
}
/**
@@ -233,7 +239,8 @@ public class SAMDataSource {
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
- byte defaultBaseQualities) {
+ byte defaultBaseQualities,
+ boolean removeProgramRecords) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
@@ -249,6 +256,7 @@ public class SAMDataSource {
dispatcher = null;
validationStringency = strictness;
+ this.removeProgramRecords = removeProgramRecords;
if(readBufferSize != null)
ReadShard.setReadBufferSize(readBufferSize);
else {
@@ -748,7 +756,7 @@ public class SAMDataSource {
private synchronized void createNewResource() {
if(allResources.size() > maxEntries)
throw new ReviewedStingException("Cannot create a new resource pool. All resources are in use.");
- SAMReaders readers = new SAMReaders(readerIDs, validationStringency);
+ SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords);
allResources.add(readers);
availableResources.add(readers);
}
@@ -777,9 +785,11 @@ public class SAMDataSource {
/**
* Derive a new set of readers from the Reads metadata.
* @param readerIDs reads to load.
+ * TODO: validationStringency is not used here
* @param validationStringency validation stringency.
+ * @param removeProgramRecords indicate whether to clear program records from the readers
*/
- public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) {
+ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency, boolean removeProgramRecords) {
final int totalNumberOfFiles = readerIDs.size();
int readerNumber = 1;
final SimpleTimer timer = new SimpleTimer().start();
@@ -790,6 +800,9 @@ public class SAMDataSource {
long lastTick = timer.currentTime();
for(final SAMReaderID readerID: readerIDs) {
final ReaderInitializer init = new ReaderInitializer(readerID).call();
+ if (removeProgramRecords) {
+ init.reader.getFileHeader().setProgramRecords(new ArrayList());
+ }
if (threadAllocation.getNumIOThreads() > 0) {
inputStreams.put(init.readerID, init.blockInputStream); // get from initializer
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
index 4ecfe472d..c02ae7d99 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java
@@ -45,7 +45,6 @@ import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.LinkedList;
import java.util.List;
/**
@@ -56,30 +55,31 @@ public class ReferenceDataSource {
private IndexedFastaSequenceFile reference;
/** our log, which we want to capture anything from this class */
- protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
+ protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
/**
* Create reference data source from fasta file
* @param fastaFile Fasta file to be used as reference
*/
public ReferenceDataSource(File fastaFile) {
-
// does the fasta file exist? check that first...
if (!fastaFile.exists())
throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist.");
- File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
- File dictFile;
- if (fastaFile.getAbsolutePath().endsWith("fa")) {
- dictFile = new File(fastaFile.getAbsolutePath().replace(".fa", ".dict"));
- }
- else
- dictFile = new File(fastaFile.getAbsolutePath().replace(".fasta", ".dict"));
+ final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz");
+
+ final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
+
+ // determine the name for the dict file
+ final String fastaExt = (fastaFile.getAbsolutePath().endsWith("fa") ? ".fa" : ".fasta" ) + (isGzipped ? ".gz" : "");
+ final File dictFile = new File(fastaFile.getAbsolutePath().replace(fastaExt, ".dict"));
/*
- if index file does not exist, create it manually
- */
+ * if index file does not exist, create it manually
+ */
if (!indexFile.exists()) {
+ if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile);
+
logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath()));
FSLockWithShared indexLock = new FSLockWithShared(indexFile,true);
try {
@@ -96,7 +96,7 @@ public class ReferenceDataSource {
}
catch(UserException e) {
// Rethrow all user exceptions as-is; there should be more details in the UserException itself.
- throw e;
+ throw e;
}
catch (Exception e) {
// If lock creation succeeded, the failure must have been generating the index.
@@ -115,6 +115,8 @@ public class ReferenceDataSource {
* This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary
*/
if (!dictFile.exists()) {
+ if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile);
+
logger.info(String.format("Dict file %s does not exist. Trying to create it now.", dictFile.getAbsolutePath()));
/*
@@ -219,9 +221,9 @@ public class ReferenceDataSource {
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
shards.add(new LocusShard(parser,
- readsDataSource,
- Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
- null));
+ readsDataSource,
+ Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
+ null));
}
}
return shards;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index 508099708..95e39b7c6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -58,7 +58,7 @@ import java.util.Collection;
/** Shards and schedules data in manageable chunks. */
public abstract class MicroScheduler implements MicroSchedulerMBean {
- protected static Logger logger = Logger.getLogger(MicroScheduler.class);
+ protected static final Logger logger = Logger.getLogger(MicroScheduler.class);
/**
* Counts the number of instances of the class that are currently alive.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java b/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java
index 632638f64..390da0cce 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java
@@ -66,13 +66,13 @@ public class TreeReducer implements Callable {
* @return Result of the reduce.
*/
public Object call() {
- Object result = null;
+ Object result;
final long startTime = System.currentTimeMillis();
try {
if( lhs == null )
- result = lhs.get();
+ result = null;
// todo -- what the hell is this above line? Shouldn't it be the two below?
// if( lhs == null )
// throw new IllegalStateException(String.format("Insufficient data on which to reduce; lhs = %s, rhs = %s", lhs, rhs) );
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java
index 9a1455859..cda7392ae 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java
@@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
+import java.util.Iterator;
+
/**
* Filter out reads with wonky cigar strings.
*
+ * - No reads with Hard/Soft clips in the middle of the cigar
+ * - No reads starting with deletions (with or without preceding clips)
+ * - No reads ending in deletions (with or without follow-up clips)
+ * - No reads that are fully hard or soft clipped
+ * - No reads that have consecutive indels in the cigar (II, DD, ID or DI)
+ *
+ * ps: apparently an empty cigar is okay...
+ *
* @author ebanks
* @version 0.1
*/
@@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter {
public boolean filterOut(final SAMRecord rec) {
final Cigar c = rec.getCigar();
- if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad
- boolean previousElementWasIndel = false;
- CigarOperator lastOp = c.getCigarElement(0).getOperator();
-
- if (lastOp == CigarOperator.D) // filter out reads starting with deletion
- return true;
-
- for (CigarElement ce : c.getCigarElements()) {
- CigarOperator op = ce.getOperator();
- if (op == CigarOperator.D || op == CigarOperator.I) {
- if (previousElementWasIndel)
- return true; // filter out reads with adjacent I/D
-
- previousElementWasIndel = true;
- }
- else // this is a regular base (match/mismatch/hard or soft clip)
- previousElementWasIndel = false; // reset the previous element
-
- lastOp = op;
+ // if there is no Cigar then it can't be bad
+ if( c.isEmpty() ) {
+ return false;
}
- return lastOp == CigarOperator.D;
+ Iterator elementIterator = c.getCigarElements().iterator();
+
+ CigarOperator firstOp = CigarOperator.H;
+ while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) {
+ CigarOperator op = elementIterator.next().getOperator();
+
+ // No reads with Hard/Soft clips in the middle of the cigar
+ if (firstOp != CigarOperator.H && op == CigarOperator.H) {
+ return true;
+ }
+ firstOp = op;
+ }
+
+ // No reads starting with deletions (with or without preceding clips)
+ if (firstOp == CigarOperator.D) {
+ return true;
+ }
+
+ boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S);
+ boolean previousElementWasIndel = firstOp == CigarOperator.I;
+ CigarOperator lastOp = firstOp;
+ CigarOperator previousOp = firstOp;
+
+ while (elementIterator.hasNext()) {
+ CigarOperator op = elementIterator.next().getOperator();
+
+ if (op != CigarOperator.S && op != CigarOperator.H) {
+
+ // No reads with Hard/Soft clips in the middle of the cigar
+ if (previousOp == CigarOperator.S || previousOp == CigarOperator.H)
+ return true;
+
+ lastOp = op;
+
+ if (!hasMeaningfulElements && op.consumesReadBases()) {
+ hasMeaningfulElements = true;
+ }
+
+ if (op == CigarOperator.I || op == CigarOperator.D) {
+
+ // No reads that have consecutive indels in the cigar (II, DD, ID or DI)
+ if (previousElementWasIndel) {
+ return true;
+ }
+ previousElementWasIndel = true;
+ }
+ else {
+ previousElementWasIndel = false;
+ }
+ }
+ // No reads with Hard/Soft clips in the middle of the cigar
+ else if (op == CigarOperator.S && previousOp == CigarOperator.H) {
+ return true;
+ }
+
+ previousOp = op;
+ }
+
+ // No reads ending in deletions (with or without follow-up clips)
+ // No reads that are fully hard or soft clipped
+ return lastOp == CigarOperator.D || !hasMeaningfulElements;
}
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java
index 999deddd1..636787c69 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java
@@ -119,7 +119,7 @@ public class ThreadLocalOutputTracker extends OutputTracker {
try {
tempFile = File.createTempFile( stub.getClass().getName(), null );
- tempFile.deleteOnExit();
+ //tempFile.deleteOnExit();
}
catch( IOException ex ) {
throw new UserException.BadTmpDir("Unable to create temporary file for stub: " + stub.getClass().getName() );
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java
index cb8786be1..300e801e6 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java
@@ -62,6 +62,7 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage source = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false);
+ if ( ! closed )
+ throw new ReviewedStingException("Writer not closed, but we are merging into the file!");
+ final String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin";
+ logger.debug(String.format("Merging %s into %s",file.getAbsolutePath(),targetFilePath));
+
+ // use the feature manager to determine the right codec for the tmp file
+ // that way we don't assume it's a specific type
+ final FeatureManager.FeatureDescriptor fd = new FeatureManager().getByFiletype(file);
+ if ( fd == null )
+ throw new ReviewedStingException("Unexpectedly couldn't find valid codec for temporary output file " + file);
+
+ final FeatureCodec codec = fd.getCodec();
+ final AbstractFeatureReader source =
+ AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), codec, false);
- for ( VariantContext vc : source.iterator() ) {
+ for ( final VariantContext vc : source.iterator() ) {
target.writer.add(vc);
}
source.close();
+ file.delete(); // this should be last to aid in debugging when the process fails
+ } catch (UserException e) {
+ throw new ReviewedStingException("BUG: intermediate file " + file + " is malformed, got error while reading", e);
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
index 09766f127..5e1132d45 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
@@ -47,6 +47,7 @@ import java.util.List;
public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
public static final String NO_HEADER_ARG_NAME = "no_cmdline_in_header";
public static final String SITES_ONLY_ARG_NAME = "sites_only";
+ public static final String FORCE_BCF = "bcf";
public static final HashSet SUPPORTED_ZIPPED_SUFFIXES = new HashSet();
//
@@ -96,7 +97,11 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public List createArgumentDefinitions( ArgumentSource source ) {
- return Arrays.asList( createDefaultArgumentDefinition(source), createNoCommandLineHeaderArgumentDefinition(),createSitesOnlyArgumentDefinition());
+ return Arrays.asList(
+ createDefaultArgumentDefinition(source),
+ createNoCommandLineHeaderArgumentDefinition(),
+ createSitesOnlyArgumentDefinition(),
+ createBCFArgumentDefinition() );
}
/**
@@ -117,7 +122,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) {
if(!source.isRequired())
throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default.");
- VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, false, argumentSources, false, false);
+ VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
engine.addOutput(stub);
return stub;
}
@@ -141,15 +146,15 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
if(writerFile == null && !source.isRequired())
throw new MissingArgumentValueException(defaultArgumentDefinition);
- // Should we compress the output stream?
- boolean compress = isCompressed(writerFileName);
-
- boolean skipWritingCmdLineHeader = argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches);
- boolean doNotWriteGenotypes = argumentIsPresent(createSitesOnlyArgumentDefinition(),matches);
-
// Create a stub for the given object.
- VariantContextWriterStub stub = (writerFile != null) ? new VariantContextWriterStub(engine, writerFile, compress, argumentSources, skipWritingCmdLineHeader, doNotWriteGenotypes)
- : new VariantContextWriterStub(engine, defaultOutputStream, compress, argumentSources, skipWritingCmdLineHeader, doNotWriteGenotypes);
+ final VariantContextWriterStub stub = (writerFile != null)
+ ? new VariantContextWriterStub(engine, writerFile, argumentSources)
+ : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
+
+ stub.setCompressed(isCompressed(writerFileName));
+ stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches));
+ stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches));
+ stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches));
// WARNING: Side effects required by engine!
parsingEngine.addTags(stub,getArgumentTags(matches));
@@ -159,8 +164,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
/**
- * Creates the optional compression level argument for the BAM file.
- * @return Argument definition for the BAM file itself. Will not be null.
+ * Creates the optional no_header argument for the VCF file.
+ * @return Argument definition for the VCF file itself. Will not be null.
*/
private ArgumentDefinition createNoCommandLineHeaderArgumentDefinition() {
return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
@@ -179,8 +184,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
/**
- * Creates the optional compression level argument for the BAM file.
- * @return Argument definition for the BAM file itself. Will not be null.
+ * Creates the optional sites_only argument definition
+ * @return Argument definition for the VCF file itself. Will not be null.
*/
private ArgumentDefinition createSitesOnlyArgumentDefinition() {
return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
@@ -198,6 +203,26 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
null );
}
+ /**
+ * Creates the optional bcf argument definition
+ * @return Argument definition for the VCF file itself. Will not be null.
+ */
+ private ArgumentDefinition createBCFArgumentDefinition() {
+ return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
+ boolean.class,
+ FORCE_BCF,
+ FORCE_BCF,
+ "force BCF output, regardless of the file's extension",
+ false,
+ true,
+ false,
+ true,
+ null,
+ null,
+ null,
+ null );
+ }
+
/**
* Returns true if the file will be compressed.
* @param writerFileName Name of the file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
index 6ed889eb6..260a7efda 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
@@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.File;
import java.io.OutputStream;
@@ -78,7 +79,7 @@ public class VariantContextWriterStub implements Stub, Var
/**
* Should we emit a compressed output stream?
*/
- private final boolean isCompressed;
+ private boolean isCompressed = false;
/**
* A hack: push the argument sources into the VCF header so that the VCF header
@@ -89,12 +90,17 @@ public class VariantContextWriterStub implements Stub, Var
/**
* Should the header be written out? A hidden argument.
*/
- private final boolean skipWritingCommandLineHeader;
+ private boolean skipWritingCommandLineHeader = false;
/**
* Should we not write genotypes even when provided?
*/
- private final boolean doNotWriteGenotypes;
+ private boolean doNotWriteGenotypes = false;
+
+ /**
+ * Should we force BCF writing regardless of the file extension?
+ */
+ private boolean forceBCF = false;
/**
* Connects this stub with an external stream capable of serving the
@@ -107,19 +113,13 @@ public class VariantContextWriterStub implements Stub, Var
*
* @param engine engine.
* @param genotypeFile file to (ultimately) create.
- * @param isCompressed should we compress the output stream?
* @param argumentSources sources.
- * @param skipWritingCommandLineHeader skip writing header.
- * @param doNotWriteGenotypes do not write genotypes.
*/
- public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection