From d3f4a5a9017b914894a73134abb8c7dbd5751ee6 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 10:37:38 -0500 Subject: [PATCH 01/18] Fail gracefully when encountering malformed VCFs without enough data columns --- .../org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java | 2 ++ .../src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index aaa2e63a7..b3329c708 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -120,6 +120,8 @@ public class VCF3Codec extends AbstractVCFCodec { genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); ArrayList genotypes = new ArrayList(nParts); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4c1bb1d9e..453155be7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -147,6 +147,8 @@ public class VCFCodec extends AbstractVCFCodec { genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); ArrayList genotypes = new ArrayList(nParts); From 09a5a9eac08ec4b26983bff3c73a0373a91ca688 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 10:43:52 -0500 Subject: [PATCH 02/18] Don't update lineNo for decodeLoc - only for decode (otherwise they get double-counted). Even still, because of the way the GATK currently utilizes Tribble we can parse the same line multiple times, which knocks the line counter out of sync. For now, I've added a TODO in the code to remind us and the error messages note that it's an approximate line number. --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 3 ++- .../broadinstitute/sting/utils/exceptions/UserException.java | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 3009c236b..b902f220f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -184,7 +184,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - lineNo++; // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -279,6 +278,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { builder.source(getName()); // increment the line count + // TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when + // TODO -- the first record is far along the contig) and the line counter can get out of sync lineNo++; // parse out the required fields diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index c599d4759..a2816b58f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -184,11 +184,11 @@ public class UserException extends ReviewedStingException { public static class MalformedVCF extends UserException { public MalformedVCF(String message, String line) { - super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + super(String.format("The provided VCF file is malformed at approximately line %s: %s", line, message)); } public MalformedVCF(String message, int lineNo) { - super(String.format("The provided VCF file is malformed at line number %d: %s", lineNo, message)); + super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); } } From 9497e9492cb6904fac16349ae5cdaadb8a4350d8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 11:21:28 -0500 Subject: [PATCH 03/18] Bug fix for complex records: do not ever reverse clip out a complete allele. --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index b902f220f..e44c10f1f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -595,6 +595,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { if ( a.isSymbolic() ) continue; + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - 1; + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) stillClipping = false; else if ( ref.length() == clipping ) From 76485217184bc6f6891f7a3a76380b38a3fceb6f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 11:26:43 -0500 Subject: [PATCH 04/18] Add check for mixed genotype so that we don't exception out for a valid record --- .../gatk/walkers/varianteval/evaluators/CountVariants.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index c740eb78c..e5e8dfaf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -182,6 +182,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { nHomDerived++; } + break; + case MIXED: break; default: throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); From 106bf13056969d13e01582a7088d168142d73519 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 12:05:50 -0500 Subject: [PATCH 05/18] Use a thread local result object to collect the results of the exact calculation instead of passing in multiple pre-allocated arrays. --- .../AlleleFrequencyCalculationModel.java | 7 +-- .../genotyper/ExactAFCalculationModel.java | 22 +++---- .../genotyper/UnifiedGenotyperEngine.java | 58 +++++++++---------- 3 files changed, 39 insertions(+), 48 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 7d3e7047d..681cc1fa6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.List; @@ -65,11 +64,9 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @param GLs genotype likelihoods * @param Alleles Alleles corresponding to GLs * @param log10AlleleFrequencyPriors priors - * @param log10AlleleFrequencyLikelihoods array (pre-allocated) to store likelihoods results - * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store posteriors results + * @param result (pre-allocated) object to store likelihoods results */ protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, double[][] log10AlleleFrequencyPriors, - double[][] log10AlleleFrequencyLikelihoods, - double[][] log10AlleleFrequencyPosteriors); + AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index ed86897f2..33634ce28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -47,12 +47,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public void getLog10PNonRef(final GenotypesContext GLs, final List alleles, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors) { + final AlleleFrequencyCalculationResult result) { final int numAlleles = alleles.size(); //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); + linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); } private static final ArrayList getGLs(GenotypesContext GLs) { @@ -196,8 +195,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numAlternateAlleles, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors, + final AlleleFrequencyCalculationResult result, final boolean preserveData) { final ArrayList genotypeLikelihoods = getGLs(GLs); @@ -221,7 +219,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); // adjust max likelihood seen if needed maxLog10L = Math.max(maxLog10L, log10LofKs); @@ -236,14 +234,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final Queue ACqueue, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors) { + final AlleleFrequencyCalculationResult result) { if ( DEBUG ) System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result); // clean up memory if ( !preserveData ) { @@ -349,8 +346,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final ArrayList genotypeLikelihoods, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors) { + final AlleleFrequencyCalculationResult result) { set.log10Likelihoods[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -410,11 +406,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { int AC = set.ACcounts.getCounts()[i]; - log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyLikelihoods[i][AC], log10LofK); + result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); // for k=0 we still want to use theta final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; - log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); + result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 2308e1759..5d271cdb1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -75,14 +75,13 @@ public class UnifiedGenotyperEngine { // the model used for calculating p(non-ref) private ThreadLocal afcm = new ThreadLocal(); + // the allele frequency likelihoods (allocated once as an optimization) + private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything private final double[][] log10AlleleFrequencyPriorsSNPs; private final double[][] log10AlleleFrequencyPriorsIndels; - // the allele frequency likelihoods (allocated once as an optimization) - private ThreadLocal log10AlleleFrequencyLikelihoods = new ThreadLocal(); - private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); - // the priors object private final GenotypePriors genotypePriorsSNPs; private final GenotypePriors genotypePriorsIndels; @@ -264,9 +263,8 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]); - log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); + alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); } // don't try to genotype too many alternate alleles @@ -285,9 +283,9 @@ public class UnifiedGenotyperEngine { } // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; @@ -299,7 +297,7 @@ public class UnifiedGenotyperEngine { // determine which alternate alleles have AF>0 boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]); + int indexOfBestAC = MathUtils.maxElementIndex(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]); // if the most likely AC is not 0, then this is a good alternate allele to use if ( indexOfBestAC != 0 ) { @@ -320,7 +318,7 @@ public class UnifiedGenotyperEngine { // calculate p(f>0) // TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]); + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[indexOfHighestAlt]); double sum = 0.0; for (int i = 1; i <= N; i++) sum += normalizedPosteriors[i]; @@ -330,15 +328,15 @@ public class UnifiedGenotyperEngine { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0]; + phredScaledConfidence = -10.0 * alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { sum = 0.0; for (int i = 1; i <= N; i++) { - if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) break; - sum += log10AlleleFrequencyPosteriors.get()[0][i]; + sum += alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i]; } phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } @@ -396,31 +394,31 @@ public class UnifiedGenotyperEngine { // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); + double overallLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0]; - double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); + double forwardLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; + double forwardLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0]; - double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); + double reverseLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; + double reverseLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -587,10 +585,10 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) + if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) AFline.append("0.00000000\t"); else - AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i])); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i])); AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); verboseWriter.println(AFline.toString()); } From 988d60091f79b1a58117f423e649e9d7ed4b5813 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 13:37:15 -0500 Subject: [PATCH 07/18] Forgot to add in the new result class --- .../AlleleFrequencyCalculationResult.java | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java new file mode 100644 index 000000000..66106f658 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Dec 14, 2011 + * + * Useful helper class to communicate the results of the allele frequency calculation + */ +public class AlleleFrequencyCalculationResult { + + final double[][] log10AlleleFrequencyLikelihoods; + final double[][] log10AlleleFrequencyPosteriors; + + AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { + log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; + log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; + } +} \ No newline at end of file From 1e90d602a4c1628a0dde9764466f2c41c8aa8f9b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 13:38:20 -0500 Subject: [PATCH 08/18] Optimization: cache up front the PL index to the pair of alleles it represents for all possible numbers of alternate alleles. --- .../genotyper/ExactAFCalculationModel.java | 26 +++++++----------- .../genotyper/UnifiedGenotyperEngine.java | 27 +++++++++++++++++-- .../ExactAFCalculationModelUnitTest.java | 11 ++++---- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 33634ce28..bc27916dd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -55,7 +55,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } private static final ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(); // TODO -- initialize with size of GLs + ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { @@ -198,6 +198,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final AlleleFrequencyCalculationResult result, final boolean preserveData) { + // make sure the PL cache has been initialized + if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null ) + UnifiedGenotyperEngine.calculatePLcache(5); + final ArrayList genotypeLikelihoods = getGLs(GLs); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -415,10 +419,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - // todo -- arent' there a small number of fixed values that this function can adopt? - // todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance - // todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient - // todo -- this can be computed once at the start of the all operations // the closed form representation generalized for multiple alleles is as follows: // AA: (2j - totalK) * (2j - totalK - 1) @@ -434,25 +434,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( PLindex <= numAltAlleles ) return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; - int subtractor = numAltAlleles+1; - int subtractions = 0; - do { - PLindex -= subtractor; - subtractor--; - subtractions++; - } - while ( PLindex >= subtractor ); + // find the 2 alternate alleles that are represented by this PL index + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex]; - final int k_i = ACcounts[subtractions-1]; + final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele // the hom var case (e.g. BB, CC, DD) final double coeff; - if ( PLindex == 0 ) { + if ( alleles[0] == alleles[1] ) { coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; } // the het non-ref case (e.g. BC, BD, CD) else { - final int k_j = ACcounts[subtractions+PLindex-1]; + final int k_j = ACcounts[alleles[1]-1]; coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 5d271cdb1..221d00410 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -102,6 +102,8 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; + // a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + protected static int[][][] PLIndexToAlleleIndex; // --------------------------------------------------------------------------------------------------------- @@ -135,6 +137,27 @@ public class UnifiedGenotyperEngine { genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); filter.add(LOW_QUAL_FILTER_NAME); + calculatePLcache(UAC.MAX_ALTERNATE_ALLELES); + } + + protected static void calculatePLcache(int maxAltAlleles) { + PLIndexToAlleleIndex = new int[maxAltAlleles+1][][]; + PLIndexToAlleleIndex[0] = null; + int numLikelihoods = 1; + + // for each count of alternate alleles + for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) { + numLikelihoods += altAlleles + 1; + PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][]; + int PLindex = 0; + + // for all possible combinations of the 2 alt alleles + for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) { + for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) { + PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 }; + } + } + } } /** @@ -751,8 +774,8 @@ public class UnifiedGenotyperEngine { * * @return genotypes */ - public GenotypesContext assignGenotypes(VariantContext vc, - boolean[] allelesToUse) { + public GenotypesContext assignGenotypes(final VariantContext vc, + final boolean[] allelesToUse) { final GenotypesContext GLs = vc.getGenotypes(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index 9640a8963..dec6ecb79 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -83,21 +83,20 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1]; - final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1]; + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); for ( int i = 0; i < 2; i++ ) { for ( int j = 0; j < 2*numSamples+1; j++ ) { - log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; } } - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]); + int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } From 35fc2e13c3677a2ff241447bf74e53a077d2b12d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 15:31:09 -0500 Subject: [PATCH 09/18] Using the new PL cache, fix a bug: when only a subset of the genotyped alleles are used for assigning genotypes (because the exact model determined that they weren't all real) the PLs need to be adjusted to reflect this. While fixing this I discovered that the integration tests are busted because ref calls (ALT=.) were getting annotated with PLs, which makes no sense at all. --- ...NPGenotypeLikelihoodsCalculationModel.java | 14 +-- .../genotyper/UnifiedGenotyperEngine.java | 107 +++++++++--------- .../org/broadinstitute/sting/utils/Utils.java | 9 ++ 3 files changed, 66 insertions(+), 64 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 4087443f8..57cc5594a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -95,7 +96,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup); // how many alternate alleles are we using? - int alleleCounter = countSetBits(basesToUse); + int alleleCounter = Utils.countSetBits(basesToUse); // if there are no non-ref alleles... if ( alleleCounter == 0 ) { @@ -109,7 +110,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC } // create the alternate alleles and the allele ordering (the ordering is crucial for the GLs) - final int numAltAlleles = countSetBits(basesToUse); + final int numAltAlleles = Utils.countSetBits(basesToUse); final int[] alleleOrdering = new int[numAltAlleles + 1]; alleleOrdering[0] = indexOfRefBase; int alleleOrderingIndex = 1; @@ -161,15 +162,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return builder.genotypes(genotypes).make(); } - private int countSetBits(boolean[] array) { - int counter = 0; - for ( int i = 0; i < array.length; i++ ) { - if ( array[i] ) - counter++; - } - return counter; - } - // fills in the allelesToUse array protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map contexts, boolean useBAQedPileup) { int[] qualCounts = new int[4]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 221d00410..f185af0c3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -103,6 +103,7 @@ public class UnifiedGenotyperEngine { private final boolean BAQEnabledOnCMDLine; // a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + // the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)] protected static int[][][] PLIndexToAlleleIndex; @@ -142,7 +143,7 @@ public class UnifiedGenotyperEngine { protected static void calculatePLcache(int maxAltAlleles) { PLIndexToAlleleIndex = new int[maxAltAlleles+1][][]; - PLIndexToAlleleIndex[0] = null; + PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} }; int numLikelihoods = 1; // for each count of alternate alleles @@ -399,7 +400,7 @@ public class UnifiedGenotyperEngine { } // create the genotypes - GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse); + GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse, myAlleles); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) @@ -771,82 +772,82 @@ public class UnifiedGenotyperEngine { /** * @param vc variant context with genotype likelihoods * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use + * @param newAlleles a list of the final new alleles to use * * @return genotypes */ public GenotypesContext assignGenotypes(final VariantContext vc, - final boolean[] allelesToUse) { + final boolean[] allelesToUse, + final List newAlleles) { + // the no-called genotypes final GenotypesContext GLs = vc.getGenotypes(); + // samples final List sampleIndices = GLs.getSampleNamesOrderedByName(); + // the new called genotypes to create final GenotypesContext calls = GenotypesContext.create(); + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = allelesToUse.length; + final int numNewAltAlleles = newAlleles.size() - 1; + ArrayList likelihoodIndexesToUse = null; + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { + likelihoodIndexesToUse = new ArrayList(30); + final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; + + for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) { + int[] alleles = PLcache[PLindex]; + // consider this entry only if both of the alleles are good + if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) ) + likelihoodIndexesToUse.add(PLindex); + } + } + + // create the new genotypes for ( int k = GLs.size() - 1; k >= 0; k-- ) { final String sample = sampleIndices.get(k); final Genotype g = GLs.get(sample); if ( !g.hasLikelihoods() ) continue; - final double[] likelihoods = g.getLikelihoods().getAsVector(); + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + final double[] newLikelihoods; + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + } - // if there is no mass on the likelihoods, then just no-call the sample - if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) { + // if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); continue; } - // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. - // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. - - final int numAltAlleles = allelesToUse.length; - - // start with the assumption that the ideal genotype is homozygous reference - Allele maxAllele1 = vc.getReference(), maxAllele2 = vc.getReference(); - double maxLikelihoodSeen = likelihoods[0]; - int indexOfMax = 0; - - // keep track of some state - Allele firstAllele = vc.getReference(); - int subtractor = numAltAlleles + 1; - int subtractionsMade = 0; - - for ( int i = 1, PLindex = 1; i < likelihoods.length; i++, PLindex++ ) { - if ( PLindex == subtractor ) { - firstAllele = vc.getAlternateAllele(subtractionsMade); - PLindex -= subtractor; - subtractor--; - subtractionsMade++; - - // we can skip this allele if it's not usable - if ( !allelesToUse[subtractionsMade-1] ) { - i += subtractor - 1; - PLindex += subtractor - 1; - continue; - } - } - - // we don't care about the entry if we've already seen better - if ( likelihoods[i] <= maxLikelihoodSeen ) - continue; - - // if it's usable then update the alleles - int alleleIndex = subtractionsMade + PLindex - 1; - if ( allelesToUse[alleleIndex] ) { - maxAllele1 = firstAllele; - maxAllele2 = vc.getAlternateAllele(alleleIndex); - maxLikelihoodSeen = likelihoods[i]; - indexOfMax = i; - } - } + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; ArrayList myAlleles = new ArrayList(); - myAlleles.add(maxAllele1); - myAlleles.add(maxAllele2); + myAlleles.add(newAlleles.get(alleles[0])); + myAlleles.add(newAlleles.get(alleles[1])); - final double qual = GenotypeLikelihoods.getQualFromLikelihoods(indexOfMax, likelihoods); - calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); + Map attrs = new HashMap(g.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, newLikelihoods); + calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false)); } return calls; diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index f0eb5d399..b79770eb5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -388,6 +388,15 @@ public class Utils { return reallocate(pos, z); } + public static int countSetBits(boolean[] array) { + int counter = 0; + for ( int i = 0; i < array.length; i++ ) { + if ( array[i] ) + counter++; + } + return counter; + } + /** * Returns new (reallocated) integer array of the specified size, with content * of the original array orig copied into it. If newSize is From 71b4bb12b7903e1542334b80a96ea3345a486967 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Dec 2011 09:58:23 -0500 Subject: [PATCH 11/18] Bug fix for incorrect logic in subsetSamples -- Now properly handles the case where a sample isn't present (no longer adds a null to the genotypes list) -- Fix for logic failure where if the number of requested samples equals the number of known genotypes then all of the records were returned, which isn't correct when there are missing samples. -- Unit tests added to handle these cases --- .../utils/variantcontext/GenotypesContext.java | 15 ++++++++++----- .../variantcontext/VariantContextUnitTest.java | 3 +++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index 845c65c9c..85f7cc078 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -410,6 +410,12 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + /** + * Gets sample associated with this sampleName, or null if none is found + * + * @param sampleName + * @return + */ public Genotype get(final String sampleName) { Integer offset = getSampleI(sampleName); return offset == null ? null : getGenotypes().get(offset); @@ -648,16 +654,15 @@ public class GenotypesContext implements List { @Ensures("result != null") public GenotypesContext subsetToSamples( final Set samples ) { final int nSamples = samples.size(); - final int nGenotypes = size(); - if ( nSamples == nGenotypes ) - return this; - else if ( nSamples == 0 ) + if ( nSamples == 0 ) return NO_GENOTYPES; else { // nGenotypes < nSamples final GenotypesContext subset = create(samples.size()); for ( final String sample : samples ) { - subset.add(get(sample)); + final Genotype g = get(sample); + if ( g != null ) + subset.add(g); } return subset; } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index fca7440e4..0e75eee14 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -704,11 +704,14 @@ public class VariantContextUnitTest extends BaseTest { public Object[][] MakeSubContextTest() { for ( boolean updateAlleles : Arrays.asList(true, false)) { new SubContextTest(Collections.emptySet(), updateAlleles); + new SubContextTest(Collections.singleton("MISSING"), updateAlleles); new SubContextTest(Collections.singleton("AA"), updateAlleles); new SubContextTest(Collections.singleton("AT"), updateAlleles); new SubContextTest(Collections.singleton("TT"), updateAlleles); new SubContextTest(Arrays.asList("AA", "AT"), updateAlleles); new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "MISSING"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "TT", "MISSING"), updateAlleles); } return SubContextTest.getTests(SubContextTest.class); From 01e547eed3b70f8c1c5fac507136264fe9f08200 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Dec 2011 10:00:21 -0500 Subject: [PATCH 12/18] Parallel SAMDataSource initialization -- Uses 8 threads to load BAM files and indices in parallel, decreasing costs to read thousands of BAM files by a significant amount -- Added logger.info message noting progress and cost of reading low-level BAM data. --- .../gatk/datasources/reads/SAMDataSource.java | 131 +++++++++++++----- 1 file changed, 97 insertions(+), 34 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d70c63bd2..aacdf9b95 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -51,6 +52,7 @@ import java.io.File; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; +import java.util.concurrent.*; /** * User: aaron @@ -199,7 +201,7 @@ public class SAMDataSource { BAQ.QualityMode.DONT_MODIFY, null, // no BAQ (byte) -1); - } + } /** * Create a new SAM data source given the supplied read metadata. @@ -253,17 +255,15 @@ public class SAMDataSource { if(readBufferSize != null) ReadShard.setReadBufferSize(readBufferSize); - for (SAMReaderID readerID : samFiles) { - if (!readerID.samFile.canRead()) - throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + - "Please check that the file is present and readable and try again."); - } - resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); // Determine the sort order. for(SAMReaderID readerID: readerIDs) { + if (! readerID.samFile.canRead() ) + throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + + "Please check that the file is present and readable and try again."); + // Get the sort order, forcing it to coordinate if unsorted. SAMFileReader reader = readers.getReader(readerID); SAMFileHeader header = reader.getFileHeader(); @@ -711,29 +711,68 @@ public class SAMDataSource { * @param validationStringency validation stringency. */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { + final int N_THREADS = 8; int totalNumberOfFiles = readerIDs.size(); int readerNumber = 1; - for(SAMReaderID readerID: readerIDs) { - File indexFile = findIndexFile(readerID.samFile); - SAMFileReader reader = null; - - if(threadAllocation.getNumIOThreads() > 0) { - BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(blockInputStream,indexFile,false); - inputStreams.put(readerID,blockInputStream); - } - else - reader = new SAMFileReader(readerID.samFile,indexFile,false); - reader.setSAMRecordFactory(factory); - - reader.enableFileSource(true); - reader.setValidationStringency(validationStringency); - - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); - - readers.put(readerID,reader); + ExecutorService executor = Executors.newFixedThreadPool(N_THREADS); + final List inits = new ArrayList(totalNumberOfFiles); + Queue> futures = new LinkedList>(); + for (SAMReaderID readerID: readerIDs) { + logger.debug("Enqueuing for initialization: " + readerID.samFile); + final ReaderInitializer init = new ReaderInitializer(readerID); + inits.add(init); + futures.add(executor.submit(init)); } + + final SimpleTimer timer = new SimpleTimer(); + try { + final int MAX_WAIT = 30 * 1000; + final int MIN_WAIT = 1 * 1000; + + timer.start(); + while ( ! futures.isEmpty() ) { + final int prevSize = futures.size(); + final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file + final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT)); + Thread.sleep(waitTimeInMS); + + Queue> pending = new LinkedList>(); + for ( final Future initFuture : futures ) { + if ( initFuture.isDone() ) { + final ReaderInitializer init = initFuture.get(); + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID)); + readers.put(init.readerID, init.reader); + } else { + pending.add(initFuture); + } + } + + final int pendingSize = pending.size(); + final int nExecutedInTick = prevSize - pendingSize; + final int nExecutedTotal = totalNumberOfFiles - pendingSize; + final double totalTimeInSeconds = timer.getElapsedTime(); + final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); + final int nRemaining = pendingSize; + final double estTimeToComplete = pendingSize / nTasksPerSecond; + logger.info(String.format("Init %d BAMs in last %d s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", + nExecutedInTick, (int)(waitTimeInMS / 1000.0), + nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, + nRemaining, estTimeToComplete, estTimeToComplete / 60)); + + futures = pending; + } + } catch ( InterruptedException e ) { + throw new ReviewedStingException("Interrupted SAMReader initialization", e); + } catch ( ExecutionException e ) { + throw new ReviewedStingException("Execution exception during SAMReader initialization", e); + } + + logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); + executor.shutdown(); } /** @@ -806,6 +845,30 @@ public class SAMDataSource { } } + class ReaderInitializer implements Callable { + final SAMReaderID readerID; + BlockInputStream blockInputStream = null; + SAMFileReader reader; + + public ReaderInitializer(final SAMReaderID readerID) { + this.readerID = readerID; + } + + public ReaderInitializer call() { + final File indexFile = findIndexFile(readerID.samFile); + if (threadAllocation.getNumIOThreads() > 0) { + blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(blockInputStream,indexFile,false); + } + else + reader = new SAMFileReader(readerID.samFile,indexFile,false); + reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); + reader.setValidationStringency(validationStringency); + return this; + } + } + private class ReleasingIterator implements StingSAMIterator { /** * The resource acting as the source of the data. @@ -988,12 +1051,12 @@ public class SAMDataSource { return // Read ends on a later contig, or... read.getReferenceIndex() > intervalContigIndices[currentBound] || - // Read ends of this contig... - (read.getReferenceIndex() == intervalContigIndices[currentBound] && - // either after this location, or... - (read.getAlignmentEnd() >= intervalStarts[currentBound] || - // read is unmapped but positioned and alignment start is on or after this start point. - (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); } /** @@ -1005,8 +1068,8 @@ public class SAMDataSource { return // Read starts on a prior contig, or... read.getReferenceIndex() < intervalContigIndices[currentBound] || - // Read starts on this contig and the alignment start is registered before this end point. - (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); } } From 4fddac9f22335388c13ce4c1719a6c99f56354b3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 16:24:43 -0500 Subject: [PATCH 14/18] Updating busted integration tests --- .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 7 +++++-- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index f185af0c3..7e5f388b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -817,7 +817,7 @@ public class UnifiedGenotyperEngine { // create the new likelihoods array from the alleles we are allowed to use final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - final double[] newLikelihoods; + double[] newLikelihoods; if ( likelihoodIndexesToUse == null ) { newLikelihoods = originalLikelihoods; } else { @@ -825,6 +825,9 @@ public class UnifiedGenotyperEngine { int newIndex = 0; for ( int oldIndex : likelihoodIndexesToUse ) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); } // if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample @@ -846,7 +849,7 @@ public class UnifiedGenotyperEngine { if ( numNewAltAlleles == 0 ) attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); else - attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, newLikelihoods); + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false)); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 4ae00431c..6041f80b8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "94e53320f14c5ff29d62f68d36b46fcd" ); - e.put( "--output_mode EMIT_ALL_SITES", "73ad1cc41786b12c5f0e6f3e9ec2b728" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "274bd9d1b9c7857690fa5f0228ffc6d7" ); + e.put( "--output_mode EMIT_ALL_SITES", "594c6d3c48bbc73289de7727d768644d" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( From c85100ce9cee310c013b8564a308dcd608f96098 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 15:34:11 -0500 Subject: [PATCH 15/18] Fix ClippingOp bug when performing multiple hardclip ops bug: When performing multiple hard clip operations in a read that has indels, if the N+1 hardclip requests to clip inside an indel that has been removed by one of the (1..N) previous hardclips, the hard clipper would go out of bounds. fix: dynamically adjust the boundaries according to the new hardclipped read length. (this maintains the current contract that hardclipping will never return a read starting or ending in indels). --- .../sting/utils/clipreads/ReadClipper.java | 9 ++++++++- .../sting/utils/clipreads/ClipReadsTestUtils.java | 2 +- .../sting/utils/clipreads/ReadClipperUnitTest.java | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 8c1061494..c3684034c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -168,7 +168,14 @@ public class ReadClipper { try { GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); for (ClippingOp op : getOps()) { - clippedRead = op.apply(algorithm, clippedRead); + //check if the clipped read can still be clipped in the range requested + if (op.start < clippedRead.getReadLength()) { + ClippingOp fixedOperation = op; + if (op.stop > clippedRead.getReadLength()) + fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); + + clippedRead = fixedOperation.apply(algorithm, clippedRead); + } } wasClipped = true; ops.clear(); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java index a5524e6f1..4a5b9107b 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java @@ -48,7 +48,7 @@ public class ClipReadsTestUtils { Assert.assertTrue(read.isEmpty()); } - private static byte[] subtractToArray(byte[] array, int n) { + public static byte[] subtractToArray(byte[] array, int n) { if (array == null) return null; diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index ff33e3184..264b73663 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -231,7 +231,7 @@ public class ReadClipperUnitTest extends BaseTest { @Test(enabled = true) public void testHardClipLowQualEnds() { // Needs a thorough redesign - logger.warn("Executing testHardClipByReferenceCoordinates"); + logger.warn("Executing testHardClipLowQualEnds"); //Clip whole read Assert.assertEquals(readClipper.hardClipLowQualEnds((byte) 64), new GATKSAMRecord(readClipper.read.getHeader())); From 128bdf9c091cf4bb48ee34b2bc5c7f1d454314fe Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 16:51:55 -0500 Subject: [PATCH 16/18] Create artificial reads with "default" parameters * added functions to create synthetic reads for unit testing with reasonable default parameters * added more functions to create synthetic reads based on cigar string + bases and quals. --- .../sting/utils/sam/ArtificialSAMUtils.java | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 26fabade2..cedd56bdf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -200,6 +200,48 @@ public class ArtificialSAMUtils { return rec; } + /** + * Create an artificial read based on the parameters + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar ) { + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); + rec.setCigarString(cigar); + return rec; + } + + /** + * Create an artificial read with the following default parameters : + * header: + * numberOfChromosomes = 1 + * startingChromosome = 1 + * chromosomeSize = 1000000 + * read: + * name = "default_read" + * refIndex = 0 + * alignmentStart = 1 + * + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar); + } + + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); From 50dee86d7f76db5b57bd2b5c674f3f6b64854661 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 16:52:43 -0500 Subject: [PATCH 17/18] Added unit test to catch Ryan's exception Unit test to catch the special case that broke the clipping op, fixed in the previous commit. --- .../utils/clipreads/ClipReadsTestUtils.java | 12 +++++ .../utils/clipreads/ReadClipperUnitTest.java | 44 ++++++------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java index 4a5b9107b..de9d8fb50 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java @@ -24,6 +24,18 @@ public class ClipReadsTestUtils { final static String BASES = "ACTG"; final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 + public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) { + // If they're both not empty, test their contents + if(!actual.isEmpty() && !expected.isEmpty()) { + Assert.assertEquals(actual.getReadBases(), expected.getReadBases()); + Assert.assertEquals(actual.getBaseQualities(), expected.getBaseQualities()); + Assert.assertEquals(actual.getCigarString(), expected.getCigarString()); + } + // Otherwise test if they're both empty + else + Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); + } + public static void testBaseQualCigar(GATKSAMRecord read, byte[] readBases, byte[] baseQuals, String cigar) { // Because quals to char start at 33 for visibility baseQuals = subtractToArray(baseQuals, 33); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index 264b73663..650d3f26e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -30,6 +30,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; @@ -230,42 +231,25 @@ public class ReadClipperUnitTest extends BaseTest { @Test(enabled = true) public void testHardClipLowQualEnds() { - // Needs a thorough redesign logger.warn("Executing testHardClipLowQualEnds"); - //Clip whole read - Assert.assertEquals(readClipper.hardClipLowQualEnds((byte) 64), new GATKSAMRecord(readClipper.read.getHeader())); + // Testing clipping that ends inside an insertion + final byte[] BASES = {'A','C','G','T','A','C','G','T'}; + final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; + final String CIGAR = "1S1M5I1S"; - List testList = new LinkedList(); - testList.add(new TestParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(11, -1, 2, 4, "2H2M"));//clip 2 bases at start + final byte[] CLIPPED_BASES = {}; + final byte[] CLIPPED_QUALS = {}; + final String CLIPPED_CIGAR = ""; - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipLowQualEnds((byte) p.inputStart), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } - /* todo find a better way to test lowqual tail clipping on both sides - // Reverse Quals sequence - readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - testList = new LinkedList(); - testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end - testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); + GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR); + + ReadClipper lowQualClipper = new ReadClipper(read); + ClipReadsTestUtils.assertEqualReads(lowQualClipper.hardClipLowQualEnds((byte) 2), expected); + - for ( testParameter p : testList ) { - init(); - readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); - testBaseQualCigar( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), - BASES.substring(p.substringStart,p.substringStop).getBytes(), - QUALS.substring(p.substringStart,p.substringStop), - p.cigar ); - } - */ } @Test(enabled = false)