From 079932ba2a2c0c26965640c79247d3cf1b6faec5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 13 Dec 2011 23:36:10 -0500 Subject: [PATCH 01/29] The log10cache needs to be larger if we want to handle 10K samples in the UG. --- .../java/src/org/broadinstitute/sting/utils/MathUtils.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index f92d4be78..759e1649d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1033,14 +1033,15 @@ public class MathUtils { public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0/JACOBIAN_LOG_TABLE_STEP; public static final double MAX_JACOBIAN_TOLERANCE = 10.0; - private static final int MAXN = 10000; + private static final int MAXN = 11000; + private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients static { - log10Cache = new double[2*MAXN]; + log10Cache = new double[LOG10_CACHE_SIZE]; jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k=1; k < 2*MAXN; k++) + for (int k=1; k < LOG10_CACHE_SIZE; k++) log10Cache[k] = Math.log10(k); for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { From d3f4a5a9017b914894a73134abb8c7dbd5751ee6 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 10:37:38 -0500 Subject: [PATCH 03/29] Fail gracefully when encountering malformed VCFs without enough data columns --- .../org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java | 2 ++ .../src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index aaa2e63a7..b3329c708 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -120,6 +120,8 @@ public class VCF3Codec extends AbstractVCFCodec { genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); ArrayList genotypes = new ArrayList(nParts); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4c1bb1d9e..453155be7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -147,6 +147,8 @@ public class VCFCodec extends AbstractVCFCodec { genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); ArrayList genotypes = new ArrayList(nParts); From 09a5a9eac08ec4b26983bff3c73a0373a91ca688 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 10:43:52 -0500 Subject: [PATCH 04/29] Don't update lineNo for decodeLoc - only for decode (otherwise they get double-counted). Even still, because of the way the GATK currently utilizes Tribble we can parse the same line multiple times, which knocks the line counter out of sync. For now, I've added a TODO in the code to remind us and the error messages note that it's an approximate line number. --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 3 ++- .../broadinstitute/sting/utils/exceptions/UserException.java | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 3009c236b..b902f220f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -184,7 +184,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - lineNo++; // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -279,6 +278,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { builder.source(getName()); // increment the line count + // TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when + // TODO -- the first record is far along the contig) and the line counter can get out of sync lineNo++; // parse out the required fields diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index c599d4759..a2816b58f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -184,11 +184,11 @@ public class UserException extends ReviewedStingException { public static class MalformedVCF extends UserException { public MalformedVCF(String message, String line) { - super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + super(String.format("The provided VCF file is malformed at approximately line %s: %s", line, message)); } public MalformedVCF(String message, int lineNo) { - super(String.format("The provided VCF file is malformed at line number %d: %s", lineNo, message)); + super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); } } From 9497e9492cb6904fac16349ae5cdaadb8a4350d8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 11:21:28 -0500 Subject: [PATCH 05/29] Bug fix for complex records: do not ever reverse clip out a complete allele. --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index b902f220f..e44c10f1f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -595,6 +595,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { if ( a.isSymbolic() ) continue; + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - 1; + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) stillClipping = false; else if ( ref.length() == clipping ) From 76485217184bc6f6891f7a3a76380b38a3fceb6f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 11:26:43 -0500 Subject: [PATCH 06/29] Add check for mixed genotype so that we don't exception out for a valid record --- .../gatk/walkers/varianteval/evaluators/CountVariants.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index c740eb78c..e5e8dfaf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -182,6 +182,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { nHomDerived++; } + break; + case MIXED: break; default: throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); From 106bf13056969d13e01582a7088d168142d73519 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 12:05:50 -0500 Subject: [PATCH 07/29] Use a thread local result object to collect the results of the exact calculation instead of passing in multiple pre-allocated arrays. --- .../AlleleFrequencyCalculationModel.java | 7 +-- .../genotyper/ExactAFCalculationModel.java | 22 +++---- .../genotyper/UnifiedGenotyperEngine.java | 58 +++++++++---------- 3 files changed, 39 insertions(+), 48 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 7d3e7047d..681cc1fa6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.List; @@ -65,11 +64,9 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @param GLs genotype likelihoods * @param Alleles Alleles corresponding to GLs * @param log10AlleleFrequencyPriors priors - * @param log10AlleleFrequencyLikelihoods array (pre-allocated) to store likelihoods results - * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store posteriors results + * @param result (pre-allocated) object to store likelihoods results */ protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, double[][] log10AlleleFrequencyPriors, - double[][] log10AlleleFrequencyLikelihoods, - double[][] log10AlleleFrequencyPosteriors); + AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index ed86897f2..33634ce28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -47,12 +47,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public void getLog10PNonRef(final GenotypesContext GLs, final List alleles, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors) { + final AlleleFrequencyCalculationResult result) { final int numAlleles = alleles.size(); //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); + linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); } private static final ArrayList getGLs(GenotypesContext GLs) { @@ -196,8 +195,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numAlternateAlleles, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors, + final AlleleFrequencyCalculationResult result, final boolean preserveData) { final ArrayList genotypeLikelihoods = getGLs(GLs); @@ -221,7 +219,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); // adjust max likelihood seen if needed maxLog10L = Math.max(maxLog10L, log10LofKs); @@ -236,14 +234,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final Queue ACqueue, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors) { + final AlleleFrequencyCalculationResult result) { if ( DEBUG ) System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result); // clean up memory if ( !preserveData ) { @@ -349,8 +346,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final ArrayList genotypeLikelihoods, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final double[][] log10AlleleFrequencyLikelihoods, - final double[][] log10AlleleFrequencyPosteriors) { + final AlleleFrequencyCalculationResult result) { set.log10Likelihoods[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -410,11 +406,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { int AC = set.ACcounts.getCounts()[i]; - log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyLikelihoods[i][AC], log10LofK); + result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); // for k=0 we still want to use theta final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; - log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); + result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 2308e1759..5d271cdb1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -75,14 +75,13 @@ public class UnifiedGenotyperEngine { // the model used for calculating p(non-ref) private ThreadLocal afcm = new ThreadLocal(); + // the allele frequency likelihoods (allocated once as an optimization) + private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything private final double[][] log10AlleleFrequencyPriorsSNPs; private final double[][] log10AlleleFrequencyPriorsIndels; - // the allele frequency likelihoods (allocated once as an optimization) - private ThreadLocal log10AlleleFrequencyLikelihoods = new ThreadLocal(); - private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); - // the priors object private final GenotypePriors genotypePriorsSNPs; private final GenotypePriors genotypePriorsIndels; @@ -264,9 +263,8 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]); - log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); + alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); } // don't try to genotype too many alternate alleles @@ -285,9 +283,9 @@ public class UnifiedGenotyperEngine { } // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; @@ -299,7 +297,7 @@ public class UnifiedGenotyperEngine { // determine which alternate alleles have AF>0 boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]); + int indexOfBestAC = MathUtils.maxElementIndex(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]); // if the most likely AC is not 0, then this is a good alternate allele to use if ( indexOfBestAC != 0 ) { @@ -320,7 +318,7 @@ public class UnifiedGenotyperEngine { // calculate p(f>0) // TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]); + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[indexOfHighestAlt]); double sum = 0.0; for (int i = 1; i <= N; i++) sum += normalizedPosteriors[i]; @@ -330,15 +328,15 @@ public class UnifiedGenotyperEngine { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0]; + phredScaledConfidence = -10.0 * alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { sum = 0.0; for (int i = 1; i <= N; i++) { - if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) break; - sum += log10AlleleFrequencyPosteriors.get()[0][i]; + sum += alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i]; } phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } @@ -396,31 +394,31 @@ public class UnifiedGenotyperEngine { // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); + double overallLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0]; - double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); + double forwardLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; + double forwardLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyLikelihoods.get()); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods); + clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get()); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0]; - double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); + double reverseLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0]; + double reverseLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -587,10 +585,10 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) + if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) AFline.append("0.00000000\t"); else - AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i])); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i])); AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); verboseWriter.println(AFline.toString()); } From 988d60091f79b1a58117f423e649e9d7ed4b5813 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 13:37:15 -0500 Subject: [PATCH 09/29] Forgot to add in the new result class --- .../AlleleFrequencyCalculationResult.java | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java new file mode 100644 index 000000000..66106f658 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Dec 14, 2011 + * + * Useful helper class to communicate the results of the allele frequency calculation + */ +public class AlleleFrequencyCalculationResult { + + final double[][] log10AlleleFrequencyLikelihoods; + final double[][] log10AlleleFrequencyPosteriors; + + AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { + log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; + log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; + } +} \ No newline at end of file From 1e90d602a4c1628a0dde9764466f2c41c8aa8f9b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 13:38:20 -0500 Subject: [PATCH 10/29] Optimization: cache up front the PL index to the pair of alleles it represents for all possible numbers of alternate alleles. --- .../genotyper/ExactAFCalculationModel.java | 26 +++++++----------- .../genotyper/UnifiedGenotyperEngine.java | 27 +++++++++++++++++-- .../ExactAFCalculationModelUnitTest.java | 11 ++++---- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 33634ce28..bc27916dd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -55,7 +55,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } private static final ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(); // TODO -- initialize with size of GLs + ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { @@ -198,6 +198,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final AlleleFrequencyCalculationResult result, final boolean preserveData) { + // make sure the PL cache has been initialized + if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null ) + UnifiedGenotyperEngine.calculatePLcache(5); + final ArrayList genotypeLikelihoods = getGLs(GLs); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -415,10 +419,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - // todo -- arent' there a small number of fixed values that this function can adopt? - // todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance - // todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient - // todo -- this can be computed once at the start of the all operations // the closed form representation generalized for multiple alleles is as follows: // AA: (2j - totalK) * (2j - totalK - 1) @@ -434,25 +434,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( PLindex <= numAltAlleles ) return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; - int subtractor = numAltAlleles+1; - int subtractions = 0; - do { - PLindex -= subtractor; - subtractor--; - subtractions++; - } - while ( PLindex >= subtractor ); + // find the 2 alternate alleles that are represented by this PL index + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex]; - final int k_i = ACcounts[subtractions-1]; + final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele // the hom var case (e.g. BB, CC, DD) final double coeff; - if ( PLindex == 0 ) { + if ( alleles[0] == alleles[1] ) { coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; } // the het non-ref case (e.g. BC, BD, CD) else { - final int k_j = ACcounts[subtractions+PLindex-1]; + final int k_j = ACcounts[alleles[1]-1]; coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 5d271cdb1..221d00410 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -102,6 +102,8 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; + // a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + protected static int[][][] PLIndexToAlleleIndex; // --------------------------------------------------------------------------------------------------------- @@ -135,6 +137,27 @@ public class UnifiedGenotyperEngine { genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); filter.add(LOW_QUAL_FILTER_NAME); + calculatePLcache(UAC.MAX_ALTERNATE_ALLELES); + } + + protected static void calculatePLcache(int maxAltAlleles) { + PLIndexToAlleleIndex = new int[maxAltAlleles+1][][]; + PLIndexToAlleleIndex[0] = null; + int numLikelihoods = 1; + + // for each count of alternate alleles + for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) { + numLikelihoods += altAlleles + 1; + PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][]; + int PLindex = 0; + + // for all possible combinations of the 2 alt alleles + for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) { + for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) { + PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 }; + } + } + } } /** @@ -751,8 +774,8 @@ public class UnifiedGenotyperEngine { * * @return genotypes */ - public GenotypesContext assignGenotypes(VariantContext vc, - boolean[] allelesToUse) { + public GenotypesContext assignGenotypes(final VariantContext vc, + final boolean[] allelesToUse) { final GenotypesContext GLs = vc.getGenotypes(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index 9640a8963..dec6ecb79 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -83,21 +83,20 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1]; - final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1]; + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); for ( int i = 0; i < 2; i++ ) { for ( int j = 0; j < 2*numSamples+1; j++ ) { - log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; } } - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]); + int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } From 35fc2e13c3677a2ff241447bf74e53a077d2b12d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 15:31:09 -0500 Subject: [PATCH 11/29] Using the new PL cache, fix a bug: when only a subset of the genotyped alleles are used for assigning genotypes (because the exact model determined that they weren't all real) the PLs need to be adjusted to reflect this. While fixing this I discovered that the integration tests are busted because ref calls (ALT=.) were getting annotated with PLs, which makes no sense at all. --- ...NPGenotypeLikelihoodsCalculationModel.java | 14 +-- .../genotyper/UnifiedGenotyperEngine.java | 107 +++++++++--------- .../org/broadinstitute/sting/utils/Utils.java | 9 ++ 3 files changed, 66 insertions(+), 64 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 4087443f8..57cc5594a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -95,7 +96,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup); // how many alternate alleles are we using? - int alleleCounter = countSetBits(basesToUse); + int alleleCounter = Utils.countSetBits(basesToUse); // if there are no non-ref alleles... if ( alleleCounter == 0 ) { @@ -109,7 +110,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC } // create the alternate alleles and the allele ordering (the ordering is crucial for the GLs) - final int numAltAlleles = countSetBits(basesToUse); + final int numAltAlleles = Utils.countSetBits(basesToUse); final int[] alleleOrdering = new int[numAltAlleles + 1]; alleleOrdering[0] = indexOfRefBase; int alleleOrderingIndex = 1; @@ -161,15 +162,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return builder.genotypes(genotypes).make(); } - private int countSetBits(boolean[] array) { - int counter = 0; - for ( int i = 0; i < array.length; i++ ) { - if ( array[i] ) - counter++; - } - return counter; - } - // fills in the allelesToUse array protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map contexts, boolean useBAQedPileup) { int[] qualCounts = new int[4]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 221d00410..f185af0c3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -103,6 +103,7 @@ public class UnifiedGenotyperEngine { private final boolean BAQEnabledOnCMDLine; // a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + // the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)] protected static int[][][] PLIndexToAlleleIndex; @@ -142,7 +143,7 @@ public class UnifiedGenotyperEngine { protected static void calculatePLcache(int maxAltAlleles) { PLIndexToAlleleIndex = new int[maxAltAlleles+1][][]; - PLIndexToAlleleIndex[0] = null; + PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} }; int numLikelihoods = 1; // for each count of alternate alleles @@ -399,7 +400,7 @@ public class UnifiedGenotyperEngine { } // create the genotypes - GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse); + GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse, myAlleles); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) @@ -771,82 +772,82 @@ public class UnifiedGenotyperEngine { /** * @param vc variant context with genotype likelihoods * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use + * @param newAlleles a list of the final new alleles to use * * @return genotypes */ public GenotypesContext assignGenotypes(final VariantContext vc, - final boolean[] allelesToUse) { + final boolean[] allelesToUse, + final List newAlleles) { + // the no-called genotypes final GenotypesContext GLs = vc.getGenotypes(); + // samples final List sampleIndices = GLs.getSampleNamesOrderedByName(); + // the new called genotypes to create final GenotypesContext calls = GenotypesContext.create(); + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = allelesToUse.length; + final int numNewAltAlleles = newAlleles.size() - 1; + ArrayList likelihoodIndexesToUse = null; + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { + likelihoodIndexesToUse = new ArrayList(30); + final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; + + for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) { + int[] alleles = PLcache[PLindex]; + // consider this entry only if both of the alleles are good + if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) ) + likelihoodIndexesToUse.add(PLindex); + } + } + + // create the new genotypes for ( int k = GLs.size() - 1; k >= 0; k-- ) { final String sample = sampleIndices.get(k); final Genotype g = GLs.get(sample); if ( !g.hasLikelihoods() ) continue; - final double[] likelihoods = g.getLikelihoods().getAsVector(); + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + final double[] newLikelihoods; + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + } - // if there is no mass on the likelihoods, then just no-call the sample - if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) { + // if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); continue; } - // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. - // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. - - final int numAltAlleles = allelesToUse.length; - - // start with the assumption that the ideal genotype is homozygous reference - Allele maxAllele1 = vc.getReference(), maxAllele2 = vc.getReference(); - double maxLikelihoodSeen = likelihoods[0]; - int indexOfMax = 0; - - // keep track of some state - Allele firstAllele = vc.getReference(); - int subtractor = numAltAlleles + 1; - int subtractionsMade = 0; - - for ( int i = 1, PLindex = 1; i < likelihoods.length; i++, PLindex++ ) { - if ( PLindex == subtractor ) { - firstAllele = vc.getAlternateAllele(subtractionsMade); - PLindex -= subtractor; - subtractor--; - subtractionsMade++; - - // we can skip this allele if it's not usable - if ( !allelesToUse[subtractionsMade-1] ) { - i += subtractor - 1; - PLindex += subtractor - 1; - continue; - } - } - - // we don't care about the entry if we've already seen better - if ( likelihoods[i] <= maxLikelihoodSeen ) - continue; - - // if it's usable then update the alleles - int alleleIndex = subtractionsMade + PLindex - 1; - if ( allelesToUse[alleleIndex] ) { - maxAllele1 = firstAllele; - maxAllele2 = vc.getAlternateAllele(alleleIndex); - maxLikelihoodSeen = likelihoods[i]; - indexOfMax = i; - } - } + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; ArrayList myAlleles = new ArrayList(); - myAlleles.add(maxAllele1); - myAlleles.add(maxAllele2); + myAlleles.add(newAlleles.get(alleles[0])); + myAlleles.add(newAlleles.get(alleles[1])); - final double qual = GenotypeLikelihoods.getQualFromLikelihoods(indexOfMax, likelihoods); - calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); + Map attrs = new HashMap(g.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, newLikelihoods); + calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false)); } return calls; diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index f0eb5d399..b79770eb5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -388,6 +388,15 @@ public class Utils { return reallocate(pos, z); } + public static int countSetBits(boolean[] array) { + int counter = 0; + for ( int i = 0; i < array.length; i++ ) { + if ( array[i] ) + counter++; + } + return counter; + } + /** * Returns new (reallocated) integer array of the specified size, with content * of the original array orig copied into it. If newSize is From 71b4bb12b7903e1542334b80a96ea3345a486967 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Dec 2011 09:58:23 -0500 Subject: [PATCH 13/29] Bug fix for incorrect logic in subsetSamples -- Now properly handles the case where a sample isn't present (no longer adds a null to the genotypes list) -- Fix for logic failure where if the number of requested samples equals the number of known genotypes then all of the records were returned, which isn't correct when there are missing samples. -- Unit tests added to handle these cases --- .../utils/variantcontext/GenotypesContext.java | 15 ++++++++++----- .../variantcontext/VariantContextUnitTest.java | 3 +++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index 845c65c9c..85f7cc078 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -410,6 +410,12 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + /** + * Gets sample associated with this sampleName, or null if none is found + * + * @param sampleName + * @return + */ public Genotype get(final String sampleName) { Integer offset = getSampleI(sampleName); return offset == null ? null : getGenotypes().get(offset); @@ -648,16 +654,15 @@ public class GenotypesContext implements List { @Ensures("result != null") public GenotypesContext subsetToSamples( final Set samples ) { final int nSamples = samples.size(); - final int nGenotypes = size(); - if ( nSamples == nGenotypes ) - return this; - else if ( nSamples == 0 ) + if ( nSamples == 0 ) return NO_GENOTYPES; else { // nGenotypes < nSamples final GenotypesContext subset = create(samples.size()); for ( final String sample : samples ) { - subset.add(get(sample)); + final Genotype g = get(sample); + if ( g != null ) + subset.add(g); } return subset; } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index fca7440e4..0e75eee14 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -704,11 +704,14 @@ public class VariantContextUnitTest extends BaseTest { public Object[][] MakeSubContextTest() { for ( boolean updateAlleles : Arrays.asList(true, false)) { new SubContextTest(Collections.emptySet(), updateAlleles); + new SubContextTest(Collections.singleton("MISSING"), updateAlleles); new SubContextTest(Collections.singleton("AA"), updateAlleles); new SubContextTest(Collections.singleton("AT"), updateAlleles); new SubContextTest(Collections.singleton("TT"), updateAlleles); new SubContextTest(Arrays.asList("AA", "AT"), updateAlleles); new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "MISSING"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "TT", "MISSING"), updateAlleles); } return SubContextTest.getTests(SubContextTest.class); From 01e547eed3b70f8c1c5fac507136264fe9f08200 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Dec 2011 10:00:21 -0500 Subject: [PATCH 14/29] Parallel SAMDataSource initialization -- Uses 8 threads to load BAM files and indices in parallel, decreasing costs to read thousands of BAM files by a significant amount -- Added logger.info message noting progress and cost of reading low-level BAM data. --- .../gatk/datasources/reads/SAMDataSource.java | 131 +++++++++++++----- 1 file changed, 97 insertions(+), 34 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d70c63bd2..aacdf9b95 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -51,6 +52,7 @@ import java.io.File; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; +import java.util.concurrent.*; /** * User: aaron @@ -199,7 +201,7 @@ public class SAMDataSource { BAQ.QualityMode.DONT_MODIFY, null, // no BAQ (byte) -1); - } + } /** * Create a new SAM data source given the supplied read metadata. @@ -253,17 +255,15 @@ public class SAMDataSource { if(readBufferSize != null) ReadShard.setReadBufferSize(readBufferSize); - for (SAMReaderID readerID : samFiles) { - if (!readerID.samFile.canRead()) - throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + - "Please check that the file is present and readable and try again."); - } - resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); // Determine the sort order. for(SAMReaderID readerID: readerIDs) { + if (! readerID.samFile.canRead() ) + throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + + "Please check that the file is present and readable and try again."); + // Get the sort order, forcing it to coordinate if unsorted. SAMFileReader reader = readers.getReader(readerID); SAMFileHeader header = reader.getFileHeader(); @@ -711,29 +711,68 @@ public class SAMDataSource { * @param validationStringency validation stringency. */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { + final int N_THREADS = 8; int totalNumberOfFiles = readerIDs.size(); int readerNumber = 1; - for(SAMReaderID readerID: readerIDs) { - File indexFile = findIndexFile(readerID.samFile); - SAMFileReader reader = null; - - if(threadAllocation.getNumIOThreads() > 0) { - BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(blockInputStream,indexFile,false); - inputStreams.put(readerID,blockInputStream); - } - else - reader = new SAMFileReader(readerID.samFile,indexFile,false); - reader.setSAMRecordFactory(factory); - - reader.enableFileSource(true); - reader.setValidationStringency(validationStringency); - - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); - - readers.put(readerID,reader); + ExecutorService executor = Executors.newFixedThreadPool(N_THREADS); + final List inits = new ArrayList(totalNumberOfFiles); + Queue> futures = new LinkedList>(); + for (SAMReaderID readerID: readerIDs) { + logger.debug("Enqueuing for initialization: " + readerID.samFile); + final ReaderInitializer init = new ReaderInitializer(readerID); + inits.add(init); + futures.add(executor.submit(init)); } + + final SimpleTimer timer = new SimpleTimer(); + try { + final int MAX_WAIT = 30 * 1000; + final int MIN_WAIT = 1 * 1000; + + timer.start(); + while ( ! futures.isEmpty() ) { + final int prevSize = futures.size(); + final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file + final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT)); + Thread.sleep(waitTimeInMS); + + Queue> pending = new LinkedList>(); + for ( final Future initFuture : futures ) { + if ( initFuture.isDone() ) { + final ReaderInitializer init = initFuture.get(); + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID)); + readers.put(init.readerID, init.reader); + } else { + pending.add(initFuture); + } + } + + final int pendingSize = pending.size(); + final int nExecutedInTick = prevSize - pendingSize; + final int nExecutedTotal = totalNumberOfFiles - pendingSize; + final double totalTimeInSeconds = timer.getElapsedTime(); + final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); + final int nRemaining = pendingSize; + final double estTimeToComplete = pendingSize / nTasksPerSecond; + logger.info(String.format("Init %d BAMs in last %d s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", + nExecutedInTick, (int)(waitTimeInMS / 1000.0), + nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, + nRemaining, estTimeToComplete, estTimeToComplete / 60)); + + futures = pending; + } + } catch ( InterruptedException e ) { + throw new ReviewedStingException("Interrupted SAMReader initialization", e); + } catch ( ExecutionException e ) { + throw new ReviewedStingException("Execution exception during SAMReader initialization", e); + } + + logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); + executor.shutdown(); } /** @@ -806,6 +845,30 @@ public class SAMDataSource { } } + class ReaderInitializer implements Callable { + final SAMReaderID readerID; + BlockInputStream blockInputStream = null; + SAMFileReader reader; + + public ReaderInitializer(final SAMReaderID readerID) { + this.readerID = readerID; + } + + public ReaderInitializer call() { + final File indexFile = findIndexFile(readerID.samFile); + if (threadAllocation.getNumIOThreads() > 0) { + blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(blockInputStream,indexFile,false); + } + else + reader = new SAMFileReader(readerID.samFile,indexFile,false); + reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); + reader.setValidationStringency(validationStringency); + return this; + } + } + private class ReleasingIterator implements StingSAMIterator { /** * The resource acting as the source of the data. @@ -988,12 +1051,12 @@ public class SAMDataSource { return // Read ends on a later contig, or... read.getReferenceIndex() > intervalContigIndices[currentBound] || - // Read ends of this contig... - (read.getReferenceIndex() == intervalContigIndices[currentBound] && - // either after this location, or... - (read.getAlignmentEnd() >= intervalStarts[currentBound] || - // read is unmapped but positioned and alignment start is on or after this start point. - (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); } /** @@ -1005,8 +1068,8 @@ public class SAMDataSource { return // Read starts on a prior contig, or... read.getReferenceIndex() < intervalContigIndices[currentBound] || - // Read starts on this contig and the alignment start is registered before this end point. - (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); } } From 4fddac9f22335388c13ce4c1719a6c99f56354b3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 16:24:43 -0500 Subject: [PATCH 16/29] Updating busted integration tests --- .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 7 +++++-- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index f185af0c3..7e5f388b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -817,7 +817,7 @@ public class UnifiedGenotyperEngine { // create the new likelihoods array from the alleles we are allowed to use final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - final double[] newLikelihoods; + double[] newLikelihoods; if ( likelihoodIndexesToUse == null ) { newLikelihoods = originalLikelihoods; } else { @@ -825,6 +825,9 @@ public class UnifiedGenotyperEngine { int newIndex = 0; for ( int oldIndex : likelihoodIndexesToUse ) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); } // if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample @@ -846,7 +849,7 @@ public class UnifiedGenotyperEngine { if ( numNewAltAlleles == 0 ) attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); else - attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, newLikelihoods); + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false)); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 4ae00431c..6041f80b8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "94e53320f14c5ff29d62f68d36b46fcd" ); - e.put( "--output_mode EMIT_ALL_SITES", "73ad1cc41786b12c5f0e6f3e9ec2b728" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "274bd9d1b9c7857690fa5f0228ffc6d7" ); + e.put( "--output_mode EMIT_ALL_SITES", "594c6d3c48bbc73289de7727d768644d" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( From c85100ce9cee310c013b8564a308dcd608f96098 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 15:34:11 -0500 Subject: [PATCH 17/29] Fix ClippingOp bug when performing multiple hardclip ops bug: When performing multiple hard clip operations in a read that has indels, if the N+1 hardclip requests to clip inside an indel that has been removed by one of the (1..N) previous hardclips, the hard clipper would go out of bounds. fix: dynamically adjust the boundaries according to the new hardclipped read length. (this maintains the current contract that hardclipping will never return a read starting or ending in indels). --- .../sting/utils/clipreads/ReadClipper.java | 9 ++++++++- .../sting/utils/clipreads/ClipReadsTestUtils.java | 2 +- .../sting/utils/clipreads/ReadClipperUnitTest.java | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 8c1061494..c3684034c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -168,7 +168,14 @@ public class ReadClipper { try { GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); for (ClippingOp op : getOps()) { - clippedRead = op.apply(algorithm, clippedRead); + //check if the clipped read can still be clipped in the range requested + if (op.start < clippedRead.getReadLength()) { + ClippingOp fixedOperation = op; + if (op.stop > clippedRead.getReadLength()) + fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); + + clippedRead = fixedOperation.apply(algorithm, clippedRead); + } } wasClipped = true; ops.clear(); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java index a5524e6f1..4a5b9107b 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java @@ -48,7 +48,7 @@ public class ClipReadsTestUtils { Assert.assertTrue(read.isEmpty()); } - private static byte[] subtractToArray(byte[] array, int n) { + public static byte[] subtractToArray(byte[] array, int n) { if (array == null) return null; diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index ff33e3184..264b73663 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -231,7 +231,7 @@ public class ReadClipperUnitTest extends BaseTest { @Test(enabled = true) public void testHardClipLowQualEnds() { // Needs a thorough redesign - logger.warn("Executing testHardClipByReferenceCoordinates"); + logger.warn("Executing testHardClipLowQualEnds"); //Clip whole read Assert.assertEquals(readClipper.hardClipLowQualEnds((byte) 64), new GATKSAMRecord(readClipper.read.getHeader())); From 128bdf9c091cf4bb48ee34b2bc5c7f1d454314fe Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 16:51:55 -0500 Subject: [PATCH 18/29] Create artificial reads with "default" parameters * added functions to create synthetic reads for unit testing with reasonable default parameters * added more functions to create synthetic reads based on cigar string + bases and quals. --- .../sting/utils/sam/ArtificialSAMUtils.java | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 26fabade2..cedd56bdf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -200,6 +200,48 @@ public class ArtificialSAMUtils { return rec; } + /** + * Create an artificial read based on the parameters + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar ) { + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); + rec.setCigarString(cigar); + return rec; + } + + /** + * Create an artificial read with the following default parameters : + * header: + * numberOfChromosomes = 1 + * startingChromosome = 1 + * chromosomeSize = 1000000 + * read: + * name = "default_read" + * refIndex = 0 + * alignmentStart = 1 + * + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar); + } + + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); From 50dee86d7f76db5b57bd2b5c674f3f6b64854661 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 16:52:43 -0500 Subject: [PATCH 19/29] Added unit test to catch Ryan's exception Unit test to catch the special case that broke the clipping op, fixed in the previous commit. --- .../utils/clipreads/ClipReadsTestUtils.java | 12 +++++ .../utils/clipreads/ReadClipperUnitTest.java | 44 ++++++------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java index 4a5b9107b..de9d8fb50 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java @@ -24,6 +24,18 @@ public class ClipReadsTestUtils { final static String BASES = "ACTG"; final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 + public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) { + // If they're both not empty, test their contents + if(!actual.isEmpty() && !expected.isEmpty()) { + Assert.assertEquals(actual.getReadBases(), expected.getReadBases()); + Assert.assertEquals(actual.getBaseQualities(), expected.getBaseQualities()); + Assert.assertEquals(actual.getCigarString(), expected.getCigarString()); + } + // Otherwise test if they're both empty + else + Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); + } + public static void testBaseQualCigar(GATKSAMRecord read, byte[] readBases, byte[] baseQuals, String cigar) { // Because quals to char start at 33 for visibility baseQuals = subtractToArray(baseQuals, 33); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index 264b73663..650d3f26e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -30,6 +30,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; @@ -230,42 +231,25 @@ public class ReadClipperUnitTest extends BaseTest { @Test(enabled = true) public void testHardClipLowQualEnds() { - // Needs a thorough redesign logger.warn("Executing testHardClipLowQualEnds"); - //Clip whole read - Assert.assertEquals(readClipper.hardClipLowQualEnds((byte) 64), new GATKSAMRecord(readClipper.read.getHeader())); + // Testing clipping that ends inside an insertion + final byte[] BASES = {'A','C','G','T','A','C','G','T'}; + final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; + final String CIGAR = "1S1M5I1S"; - List testList = new LinkedList(); - testList.add(new TestParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(11, -1, 2, 4, "2H2M"));//clip 2 bases at start + final byte[] CLIPPED_BASES = {}; + final byte[] CLIPPED_QUALS = {}; + final String CLIPPED_CIGAR = ""; - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipLowQualEnds((byte) p.inputStart), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } - /* todo find a better way to test lowqual tail clipping on both sides - // Reverse Quals sequence - readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - testList = new LinkedList(); - testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end - testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); + GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR); + + ReadClipper lowQualClipper = new ReadClipper(read); + ClipReadsTestUtils.assertEqualReads(lowQualClipper.hardClipLowQualEnds((byte) 2), expected); + - for ( testParameter p : testList ) { - init(); - readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); - testBaseQualCigar( readClipper.hardClipLowQualEnds( (byte)p.inputStart ), - BASES.substring(p.substringStart,p.substringStop).getBytes(), - QUALS.substring(p.substringStart,p.substringStop), - p.cigar ); - } - */ } @Test(enabled = false) From 6fb4be1a0985c7895c5cf65099feb920075f97b5 Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 14 Dec 2011 18:05:31 -0500 Subject: [PATCH 21/29] Cache header merger. --- .../sting/gatk/datasources/reads/SAMDataSource.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d70c63bd2..fdaca54de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -95,6 +95,11 @@ public class SAMDataSource { */ private final Map readerPositions = new HashMap(); + /** + * Cached representation of the merged header used to generate a merging iterator. + */ + private final SamFileHeaderMerger headerMerger; + /** * The merged header. */ @@ -287,7 +292,7 @@ public class SAMDataSource { initializeReaderPositions(readers); - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); + headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); mergedHeader = headerMerger.getMergedHeader(); hasReadGroupCollisions = headerMerger.hasReadGroupCollisions(); @@ -535,8 +540,6 @@ public class SAMDataSource { * @return An iterator over the selected data. */ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); - // Set up merging to dynamically merge together multiple BAMs. MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); From 62a2e335bc1edb6b95cb2165b3032ea19842b9c1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 15 Dec 2011 11:08:19 -0500 Subject: [PATCH 24/29] Changing HardClipper contract to allow UNMAPPED reads shifted the contract to functions that operate on reference based coordinates. The clipper should do the right thing with unmapped reads, but it needs more testing (Ryan is using it at the moment and says it works). Will write some unit tests. --- .../org/broadinstitute/sting/utils/clipreads/ClippingOp.java | 2 +- .../org/broadinstitute/sting/utils/clipreads/ReadClipper.java | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 4a253b217..97855080c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -247,7 +247,7 @@ public class ClippingOp { return newCigar; } - @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"}) + @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { if (start == 0 && stop == read.getReadLength() - 1) return new GATKSAMRecord(read.getHeader()); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index c3684034c..4df899888 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -58,6 +58,7 @@ public class ReadClipper { return hardClipByReferenceCoordinates(refStart, -1); } + @Requires("!read.getReadUnmappedFlag()") protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); From 4748ae0a140613f56c47013c78a2deb0146c6c40 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 19:17:49 -0500 Subject: [PATCH 25/29] Bugfix: Softclips before Hardclips weren't being accounted for caught a bug in the hard clipper where it does not account for hard clipping softclipped bases in the resulting cigar string, if there is already a hard clipped base immediately after it. * updated unit test for hardClipSoftClippedBases with corresponding test-case. --- .../sting/utils/clipreads/ClippingOp.java | 4 + .../utils/clipreads/ReadClipperUnitTest.java | 97 ++++++++----------- 2 files changed, 42 insertions(+), 59 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 97855080c..06985041e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -373,6 +373,10 @@ public class ClippingOp { while(cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); + + // if the read had a HardClip operator in the end, combine it with the Hard Clip we are adding + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClipCount += cigarElement.getLength(); } newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index 650d3f26e..f998a4e78 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -252,77 +252,56 @@ public class ReadClipperUnitTest extends BaseTest { } - @Test(enabled = false) + @Test(enabled = true) public void testHardClipSoftClippedBases() { // Generate a list of cigars to test for (Cigar cigar : ClipReadsTestUtils.generateCigars()) { - //logger.warn("Testing Cigar: "+cigar.toString()); - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar(cigar)); + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + readClipper = new ReadClipper(read); + GATKSAMRecord clippedRead = readClipper.hardClipSoftClippedBases(); - int clipStart = 0; - int clipEnd = 0; - boolean expectEmptyRead = false; + int sumHardClips = 0; + int sumMatches = 0; - List cigarElements = cigar.getCigarElements(); - int CigarListLength = cigarElements.size(); + boolean tail = true; + for (CigarElement element : read.getCigar().getCigarElements()) { + // Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right) + if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP) + tail = true; - // It will know what needs to be clipped based on the start and end of the string, hardclips and softclips - // are added to the amount to clip - if (cigarElements.get(0).getOperator() == CigarOperator.HARD_CLIP) { - //clipStart += cigarElements.get(0).getLength(); - if (cigarElements.get(1).getOperator() == CigarOperator.SOFT_CLIP) { - clipStart += cigarElements.get(1).getLength(); - // Check for leading indel - if (cigarElements.get(2).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; - } - } - // Check for leading indel - else if (cigarElements.get(1).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; - } - } else if (cigarElements.get(0).getOperator() == CigarOperator.SOFT_CLIP) { - clipStart += cigarElements.get(0).getLength(); - // Check for leading indel - if (cigarElements.get(1).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; - } - } - //Check for leading indel - else if (cigarElements.get(0).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; + // Adds all H, S and D's (next to hard/soft clips). + // All these should be hard clips after clipping. + if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION)) + sumHardClips += element.getLength(); + + // this means we're no longer on the tail (insertions can still potentially be the tail because + // of the current contract of clipping out hanging insertions + else if (element.getOperator() != CigarOperator.INSERTION) + tail = false; + + // Adds all matches to verify that they remain the same after clipping + if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + sumMatches += element.getLength(); } - if (cigarElements.get(CigarListLength - 1).getOperator() == CigarOperator.HARD_CLIP) { - //clipEnd += cigarElements.get(CigarListLength - 1).getLength(); - if (cigarElements.get(CigarListLength - 2).getOperator() == CigarOperator.SOFT_CLIP) - clipEnd += cigarElements.get(CigarListLength - 2).getLength(); - } else if (cigarElements.get(CigarListLength - 1).getOperator() == CigarOperator.SOFT_CLIP) - clipEnd += cigarElements.get(CigarListLength - 1).getLength(); + for (CigarElement element : clippedRead.getCigar().getCigarElements()) { + // Test if clipped read has Soft Clips (shouldn't have any!) + Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString())); - String readBases = readClipper.read.getReadString(); - String baseQuals = readClipper.read.getBaseQualityString(); + // Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for + if (element.getOperator() == CigarOperator.HARD_CLIP) + sumHardClips -= element.getLength(); - // "*" is the default empty-sequence-string and for our test it needs to be changed to "" - if (readBases.equals("*")) - readBases = ""; - if (baseQuals.equals("*")) - baseQuals = ""; + // Make sure all matches are still there + if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + sumMatches -= element.getLength(); + } + Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips)); + Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); - logger.warn(String.format("Testing cigar %s, expecting Base: %s and Qual: %s", - cigar.toString(), readBases.substring(clipStart, readBases.length() - clipEnd), - baseQuals.substring(clipStart, baseQuals.length() - clipEnd))); - //if (expectEmptyRead) - // testBaseQual( readClipper.hardClipSoftClippedBases(), new byte[0], new byte[0] ); - //else - ClipReadsTestUtils.testBaseQual(readClipper.hardClipSoftClippedBases(), - readBases.substring(clipStart, readBases.length() - clipEnd).getBytes(), - baseQuals.substring(clipStart, baseQuals.length() - clipEnd).getBytes()); - logger.warn("Cigar: " + cigar.toString() + " PASSED!"); + + logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); } - // We will use testParameter in the following way - // Right tail, left tail, - } } \ No newline at end of file From e61e5c75891154393af93c9c529b60d446bf4bac Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 15 Dec 2011 13:09:46 -0500 Subject: [PATCH 27/29] Refactor of ReadClipper unit tests * expanded the systematic cigar string space test framework Roger wrote to all tests * moved utility functions into Utils and ReadUtils * cleaned up unused classes --- .../org/broadinstitute/sting/utils/Utils.java | 14 + .../sting/utils/clipreads/ReadClipper.java | 2 +- .../sting/utils/sam/ReadUtils.java | 5 + .../utils/clipreads/CigarStringTestPair.java | 18 -- .../utils/clipreads/ClipReadsTestUtils.java | 306 +++++++++--------- .../utils/clipreads/ClippingOpUnitTest.java | 66 ++-- .../utils/clipreads/ReadClipperUnitTest.java | 233 +++++-------- .../sting/utils/clipreads/TestParameter.java | 24 -- 8 files changed, 280 insertions(+), 388 deletions(-) delete mode 100644 public/java/test/org/broadinstitute/sting/utils/clipreads/CigarStringTestPair.java delete mode 100644 public/java/test/org/broadinstitute/sting/utils/clipreads/TestParameter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b79770eb5..10bc050da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -654,4 +654,18 @@ public class Utils { // handle exception } } + + + public static byte [] arrayFromArrayWithLength(byte[] array, int length) { + byte [] output = new byte[length]; + for (int j = 0; j < length; j++) + output[j] = array[(j % array.length)]; + return output; + } + + public static void fillArrayWithByte(byte[] array, byte value) { + for (int i=0; i clippedRead.getReadLength()) + if (op.stop >= clippedRead.getReadLength()) fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); clippedRead = fixedOperation.apply(algorithm, clippedRead); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 8d9018045..ac1f00437 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -910,4 +910,9 @@ public class ReadUtils { return comp.compare(read1, read2); } + // TEST UTILITIES + + + + } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/CigarStringTestPair.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/CigarStringTestPair.java deleted file mode 100644 index cc9021fae..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/CigarStringTestPair.java +++ /dev/null @@ -1,18 +0,0 @@ -package org.broadinstitute.sting.utils.clipreads; - -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 11/29/11 - * Time: 4:53 PM - * To change this template use File | Settings | File Templates. - */ -public class CigarStringTestPair { - public String toTest; - public String expected; - - public CigarStringTestPair(String ToTest, String Expected) { - this.toTest = ToTest; - this.expected = Expected; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java index de9d8fb50..c6dc3a833 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java @@ -2,8 +2,8 @@ package org.broadinstitute.sting.utils.clipreads; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.TextCigarCodec; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -21,9 +21,149 @@ public class ClipReadsTestUtils { //Should contain all the utils needed for tests to mass produce //reads, cigars, and other needed classes - final static String BASES = "ACTG"; - final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 + final static byte [] BASES = {'A', 'C', 'T', 'G'}; + final static byte [] QUALS = {2, 15, 25, 30}; + final static String CIGAR = "4M"; + final static CigarElement[] cigarElements = { new CigarElement(1, CigarOperator.HARD_CLIP), + new CigarElement(1, CigarOperator.SOFT_CLIP), + new CigarElement(1, CigarOperator.INSERTION), + new CigarElement(1, CigarOperator.DELETION), + new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH)}; + + public static GATKSAMRecord makeReadFromCigar(Cigar cigar) { + return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); + } + + /** + * This function generates every valid permutation of cigar strings with a given length. + * + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * + * @param maximumLength the maximum number of elements in the cigar + * @return a list with all valid Cigar objects + */ + public static List generateCigarList(int maximumLength) { + int numCigarElements = cigarElements.length; + LinkedList cigarList = new LinkedList(); + byte [] cigarCombination = new byte[maximumLength]; + + Utils.fillArrayWithByte(cigarCombination, (byte) 0); // we start off with all 0's in the combination array. + int currentIndex = 0; + while (true) { + Cigar cigar = createCigarFromCombination(cigarCombination); // create the cigar + cigar = combineAdjacentCigarElements(cigar); // combine adjacent elements + if (isCigarValid(cigar)) { // check if it's valid + cigarList.add(cigar); // add it + } + + boolean currentIndexChanged = false; + while (currentIndex < maximumLength && cigarCombination[currentIndex] == numCigarElements - 1) { + currentIndex++; // find the next index to increment + currentIndexChanged = true; // keep track of the fact that we have changed indices! + } + + if (currentIndex == maximumLength) // if we hit the end of the array, we're done. + break; + + cigarCombination[currentIndex]++; // otherwise advance the current index + + if (currentIndexChanged) { // if we have changed index, then... + for (int i = 0; i < currentIndex; i++) + cigarCombination[i] = 0; // reset everything from 0->currentIndex + currentIndex = 0; // go back to the first index + } + } + + return cigarList; + } + + private static boolean isCigarValid(Cigar cigar) { + if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation) + + Stack cigarElementStack = new Stack(); // Stack to invert cigar string to find ending operator + CigarOperator startingOp = null; + CigarOperator endingOp = null; + + // check if it doesn't start with deletions + boolean readHasStarted = false; // search the list of elements for the starting operator + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (!readHasStarted) { + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + startingOp = cigarElement.getOperator(); + } + } + cigarElementStack.push(cigarElement); + } + + readHasStarted = false; // search the inverted list of elements (stack) for the stopping operator + while (!cigarElementStack.empty()) { + CigarElement cigarElement = cigarElementStack.pop(); + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + endingOp = cigarElement.getOperator(); + break; + } + } + + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) + } + + return false; + } + + private static Cigar createCigarFromCombination(byte[] cigarCombination) { + Cigar cigar = new Cigar(); + for (byte i : cigarCombination) { + cigar.add(cigarElements[i]); + } + return cigar; + } + + + /** + * Combines equal adjacent elements of a Cigar object + * + * @param rawCigar the cigar object + * @return a combined cigar object + */ + private static Cigar combineAdjacentCigarElements(Cigar rawCigar) { + Cigar combinedCigar = new Cigar(); + CigarElement lastElement = null; + int lastElementLength = 0; + for (CigarElement cigarElement : rawCigar.getCigarElements()) { + if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator()) + lastElementLength += cigarElement.getLength(); + else + { + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + lastElement = cigarElement; + lastElementLength = cigarElement.getLength(); + } + } + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + return combinedCigar; + } + + public static GATKSAMRecord makeRead() { + return ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); + } + + /** + * Asserts that the two reads have the same bases, qualities and cigar strings + * + * @param actual the calculated read + * @param expected the expected read + */ public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) { // If they're both not empty, test their contents if(!actual.isEmpty() && !expected.isEmpty()) { @@ -36,162 +176,4 @@ public class ClipReadsTestUtils { Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); } - public static void testBaseQualCigar(GATKSAMRecord read, byte[] readBases, byte[] baseQuals, String cigar) { - // Because quals to char start at 33 for visibility - baseQuals = subtractToArray(baseQuals, 33); - - Assert.assertEquals(read.getReadBases(), readBases); - Assert.assertEquals(read.getBaseQualities(), baseQuals); - Assert.assertEquals(read.getCigarString(), cigar); - } - - public static void testCigar(GATKSAMRecord read, String cigar) { - Assert.assertEquals(read.getCigarString(), cigar); - } - - public static void testBaseQual(GATKSAMRecord read, byte[] readBases, byte[] baseQuals) { - // Because quals to chars start at 33 for visibility - baseQuals = subtractToArray(baseQuals, 33); - - if (readBases.length > 0 && baseQuals.length > 0) { - Assert.assertEquals(read.getReadBases(), readBases); - Assert.assertEquals(read.getBaseQualities(), baseQuals); - } else - Assert.assertTrue(read.isEmpty()); - } - - public static byte[] subtractToArray(byte[] array, int n) { - if (array == null) - return null; - - byte[] output = new byte[array.length]; - - for (int i = 0; i < array.length; i++) - output[i] = (byte) (array[i] - n); - - return output; - } - - // What the test read looks like - // Ref: 10 11 12 13 14 15 16 17 - // Read: 0 1 2 3 - - - - - // -------------------------------- - // Bases: A C T G - - - - - // Quals: ! + 5 ? - - - - - - public static GATKSAMRecord makeRead() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - GATKSAMRecord output = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, BASES.length()); - output.setReadBases(new String(BASES).getBytes()); - output.setBaseQualityString(new String(QUALS)); - - return output; - } - - public static GATKSAMRecord makeReadFromCigar(Cigar cigar) { - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - GATKSAMRecord output = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, cigar.getReadLength()); - output.setReadBases(cycleString(BASES, cigar.getReadLength()).getBytes()); - output.setBaseQualityString(cycleString(QUALS, cigar.getReadLength())); - output.setCigar(cigar); - - return output; - } - - private static String cycleString(String string, int length) { - String output = ""; - int cycles = (length / string.length()) + 1; - - for (int i = 1; i < cycles; i++) - output += string; - - for (int j = 0; output.length() < length; j++) - output += string.charAt(j % string.length()); - - return output; - } - - public static Set generateCigars() { - - // This function generates every permutation of cigar strings we need. - - LinkedHashSet output = new LinkedHashSet(); - - List clippingOptionsStart = new LinkedList(); - clippingOptionsStart.add(new Cigar()); - clippingOptionsStart.add(TextCigarCodec.getSingleton().decode("1H1S")); - clippingOptionsStart.add(TextCigarCodec.getSingleton().decode("1S")); - clippingOptionsStart.add(TextCigarCodec.getSingleton().decode("1H")); - - LinkedList clippingOptionsEnd = new LinkedList(); - clippingOptionsEnd.add(new Cigar()); - clippingOptionsEnd.add(TextCigarCodec.getSingleton().decode("1S1H")); - clippingOptionsEnd.add(TextCigarCodec.getSingleton().decode("1S")); - clippingOptionsEnd.add(TextCigarCodec.getSingleton().decode("1H")); - - - LinkedList indelOptions1 = new LinkedList(); - indelOptions1.add(new Cigar()); - //indelOptions1.add( TextCigarCodec.getSingleton().decode("1I1D")); - //indelOptions1.add( TextCigarCodec.getSingleton().decode("1D1I") ); - indelOptions1.add(TextCigarCodec.getSingleton().decode("1I")); - indelOptions1.add(TextCigarCodec.getSingleton().decode("1D")); - - LinkedList indelOptions2 = new LinkedList(); - indelOptions2.add(new Cigar()); - indelOptions2.add(TextCigarCodec.getSingleton().decode("1I")); - indelOptions2.add(null); - - - // Start With M as base CigarElements, M, - - LinkedList base = new LinkedList(); - base.add(TextCigarCodec.getSingleton().decode("1M")); - base.add(TextCigarCodec.getSingleton().decode("5M")); - base.add(TextCigarCodec.getSingleton().decode("25M")); - // Should indel be added as a base? - - // Nested loops W00t! - for (Cigar Base : base) { - for (Cigar indelStart : indelOptions1) { - for (Cigar indelEnd : indelOptions2) { - for (Cigar clipStart : clippingOptionsStart) { - for (Cigar clipEnd : clippingOptionsEnd) { - // Create a list of Cigar Elements and construct Cigar - List CigarBuilder = new ArrayList(); - // add starting clipping (H/S) - CigarBuilder.addAll(clipStart.getCigarElements()); - // add first base (M) - CigarBuilder.addAll(Base.getCigarElements()); - // add first indel - CigarBuilder.addAll(indelStart.getCigarElements()); - // add second base (M) - CigarBuilder.addAll(Base.getCigarElements()); - // add another indel or nothing (M) - if (indelEnd != null) - CigarBuilder.addAll(indelEnd.getCigarElements()); - // add final clipping (S/H) - CigarBuilder.addAll(clipEnd.getCigarElements()); - - - output.add(new Cigar(removeConsecutiveElements(CigarBuilder))); - - } - } - } - } - } - - return output; - } - - private static List removeConsecutiveElements(List cigarBuilder) { - LinkedList output = new LinkedList(); - for (CigarElement E : cigarBuilder) { - if (output.isEmpty() || output.getLast().getOperator() != E.getOperator()) - output.add(E); - } - return output; - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java index 719d04287..dd674ff1c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java @@ -5,8 +5,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import java.util.LinkedList; -import java.util.List; /** * Created by IntelliJ IDEA. @@ -27,43 +25,43 @@ public class ClippingOpUnitTest extends BaseTest { @Test private void testHardClip() { - List testList = new LinkedList(); - testList.add(new TestParameter(0, 0, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(3, 3, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(0, 1, 2, 4, "2H2M"));//clip 2 bases at start - testList.add(new TestParameter(2, 3, 0, 2, "2M2H"));//clip 2 bases at end - testList.add(new TestParameter(0, 2, 3, 4, "3H1M"));//clip 3 bases at start - testList.add(new TestParameter(1, 3, 0, 1, "1M3H"));//clip 3 bases at end - - for (TestParameter p : testList) { - init(); - clippingOp = new ClippingOp(p.inputStart, p.inputStop); - logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.substringStart + "," + p.substringStop + "," + p.cigar); - ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.HARDCLIP_BASES, read), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } +// List testList = new LinkedList(); +// testList.add(new TestParameter(0, 0, 1, 4, "1H3M"));//clip 1 base at start +// testList.add(new TestParameter(3, 3, 0, 3, "3M1H"));//clip 1 base at end +// testList.add(new TestParameter(0, 1, 2, 4, "2H2M"));//clip 2 bases at start +// testList.add(new TestParameter(2, 3, 0, 2, "2M2H"));//clip 2 bases at end +// testList.add(new TestParameter(0, 2, 3, 4, "3H1M"));//clip 3 bases at start +// testList.add(new TestParameter(1, 3, 0, 1, "1M3H"));//clip 3 bases at end +// +// for (TestParameter p : testList) { +// init(); +// clippingOp = new ClippingOp(p.inputStart, p.inputStop); +// logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.substringStart + "," + p.substringStop + "," + p.cigar); +// ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.HARDCLIP_BASES, read), +// ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), +// ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), +// p.cigar); +// } } @Test private void testSoftClip() { - List testList = new LinkedList(); - testList.add(new TestParameter(0, 0, -1, -1, "1S3M"));//clip 1 base at start - testList.add(new TestParameter(3, 3, -1, -1, "3M1S"));//clip 1 base at end - testList.add(new TestParameter(0, 1, -1, -1, "2S2M"));//clip 2 bases at start - testList.add(new TestParameter(2, 3, -1, -1, "2M2S"));//clip 2 bases at end - testList.add(new TestParameter(0, 2, -1, -1, "3S1M"));//clip 3 bases at start - testList.add(new TestParameter(1, 3, -1, -1, "1M3S"));//clip 3 bases at end - - for (TestParameter p : testList) { - init(); - clippingOp = new ClippingOp(p.inputStart, p.inputStop); - logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.cigar); - ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.SOFTCLIP_BASES, read), - ClipReadsTestUtils.BASES.getBytes(), ClipReadsTestUtils.QUALS.getBytes(), p.cigar); - } +// List testList = new LinkedList(); +// testList.add(new TestParameter(0, 0, -1, -1, "1S3M"));//clip 1 base at start +// testList.add(new TestParameter(3, 3, -1, -1, "3M1S"));//clip 1 base at end +// testList.add(new TestParameter(0, 1, -1, -1, "2S2M"));//clip 2 bases at start +// testList.add(new TestParameter(2, 3, -1, -1, "2M2S"));//clip 2 bases at end +// testList.add(new TestParameter(0, 2, -1, -1, "3S1M"));//clip 3 bases at start +// testList.add(new TestParameter(1, 3, -1, -1, "1M3S"));//clip 3 bases at end +// +// for (TestParameter p : testList) { +// init(); +// clippingOp = new ClippingOp(p.inputStart, p.inputStop); +// logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.cigar); +// ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.SOFTCLIP_BASES, read), +// ClipReadsTestUtils.BASES.getBytes(), ClipReadsTestUtils.QUALS.getBytes(), p.cigar); +// } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index f998a4e78..fc1459ee0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -28,16 +28,15 @@ package org.broadinstitute.sting.utils.clipreads; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; -import java.util.LinkedList; import java.util.List; /** @@ -53,157 +52,36 @@ public class ReadClipperUnitTest extends BaseTest { // TODO: add indels to all test cases - ReadClipper readClipper; + List cigarList; + int maximumCigarSize = 10; - @BeforeMethod + @BeforeClass public void init() { - readClipper = new ReadClipper(ClipReadsTestUtils.makeRead()); + cigarList = ClipReadsTestUtils.generateCigarList(maximumCigarSize); } @Test(enabled = true) public void testHardClipBothEndsByReferenceCoordinates() { - logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); - //int debug = 1; - //Clip whole read - Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(10, 10), new GATKSAMRecord(readClipper.read.getHeader())); - //clip 1 base - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipBothEndsByReferenceCoordinates(10, 13), - ClipReadsTestUtils.BASES.substring(1, 3).getBytes(), ClipReadsTestUtils.QUALS.substring(1, 3).getBytes(), - "1H2M1H"); - - List cigarStringTestPairs = new LinkedList(); - cigarStringTestPairs.add(new CigarStringTestPair("5M1D1M2I4M5I6M1D3M2I100M", "1H4M1D1M2I4M5I6M1D3M2I99M1H")); - //cigarStringTestPairs.add( new CigarStringTestPair("5M1I1M2I1M","1H4M1I1M2I1H")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1I1M1I1M1I1M1I1M1I1M1S", "1H1M1I1M1I1M1I1M1I1M1I1M1H")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1D1M1D1M1D1M1D1M1D1M1S", "1H1M1D1M1D1M1D1M1D1M1D1M1H")); - - for (CigarStringTestPair pair : cigarStringTestPairs) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar(TextCigarCodec.getSingleton().decode(pair.toTest))); - ClipReadsTestUtils.testCigar(readClipper.hardClipBothEndsByReferenceCoordinates( - ReadUtils.getRefCoordSoftUnclippedStart(readClipper.read), - ReadUtils.getRefCoordSoftUnclippedEnd(readClipper.read)), - pair.expected); - } - /* - for ( Cigar cigar: ClipReadsTestUtils.generateCigars() ) { - // The read has to be long enough to clip one base from each side - // This also filters a lot of cigars - if ( cigar.getReadLength() > 26 ) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar( cigar )); - System.out.println( "Testing Cigar: "+cigar.toString() ) ; - //cigar length reference plus soft clip - - ClipReadsTestUtils.testBaseQual( - readClipper.hardClipBothEndsByReferenceCoordinates( - ReadUtils.getRefCoordSoftUnclippedStart(readClipper.read), - ReadUtils.getRefCoordSoftUnclippedEnd(readClipper.read) ), - readClipper.read.getReadString().substring(1, (cigar.getReadLength() - 1)).getBytes(), - readClipper.read.getBaseQualityString().substring(1, (cigar.getReadLength() - 1)).getBytes()); - } - } - */ } @Test(enabled = true) public void testHardClipByReadCoordinates() { - logger.warn("Executing testHardClipByReadCoordinates"); - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReadCoordinates(0, 3), new GATKSAMRecord(readClipper.read.getHeader())); - - List testList = new LinkedList(); - testList.add(new TestParameter(0, 0, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(3, 3, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(0, 1, 2, 4, "2H2M"));//clip 2 bases at start - testList.add(new TestParameter(2, 3, 0, 2, "2M2H"));//clip 2 bases at end - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReadCoordinates(p.inputStart, p.inputStop), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } - } @Test(enabled = true) public void testHardClipByReferenceCoordinates() { logger.warn("Executing testHardClipByReferenceCoordinates"); - //logger.warn(debug); - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(10, 13), new GATKSAMRecord(readClipper.read.getHeader())); - List testList = new LinkedList(); - testList.add(new TestParameter(-1, 10, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(13, -1, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(-1, 11, 2, 4, "2H2M"));//clip 2 bases at start - testList.add(new TestParameter(12, -1, 0, 2, "2M2H"));//clip 2 bases at end - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReferenceCoordinates(p.inputStart, p.inputStop), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } - - List cigarStringTestPairs = new LinkedList(); - cigarStringTestPairs.add(new CigarStringTestPair("5M1D1M2I4M5I6M1D3M2I100M", "1H4M1D1M2I4M5I6M1D3M2I100M")); - //cigarStringTestPairs.add( new CigarStringTestPair("5M1I1M2I1M","1H4M1I1M2I1M")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1I1M1I1M1I1M1I1M1I1M1S", "1H1M1I1M1I1M1I1M1I1M1I1M1S")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1D1M1D1M1D1M1D1M1D1M1S", "1H1M1D1M1D1M1D1M1D1M1D1M1S")); - - //Clips only first base - for (CigarStringTestPair pair : cigarStringTestPairs) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar(TextCigarCodec.getSingleton().decode(pair.toTest))); - ClipReadsTestUtils.testCigar(readClipper.hardClipByReadCoordinates(0, 0), pair.expected); - } - /* - for ( Cigar cigar: ClipReadsTestUtils.generateCigars() ) { - // The read has to be long enough to clip one base - // This also filters a lot of cigars - if ( cigar.getReadLength() > 26 ) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar( cigar )); - System.out.println( "Testing Cigar: "+cigar.toString() ) ; - //cigar length reference plus soft clip - - // Clip first read - ClipReadsTestUtils.testBaseQual( - readClipper.hardClipByReadCoordinates(0,0), - readClipper.read.getReadString().substring(1, cigar.getReadLength()).getBytes(), - readClipper.read.getBaseQualityString().substring(1, cigar.getReadLength()).getBytes()); - } - } - */ } @Test(enabled = true) public void testHardClipByReferenceCoordinatesLeftTail() { - init(); logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(13), new GATKSAMRecord(readClipper.read.getHeader())); - - List testList = new LinkedList(); - testList.add(new TestParameter(10, -1, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(11, -1, 2, 4, "2H2M"));//clip 2 bases at start - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReferenceCoordinatesLeftTail(p.inputStart), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } - } @Test(enabled = true) @@ -211,29 +89,80 @@ public class ReadClipperUnitTest extends BaseTest { init(); logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(10), new GATKSAMRecord(readClipper.read.getHeader())); - - List testList = new LinkedList(); - testList.add(new TestParameter(-1, 13, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(-1, 12, 0, 2, "2M2H"));//clip 2 bases at end - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReferenceCoordinatesRightTail(p.inputStop), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } - } @Test(enabled = true) public void testHardClipLowQualEnds() { logger.warn("Executing testHardClipLowQualEnds"); - // Testing clipping that ends inside an insertion + final byte LOW_QUAL = 2; + final byte HIGH_QUAL = 30; + + // create a read for every cigar permutation + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + int readLength = read.getReadLength(); + byte [] quals = new byte[readLength]; + + for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { + + // create a read with nLowQualBases in the left tail + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) + quals[addLeft] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipLowQualEnds(LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + testNoLowQualBases(clipLeft, LOW_QUAL); + + // Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); + + // create a read with nLowQualBases in the right tail + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addRight = 0; addRight < nLowQualBases; addRight++) + quals[readLength - addRight - 1] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipLowQualEnds(LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + testNoLowQualBases(clipRight, LOW_QUAL); + + // Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); + + // create a read with nLowQualBases in the both tails + if (nLowQualBases <= readLength/2) { + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { + quals[addBoth] = LOW_QUAL; + quals[readLength - addBoth - 1] = LOW_QUAL; + } + read.setBaseQualities(quals); + GATKSAMRecord clipBoth = (new ReadClipper(read)).hardClipLowQualEnds(LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + testNoLowQualBases(clipBoth, LOW_QUAL); + + // Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); + } + } +// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString())); + } + + + + + + // ONE OFF Testing clipping that ends inside an insertion final byte[] BASES = {'A','C','G','T','A','C','G','T'}; final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; final String CIGAR = "1S1M5I1S"; @@ -248,17 +177,23 @@ public class ReadClipperUnitTest extends BaseTest { ReadClipper lowQualClipper = new ReadClipper(read); ClipReadsTestUtils.assertEqualReads(lowQualClipper.hardClipLowQualEnds((byte) 2), expected); + } - + private void testNoLowQualBases(GATKSAMRecord read, byte low_qual) { + if (!read.isEmpty()) { + byte [] quals = read.getBaseQualities(); + for (int i=0; i %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); - logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); +// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); } } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/TestParameter.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/TestParameter.java deleted file mode 100644 index 155fe094e..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/TestParameter.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.utils.clipreads; - -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 11/28/11 - * Time: 4:07 PM - * To change this template use File | Settings | File Templates. - */ -public class TestParameter { - int inputStart; - int inputStop; - int substringStart; - int substringStop; - String cigar; - - public TestParameter(int InputStart, int InputStop, int SubstringStart, int SubstringStop, String Cigar) { - inputStart = InputStart; - inputStop = InputStop; - substringStart = SubstringStart; - substringStop = SubstringStop; - cigar = Cigar; - } -} From 3642a73c07d9f6da85ee59c62ea3a8c9587a364e Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Fri, 16 Dec 2011 09:37:44 -0500 Subject: [PATCH 28/29] Performance improvements for dynamically merging BAMs in read walkers. This change and my previous change have dropped runtime when dynamically merging 2k BAM files from 72.6min/1M reads to 46.8sec/1M reads. Note that many of these changes are stopgaps -- the real problem is the way ReadWalkers interface with Picard, and I'll have to work with Tim&Co to produce a more maintainable patch. --- .../picard/sam/MergingSamRecordIterator.java | 247 ++++++ .../sf/picard/sam/SamFileHeaderMerger.java | 744 ++++++++++++++++++ .../gatk/datasources/reads/SAMDataSource.java | 34 +- 3 files changed, 1011 insertions(+), 14 deletions(-) create mode 100644 public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java create mode 100644 public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java diff --git a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java new file mode 100644 index 000000000..4b1c7a999 --- /dev/null +++ b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package net.sf.picard.sam; + +import net.sf.picard.PicardException; + +import java.util.*; +import java.lang.reflect.Constructor; + +import net.sf.samtools.*; +import net.sf.samtools.util.CloseableIterator; + +/** + * Provides an iterator interface for merging multiple underlying iterators into a single + * iterable stream. The underlying iterators/files must all have the same sort order unless + * the requested output format is unsorted, in which case any combination is valid. + */ +public class MergingSamRecordIterator implements CloseableIterator { + private final PriorityQueue pq; + private final SamFileHeaderMerger samHeaderMerger; + private final Collection readers; + private final SAMFileHeader.SortOrder sortOrder; + private final SAMRecordComparator comparator; + + private boolean initialized = false; + private boolean iterationStarted = false; + + /** + * Constructs a new merging iterator with the same set of readers and sort order as + * provided by the header merger parameter. + * @param headerMerger The merged header and contents of readers. + * @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order. + * @deprecated replaced by (SamFileHeaderMerger, Collection, boolean) + */ + public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) { + this(headerMerger, headerMerger.getReaders(), forcePresorted); + } + + /** + * Constructs a new merging iterator with the same set of readers and sort order as + * provided by the header merger parameter. + * @param headerMerger The merged header and contents of readers. + * @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order. + */ + public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection readers, final boolean assumeSorted) { + this.samHeaderMerger = headerMerger; + this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); + this.comparator = getComparator(); + this.readers = readers; + + this.pq = new PriorityQueue(readers.size()); + + for (final SAMFileReader reader : readers) { + if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted && + reader.getFileHeader().getSortOrder() != this.sortOrder){ + throw new PicardException("Files are not compatible with sort order"); + } + } + } + + /** + * Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval, + * rather than iterating over every read in the backing file or stream. + * @param reader Reader to add to the merging iterator. + * @param iterator Iterator traversing over reader contents. + */ + public void addIterator(final SAMFileReader reader, final CloseableIterator iterator) { + if(iterationStarted) + throw new PicardException("Cannot add another iterator; iteration has already begun"); + if(!samHeaderMerger.containsHeader(reader.getFileHeader())) + throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger"); + final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator); + addIfNotEmpty(comparableIterator); + initialized = true; + } + + private void startIterationIfRequired() { + if(initialized) + return; + for(SAMFileReader reader: readers) + addIterator(reader,reader.iterator()); + iterationStarted = true; + } + + /** + * Close down all open iterators. + */ + public void close() { + // Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue. + for(CloseableIterator iterator: pq) + iterator.close(); + } + + /** Returns true if any of the underlying iterators has more records, otherwise false. */ + public boolean hasNext() { + startIterationIfRequired(); + return !this.pq.isEmpty(); + } + + /** Returns the next record from the top most iterator during merging. */ + public SAMRecord next() { + startIterationIfRequired(); + + final ComparableSamRecordIterator iterator = this.pq.poll(); + final SAMRecord record = iterator.next(); + addIfNotEmpty(iterator); + record.setHeader(this.samHeaderMerger.getMergedHeader()); + + // Fix the read group if needs be + if (this.samHeaderMerger.hasReadGroupCollisions()) { + final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID); + if (oldGroupId != null ) { + final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId); + record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId); + } + } + + // Fix the program group if needs be + if (this.samHeaderMerger.hasProgramGroupCollisions()) { + final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID); + if (oldGroupId != null ) { + final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId); + record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId); + } + } + + // Fix up the sequence indexes if needs be + if (this.samHeaderMerger.hasMergedSequenceDictionary()) { + if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex())); + } + + if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex())); + } + } + + return record; + } + + /** + * Adds iterator to priority queue. If the iterator has more records it is added + * otherwise it is closed and not added. + */ + private void addIfNotEmpty(final ComparableSamRecordIterator iterator) { + if (iterator.hasNext()) { + pq.offer(iterator); + } + else { + iterator.close(); + } + } + + /** Unsupported operation. */ + public void remove() { + throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()"); + } + + /** + * Get the right comparator for a given sort order (coordinate, alphabetic). In the + * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive + * ordering. + */ + private SAMRecordComparator getComparator() { + // For unsorted build a fake comparator that compares based on object ID + if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { + return new SAMRecordComparator() { + public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { + return System.identityHashCode(lhs) - System.identityHashCode(rhs); + } + + public int compare(final SAMRecord lhs, final SAMRecord rhs) { + return fileOrderCompare(lhs, rhs); + } + }; + } + if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) { + return new MergedSequenceDictionaryCoordinateOrderComparator(); + } + + // Otherwise try and figure out what kind of comparator to return and build it + return this.sortOrder.getComparatorInstance(); + } + + /** Returns the merged header that the merging iterator is working from. */ + public SAMFileHeader getMergedHeader() { + return this.samHeaderMerger.getMergedHeader(); + } + + /** + * Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged + * sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids + * more copy & paste. + */ + private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator { + + public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { + final int referenceIndex1 = getReferenceIndex(samRecord1); + final int referenceIndex2 = getReferenceIndex(samRecord2); + if (referenceIndex1 != referenceIndex2) { + if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return 1; + } else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return -1; + } else { + return referenceIndex1 - referenceIndex2; + } + } + if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + // Both are unmapped. + return 0; + } + return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); + } + + private int getReferenceIndex(final SAMRecord samRecord) { + if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex()); + } + if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex()); + } + return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + } + } +} diff --git a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java new file mode 100644 index 000000000..f78cd81da --- /dev/null +++ b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java @@ -0,0 +1,744 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.picard.sam; + +import java.util.*; + +import net.sf.picard.PicardException; +import net.sf.samtools.AbstractSAMHeaderRecord; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.SequenceUtil; + +/** + * Merges SAMFileHeaders that have the same sequences into a single merged header + * object while providing read group translation for cases where read groups + * clash across input headers. + */ +public class SamFileHeaderMerger { + //Super Header to construct + private final SAMFileHeader mergedHeader; + private Collection readers; + private final Collection headers; + + //Translation of old group ids to new group ids + private final Map> samReadGroupIdTranslation = + new IdentityHashMap>(); + + //the read groups from different files use the same group ids + private boolean hasReadGroupCollisions = false; + + //the program records from different files use the same program record ids + private boolean hasProgramGroupCollisions = false; + + //Translation of old program group ids to new program group ids + private Map> samProgramGroupIdTranslation = + new IdentityHashMap>(); + + private boolean hasMergedSequenceDictionary = false; + + // Translation of old sequence dictionary ids to new dictionary ids + // This is an IdentityHashMap because it can be quite expensive to compute the hashCode for + // large SAMFileHeaders. It is possible that two input files will have identical headers so that + // the regular HashMap would fold them together, but the value stored in each of the two + // Map entries will be the same, so it should not hurt anything. + private final Map> samSeqDictionaryIdTranslationViaHeader = + new IdentityHashMap>(); + + //HeaderRecordFactory that creates SAMReadGroupRecord instances. + private static final HeaderRecordFactory READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory() { + public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) { + return new SAMReadGroupRecord(id, srcReadGroupRecord); + } + }; + + //HeaderRecordFactory that creates SAMProgramRecord instances. + private static final HeaderRecordFactory PROGRAM_RECORD_FACTORY = new HeaderRecordFactory() { + public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) { + return new SAMProgramRecord(id, srcProgramRecord); + } + }; + + //comparator used to sort lists of program group and read group records + private static final Comparator RECORD_ID_COMPARATOR = new Comparator() { + public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) { + return o1.getId().compareTo(o2.getId()); + } + }; + + /** + * Create SAMFileHeader with additional information. Required that sequence dictionaries agree. + * + * @param readers sam file readers to combine + * @param sortOrder sort order new header should have + * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) + */ + public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { + this(readers, sortOrder, false); + } + + /** + * Create SAMFileHeader with additional information. + * + * @param readers sam file readers to combine + * @param sortOrder sort order new header should have + * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that + * all input sequence dictionaries be identical. + * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) + */ + public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) { + this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries); + this.readers = readers; + } + + /** + * Create SAMFileHeader with additional information.. This is the preferred constructor. + * + * @param sortOrder sort order new header should have + * @param headers sam file headers to combine + * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that + * all input sequence dictionaries be identical. + */ + public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection headers, final boolean mergeDictionaries) { + this.headers = headers; + this.mergedHeader = new SAMFileHeader(); + + SAMSequenceDictionary sequenceDictionary; + try { + sequenceDictionary = getSequenceDictionary(headers); + this.hasMergedSequenceDictionary = false; + } + catch (SequenceUtil.SequenceListsDifferException pe) { + if (mergeDictionaries) { + sequenceDictionary = mergeSequenceDictionaries(headers); + this.hasMergedSequenceDictionary = true; + } + else { + throw pe; + } + } + + this.mergedHeader.setSequenceDictionary(sequenceDictionary); + + // Set program that creates input alignments + for (final SAMProgramRecord program : mergeProgramGroups(headers)) { + this.mergedHeader.addProgramRecord(program); + } + + // Set read groups for merged header + final List readGroups = mergeReadGroups(headers); + this.mergedHeader.setReadGroups(readGroups); + this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); + + this.mergedHeader.setSortOrder(sortOrder); + + for (final SAMFileHeader header : headers) { + for (final String comment : header.getComments()) { + this.mergedHeader.addComment(comment); + } + } + } + + // Utilility method to make use with old constructor + private static List getHeadersFromReaders(Collection readers) { + List headers = new ArrayList(readers.size()); + for (SAMFileReader reader : readers) { + headers.add(reader.getFileHeader()); + } + return headers; + } + + + /** + * Checks to see if there are clashes where different readers are using the same read + * group IDs. If yes, then those IDs that collided are remapped. + * + * @param headers headers to combine + * @return new list of read groups constructed from all the readers + */ + private List mergeReadGroups(final Collection headers) { + //prepare args for mergeHeaderRecords(..) call + final HashSet idsThatAreAlreadyTaken = new HashSet(); + + final List> readGroupsToProcess = new LinkedList>(); + for (final SAMFileHeader header : headers) { + for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { + //verify that there are no existing id collisions in this input file + if(!idsThatAreAlreadyTaken.add(readGroup.getId())) + throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")"); + + readGroupsToProcess.add(new HeaderRecordAndFileHeader(readGroup, header)); + } + idsThatAreAlreadyTaken.clear(); + } + + final List result = new LinkedList(); + + hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result); + + //sort the result list by record id + Collections.sort(result, RECORD_ID_COMPARATOR); + + return result; + } + + + /** + * Checks to see if there are clashes where different readers are using the same program + * group IDs. If yes, then those IDs that collided are remapped. + * + * @param headers headers to combine + * @return new list of program groups constructed from all the readers + */ + private List mergeProgramGroups(final Collection headers) { + + final List overallResult = new LinkedList(); + + //this Set will accumulate all SAMProgramRecord ids that have been encountered so far. + final HashSet idsThatAreAlreadyTaken = new HashSet(); + + //need to process all program groups + List> programGroupsLeftToProcess = new LinkedList>(); + for (final SAMFileHeader header : headers) { + for (final SAMProgramRecord programGroup : header.getProgramRecords()) { + //verify that there are no existing id collisions in this input file + if(!idsThatAreAlreadyTaken.add(programGroup.getId())) + throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")"); + + programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader(programGroup, header)); + } + idsThatAreAlreadyTaken.clear(); + } + + //A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to + //another program group header (lets say ID=1 PN=A) to indicate that the given file was + //processed by program A followed by program B. These PP attributes potentially + //connect headers into one or more tree structures. Merging is done by + //first merging all headers that don't have PP attributes (eg. tree roots), + //then updating and merging all headers whose PPs point to the tree-root headers, + //and so on until all program group headers are processed. + + //currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots). + List< HeaderRecordAndFileHeader > currentProgramGroups = new LinkedList>(); + for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { + final HeaderRecordAndFileHeader pair = programGroupsLeftToProcessIterator.next(); + if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) { + programGroupsLeftToProcessIterator.remove(); + currentProgramGroups.add(pair); + } + } + + //merge currentProgramGroups + while(!currentProgramGroups.isEmpty()) + { + final List currentResult = new LinkedList(); + + hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult); + + //add currentResults to overallResults + overallResult.addAll(currentResult); + + //apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess + currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false); + programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true); + + //find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups), + //and move them to the list of programGroupsToProcessNext. + LinkedList> programGroupsToProcessNext = new LinkedList>(); + for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { + final HeaderRecordAndFileHeader pairLeftToProcess = programGroupsLeftToProcessIterator.next(); + final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); + //find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file) + for(final HeaderRecordAndFileHeader justProcessedPair : currentProgramGroups) { + String idJustProcessed = justProcessedPair.getHeaderRecord().getId(); + if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) { + programGroupsLeftToProcessIterator.remove(); + programGroupsToProcessNext.add(pairLeftToProcess); + break; + } + } + } + + currentProgramGroups = programGroupsToProcessNext; + } + + //verify that all records were processed + if(!programGroupsLeftToProcess.isEmpty()) { + StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n"); + for( final HeaderRecordAndFileHeader pair : programGroupsLeftToProcess ) { + SAMProgramRecord record = pair.getHeaderRecord(); + errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n"); + } + throw new PicardException(errorMsg.toString()); + } + + //sort the result list by record id + Collections.sort(overallResult, RECORD_ID_COMPARATOR); + + return overallResult; + } + + + /** + * Utility method that takes a list of program groups and remaps all their + * ids (including ppIds if requested) using the given idTranslationTable. + * + * NOTE: when remapping, this method creates new SAMProgramRecords and + * doesn't mutate any records in the programGroups list. + * + * @param programGroups The program groups to translate. + * @param idTranslationTable The translation table. + * @param translatePpIds Whether ppIds should be translated as well. + * + * @return The list of translated records. + */ + private List> translateIds( + List> programGroups, + Map> idTranslationTable, + boolean translatePpIds) { + + //go through programGroups and translate any IDs and PPs based on the idTranslationTable. + List> result = new LinkedList>(); + for(final HeaderRecordAndFileHeader pair : programGroups ) { + final SAMProgramRecord record = pair.getHeaderRecord(); + final String id = record.getProgramGroupId(); + final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); + + final SAMFileHeader header = pair.getFileHeader(); + final Map translations = idTranslationTable.get(header); + + //see if one or both ids need to be translated + SAMProgramRecord translatedRecord = null; + if(translations != null) + { + String translatedId = translations.get( id ); + String translatedPpId = translatePpIds ? translations.get( ppId ) : null; + + boolean needToTranslateId = translatedId != null && !translatedId.equals(id); + boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId); + + if(needToTranslateId && needToTranslatePpId) { + translatedRecord = new SAMProgramRecord(translatedId, record); + translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); + } else if(needToTranslateId) { + translatedRecord = new SAMProgramRecord(translatedId, record); + } else if(needToTranslatePpId) { + translatedRecord = new SAMProgramRecord(id, record); + translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); + } + } + + if(translatedRecord != null) { + result.add(new HeaderRecordAndFileHeader(translatedRecord, header)); + } else { + result.add(pair); //keep the original record + } + } + + return result; + } + + + /** + * Utility method for merging a List of AbstractSAMHeaderRecords. If it finds + * records that have identical ids and attributes, it will collapse them + * into one record. If it finds records that have identical ids but + * non-identical attributes, this is treated as a collision. When collision happens, + * the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable. + * + * NOTE: Non-collided records also get recorded in the idTranslationTable as + * old-id to old-id. This way, an idTranslationTable lookup should never return null. + * + * @param headerRecords The header records to merge. + * @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord. + * @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped. + * @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id + * mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as + * old-id to old-id. This way, an idTranslationTable lookup should never return null. + * + * @param result The list of merged header records. + * + * @return True if there were collisions. + */ + private boolean mergeHeaderRecords(final List> headerRecords, HeaderRecordFactory headerRecordFactory, + final HashSet idsThatAreAlreadyTaken, Map> idTranslationTable, List result) { + + //The outer Map bins the header records by their ids. The nested Map further collapses + //header records which, in addition to having the same id, also have identical attributes. + //In other words, each key in the nested map represents one or more + //header records which have both identical ids and identical attributes. The List of + //SAMFileHeaders keeps track of which readers these header record(s) came from. + final Map>> idToRecord = + new HashMap>>(); + + //Populate the idToRecord and seenIds data structures + for (final HeaderRecordAndFileHeader pair : headerRecords) { + final RecordType record = pair.getHeaderRecord(); + final SAMFileHeader header = pair.getFileHeader(); + final String recordId = record.getId(); + Map> recordsWithSameId = idToRecord.get(recordId); + if(recordsWithSameId == null) { + recordsWithSameId = new LinkedHashMap>(); + idToRecord.put(recordId, recordsWithSameId); + } + + List fileHeaders = recordsWithSameId.get(record); + if(fileHeaders == null) { + fileHeaders = new LinkedList(); + recordsWithSameId.put(record, fileHeaders); + } + + fileHeaders.add(header); + } + + //Resolve any collisions between header records by remapping their ids. + boolean hasCollisions = false; + for (final Map.Entry>> entry : idToRecord.entrySet() ) + { + final String recordId = entry.getKey(); + final Map> recordsWithSameId = entry.getValue(); + + + for( Map.Entry> recordWithUniqueAttr : recordsWithSameId.entrySet()) { + final RecordType record = recordWithUniqueAttr.getKey(); + final List fileHeaders = recordWithUniqueAttr.getValue(); + + String newId; + if(!idsThatAreAlreadyTaken.contains(recordId)) { + //don't remap 1st record. If there are more records + //with this id, they will be remapped in the 'else'. + newId = recordId; + idsThatAreAlreadyTaken.add(recordId); + } else { + //there is more than one record with this id. + hasCollisions = true; + + //find a unique newId for this record + int idx=1; + while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++))) + ; + + idsThatAreAlreadyTaken.add( newId ); + } + + for(SAMFileHeader fileHeader : fileHeaders) { + Map readerTranslationTable = idTranslationTable.get(fileHeader); + if(readerTranslationTable == null) { + readerTranslationTable = new HashMap(); + idTranslationTable.put(fileHeader, readerTranslationTable); + } + readerTranslationTable.put(recordId, newId); + } + + result.add( headerRecordFactory.createRecord(newId, record) ); + } + } + + return hasCollisions; + } + + + /** + * Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence + * are different from one another. + * + * @param headers headers to pull sequences from + * @return sequences from files. Each file should have the same sequence + */ + private SAMSequenceDictionary getSequenceDictionary(final Collection headers) { + SAMSequenceDictionary sequences = null; + for (final SAMFileHeader header : headers) { + + if (sequences == null) { + sequences = header.getSequenceDictionary(); + } + else { + final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); + SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences); + } + } + + return sequences; + } + + /** + * Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries. + * + * @param headers headers to pull sequences from + * @return sequences from files. Each file should have the same sequence + */ + private SAMSequenceDictionary mergeSequenceDictionaries(final Collection headers) { + SAMSequenceDictionary sequences = new SAMSequenceDictionary(); + for (final SAMFileHeader header : headers) { + final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); + sequences = mergeSequences(sequences, currentSequences); + } + // second pass, make a map of the original seqeunce id -> new sequence id + createSequenceMapping(headers, sequences); + return sequences; + } + + /** + * They've asked to merge the sequence headers. What we support right now is finding the sequence name superset. + * + * @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals. + * @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict. + * @return A new sequence dictionary that resulting from merging the two inputs. + */ + private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) { + + // a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict. + LinkedList holder = new LinkedList(); + + // Return value will be created from this. + LinkedList resultingDict = new LinkedList(); + for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) { + resultingDict.add(sequenceRecord); + } + + // Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. + int prevloc = -1; + // Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. + SAMSequenceRecord previouslyMerged = null; + + for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) { + // Does it already exist in resultingDict? + int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName()); + if (loc == -1) { + // If doesn't already exist in resultingDict, save it an decide where to insert it later. + holder.add(sequenceRecord.clone()); + } else if (prevloc > loc) { + // If sequenceRecord already exists in resultingDict, but prior to the previous one + // from mergeIntoDict that already existed, cannot merge. + throw new PicardException("Cannot merge sequence dictionaries because sequence " + + sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() + + " are in different orders in two input sequence dictionaries."); + } else { + // Since sequenceRecord already exists in resultingDict, don't need to add it. + // Add in all the sequences prior to it that have been held in holder. + resultingDict.addAll(loc, holder); + // Remember the index of sequenceRecord so can check for merge imcompatibility. + prevloc = loc + holder.size(); + previouslyMerged = sequenceRecord; + holder.clear(); + } + } + // Append anything left in holder. + if (holder.size() != 0) { + resultingDict.addAll(holder); + } + return new SAMSequenceDictionary(resultingDict); + } + + /** + * Find sequence in list. + * @param list List to search for the sequence name. + * @param sequenceName Name to search for. + * @return Index of SAMSequenceRecord with the given name in list, or -1 if not found. + */ + private static int getIndexOfSequenceName(final List list, final String sequenceName) { + for (int i = 0; i < list.size(); ++i) { + if (list.get(i).getSequenceName().equals(sequenceName)) { + return i; + } + } + return -1; + } + + /** + * create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged + * list of sequence id's. + * @param headers the collections of headers. + * @param masterDictionary the superset dictionary we've created. + */ + private void createSequenceMapping(final Collection headers, SAMSequenceDictionary masterDictionary) { + LinkedList resultingDictStr = new LinkedList(); + for (SAMSequenceRecord r : masterDictionary.getSequences()) { + resultingDictStr.add(r.getSequenceName()); + } + for (final SAMFileHeader header : headers) { + Map seqMap = new HashMap(); + SAMSequenceDictionary dict = header.getSequenceDictionary(); + for (SAMSequenceRecord rec : dict.getSequences()) { + seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName())); + } + this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap); + } + } + + + + /** + * Returns the read group id that should be used for the input read and RG id. + * + * @deprecated replaced by getReadGroupId(SAMFileHeader, String) + * */ + public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { + return getReadGroupId(reader.getFileHeader(), originalReadGroupId); + } + + /** Returns the read group id that should be used for the input read and RG id. */ + public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) { + return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId); + } + + /** + * @param reader one of the input files + * @param originalProgramGroupId a program group ID from the above input file + * @return new ID from the merged list of program groups in the output file + * @deprecated replaced by getProgramGroupId(SAMFileHeader, String) + */ + public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) { + return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId); + } + + /** + * @param header one of the input headers + * @param originalProgramGroupId a program group ID from the above input file + * @return new ID from the merged list of program groups in the output file + */ + public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) { + return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId); + } + + /** Returns true if there are read group duplicates within the merged headers. */ + public boolean hasReadGroupCollisions() { + return this.hasReadGroupCollisions; + } + + /** Returns true if there are program group duplicates within the merged headers. */ + public boolean hasProgramGroupCollisions() { + return hasProgramGroupCollisions; + } + + /** @return if we've merged the sequence dictionaries, return true */ + public boolean hasMergedSequenceDictionary() { + return hasMergedSequenceDictionary; + } + + /** Returns the merged header that should be written to any output merged file. */ + public SAMFileHeader getMergedHeader() { + return this.mergedHeader; + } + + /** Returns the collection of readers that this header merger is working with. May return null. + * @deprecated replaced by getHeaders() + */ + public Collection getReaders() { + return this.readers; + } + + /** Returns the collection of readers that this header merger is working with. + */ + public Collection getHeaders() { + return this.headers; + } + + /** + * Tells whether this header merger contains a given SAM file header. Note that header presence + * is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for + * reasons of performance. + * @param header header to check for. + * @return True if the header exists in this HeaderMerger. False otherwise. + */ + boolean containsHeader(SAMFileHeader header) { + for(SAMFileHeader headerMergerHeader: headers) { + if(headerMergerHeader == header) + return true; + } + return false; + } + + /** + * returns the new mapping for a specified reader, given it's old sequence index + * @param reader the reader + * @param oldReferenceSequenceIndex the old sequence (also called reference) index + * @return the new index value + * @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer) + */ + public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) { + return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex); + } + + /** + * Another mechanism for getting the new sequence index, for situations in which the reader is not available. + * Note that if the SAMRecord has already had its header replaced with the merged header, this won't work. + * @param header The original header for the input record in question. + * @param oldReferenceSequenceIndex The original sequence index. + * @return the new index value that is compatible with the merged sequence index. + */ + public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) { + final Map mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header); + if (mapping == null) { + throw new PicardException("No sequence dictionary mapping available for header: " + header); + } + + final Integer newIndex = mapping.get(oldReferenceSequenceIndex); + if (newIndex == null) { + throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header); + } + + return newIndex; + } + + + /** + * Implementations of this interface are used by mergeHeaderRecords(..) to instantiate + * specific subclasses of AbstractSAMHeaderRecord. + */ + private static interface HeaderRecordFactory { + + /** + * Constructs a new instance of RecordType. + * @param id The id of the new record. + * @param srcRecord Except for the id, the new record will be a copy of this source record. + */ + public RecordType createRecord(final String id, RecordType srcRecord); + } + + /** + * Struct that groups together a subclass of AbstractSAMHeaderRecord with the + * SAMFileHeader that it came from. + */ + private static class HeaderRecordAndFileHeader { + private RecordType headerRecord; + private SAMFileHeader samFileHeader; + + public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) { + this.headerRecord = headerRecord; + this.samFileHeader = samFileHeader; + } + + public RecordType getHeaderRecord() { + return headerRecord; + } + public SAMFileHeader getFileHeader() { + return samFileHeader; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d5d8ab2ef..2729941bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -259,6 +259,11 @@ public class SAMDataSource { validationStringency = strictness; if(readBufferSize != null) ReadShard.setReadBufferSize(readBufferSize); + else { + // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively + // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. + ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); + } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); @@ -479,10 +484,13 @@ public class SAMDataSource { // Cache the most recently viewed read so that we can check whether we've reached the end of a pair. SAMRecord read = null; + Map positionUpdates = new IdentityHashMap(); + CloseableIterator iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate); while(!shard.isBufferFull() && iterator.hasNext()) { read = iterator.next(); - addReadToBufferingShard(shard,getReaderID(readers,read),read); + shard.addRead(read); + noteFilePositionUpdate(positionUpdates,read); } // If the reads are sorted in queryname order, ensure that all reads @@ -492,11 +500,21 @@ public class SAMDataSource { SAMRecord nextRead = iterator.next(); if(read == null || !read.getReadName().equals(nextRead.getReadName())) break; - addReadToBufferingShard(shard,getReaderID(readers,nextRead),nextRead); + shard.addRead(nextRead); + noteFilePositionUpdate(positionUpdates,nextRead); } } iterator.close(); + + // Make the updates specified by the reader. + for(Map.Entry positionUpdate: positionUpdates.entrySet()) + readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue()); + } + + private void noteFilePositionUpdate(Map positionMapping, SAMRecord read) { + GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); + positionMapping.put(read.getFileSource().getReader(),endChunk); } public StingSAMIterator seek(Shard shard) { @@ -574,18 +592,6 @@ public class SAMDataSource { readProperties.defaultBaseQualities()); } - /** - * Adds this read to the given shard. - * @param shard The shard to which to add the read. - * @param id The id of the given reader. - * @param read The read to add to the shard. - */ - private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) { - GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); - shard.addRead(read); - readerPositions.put(id,endChunk); - } - /** * Filter reads based on user-specified criteria. * From 5aa79dacfc63895ab7ab96d1b17ea43b35230f37 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 16 Dec 2011 10:29:20 -0500 Subject: [PATCH 29/29] Changing hidden optimization argument to advanced. --- .../gatk/walkers/variantrecalibration/VariantRecalibrator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 339b8e0e6..7cc5b1625 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -175,7 +175,7 @@ public class VariantRecalibrator extends RodWalker