From c3ea96d85616a3ce0b4b90d40e2d141b1efd16cf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 08:42:01 -0400 Subject: [PATCH 001/113] Removing many unused functions of unquestionable purpose --- .../sting/utils/QualityUtils.java | 101 ++---------------- 1 file changed, 10 insertions(+), 91 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index fad2320fc..093da7dd6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double MIN_REASONABLE_ERROR = 0.0001; public final static byte MAX_REASONABLE_Q_SCORE = 40; public final static byte MIN_USABLE_Q_SCORE = 6; - public final static int MAPPING_QUALITY_UNAVAILABLE = 255; + private static double qualToErrorProbCache[] = new double[256]; + static { + for (byte i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i); + } + /** * Private constructor. No instantiating this class! */ @@ -33,10 +36,6 @@ public class QualityUtils { return 1.0 - qualToErrorProb(qual); } - static public double qualToProb(int qual) { - return qualToProb( (double)qual ); - } - static public double qualToProb(double qual) { return 1.0 - Math.pow(10.0, qual/(-10.0)); } @@ -48,10 +47,14 @@ public class QualityUtils { * @param qual a quality score (0-40) * @return a probability (0.0-1.0) */ - static public double qualToErrorProb(byte qual) { + static public double qualToErrorProbRaw(byte qual) { return Math.pow(10.0, ((double) qual)/-10.0); } + static public double qualToErrorProb(byte qual) { + return qualToErrorProbCache[qual]; + } + /** * Convert a probability to a quality score. Note, this is capped at Q40. * @@ -110,88 +113,4 @@ public class QualityUtils { //return (byte) Math.min(qual, maxQual); return (byte) Math.max(Math.min(qual, maxQual), 1); } - - /** - * Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field. - * Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis. - * Another note: the probability encoded here gets rounded to the nearest 1%. - * - * @param baseIndex the base index - * @param prob the base probability - * @return a byte containing the index and the probability - */ - static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) { - byte compressedQual = 0; - - compressedQual = (byte) baseIndex; - - byte cprob = (byte) (100.0*prob); - byte qualmask = (byte) 252; - compressedQual += ((cprob << 2) & qualmask); - - return compressedQual; - } - - /** - * From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T) - * - * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality - * @return base index - */ - static public int compressedQualityToBaseIndex(byte compressedQual) { - return (int) (compressedQual & 0x3); - } - - /** - * From a compressed base, extract the base probability - * - * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality - * @return the probability - */ - static public double compressedQualityToProb(byte compressedQual) { - // Because java natives are signed, extra care must be taken to avoid - // shifting a 1 into the sign bit in the implicit promotion of 2 to an int. - int x2 = ((int) compressedQual) & 0xff; - x2 = (x2 >>> 2); - - return ((double) x2)/100.0; - } - - /** - * Return the complement of a compressed quality - * - * @param compressedQual the compressed quality score (as returned by baseAndProbToCompressedQuality) - * @return the complementary compressed quality - */ - static public byte complementCompressedQuality(byte compressedQual) { - int baseIndex = compressedQualityToBaseIndex(compressedQual); - double prob = compressedQualityToProb(compressedQual); - - return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob); - } - - /** - * Return the reverse complement of a byte array of compressed qualities - * - * @param compressedQuals a byte array of compressed quality scores - * @return the reverse complement of the byte array - */ - static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) { - byte[] rcCompressedQuals = new byte[compressedQuals.length]; - - for (int pos = 0; pos < compressedQuals.length; pos++) { - rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]); - } - - return rcCompressedQuals; - } - - /** - * Return the reverse of a byte array of qualities (compressed or otherwise) - * @param quals the array of bytes to be reversed - * @return the reverse of the quality array - */ - static public byte[] reverseQualityArray( byte[] quals ) { - return Utils.reverse(quals); // no sense in duplicating functionality - } } From c57198a1b998ba25b7facac526cafa04f9b8f77a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 08:46:17 -0400 Subject: [PATCH 002/113] Optimizations in VCFCodec -- Don't create an empty LinkedHashSet() for PASS fields. Just return Collections.emptySet() instead. -- For filter fields with actual values, returns an unmodifiableSet instead of one that can be changed --- .../broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index fa030ef5f..cd320b332 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -110,11 +110,8 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; - // empty set for passes filters - LinkedHashSet fFields = new LinkedHashSet(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return fFields; + return Collections.emptySet(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4"); if ( filterString.length() == 0 ) @@ -124,6 +121,8 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterHash.containsKey(filterString) ) return filterHash.get(filterString); + // empty set for passes filters + LinkedHashSet fFields = new LinkedHashSet(); // otherwise we have to parse and cache the value if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) fFields.add(filterString); @@ -132,7 +131,7 @@ public class VCFCodec extends AbstractVCFCodec { filterHash.put(filterString, fFields); - return fFields; + return Collections.unmodifiableSet(fFields); } From 82f213177730123def312b6a5b64878f7f665769 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 12:27:11 -0400 Subject: [PATCH 003/113] Simplied getAttributeAsX interfaces -- Removed versions getAttribriteAsX(key) that except on not having the value. -- Removed version that getAttributeAsXNoException(key) -- The only available assessors are now getAttributeAsX(key, default). -- This single accessors properly handle their argument types, so if the value is a double it is returned directly for getAttributeAsDouble(), or if it's a string it's converted to a double. If the key isn't found, default is returned. --- .../gatk/walkers/annotator/SBByDepth.java | 2 +- .../indels/HaplotypeIndelErrorModel.java | 2 +- .../gatk/walkers/phasing/PhasingRead.java | 2 +- .../walkers/phasing/RefSeqDataParser.java | 10 ++-- .../varianteval/evaluators/CountVariants.java | 6 +-- .../evaluators/GenotypePhasingEvaluator.java | 3 +- .../evaluators/SimpleMetricsByAC.java | 2 +- .../evaluators/TiTvVariantEvaluator.java | 2 +- .../evaluators/ValidationReport.java | 2 +- .../stratifications/AlleleCount.java | 2 +- .../stratifications/AlleleFrequency.java | 2 +- .../stratifications/Degeneracy.java | 10 ++-- .../stratifications/FunctionalClass.java | 4 +- .../VQSRCalibrationCurve.java | 4 +- .../walkers/variantutils/SelectVariants.java | 2 +- .../walkers/variantutils/VariantsToTable.java | 2 +- .../sting/utils/codecs/vcf/VCFCodec.java | 14 +++-- .../sting/utils/variantcontext/Genotype.java | 9 ---- .../InferredGeneticContext.java | 53 ++++++++++++------- .../utils/variantcontext/VariantContext.java | 10 ---- .../variantcontext/VariantContextUtils.java | 16 +++--- 21 files changed, 79 insertions(+), 80 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java index 180bed24d..d2c4d24ab 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java @@ -26,7 +26,7 @@ public class SBByDepth extends AnnotationByDepth { if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY)) return null; - double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY)); + double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1); final Map genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index e68aa31e0..232e468f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel { baseMatchArray = new double[MAX_CACHED_QUAL+1]; baseMismatchArray = new double[MAX_CACHED_QUAL+1]; for (int k=1; k <= MAX_CACHED_QUAL; k++) { - double baseProb = QualityUtils.qualToProb(k); + double baseProb = QualityUtils.qualToProb((byte)k); baseMatchArray[k] = probToQual(baseProb); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java index a56c9e21e..63fb33295 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray { public PhasingRead(int length, int mappingQual) { super(length); - this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual)); + this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual)); this.baseProbs = new PreciseNonNegativeDouble[length]; Arrays.fill(this.baseProbs, null); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java index 55da1c152..f94140814 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java @@ -44,12 +44,12 @@ public class RefSeqDataParser { String nameKeyToUseMultiplePrefix = nameKeyToUse + "_"; Map entriesToNames = new HashMap(); - Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY); - if (numRecords != null) { + int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1); + if (numRecords != -1) { boolean done = false; if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1": - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); done = true; @@ -59,14 +59,14 @@ public class RefSeqDataParser { if (!done) { for (int i = 1; i <= numRecords; i++) { String key = nameKeyToUseMultiplePrefix + i; - String name = vc.getAttributeAsStringNoException(key); + String name = vc.getAttributeAsString(key, null); if (name != null) entriesToNames.put(key, name); } } } else { // no entry with the # of records: - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 59ef3d992..fd379dfda 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -109,12 +109,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval { case SNP: nVariantLoci++; nSNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case MNP: nVariantLoci++; nMNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case INDEL: nVariantLoci++; @@ -136,7 +136,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval { String refStr = vc1.getReference().getBaseString().toUpperCase(); - String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null; + String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null; // if (aaStr.equals(".")) { // aaStr = refStr; // } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index a476a2680..e69dbfb28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { } public static Double getPQ(Genotype gt) { - return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + return d == -1 ? null : d; } public static boolean topMatchesTop(AllelePair b1, AllelePair b2) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java index d466645ea..38cbf1c45 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java @@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval if ( eval.hasGenotypes() ) ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); + ac = eval.getAttributeAsInt("AC", -1); } if ( ac != -1 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index be957abd7..ee58012a0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -50,7 +50,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } String refStr = vc.getReference().getBaseString().toUpperCase(); - String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase(); + String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase(); if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) { BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 9c331b577..7fa56785b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -130,7 +130,7 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { //// System.out.printf(" ac = %d%n", ac); } else - ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY); + ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO; } else if ( vc.hasGenotypes() ) { return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 5cdea4e00..56b06d032 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -45,7 +45,7 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = -1; if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { - AC = eval.getAttributeAsInt("AC"); + AC = eval.getAttributeAsInt("AC", 0); } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getChromosomeCount(allele)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java index 96d9f30ec..ac1ee9e0e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java @@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier { if (eval != null) { try { - relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3)))); + relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3)))); } catch (Exception e) { return relevantStates; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java index cc878e975..06ac05ec8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java @@ -92,8 +92,8 @@ public class Degeneracy extends VariantStratifier { Integer frame = null; if (eval.hasAttribute("refseq.functionalClass")) { - aa = eval.getAttributeAsString("refseq.variantAA"); - frame = eval.getAttributeAsInt("refseq.frame"); + aa = eval.getAttributeAsString("refseq.variantAA", null); + frame = eval.getAttributeAsInt("refseq.frame", 0); } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -101,7 +101,7 @@ public class Degeneracy extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); + String newtype = eval.getAttributeAsString(key, null); if ( newtype != null && ( type == null || @@ -111,13 +111,13 @@ public class Degeneracy extends VariantStratifier { type = newtype; String aakey = String.format("refseq.variantAA_%d", annotationId); - aa = eval.getAttributeAsString(aakey); + aa = eval.getAttributeAsString(aakey, null); if (aa != null) { String framekey = String.format("refseq.frame_%d", annotationId); if (eval.hasAttribute(framekey)) { - frame = eval.getAttributeAsInt(framekey); + frame = eval.getAttributeAsInt(framekey, 0); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index 0de871fe6..4af12fbd1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -32,7 +32,7 @@ public class FunctionalClass extends VariantStratifier { String type = null; if (eval.hasAttribute("refseq.functionalClass")) { - type = eval.getAttributeAsString("refseq.functionalClass"); + type = eval.getAttributeAsString("refseq.functionalClass", null); } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -40,7 +40,7 @@ public class FunctionalClass extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); + String newtype = eval.getAttributeAsString(key, null); if ( newtype != null && !newtype.equalsIgnoreCase("null") && ( type == null || diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java index bc7252ec2..04ba3ff14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java @@ -115,7 +115,7 @@ public class VQSRCalibrationCurve { if ( vc.isFiltered() ) return 0.0; else if ( vc.hasAttribute(VQSRQualKey) ) { - double qual = vc.getAttributeAsDouble(VQSRQualKey); + double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0); return probTrueVariant(qual); } else { throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc); @@ -143,7 +143,7 @@ public class VQSRCalibrationCurve { for ( int i = 0; i < log10Likelihoods.length; i++) { double p = Math.pow(10, log10Likelihoods[i]); double q = alpha * p + (1-alpha) * noInfoPr; - if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q); + if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q); updated[i] = Math.log10(q); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bb3cd82a1..ceafb0cf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -575,7 +575,7 @@ public class SelectVariants extends RodWalker { // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. // We then pick a variant with probablity AF*desiredFraction if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { - String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); + String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null); double af; double afBoost = 1.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 2a877fb09..aafbe4db4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -192,7 +192,7 @@ public class VariantsToTable extends RodWalker { if ( getters.containsKey(field) ) { val = getters.get(field).get(vc); } else if ( vc.hasAttribute(field) ) { - val = vc.getAttributeAsString(field); + val = vc.getAttributeAsString(field, null); } else if ( isWildCard(field) ) { Set wildVals = new HashSet(); for ( Map.Entry elt : vc.getAttributes().entrySet()) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index cd320b332..94e40fc98 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -105,7 +105,10 @@ public class VCFCodec extends AbstractVCFCodec { * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) */ protected Set parseFilters(String filterString) { + return parseFilters(filterHash, lineNo, filterString); + } + public static Set parseFilters(final Map> cache, final int lineNo, final String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; @@ -113,13 +116,13 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) return Collections.emptySet(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4"); + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status"); + generateException("The VCF specification requires a valid filter status", lineNo); // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); + if ( cache != null && cache.containsKey(filterString) ) + return Collections.unmodifiableSet(cache.get(filterString)); // empty set for passes filters LinkedHashSet fFields = new LinkedHashSet(); @@ -129,7 +132,8 @@ public class VCFCodec extends AbstractVCFCodec { else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - filterHash.put(filterString, fFields); + fFields = fFields; + if ( cache != null ) cache.put(filterString, fFields); return Collections.unmodifiableSet(fFields); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index fdf3d97db..85d752003 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -293,17 +293,8 @@ public class Genotype { return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java index 3d162adb0..4266fb4b5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java @@ -204,27 +204,40 @@ public final class InferredGeneticContext { return defaultValue; } -// public AttributedObject getAttributes(Collection keys) { -// AttributedObject selected = new AttributedObject(); -// -// for ( Object key : keys ) -// selected.putAttribute(key, this.getAttribute(key)); -// -// return selected; -// } + public String getAttributeAsString(String key, String defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof String ) return (String)x; + return String.valueOf(x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" - public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } - public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } - public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } + public int getAttributeAsInt(String key, int defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Integer ) return (Integer)x; + return Integer.valueOf((String)x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key, String defaultValue) { return (String)getAttribute(key, defaultValue); } - public int getAttributeAsInt(String key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); } - public double getAttributeAsDouble(String key, double defaultValue) { return (Double)getAttribute(key, defaultValue); } - public boolean getAttributeAsBoolean(String key, boolean defaultValue){ return (Boolean)getAttribute(key, defaultValue); } + public double getAttributeAsDouble(String key, double defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Double ) return (Double)x; + return Double.valueOf((String)x); // throws an exception if this isn't a string + } - public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } - public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } - public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } - public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } + public boolean getAttributeAsBoolean(String key, boolean defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Boolean ) return (Boolean)x; + return Boolean.valueOf((String)x); // throws an exception if this isn't a string + } + +// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" +// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } +// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } +// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } +// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } +// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } +// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } +// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 673fe4529..e6637a5d9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -666,21 +666,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } - - // --------------------------------------------------------------------------------------------------------- // // Working with alleles diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 986d6305c..d5c541b19 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -565,11 +565,11 @@ public class VariantContextUtils { // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY)); + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (rsID == null && vc.hasID()) rsID = vc.getID(); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY); + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); @@ -1147,9 +1147,7 @@ public class VariantContextUtils { for (String orAttrib : MERGE_OR_ATTRIBS) { boolean attribVal = false; for (VariantContext vc : vcList) { - Boolean val = vc.getAttributeAsBooleanNoException(orAttrib); - if (val != null) - attribVal = (attribVal || val); + attribVal = vc.getAttributeAsBoolean(orAttrib, false); if (attribVal) // already true, so no reason to continue: break; } @@ -1159,7 +1157,7 @@ public class VariantContextUtils { // Merge ID fields: String iDVal = null; for (VariantContext vc : vcList) { - String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY); + String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { if (iDVal == null) iDVal = val; @@ -1239,8 +1237,10 @@ public class VariantContextUtils { public PhaseAndQuality(Genotype gt) { this.isPhased = gt.isPhased(); - if (this.isPhased) - this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + if (this.isPhased) { + this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + if ( this.PQ == -1 ) this.PQ = null; + } } } From 124ef6c4834d8a948629291762b7f9d7d9696d50 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:12:28 -0400 Subject: [PATCH 004/113] MISSING_VALUE now gets defaultValue in getAttribute functions --- .../sting/utils/variantcontext/InferredGeneticContext.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java index 4266fb4b5..bf16cd1cf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; + import java.util.*; @@ -213,7 +215,7 @@ public final class InferredGeneticContext { public int getAttributeAsInt(String key, int defaultValue) { Object x = getAttribute(key); - if ( x == null ) return defaultValue; + if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; if ( x instanceof Integer ) return (Integer)x; return Integer.valueOf((String)x); // throws an exception if this isn't a string } From 03aa04e37c8a2cf31d650cba6a8c703fbb033978 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:13:08 -0400 Subject: [PATCH 005/113] Simple refactoring to make formating functions public --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 2 +- .../sting/utils/codecs/vcf/StandardVCFWriter.java | 11 +++++++++-- .../sting/utils/codecs/vcf/VCFCodec.java | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index bb212e128..624d06a71 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -227,7 +227,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, throw new UserException.MalformedVCF(message, lineNo); } - private static void generateException(String message, int lineNo) { + protected static void generateException(String message, int lineNo) { throw new UserException.MalformedVCF(message, lineNo); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index d3705813c..e28cd7598 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -275,7 +275,7 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); // FILTER - String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + String filters = getFilterString(vc, filtersWereAppliedToContext); mWriter.write(filters); mWriter.write(VCFConstants.FIELD_SEPARATOR); @@ -319,7 +319,14 @@ public class StandardVCFWriter implements VCFWriter { } catch (IOException e) { throw new RuntimeException("Unable to write the VCF object to " + locationString()); } + } + public static final String getFilterString(final VariantContext vc) { + return getFilterString(vc, false); + } + + public static final String getFilterString(final VariantContext vc, boolean forcePASS) { + return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); } private String getQualValue(double qual) { @@ -462,7 +469,7 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(encoding); } - private static String formatVCFField(Object val) { + public static String formatVCFField(Object val) { String result; if ( val == null ) result = VCFConstants.MISSING_VALUE_v4; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 94e40fc98..42ea05355 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -118,7 +118,7 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status", lineNo); + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); // do we have the filter string cached? if ( cache != null && cache.containsKey(filterString) ) From 048202d18e42444e47e6237246b31d24c57dec9a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:13:28 -0400 Subject: [PATCH 006/113] Bugfix for cached quals --- .../java/src/org/broadinstitute/sting/utils/QualityUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 093da7dd6..19e03a19d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -17,7 +17,7 @@ public class QualityUtils { private static double qualToErrorProbCache[] = new double[256]; static { - for (byte i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i); + for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i); } /** From d471617c65f6d1f3885ea6628b7676ed6bbc6f8d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:15:19 -0400 Subject: [PATCH 007/113] GATK binary VCF (gvcf) prototype format for efficiency testing -- Very minimal working version that can read / write binary VCFs with genotypes -- Already 10x faster for sites, 5x for fully parsed genotypes, and 1000x for skipping genotypes when reading --- .../broadinstitute/sting/utils/gvcf/GVCF.java | 252 ++++++++++++++++++ .../sting/utils/gvcf/GVCFGenotype.java | 147 ++++++++++ .../sting/utils/gvcf/GVCFHeader.java | 180 +++++++++++++ .../sting/utils/gvcf/GVCFHeaderBuilder.java | 80 ++++++ 4 files changed, 659 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java new file mode 100644 index 000000000..8568c1aab --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; +import java.util.*; + +/** + * GATK binary VCF record + * + * @author Your Name + * @since Date created + */ +public class GVCF { + private final static int RECORD_TERMINATOR = 123456789; + private int chromOffset; + private int start, stop; + private String id; + private List alleleMap; + private int alleleOffsets[]; + private float qual; + private byte refPad; + private String info; + private int filterOffset; + + private List genotypes = Collections.emptyList(); + + public GVCF(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc, boolean skipGenotypes) { + chromOffset = gvcfHeaderBuilder.encodeString(vc.getChr()); + start = vc.getStart(); + stop = vc.getEnd(); + refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0; + id = vc.getID(); + + // encode alleles + alleleMap = new ArrayList(vc.getNAlleles()); + alleleOffsets = new int[vc.getNAlleles()]; + alleleMap.add(vc.getReference()); + alleleOffsets[0] = gvcfHeaderBuilder.encodeAllele(vc.getReference()); + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + alleleMap.add(vc.getAlternateAllele(i)); + alleleOffsets[i+1] = gvcfHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); + } + + qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual()); + info = infoFieldString(vc, gvcfHeaderBuilder); + filterOffset = gvcfHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); + + if ( ! skipGenotypes ) { + genotypes = encodeGenotypes(gvcfHeaderBuilder, vc); + } + } + + public GVCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { + chromOffset = inputStream.readInt(); + start = inputStream.readInt(); + stop = inputStream.readInt(); + id = inputStream.readUTF(); + refPad = inputStream.readByte(); + alleleOffsets = readIntArray(inputStream); + qual = inputStream.readFloat(); + info = inputStream.readUTF(); + filterOffset = inputStream.readInt(); + + int nGenotypes = inputStream.readInt(); + int sizeOfGenotypes = inputStream.readInt(); + if ( skipGenotypes ) { + genotypes = Collections.emptyList(); + inputStream.skipBytes(sizeOfGenotypes); + } else { + genotypes = new ArrayList(nGenotypes); + for ( int i = 0; i < nGenotypes; i++ ) + genotypes.add(new GVCFGenotype(this, inputStream)); + } + + int recordDone = inputStream.readInt(); + if ( recordDone != RECORD_TERMINATOR ) + throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); + } + + public VariantContext decode(final String source, final GVCFHeader header) { + final String contig = header.getString(chromOffset); + alleleMap = header.getAlleles(alleleOffsets); + double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual); + Set filters = header.getFilters(filterOffset); + Map attributes = new HashMap(); + attributes.put("INFO", info); + Byte refPadByte = refPad == 0 ? null : refPad; + Map genotypes = decodeGenotypes(header); + + return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte); + } + + private Map decodeGenotypes(final GVCFHeader header) { + if ( genotypes.isEmpty() ) + return VariantContext.NO_GENOTYPES; + else { + Map map = new TreeMap(); + + for ( int i = 0; i < genotypes.size(); i++ ) { + final String sampleName = header.getSample(i); + final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap); + map.put(sampleName, g); + } + + return map; + } + } + + private List encodeGenotypes(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc) { + int nGenotypes = vc.getNSamples(); + if ( nGenotypes > 0 ) { + List genotypes = new ArrayList(nGenotypes); + for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null); + + for ( Genotype g : vc.getGenotypes().values() ) { + int i = gvcfHeaderBuilder.encodeSample(g.getSampleName()); + genotypes.set(i, new GVCFGenotype(gvcfHeaderBuilder, alleleMap, g)); + } + + return genotypes; + } else { + return Collections.emptyList(); + } + } + + public int getNAlleles() { return alleleOffsets.length; } + + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeInt(chromOffset); + outputStream.writeInt(start); + outputStream.writeInt(stop); + outputStream.writeUTF(id); + outputStream.writeByte(refPad); + writeIntArray(alleleOffsets, outputStream, true); + outputStream.writeFloat(qual); + outputStream.writeUTF(info); + outputStream.writeInt(filterOffset); + + int nGenotypes = genotypes.size(); + int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; + outputStream.writeInt(nGenotypes); + outputStream.writeInt(expectedSizeOfGenotypes); + int obsSizeOfGenotypes = 0; + for ( GVCFGenotype g : genotypes ) + obsSizeOfGenotypes += g.write(outputStream); + if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) + throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); + + outputStream.writeInt(RECORD_TERMINATOR); + return outputStream.size() - startSize; + } + + private final String infoFieldString(VariantContext vc, final GVCFHeaderBuilder gvcfHeaderBuilder) { + StringBuilder s = new StringBuilder(); + + boolean first = true; + for ( Map.Entry field : vc.getAttributes().entrySet() ) { + String key = field.getKey(); + if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) + continue; + int stringIndex = gvcfHeaderBuilder.encodeString(key); + String outputValue = StandardVCFWriter.formatVCFField(field.getValue()); + if ( outputValue != null ) { + if ( ! first ) s.append(";"); + s.append(stringIndex).append("=").append(outputValue); + first = false; + } + } + + return s.toString(); + } + + private final static int BUFFER_SIZE = 1048576; // 2**20 + public static DataOutputStream createOutputStream(final File file) throws FileNotFoundException { + return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); + } + + public static DataInputStream createInputStream(final File file) throws FileNotFoundException { + return new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE)); + } + + protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException { + return readIntArray(inputStream, inputStream.readInt()); + } + + protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException { + int[] array = new int[size]; + for ( int i = 0; i < array.length; i++ ) + array[i] = inputStream.readInt(); + return array; + } + + protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException { + if ( writeSize ) outputStream.writeInt(array.length); + for ( int i : array ) + outputStream.writeInt(i); + } + + protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException { + return readByteArray(inputStream, inputStream.readInt()); + } + + protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException { + byte[] array = new byte[size]; + for ( int i = 0; i < array.length; i++ ) + array[i] = inputStream.readByte(); + return array; + } + + protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException { + if ( writeSize ) outputStream.writeInt(array.length); + for ( byte i : array ) + outputStream.writeByte(i); + } + + protected final static byte qualToByte(double phredScaledQual) { + return (byte)Math.round(Math.min(phredScaledQual, 255)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java new file mode 100644 index 000000000..2ef6d9b3a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.*; + +/** + * GATK binary VCF record + * + * @author Your Name + * @since Date created + */ +public class GVCFGenotype { + private byte gq; + private int gt; + private int dp; + private int ad[]; + private byte[] pl; + + // todo -- what to do about phasing? Perhaps we shouldn't support it + // todo -- is the FL field generic or just a flag? Should we even support per sample filtering? + + public GVCFGenotype(final GVCFHeaderBuilder gvcfHeaderBuilder, final List allAlleles, Genotype genotype) { + gq = GVCF.qualToByte(genotype.getPhredScaledQual()); + gt = encodeAlleles(genotype.getAlleles(), allAlleles); + + dp = genotype.getAttributeAsInt("DP", 0); + + int nAlleles = allAlleles.size(); + ad = new int[nAlleles]; + + int npls = nAllelesToNPls(nAlleles); + pl = new byte[npls]; + } + + private int nAllelesToNPls( int nAlleles ) { + return nAlleles*(nAlleles+1) / 2; + } + + public GVCFGenotype(GVCF gvcf, DataInputStream inputStream) throws IOException { + int gqInt = inputStream.readUnsignedByte(); + gq = (byte)gqInt; + gt = inputStream.readInt(); + dp = inputStream.readInt(); + ad = GVCF.readIntArray(inputStream, gvcf.getNAlleles()); + pl = GVCF.readByteArray(inputStream, nAllelesToNPls(gvcf.getNAlleles())); + } + + // 2 alleles => 1 + 8 + 8 + 3 => 20 + protected int sizeInBytes() { + return 1 // gq + + 4 * 2 // gt + dp + + 4 * ad.length // ad + + 1 * pl.length; // pl + } + + public Genotype decode(final String sampleName, final GVCFHeader header, GVCF gvcf, List alleleIndex) { + final List alleles = decodeAlleles(gt, alleleIndex); + final double negLog10PError = gq / 10.0; + final Set filters = Collections.emptySet(); + final Map attributes = new HashMap(); + attributes.put("DP", dp); + attributes.put("AD", ad); + attributes.put("PL", pl); + + return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false); + } + + private static int encodeAlleles(List gtList, List allAlleles) { + final int nAlleles = gtList.size(); + if ( nAlleles > 4 ) + throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList); + + int gtInt = 0; + for ( int i = 0; i < nAlleles ; i++ ) { + final int bitOffset = i * 8; + final int allelei = getAlleleIndex(gtList.get(i), allAlleles); + final int gti = (allelei + 1) << bitOffset; + gtInt = gtInt | gti; + } + + return gtInt; + } + + private static int getAlleleIndex(Allele q, List allAlleles) { + if ( q.isNoCall() ) + return 254; + for ( int i = 0; i < allAlleles.size(); i++ ) + if ( q.equals(allAlleles.get(i)) ) + return i; + throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles); + } + + private static List decodeAlleles(int gtInt, List alleleIndex) { + List alleles = new ArrayList(4); + + for ( int i = 0; i < 32; i += 8 ) { + final int gi = (gtInt & (0x000000FF << i)) >> i; + if ( gi != 0 ) { + final int allelei = gi - 1; + alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) ); + } else { + break; + } + } + + return alleles; + } + + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeByte(gq); + outputStream.writeInt(gt); + outputStream.writeInt(dp); + GVCF.writeIntArray(ad, outputStream, false); + GVCF.writeByteArray(pl, outputStream, false); + return outputStream.size() - startSize; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java new file mode 100644 index 000000000..c52c975bd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.*; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class GVCFHeader { + final protected static Logger logger = Logger.getLogger(GVCFHeader.class); + + private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes(); + final List alleles; + final List strings; + final List samples; + final List> filters; + + public GVCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + this.alleles = linearize(allelesIn); + this.strings = linearize(stringIn); + this.samples = linearize(samplesIn); + this.filters = null; // not used with this constructor + } + + public GVCFHeader(DataInputStream inputStream) throws IOException { + byte[] headerTest = new byte[MAGIC_HEADER.length]; + inputStream.read(headerTest); + if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) { + throw new UserException("Could not read GVCF file. MAGIC_HEADER missing. Saw " + headerTest); + } else { + alleles = stringsToAlleles(readStrings(inputStream)); + strings = readStrings(inputStream); + samples = readStrings(inputStream); + logger.info(String.format("Allele map of %d elements", alleles.size())); + logger.info(String.format("String map of %d elements", strings.size())); + logger.info(String.format("Sample map of %d elements", samples.size())); + filters = initializeFilterCache(); + } + } + + public int write(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.write(MAGIC_HEADER); + write(outputStream, allelesToStrings(alleles)); + write(outputStream, strings); + write(outputStream, samples); + return outputStream.size() - startBytes; + } + + public void write(DataOutputStream outputStream, List l) throws IOException { + outputStream.writeInt(l.size()); + for ( String elt : l ) outputStream.writeUTF(elt); + } + + private List allelesToStrings(List alleles) { + List strings = new ArrayList(alleles.size()); + for ( Allele allele : alleles ) strings.add(allele.toString()); + return strings; + } + + private List> initializeFilterCache() { + // required to allow offset -> set lookup + List> l = new ArrayList>(strings.size()); + for ( int i = 0; i < strings.size(); i++ ) l.add(null); + return l; + } + + private static List stringsToAlleles(final List strings) { + final List alleles = new ArrayList(strings.size()); + for ( String string : strings ) { + boolean isRef = string.endsWith("*"); + if ( isRef ) string = string.substring(0, string.length() - 1); + alleles.add(Allele.create(string, isRef)); + } + return alleles; + } + + private static List readStrings(final DataInputStream inputStream) throws IOException { + final int nStrings = inputStream.readInt(); + + final List strings = new ArrayList(nStrings); + for ( int i = 0; i < nStrings; i++ ) { + strings.add(inputStream.readUTF()); + } + + return strings; + } + + private static List linearize(final Map map) { + final ArrayList l = new ArrayList(map.size()); + for ( int i = 0; i < map.size(); i++ ) l.add(null); + for ( final Map.Entry elt : map.entrySet() ) + l.set(elt.getValue(), elt.getKey()); + return l; + } + + public String getSample(final int offset) { return samples.get(offset); } + public String getString(final int offset) { return strings.get(offset); } + public Allele getAllele(final int offset) { return alleles.get(offset); } + public List getAlleles(final int[] offsets) { + final List alleles = new ArrayList(offsets.length); + for ( int i : offsets ) alleles.add(getAllele(i)); + return alleles; + } + + public Set getFilters(final int offset) { + Set cached = filters.get(offset); + + if ( cached != null ) + return cached; + else { + final String filterString = getString(offset); + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; // UNFILTERED records are represented by null + else { + Set set = VCFCodec.parseFilters(null, -1, filterString); + filters.set(offset, set); // remember the result + return set; + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java new file mode 100644 index 000000000..2d045b8ea --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.HashMap; +import java.util.Map; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class GVCFHeaderBuilder { + Map alleles = new HashMap(); + Map strings = new HashMap(); + Map samples = new HashMap(); + + public GVCFHeader createHeader() { + return new GVCFHeader(alleles, strings, samples); + } + + public int encodeString(final String chr) { return encode(strings, chr); } + public int encodeAllele(final Allele allele) { return encode(alleles, allele); } + public int encodeSample(final String sampleName) { return encode(samples, sampleName); } + + private int encode(Map map, T key) { + Integer v = map.get(key); + if ( v == null ) { + v = map.size(); + map.put(key, v); + } + return v; + } +} From 6ff432e1f24860e8821e9f55fe71a0e470dce202 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 12:50:17 -0400 Subject: [PATCH 008/113] BugFix for TF argument to VariantEval, actually making it work properly --- .../varianteval/VariantEvalWalker.java | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index fe4729bdc..65e3d3e5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; @@ -24,6 +25,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -224,12 +226,6 @@ public class VariantEvalWalker extends RodWalker implements Tr } sampleNamesForStratification.add(ALL_SAMPLE_NAME); - // Initialize select expressions - for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { - SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); - jexlExpressions.add(sjexl); - } - // Add select expressions for anything in the tranches file if ( TRANCHE_FILENAME != null ) { // we are going to build a few select names automatically from the tranches file @@ -240,16 +236,27 @@ public class VariantEvalWalker extends RodWalker implements Tr } } + // Initialize select expressions + for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { + SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); + jexlExpressions.add(sjexl); + } + // Initialize the set of stratifications and evaluations to use stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); + boolean usingJEXL = false; for ( VariantStratifier vs : getStratificationObjects() ) { if ( vs.getClass().getSimpleName().equals("Filter") ) byFilterIsEnabled = true; else if ( vs.getClass().getSimpleName().equals("Sample") ) perSampleIsEnabled = true; + usingJEXL = usingJEXL || vs.getClass().equals(JexlExpression.class); } + if ( TRANCHE_FILENAME != null && ! usingJEXL ) + throw new UserException.BadArgumentValue("tf", "Requires the JexlExpression ST to enabled"); + // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); From d23d62049439870ea33fb4d1759c2349f2ad154d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 12:52:33 -0400 Subject: [PATCH 009/113] Pushing traversal engine timer start to as close to actual start as possible -- Should make initial timings more accurate --- .../gatk/executive/LinearMicroScheduler.java | 2 +- .../sting/gatk/executive/ShardTraverser.java | 1 + .../sting/gatk/traversals/TraversalEngine.java | 15 ++++++++++----- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 65ff27497..09ab4bd44 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -44,7 +44,6 @@ public class LinearMicroScheduler extends MicroScheduler { * @param shardStrategy A strategy for sharding the data. */ public Object execute(Walker walker, ShardStrategy shardStrategy) { - traversalEngine.startTimers(); walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); @@ -54,6 +53,7 @@ public class LinearMicroScheduler extends MicroScheduler { if ( done || shard == null ) // we ran out of shards that aren't owned break; + traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 6136bd68d..2b6488ada 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -57,6 +57,7 @@ public class ShardTraverser implements Callable { public Object call() { try { + traversalEngine.startTimersIfNecessary(); long startTime = System.currentTimeMillis(); Object accumulator = walker.reduceInit(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 89a179d0e..dc6ab240e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -115,7 +115,7 @@ public abstract class TraversalEngine,Provide LinkedList history = new LinkedList(); /** We use the SimpleTimer to time our run */ - private SimpleTimer timer = new SimpleTimer("Traversal"); + private SimpleTimer timer = null; // How long can we go without printing some progress info? private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; @@ -209,11 +209,16 @@ public abstract class TraversalEngine,Provide } } /** - * Should be called to indicate that we're going to process records and the timer should start ticking + * Should be called to indicate that we're going to process records and the timer should start ticking. This + * function should be called right before any traversal work is done, to avoid counting setup costs in the + * processing costs and inflating the estimated runtime. */ - public void startTimers() { - timer.start(); - lastProgressPrintTime = timer.currentTime(); + public void startTimersIfNecessary() { + if ( timer == null ) { + timer = new SimpleTimer("Traversal"); + timer.start(); + lastProgressPrintTime = timer.currentTime(); + } } /** From 7e9e20fed0ad5d9f2491f782de3c3850ad341a57 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 12:54:52 -0400 Subject: [PATCH 010/113] Forgot to delete previous call --- .../sting/gatk/executive/HierarchicalMicroScheduler.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 59fb4aa9e..3b9e35311 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -97,7 +97,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); - traversalEngine.startTimers(); ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); From 430da2344609582d8de24edf52ed34cd2a426607 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 13:13:07 -0400 Subject: [PATCH 011/113] At least 2 minutes must pass before a status message is printed, further stabilizing time estimates --- .../broadinstitute/sting/gatk/traversals/TraversalEngine.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index dc6ab240e..27fd173cb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -121,6 +121,7 @@ public abstract class TraversalEngine,Provide private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? + private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; @@ -229,7 +230,8 @@ public abstract class TraversalEngine,Provide * @return true if the maximum interval (in millisecs) has passed since the last printing */ private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { - return (curTime - lastPrintTime) > printFreq; + long elapsed = curTime - lastPrintTime; + return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; } /** From 5f22ef9a8c4b5fdf04b5730dc5d27ffa63c6f73c Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 7 Sep 2011 13:21:11 -0400 Subject: [PATCH 012/113] Added missing javadoc info to Beagle arguments --- .../beagle/BeagleOutputToVCFWalker.java | 21 +++++++++++++++++++ .../beagle/ProduceBeagleInputWalker.java | 3 +++ 2 files changed, 24 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 60f0fcb0a..880dba5d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -79,24 +79,45 @@ public class BeagleOutputToVCFWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * If this argument is present, the original allele frequencies and counts from this vcf are added as annotations ACH,AFH and ANH. at each record present in this vcf + */ @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public RodBinding comp; + + /** + * This required argument is used to annotate each site in the vcf INFO field with R2 annotation. Will be NaN if Beagle determined there are no variant samples. + */ @Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true) public RodBinding beagleR2; + /** + * These values will populate the GL field for each sample and contain the posterior probability of each genotype given the data after phasing and imputation. + */ @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true) public RodBinding beagleProbs; + /** + * By default, all genotypes will be marked in the VCF as "phased", using the "|" separator after Beagle. + */ @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true) public RodBinding beaglePhased; @Output(doc="VCF File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; + /** + * If this argument is absent, and if Beagle determines that there is no sample in a site that has a variant genotype, the site will be marked as filtered (Default behavior). + * If the argument is present, the site won't be marked as filtered under this condition even if there are no variant genotypes. + */ @Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false) public boolean DONT_FILTER_MONOMORPHIC_SITES = false; + /** + * Value between 0 and 1. If the probability of getting a genotype correctly (based on the posterior genotype probabilities and the actual genotype) is below this threshold, + * a genotype will be substitute by a no-call. + */ @Argument(fullName="no" + "call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false) private double noCallThreshold = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 07793fd7b..87695077d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -112,6 +112,9 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) VCFWriter bootstrapVCFOutput = null; + /** + * If sample gender is known, this flag should be set to true to ensure that Beagle treats male Chr X properly. + */ @Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false) public boolean CHECK_IS_MALE_ON_CHR_X = false; From ee9d59955857f3b3f0346c027251dc16592dde71 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 7 Sep 2011 13:31:20 -0400 Subject: [PATCH 013/113] Just cleaning up clean up old commented code from tha data processing pipeline. --- .../qscripts/DataProcessingPipeline.scala | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 2a135496d..f97ce4884 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -37,11 +37,6 @@ class DataProcessingPipeline extends QScript { * Optional Parameters ****************************************************************************/ - -// @Input(doc="path to Picard's SortSam.jar (if re-aligning a previously processed BAM file)", fullName="path_to_sort_jar", shortName="sort", required=false) -// var sortSamJar: File = _ -// - @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false) var indels: List[File] = List() @@ -132,24 +127,6 @@ class DataProcessingPipeline extends QScript { } } return sampleTable.toMap - -// println("\n\n*** INPUT FILES ***\n") -// // Creating one file for each sample in the dataset -// val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] -// for ((sample, flist) <- sampleTable) { -// -// println(sample + ":") -// for (f <- flist) -// println (f) -// println() -// -// val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") -// sampleBamFiles(sample) = sampleFileName -// //add(writeList(flist, sampleFileName)) -// } -// println("*** INPUT FILES ***\n\n") -// -// return sampleBamFiles.toMap } // Rebuilds the Read Group string to give BWA From 3a04955a3085cd87bfec758eb144e78d5bf19b20 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 7 Sep 2011 14:01:42 -0400 Subject: [PATCH 014/113] We already had isPolymorphic and isMonomorphic in the VariantContext, but the implementation was incorrect for many edge cases (e.g. sites-only files, sites with samples who were no-called). Fixing. Moving on to VE now. --- .../sting/utils/variantcontext/VariantContext.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 673fe4529..699133e38 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -983,7 +983,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return true if it's monomorphic */ public boolean isMonomorphic() { - return ! isVariant() || getChromosomeCount(getReference()) == getChromosomeCount(); + return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples()); } /** From 9127849f5d2871621945a4f005af91dc7cfa8dd9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 14:54:10 -0400 Subject: [PATCH 016/113] BugFix for unit test --- .../sting/gatk/traversals/TraverseReadsUnitTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index c0d32a05b..7f4d96add 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -127,6 +127,7 @@ public class TraverseReadsUnitTest extends BaseTest { Object accumulator = countReadWalker.reduceInit(); while (shardStrategy.hasNext()) { + traversalEngine.startTimersIfNecessary(); Shard shard = shardStrategy.next(); if (shard == null) { From aa9e32f2f115a81b643b52317d40fc46e79195ef Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 7 Sep 2011 15:48:06 -0400 Subject: [PATCH 017/113] Reverting Mark's previous commit as per the open discussion. Now the eval modules check isPolymorphic() before accruing stats when appropriate. Fixed the IndelLengthHistogram module not to error out if the indel isn't simple (that would have been bad). Only integration test that needed to be updated was the tranches one based on a separate commit from Mark. --- .../varianteval/evaluators/CompOverlap.java | 2 +- .../varianteval/evaluators/CountVariants.java | 3 +- .../evaluators/IndelLengthHistogram.java | 15 +-- .../evaluators/IndelStatistics.java | 2 +- .../evaluators/SimpleMetricsByAC.java | 2 +- .../evaluators/ThetaVariantEvaluator.java | 103 +++++++++--------- .../evaluators/TiTvVariantEvaluator.java | 2 +- .../evaluators/ValidationReport.java | 5 +- .../evaluators/VariantQualityScore.java | 2 +- .../varianteval/util/VariantEvalUtils.java | 2 +- .../VariantEvalIntegrationTest.java | 2 +- 11 files changed, 69 insertions(+), 71 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 2ea64c49c..5ccacac37 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -75,7 +75,7 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { } public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean evalIsGood = eval != null && eval.isVariant(); + boolean evalIsGood = eval != null && eval.isPolymorphic(); boolean compIsGood = comp != null && comp.isNotFiltered() && (eval == null || comp.getType() == eval.getType()); if (compIsGood) nCompVariants++; // count the number of comp events diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 59ef3d992..2913c97a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -100,11 +100,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval { // So in order to maintain consistency with the previous implementation (and the intention of the original author), I've // added in a proxy check for monomorphic status here. // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call. - if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) { + if ( vc1.isMonomorphic() ) { nRefLoci++; } else { switch (vc1.getType()) { case NO_VARIATION: + // shouldn't get here break; case SNP: nVariantLoci++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java index 35fffd815..ffe7c185f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -90,18 +90,19 @@ public class IndelLengthHistogram extends VariantEvaluator { public int getComparisonOrder() { return 1; } // need only the evals public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ! vc1.isBiallelic() && vc1.isIndel() ) { - //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); - return vc1.toString(); // biallelic sites are output - } - if ( vc1.isIndel() ) { + if ( vc1.isIndel() && vc1.isPolymorphic() ) { + + if ( ! vc1.isBiallelic() ) { + //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); + return vc1.toString(); // biallelic sites are output + } + + // only count simple insertions/deletions, not complex indels if ( vc1.isSimpleInsertion() ) { indelHistogram.update(vc1.getAlternateAllele(0).length()); } else if ( vc1.isSimpleDeletion() ) { indelHistogram.update(-vc1.getReference().length()); - } else { - throw new ReviewedStingException("Indel type that is not insertion or deletion."); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index fc347339d..f70e6c2de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -270,7 +270,7 @@ public class IndelStatistics extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (eval != null ) { + if (eval != null && eval.isPolymorphic()) { if ( indelStats == null ) { indelStats = new IndelStats(eval); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java index d466645ea..203c15a85 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java @@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval } } - if ( eval.isSNP() && eval.isBiallelic() && metrics != null ) { + if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) { metrics.incrValue(eval); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java index ec43cbd55..e51623c3c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java @@ -37,77 +37,74 @@ public class ThetaVariantEvaluator extends VariantEvaluator { } public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) { + if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) { return null; //no interesting sites } - if (vc.hasGenotypes()) { + //this maps allele to a count + ConcurrentMap alleleCounts = new ConcurrentHashMap(); - //this maps allele to a count - ConcurrentMap alleleCounts = new ConcurrentHashMap(); + int numHetsHere = 0; + float numGenosHere = 0; + int numIndsHere = 0; - int numHetsHere = 0; - float numGenosHere = 0; - int numIndsHere = 0; + for (Genotype genotype : vc.getGenotypes().values()) { + numIndsHere++; + if (!genotype.isNoCall()) { + //increment stats for heterozygosity + if (genotype.isHet()) { + numHetsHere++; + } - for (Genotype genotype : vc.getGenotypes().values()) { - numIndsHere++; - if (!genotype.isNoCall()) { - //increment stats for heterozygosity - if (genotype.isHet()) { - numHetsHere++; - } + numGenosHere++; + //increment stats for pairwise mismatches - numGenosHere++; - //increment stats for pairwise mismatches - - for (Allele allele : genotype.getAlleles()) { - if (allele.isNonNull() && allele.isCalled()) { - String alleleString = allele.toString(); - alleleCounts.putIfAbsent(alleleString, 0); - alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1); - } + for (Allele allele : genotype.getAlleles()) { + if (allele.isNonNull() && allele.isCalled()) { + String alleleString = allele.toString(); + alleleCounts.putIfAbsent(alleleString, 0); + alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1); } } } - if (numGenosHere > 0) { - //only if have one called genotype at least - this.numSites++; + } + if (numGenosHere > 0) { + //only if have one called genotype at least + this.numSites++; - this.totalHet += numHetsHere / numGenosHere; + this.totalHet += numHetsHere / numGenosHere; - //compute based on num sites - float harmonicFactor = 0; - for (int i = 1; i <= numIndsHere; i++) { - harmonicFactor += 1.0 / i; - } - this.thetaRegionNumSites += 1.0 / harmonicFactor; + //compute based on num sites + float harmonicFactor = 0; + for (int i = 1; i <= numIndsHere; i++) { + harmonicFactor += 1.0 / i; + } + this.thetaRegionNumSites += 1.0 / harmonicFactor; - //now compute pairwise mismatches - float numPairwise = 0; - float numDiffs = 0; - for (String allele1 : alleleCounts.keySet()) { - int allele1Count = alleleCounts.get(allele1); + //now compute pairwise mismatches + float numPairwise = 0; + float numDiffs = 0; + for (String allele1 : alleleCounts.keySet()) { + int allele1Count = alleleCounts.get(allele1); - for (String allele2 : alleleCounts.keySet()) { - if (allele1.compareTo(allele2) < 0) { - continue; - } - if (allele1 .compareTo(allele2) == 0) { - numPairwise += allele1Count * (allele1Count - 1) * .5; + for (String allele2 : alleleCounts.keySet()) { + if (allele1.compareTo(allele2) < 0) { + continue; + } + if (allele1 .compareTo(allele2) == 0) { + numPairwise += allele1Count * (allele1Count - 1) * .5; - } - else { - int allele2Count = alleleCounts.get(allele2); - numPairwise += allele1Count * allele2Count; - numDiffs += allele1Count * allele2Count; - } + } + else { + int allele2Count = alleleCounts.get(allele2); + numPairwise += allele1Count * allele2Count; + numDiffs += allele1Count * allele2Count; } } + } - if (numPairwise > 0) { - this.totalAvgDiffs += numDiffs / numPairwise; - } + if (numPairwise > 0) { + this.totalAvgDiffs += numDiffs / numPairwise; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index be957abd7..1feb37e01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } public void updateTiTv(VariantContext vc, boolean updateStandard) { - if (vc != null && vc.isSNP() && vc.isBiallelic()) { + if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) { if (VariantContextUtils.isTransition(vc)) { if (updateStandard) nTiInComp++; else nTi++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 9c331b577..307b4f684 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -117,7 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { public SiteStatus calcSiteStatus(VariantContext vc) { if ( vc == null ) return SiteStatus.NO_CALL; if ( vc.isFiltered() ) return SiteStatus.FILTERED; - if ( ! vc.isVariant() ) return SiteStatus.MONO; + if ( vc.isMonomorphic() ) return SiteStatus.MONO; + if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { int ac = 0; @@ -132,8 +133,6 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { else ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY); return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO; - } else if ( vc.hasGenotypes() ) { - return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO; } else { return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do //return SiteStatus.NO_CALL; // we can't figure out what to do diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java index b6ad55b18..263227938 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java @@ -232,7 +232,7 @@ public class VariantQualityScore extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { final String interesting = null; - if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) + if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) if( titvStats == null ) { titvStats = new TiTvStats(); } titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 3cc039141..92e7c6554 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -277,7 +277,7 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested samples */ public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection sampleNames) { - VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values()); + VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles()); HashMap newAts = new HashMap(vcsub.getAttributes()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3503a2353..7b6d13223 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -264,7 +264,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testTranches() { String extraArgs = "-T VariantEval -R "+ hg18Reference +" --eval " + validationDataLocation + "GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.vcf -o %s -EV TiTvVariantEvaluator -L chr1 -noEV -ST CpG -tf " + testDir + "tranches.6.txt"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("984df6e94a546294fc7e0846cbac2dfe")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("6af2b9959aa1778a5b712536de453952")); executeTestParallel("testTranches",spec); } From 2ded0277628e97e0363b8af051580a87786f0459 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 16:09:24 -0400 Subject: [PATCH 018/113] Removed dysfunctional tranches support from VariantEval --- .../walkers/varianteval/VariantEvalWalker.java | 18 ------------------ .../VariantEvalIntegrationTest.java | 2 +- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 65e3d3e5a..0d09b7033 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -149,9 +149,6 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; - @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false) - private String TRANCHE_FILENAME = null; - @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; @@ -226,16 +223,6 @@ public class VariantEvalWalker extends RodWalker implements Tr } sampleNamesForStratification.add(ALL_SAMPLE_NAME); - // Add select expressions for anything in the tranches file - if ( TRANCHE_FILENAME != null ) { - // we are going to build a few select names automatically from the tranches file - for ( Tranche t : Tranche.readTranches(new File(TRANCHE_FILENAME)) ) { - logger.info("Adding select for all variant above the pCut of : " + t); - SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod)); - SELECT_NAMES.add(String.format("TS-%.2f", t.ts)); - } - } - // Initialize select expressions for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); @@ -245,18 +232,13 @@ public class VariantEvalWalker extends RodWalker implements Tr // Initialize the set of stratifications and evaluations to use stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); - boolean usingJEXL = false; for ( VariantStratifier vs : getStratificationObjects() ) { if ( vs.getClass().getSimpleName().equals("Filter") ) byFilterIsEnabled = true; else if ( vs.getClass().getSimpleName().equals("Sample") ) perSampleIsEnabled = true; - usingJEXL = usingJEXL || vs.getClass().equals(JexlExpression.class); } - if ( TRANCHE_FILENAME != null && ! usingJEXL ) - throw new UserException.BadArgumentValue("tf", "Requires the JexlExpression ST to enabled"); - // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 7b6d13223..6c4393d6a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -261,7 +261,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { return String.format("%s -select '%s' -selectName %s", cmd, select, name); } - @Test + @Test(enabled = false) // no longer supported in the GATK public void testTranches() { String extraArgs = "-T VariantEval -R "+ hg18Reference +" --eval " + validationDataLocation + "GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.vcf -o %s -EV TiTvVariantEvaluator -L chr1 -noEV -ST CpG -tf " + testDir + "tranches.6.txt"; WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("6af2b9959aa1778a5b712536de453952")); From 9604fb2ba34d619d96459938e32d757758216c91 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 7 Sep 2011 16:49:16 -0400 Subject: [PATCH 019/113] Necessary but not sufficient step to fix GenotypeGivenAlleles mode in UG which is now busted --- .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 06455df6d..b1332bdf9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; @@ -239,7 +236,8 @@ public class UnifiedGenotyperEngine { VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger, UAC.alleles); if ( vcInput == null ) return null; - vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()); + vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase()); + } else { // deal with bad/non-standard reference bases if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) From 01b6177ce15c2d4270ac863da1a2e4e43e020411 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 17:10:56 -0400 Subject: [PATCH 020/113] Renaming GVCF -> GCF --- .../utils/{gvcf/GVCF.java => gcf/GCF.java} | 47 +++++++++---------- .../GCFGenotype.java} | 20 ++++---- .../GVCFHeader.java => gcf/GCFHeader.java} | 12 ++--- .../GCFHeaderBuilder.java} | 8 ++-- 4 files changed, 41 insertions(+), 46 deletions(-) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCF.java => gcf/GCF.java} (83%) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCFGenotype.java => gcf/GCFGenotype.java} (86%) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCFHeader.java => gcf/GCFHeader.java} (92%) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCFHeaderBuilder.java => gcf/GCFHeaderBuilder.java} (93%) diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java similarity index 83% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java index 8568c1aab..5ab241ebf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -22,12 +22,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -42,7 +39,7 @@ import java.util.*; * @author Your Name * @since Date created */ -public class GVCF { +public class GCF { private final static int RECORD_TERMINATOR = 123456789; private int chromOffset; private int start, stop; @@ -54,10 +51,10 @@ public class GVCF { private String info; private int filterOffset; - private List genotypes = Collections.emptyList(); + private List genotypes = Collections.emptyList(); - public GVCF(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc, boolean skipGenotypes) { - chromOffset = gvcfHeaderBuilder.encodeString(vc.getChr()); + public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) { + chromOffset = GCFHeaderBuilder.encodeString(vc.getChr()); start = vc.getStart(); stop = vc.getEnd(); refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0; @@ -67,22 +64,22 @@ public class GVCF { alleleMap = new ArrayList(vc.getNAlleles()); alleleOffsets = new int[vc.getNAlleles()]; alleleMap.add(vc.getReference()); - alleleOffsets[0] = gvcfHeaderBuilder.encodeAllele(vc.getReference()); + alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference()); for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { alleleMap.add(vc.getAlternateAllele(i)); - alleleOffsets[i+1] = gvcfHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); + alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); } qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual()); - info = infoFieldString(vc, gvcfHeaderBuilder); - filterOffset = gvcfHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); + info = infoFieldString(vc, GCFHeaderBuilder); + filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); if ( ! skipGenotypes ) { - genotypes = encodeGenotypes(gvcfHeaderBuilder, vc); + genotypes = encodeGenotypes(GCFHeaderBuilder, vc); } } - public GVCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { + public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { chromOffset = inputStream.readInt(); start = inputStream.readInt(); stop = inputStream.readInt(); @@ -99,9 +96,9 @@ public class GVCF { genotypes = Collections.emptyList(); inputStream.skipBytes(sizeOfGenotypes); } else { - genotypes = new ArrayList(nGenotypes); + genotypes = new ArrayList(nGenotypes); for ( int i = 0; i < nGenotypes; i++ ) - genotypes.add(new GVCFGenotype(this, inputStream)); + genotypes.add(new GCFGenotype(this, inputStream)); } int recordDone = inputStream.readInt(); @@ -109,7 +106,7 @@ public class GVCF { throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); } - public VariantContext decode(final String source, final GVCFHeader header) { + public VariantContext decode(final String source, final GCFHeader header) { final String contig = header.getString(chromOffset); alleleMap = header.getAlleles(alleleOffsets); double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual); @@ -122,7 +119,7 @@ public class GVCF { return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte); } - private Map decodeGenotypes(final GVCFHeader header) { + private Map decodeGenotypes(final GCFHeader header) { if ( genotypes.isEmpty() ) return VariantContext.NO_GENOTYPES; else { @@ -138,15 +135,15 @@ public class GVCF { } } - private List encodeGenotypes(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc) { + private List encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) { int nGenotypes = vc.getNSamples(); if ( nGenotypes > 0 ) { - List genotypes = new ArrayList(nGenotypes); + List genotypes = new ArrayList(nGenotypes); for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null); for ( Genotype g : vc.getGenotypes().values() ) { - int i = gvcfHeaderBuilder.encodeSample(g.getSampleName()); - genotypes.set(i, new GVCFGenotype(gvcfHeaderBuilder, alleleMap, g)); + int i = GCFHeaderBuilder.encodeSample(g.getSampleName()); + genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g)); } return genotypes; @@ -174,7 +171,7 @@ public class GVCF { outputStream.writeInt(nGenotypes); outputStream.writeInt(expectedSizeOfGenotypes); int obsSizeOfGenotypes = 0; - for ( GVCFGenotype g : genotypes ) + for ( GCFGenotype g : genotypes ) obsSizeOfGenotypes += g.write(outputStream); if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); @@ -183,7 +180,7 @@ public class GVCF { return outputStream.size() - startSize; } - private final String infoFieldString(VariantContext vc, final GVCFHeaderBuilder gvcfHeaderBuilder) { + private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) { StringBuilder s = new StringBuilder(); boolean first = true; @@ -191,7 +188,7 @@ public class GVCF { String key = field.getKey(); if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) continue; - int stringIndex = gvcfHeaderBuilder.encodeString(key); + int stringIndex = GCFHeaderBuilder.encodeString(key); String outputValue = StandardVCFWriter.formatVCFField(field.getValue()); if ( outputValue != null ) { if ( ! first ) s.append(";"); diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java similarity index 86% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java index 2ef6d9b3a..dd1fb091c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -38,7 +38,7 @@ import java.util.*; * @author Your Name * @since Date created */ -public class GVCFGenotype { +public class GCFGenotype { private byte gq; private int gt; private int dp; @@ -48,8 +48,8 @@ public class GVCFGenotype { // todo -- what to do about phasing? Perhaps we shouldn't support it // todo -- is the FL field generic or just a flag? Should we even support per sample filtering? - public GVCFGenotype(final GVCFHeaderBuilder gvcfHeaderBuilder, final List allAlleles, Genotype genotype) { - gq = GVCF.qualToByte(genotype.getPhredScaledQual()); + public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List allAlleles, Genotype genotype) { + gq = GCF.qualToByte(genotype.getPhredScaledQual()); gt = encodeAlleles(genotype.getAlleles(), allAlleles); dp = genotype.getAttributeAsInt("DP", 0); @@ -65,13 +65,13 @@ public class GVCFGenotype { return nAlleles*(nAlleles+1) / 2; } - public GVCFGenotype(GVCF gvcf, DataInputStream inputStream) throws IOException { + public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException { int gqInt = inputStream.readUnsignedByte(); gq = (byte)gqInt; gt = inputStream.readInt(); dp = inputStream.readInt(); - ad = GVCF.readIntArray(inputStream, gvcf.getNAlleles()); - pl = GVCF.readByteArray(inputStream, nAllelesToNPls(gvcf.getNAlleles())); + ad = GCF.readIntArray(inputStream, GCF.getNAlleles()); + pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles())); } // 2 alleles => 1 + 8 + 8 + 3 => 20 @@ -82,7 +82,7 @@ public class GVCFGenotype { + 1 * pl.length; // pl } - public Genotype decode(final String sampleName, final GVCFHeader header, GVCF gvcf, List alleleIndex) { + public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List alleleIndex) { final List alleles = decodeAlleles(gt, alleleIndex); final double negLog10PError = gq / 10.0; final Set filters = Collections.emptySet(); @@ -140,8 +140,8 @@ public class GVCFGenotype { outputStream.writeByte(gq); outputStream.writeInt(gt); outputStream.writeInt(dp); - GVCF.writeIntArray(ad, outputStream, false); - GVCF.writeByteArray(pl, outputStream, false); + GCF.writeIntArray(ad, outputStream, false); + GCF.writeByteArray(pl, outputStream, false); return outputStream.size() - startSize; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java similarity index 92% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java index c52c975bd..d0c765cc4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java @@ -22,11 +22,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -64,8 +62,8 @@ import java.util.*; * @author Your Name * @since Date created */ -public class GVCFHeader { - final protected static Logger logger = Logger.getLogger(GVCFHeader.class); +public class GCFHeader { + final protected static Logger logger = Logger.getLogger(GCFHeader.class); private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes(); final List alleles; @@ -73,14 +71,14 @@ public class GVCFHeader { final List samples; final List> filters; - public GVCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { this.alleles = linearize(allelesIn); this.strings = linearize(stringIn); this.samples = linearize(samplesIn); this.filters = null; // not used with this constructor } - public GVCFHeader(DataInputStream inputStream) throws IOException { + public GCFHeader(DataInputStream inputStream) throws IOException { byte[] headerTest = new byte[MAGIC_HEADER.length]; inputStream.read(headerTest); if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java similarity index 93% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java index 2d045b8ea..40e01ec72 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -56,13 +56,13 @@ import java.util.Map; * @author Your Name * @since Date created */ -public class GVCFHeaderBuilder { +public class GCFHeaderBuilder { Map alleles = new HashMap(); Map strings = new HashMap(); Map samples = new HashMap(); - public GVCFHeader createHeader() { - return new GVCFHeader(alleles, strings, samples); + public GCFHeader createHeader() { + return new GCFHeader(alleles, strings, samples); } public int encodeString(final String chr) { return encode(strings, chr); } From fe5724b6ea7c77f3f38f35b2d29d118860b6fc2a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 23:27:08 -0400 Subject: [PATCH 023/113] Refactored indexing part of StandardVCFWriter into superclass -- Now other implementations of the VCFWriter can easily share common functions, such as writing an index on the fly --- .../utils/codecs/vcf/IndexingVCFWriter.java | 116 ++++++++++++++++++ .../utils/codecs/vcf/StandardVCFWriter.java | 107 +++++----------- 2 files changed, 148 insertions(+), 75 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java new file mode 100644 index 000000000..632bf8ed3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.vcf; + +import org.broad.tribble.Tribble; +import org.broad.tribble.TribbleException; +import org.broad.tribble.index.DynamicIndexCreator; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.util.LittleEndianOutputStream; +import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * this class writes VCF files + */ +public abstract class IndexingVCFWriter implements VCFWriter { + final private File indexFile; + final private String name; + + private PositionalStream positionalStream; + private DynamicIndexCreator indexer; + private LittleEndianOutputStream idxStream; + + protected IndexingVCFWriter(String name, File location, OutputStream output, boolean enableOnTheFlyIndexing) { + this.name = name; + + if ( enableOnTheFlyIndexing ) { + indexFile = Tribble.indexFile(location); + try { + idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + //System.out.println("Creating index on the fly for " + location); + indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + indexer.initialize(location, indexer.defaultBinSize()); + positionalStream = new PositionalStream(output); + } catch ( IOException ex ) { + // No matter what we keep going, since we don't care if we can't create the index file + } + } else { + idxStream = null; + indexer = null; + positionalStream = null; + indexFile = null; + } + } + + public String getStreamName() { + return name; + } + + public abstract void writeHeader(VCFHeader header); + + /** + * attempt to close the VCF file + */ + public void close() { + // try to close the index stream (keep it separate to help debugging efforts) + if ( indexer != null ) { + try { + Index index = indexer.finalizeIndex(positionalStream.getPosition()); + index.write(idxStream); + idxStream.close(); + } catch (IOException e) { + throw new ReviewedStingException("Unable to close index for " + getStreamName(), e); + } + } + } + + /** + * add a record to the file + * + * @param vc the Variant Context object + */ + public void add(VariantContext vc) { + // if we are doing on the fly indexing, add the record ***before*** we write any bytes + if ( indexer != null ) + indexer.addFeature(vc, positionalStream.getPosition()); + } + + protected static final String writerName(File location, OutputStream stream) { + return location == null ? stream.toString() : location.getAbsolutePath(); + } + + protected static OutputStream openOutputStream(File location) { + try { + return new FileOutputStream(location); + } catch (FileNotFoundException e) { + throw new ReviewedStingException("Unable to create VCF file at location: " + location, e); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index e28cd7598..ebcba9635 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -44,26 +44,19 @@ import java.util.*; /** * this class writes VCF files */ -public class StandardVCFWriter implements VCFWriter { +public class StandardVCFWriter extends IndexingVCFWriter { + // the print stream we're writing to + final protected BufferedWriter mWriter; + + // should we write genotypes or just sites? + final protected boolean doNotWriteGenotypes; // the VCF header we're storing protected VCFHeader mHeader = null; - // the print stream we're writing to - protected BufferedWriter mWriter; - protected PositionalStream positionalStream = null; - // were filters applied? protected boolean filtersWereAppliedToContext = false; - // should we write genotypes or just sites? - protected boolean doNotWriteGenotypes = false; - - protected DynamicIndexCreator indexer = null; - protected File indexFile = null; - LittleEndianOutputStream idxStream = null; - File location = null; - /** * create a VCF writer, given a file to write to * @@ -93,32 +86,22 @@ public class StandardVCFWriter implements VCFWriter { * @param doNotWriteGenotypes do not write genotypes */ public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) { - mWriter = new BufferedWriter(new OutputStreamWriter(output)); - this.doNotWriteGenotypes = doNotWriteGenotypes; + this(null, output, false, doNotWriteGenotypes); } public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { - this.location = location; - - if ( enableOnTheFlyIndexing ) { - indexFile = Tribble.indexFile(location); - try { - idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - //System.out.println("Creating index on the fly for " + location); - indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - indexer.initialize(location, indexer.defaultBinSize()); - positionalStream = new PositionalStream(output); - output = positionalStream; - } catch ( IOException ex ) { - // No matter what we keep going, since we don't care if we can't create the index file - } - } - - //mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output))); - mWriter = new BufferedWriter(new OutputStreamWriter(output)); + super(writerName(location, output), location, output, enableOnTheFlyIndexing); + mWriter = new BufferedWriter(new OutputStreamWriter(output)); // todo -- fix buffer size this.doNotWriteGenotypes = doNotWriteGenotypes; } + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + + @Override public void writeHeader(VCFHeader header) { mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header; @@ -158,44 +141,24 @@ public class StandardVCFWriter implements VCFWriter { mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new TribbleException("IOException writing the VCF header to " + locationString(), e); + throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e); } } - private String locationString() { - return location == null ? mWriter.toString() : location.getAbsolutePath(); - } - /** * attempt to close the VCF file */ + @Override public void close() { // try to close the vcf stream try { mWriter.flush(); mWriter.close(); } catch (IOException e) { - throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage()); + throw new ReviewedStingException("Unable to close " + getStreamName(), e); } - // try to close the index stream (keep it separate to help debugging efforts) - if ( indexer != null ) { - try { - Index index = indexer.finalizeIndex(positionalStream.getPosition()); - index.write(idxStream); - idxStream.close(); - } catch (IOException e) { - throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage()); - } - } - } - - protected static OutputStream openOutputStream(File location) { - try { - return new FileOutputStream(location); - } catch (FileNotFoundException e) { - throw new TribbleException("Unable to create VCF file at location: " + location); - } + super.close(); } /** @@ -203,28 +166,17 @@ public class StandardVCFWriter implements VCFWriter { * * @param vc the Variant Context object */ + @Override public void add(VariantContext vc) { - add(vc, false); - } - - /** - * add a record to the file - * - * @param vc the Variant Context object - * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD) - */ - public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) { if ( mHeader == null ) - throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString()); + throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName()); if ( doNotWriteGenotypes ) vc = VariantContext.modifyGenotypes(vc, null); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles); - - // if we are doing on the fly indexing, add the record ***before*** we write any bytes - if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition()); + vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false); + super.add(vc); Map alleleMap = new HashMap(vc.getAlleles().size()); alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup @@ -317,10 +269,16 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to " + locationString()); + throw new RuntimeException("Unable to write the VCF object to " + getStreamName()); } } + // -------------------------------------------------------------------------------- + // + // implementation functions + // + // -------------------------------------------------------------------------------- + public static final String getFilterString(final VariantContext vc) { return getFilterString(vc, false); } @@ -531,12 +489,11 @@ public class StandardVCFWriter implements VCFWriter { } - public static int countOccurrences(char c, String s) { + private static int countOccurrences(char c, String s) { int count = 0; for (int i = 0; i < s.length(); i++) { count += s.charAt(i) == c ? 1 : 0; } return count; } - } From cd2c511c4ae8a7d13ca6fe3604308ca5fdea5c00 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 23:28:46 -0400 Subject: [PATCH 024/113] GCF improvements -- Support for streaming VCF writing via the VCFWriter interface -- GCF now has a header and a footer. The header is minimal, and contains a forward pointer to the position of the footer in the file. -- Readers now read the header, and then jump to the footer to get the rest of the "header" information -- Version now a field in GCF --- .../broadinstitute/sting/utils/gcf/GCF.java | 69 +++++----- .../sting/utils/gcf/GCFHeader.java | 49 +++++-- .../sting/utils/gcf/GCFWriter.java | 122 ++++++++++++++++++ 3 files changed, 198 insertions(+), 42 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java index 5ab241ebf..ef0d9ca42 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -79,8 +79,13 @@ public class GCF { } } - public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { + public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException { chromOffset = inputStream.readInt(); + + // have we reached the footer? + if ( chromOffset == GCFHeader.FOOTER_START_MARKER ) + throw new EOFException(); + start = inputStream.readInt(); stop = inputStream.readInt(); id = inputStream.readUTF(); @@ -106,6 +111,32 @@ public class GCF { throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); } + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeInt(chromOffset); + outputStream.writeInt(start); + outputStream.writeInt(stop); + outputStream.writeUTF(id); + outputStream.writeByte(refPad); + writeIntArray(alleleOffsets, outputStream, true); + outputStream.writeFloat(qual); + outputStream.writeUTF(info); + outputStream.writeInt(filterOffset); + + int nGenotypes = genotypes.size(); + int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; + outputStream.writeInt(nGenotypes); + outputStream.writeInt(expectedSizeOfGenotypes); + int obsSizeOfGenotypes = 0; + for ( GCFGenotype g : genotypes ) + obsSizeOfGenotypes += g.write(outputStream); + if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) + throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); + + outputStream.writeInt(RECORD_TERMINATOR); + return outputStream.size() - startSize; + } + public VariantContext decode(final String source, final GCFHeader header) { final String contig = header.getString(chromOffset); alleleMap = header.getAlleles(alleleOffsets); @@ -154,31 +185,6 @@ public class GCF { public int getNAlleles() { return alleleOffsets.length; } - public int write(DataOutputStream outputStream) throws IOException { - int startSize = outputStream.size(); - outputStream.writeInt(chromOffset); - outputStream.writeInt(start); - outputStream.writeInt(stop); - outputStream.writeUTF(id); - outputStream.writeByte(refPad); - writeIntArray(alleleOffsets, outputStream, true); - outputStream.writeFloat(qual); - outputStream.writeUTF(info); - outputStream.writeInt(filterOffset); - - int nGenotypes = genotypes.size(); - int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; - outputStream.writeInt(nGenotypes); - outputStream.writeInt(expectedSizeOfGenotypes); - int obsSizeOfGenotypes = 0; - for ( GCFGenotype g : genotypes ) - obsSizeOfGenotypes += g.write(outputStream); - if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) - throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); - - outputStream.writeInt(RECORD_TERMINATOR); - return outputStream.size() - startSize; - } private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) { StringBuilder s = new StringBuilder(); @@ -200,13 +206,14 @@ public class GCF { return s.toString(); } - private final static int BUFFER_SIZE = 1048576; // 2**20 - public static DataOutputStream createOutputStream(final File file) throws FileNotFoundException { - return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); + protected final static int BUFFER_SIZE = 1048576; // 2**20 + + public static DataInputStream createDataInputStream(final InputStream stream) { + return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE)); } - public static DataInputStream createInputStream(final File file) throws FileNotFoundException { - return new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE)); + public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException { + return new FileInputStream(file); } protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException { diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java index d0c765cc4..6d96eda56 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java @@ -30,9 +30,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; +import java.io.*; import java.util.*; /** @@ -65,25 +63,45 @@ import java.util.*; public class GCFHeader { final protected static Logger logger = Logger.getLogger(GCFHeader.class); - private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes(); + public final static int GCF_VERSION = 1; + public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes(); + public final static int FOOTER_START_MARKER = -1; + public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version + + final int version; + long footerPosition; final List alleles; final List strings; final List samples; final List> filters; public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + version = GCF_VERSION; + footerPosition = 0; this.alleles = linearize(allelesIn); this.strings = linearize(stringIn); this.samples = linearize(samplesIn); this.filters = null; // not used with this constructor } - public GCFHeader(DataInputStream inputStream) throws IOException { - byte[] headerTest = new byte[MAGIC_HEADER.length]; + public GCFHeader(FileInputStream fileInputStream) throws IOException { + DataInputStream inputStream = new DataInputStream(fileInputStream); + byte[] headerTest = new byte[GCF_FILE_START_MARKER.length]; inputStream.read(headerTest); - if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) { - throw new UserException("Could not read GVCF file. MAGIC_HEADER missing. Saw " + headerTest); + if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) { + throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest)); } else { + version = inputStream.readInt(); + logger.info("Read GCF version " + version); + footerPosition = inputStream.readLong(); + logger.info("Read footer position of " + footerPosition); + long lastPos = fileInputStream.getChannel().position(); + logger.info(" Last position is " + lastPos); + + // seek to the footer + fileInputStream.getChannel().position(footerPosition); + if ( inputStream.readInt() != FOOTER_START_MARKER ) + throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker"); alleles = stringsToAlleles(readStrings(inputStream)); strings = readStrings(inputStream); samples = readStrings(inputStream); @@ -91,19 +109,28 @@ public class GCFHeader { logger.info(String.format("String map of %d elements", strings.size())); logger.info(String.format("Sample map of %d elements", samples.size())); filters = initializeFilterCache(); + fileInputStream.getChannel().position(lastPos); } } - public int write(final DataOutputStream outputStream) throws IOException { + public static int writeHeader(final DataOutputStream outputStream) throws IOException { int startBytes = outputStream.size(); - outputStream.write(MAGIC_HEADER); + outputStream.write(GCF_FILE_START_MARKER); + outputStream.writeInt(GCF_VERSION); + outputStream.writeLong(0); + return outputStream.size() - startBytes; + } + + public int writeFooter(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding write(outputStream, allelesToStrings(alleles)); write(outputStream, strings); write(outputStream, samples); return outputStream.size() - startBytes; } - public void write(DataOutputStream outputStream, List l) throws IOException { + private void write(DataOutputStream outputStream, List l) throws IOException { outputStream.writeInt(l.size()); for ( String elt : l ) outputStream.writeUTF(elt); } diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java new file mode 100644 index 000000000..7ff6e27a2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * GCFWriter implementing the VCFWriter interface + * @author Your Name + * @since Date created + */ +public class GCFWriter extends IndexingVCFWriter { + final boolean skipGenotypes; + final FileOutputStream fileOutputStream; + final DataOutputStream dataOutputStream; + final GCFHeaderBuilder gcfHeaderBuilder; + int nbytes = 0; + VCFHeader header = null; + File location; + + // -------------------------------------------------------------------------------- + // + // Constructors + // + // -------------------------------------------------------------------------------- + + public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, null), location, null, enableOnTheFlyIndexing); + this.location = location; + this.skipGenotypes = doNotWriteGenotypes; + + // write the output + try { + fileOutputStream = new FileOutputStream(location); + dataOutputStream = createDataOutputStream(fileOutputStream); + gcfHeaderBuilder = new GCFHeaderBuilder(); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(location, e); + } + } + + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + + @Override + public void writeHeader(VCFHeader header) { + this.header = header; + try { + nbytes += GCFHeader.writeHeader(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e); + } + } + + @Override + public void add(VariantContext vc) { + super.add(vc); + GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes); + try { + nbytes += gcf.write(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e); + } + } + + @Override + public void close() { + // todo -- write out VCF header lines + GCFHeader gcfHeader = gcfHeaderBuilder.createHeader(); + try { + long headerPosition = nbytes; + nbytes += gcfHeader.writeFooter(dataOutputStream); + dataOutputStream.close(); + //System.out.println("Writing forward reference to " + headerPosition); + + RandomAccessFile raFile = new RandomAccessFile(location, "rw"); + raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET); + raFile.writeLong(headerPosition); + raFile.close(); + } catch ( IOException e ) { + throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e); + } + + super.close(); + } + + private static final DataOutputStream createDataOutputStream(final OutputStream stream) { + return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE)); + } + +} From 59841f82324d543d2943a3bfeb88cbd83f93532e Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 8 Sep 2011 08:41:16 -0400 Subject: [PATCH 025/113] Fixing genotype given alleles for indels. Only take the records that start at this locus. --- ...NPGenotypeLikelihoodsCalculationModel.java | 23 +--------------- .../genotyper/UnifiedGenotyperEngine.java | 26 +++++++++++++++++-- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 477155241..6905ce4a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -58,25 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; } - public static VariantContext getSNPVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { - if ( tracker == null || ref == null || logger == null ) - throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger); - VariantContext vc = null; - - // search for usable record - for( final VariantContext vc_input : tracker.getValues(allelesBinding) ) { - if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { - if ( vc == null ) { - vc = vc_input; - } else { - logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record"); - } - } - } - - return vc; - } - public Allele getLikelihoods(RefMetaDataTracker tracker, ReferenceContext ref, Map contexts, @@ -96,7 +75,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( alternateAlleleToUse != null ) { bestAlternateAllele = alternateAlleleToUse.getBases()[0]; } else if ( useAlleleFromVCF ) { - VariantContext vc = getSNPVCFromAllelesRod(tracker, ref, true, logger, UAC.alleles); + VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); // ignore places where we don't have a variant if ( vc == null ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index b1332bdf9..c558ecfbe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; @@ -36,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -233,7 +235,7 @@ public class UnifiedGenotyperEngine { private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { VariantContext vc; if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger, UAC.alleles); + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if ( vcInput == null ) return null; vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase()); @@ -633,7 +635,7 @@ public class UnifiedGenotyperEngine { // no extended event pileup // if we're genotyping given alleles and we have a requested SNP at this position, do SNP if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { - VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger, UAC.alleles); + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); if (vcInput == null) return null; @@ -739,4 +741,24 @@ public class UnifiedGenotyperEngine { return afcm; } + + public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { + if ( tracker == null || ref == null || logger == null ) + throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger); + VariantContext vc = null; + + // search for usable record + for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { + //System.out.println(vc_input); + if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { + if ( vc == null ) { + vc = vc_input; + } else { + logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); + } + } + } + + return vc; + } } From 29c968ab604bb982600ebb9b5a3c6be035a482b1 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 8 Sep 2011 08:42:43 -0400 Subject: [PATCH 026/113] clean up --- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index c558ecfbe..87dd37bf6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -749,7 +749,6 @@ public class UnifiedGenotyperEngine { // search for usable record for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { - //System.out.println(vc_input); if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { if ( vc == null ) { vc = vc_input; From 6e6bf796d5f3a39c0dcab76893825a1c6e80d549 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 8 Sep 2011 08:46:38 -0400 Subject: [PATCH 027/113] first version of somatic detector --- .../walkers/cancer/AssignSomaticStatus.java | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java new file mode 100644 index 000000000..d77621b6b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.cancer; + +import net.sf.picard.util.MathUtil; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * Assigns somatic status to a set of calls + */ +public class AssignSomaticStatus extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Argument(shortName="t", fullName="tumorsample", required=true, doc="List of tumor samples") + public Set tumorSamplesArg; + + @Argument(shortName="somaticPriorQ", fullName="somaticPriorQ", required=false, doc="Phred-scaled probability that a site is a somatic mutation") + public byte somaticPriorQ = 60; + + @Output + protected VCFWriter vcfWriter = null; + + private final String SOMATIC_TAG_NAME = "SOMATIC"; + private final String SOURCE_NAME = "AssignSomaticStatus"; + + private Set tumorSamples = new HashSet(); + private Set normalSamples = new HashSet(); + + /** + * Parse the familial relationship specification, and initialize VCF writer + */ + public void initialize() { + List rodNames = new ArrayList(); + rodNames.add(variantCollection.variants.getName()); + + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + + // set up tumor and normal samples + for ( final String sample : vcfSamples ) { + if ( tumorSamplesArg.contains(sample) ) + tumorSamples.add(sample); + else + normalSamples.add(sample); + } + logger.info("N tumor samples: " + tumorSamples.size()); + logger.info("N normal samples: " + normalSamples.size()); + if ( tumorSamples.size() != normalSamples.size() ) + logger.warn("Number of tumor samples isn't equal the number of normal samples"); + + Set headerLines = new HashSet(); + headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); + headerLines.add(new VCFFormatHeaderLine(SOMATIC_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the site is a somatic mutation")); + headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); + vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + } + + private double log10pNonRefInSamples(final VariantContext vc, final Set samples) { + return log10pSumInSamples(vc, samples, false); + } + + private double log10pRefInSamples(final VariantContext vc, final Set samples) { + return log10pSumInSamples(vc, samples, true); + } + + private double log10pSumInSamples(final VariantContext vc, final Set samples, boolean calcRefP) { + double log10p = 0; + + for ( final String sample : samples ) { + Genotype g = vc.getGenotype(sample); + if ( g.isNoCall() ) { + log10p += 0; + } else { + double[] gLikelihoods = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); + double log10pNonRefSample = Math.log10(calcRefP ? gLikelihoods[0] : 1 - gLikelihoods[0]); + log10p += log10pNonRefSample; + } + } + + return log10p; + } + + private double calcLog10pSomatic(final VariantContext vc) { + // walk over tumors, and calculate pNonRef + double log10pNonRefInTumors = log10pNonRefInSamples(vc, tumorSamples); + double log10pRefInNormals = log10pRefInSamples(vc, normalSamples); + double log10SomaticPrior = MathUtils.phredScaleToLog10Probability(somaticPriorQ); + double log10Somatic = log10SomaticPrior + log10pNonRefInTumors - log10pRefInNormals; + return log10Somatic; + } + + /** + * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype + * + * @param tracker the reference meta-data tracker + * @param ref the reference context + * @param context the alignment context + * @return null + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker != null) { + for ( final VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation()) ) { + double log10pSomatic = calcLog10pSomatic(vc); + + // write in the somatic status probability + Map attrs = new HashMap(); // vc.getAttributes()); + attrs.put(SOMATIC_TAG_NAME, MathUtils.log10ProbabilityToPhredScale(log10pSomatic)); + VariantContext newvc = VariantContext.modifyAttributes(vc, attrs); + + vcfWriter.add(newvc); + } + + return null; + } + + return null; + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + @Override + public Integer reduceInit() { + return null; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return null; + } +} From e0020b2b295b0da8b3b58dff9781d6f807653d43 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 8 Sep 2011 08:58:37 -0400 Subject: [PATCH 028/113] Fixing PrintRODs. Now has input and only prints out one copy of each record --- .../sting/gatk/walkers/PrintRODsWalker.java | 10 +++--- .../UnifiedGenotyperIntegrationTest.java | 34 ++++--------------- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java index 84549b13a..7960f5c35 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java @@ -26,21 +26,23 @@ package org.broadinstitute.sting.gatk.walkers; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import java.io.PrintStream; -import java.util.Iterator; /** * Prints out all of the RODs in the input data set. Data is rendered using the toString() method * of the given ROD. */ public class PrintRODsWalker extends RodWalker { + @Input(fullName="input", shortName = "input", doc="The input ROD which should be printed out.", required=true) + public RodBinding input; + @Output PrintStream out; @@ -62,7 +64,7 @@ public class PrintRODsWalker extends RodWalker { if ( tracker == null ) return 0; - for ( Feature feature : tracker.getValues(Feature.class) ) { + for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) { out.println(feature.toString()); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index da0c8f81f..f0164b7c4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -32,24 +32,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test MultiSample Pilot1", spec); } - // @Test - // todo - currently not working because when calling indels, using GENOTYPE_GIVEN_ALLELES yields a different result than in normal mode. To be fixed when extended events are removed. - public void testMultiSamplePilot2AndRecallingWithAlleles() { - String md5 = "b45636b29891f9df573ad2af6f507ee0"; - - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList(md5)); - List result = executeTest("test MultiSample Pilot2", spec1).getFirst(); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList(md5)); - executeTest("test MultiSample Pilot2 with alleles passed in", spec2); - } - @Test public void testWithAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( @@ -87,15 +69,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test compressed output", spec); } - // todo -- fixme -// @Test -// public void testCompressedOutputParallel() { -// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( -// baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 -nt 4", 1, -// Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); -// executeTest("testCompressedOutput-nt4", spec); -// } - // -------------------------------------------------------------------------------------------------------------- // // testing parallelization @@ -296,6 +269,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, Arrays.asList("94977d6e42e764280e9deaf4e3ac8c80")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); + + WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, + Arrays.asList("408d3aba4d094c067fc00a43992c2292")); + executeTest("test MultiSample Pilot2 indels with complicated records", spec3); + } From 9cba1019c83167c1d56f7e0d2a4d45b7eee1bbec Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 8 Sep 2011 09:25:13 -0400 Subject: [PATCH 029/113] Another fix for genotype given alleles for indels. Expanding the indel integration tests to include multiallelics and indel records that overlap --- .../genotyper/IndelGenotypeLikelihoodsCalculationModel.java | 2 +- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 41b340058..07f02de57 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -321,7 +321,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood haplotypeMap.clear(); if (getAlleleListFromVCF) { - for( final VariantContext vc_input : tracker.getValues(UAC.alleles) ) { + for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { if( vc_input != null && allowableTypes.contains(vc_input.getType()) && ref.getLocus().getStart() == vc_input.getStart()) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index f0164b7c4..185880401 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -271,9 +271,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("408d3aba4d094c067fc00a43992c2292")); + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, + Arrays.asList("e66b7321e2ac91742ad3ef91040daafd")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } From 2636d216dee775999c8a3ff8a66be83cd51bafed Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 8 Sep 2011 10:38:13 -0400 Subject: [PATCH 030/113] Adding indel vqsr integration test --- ...ntRecalibrationWalkersIntegrationTest.java | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index a5b0412e8..f3fd08cdd 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -73,5 +73,53 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(params.cutVCFMD5)); executeTest("testApplyRecalibration-"+params.inVCF, spec); } + + VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", + "6d7ee4cb651c8b666e4a4523363caaff", // tranches + "4759b111a5aa53975d46e0f22c7983bf", // recal file + "5d7e07d8813db96ba3f3dfe4737f83d1"); // cut VCF + + @DataProvider(name = "VRIndelTest") + public Object[][] createData2() { + return new Object[][]{ {indel} }; + } + + @Test(dataProvider = "VRIndelTest") + public void testVariantRecalibratorIndel(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -known:prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -training:prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + + " -truth:prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:1,000,000-40,000,000" + + " -an QD -an ReadPosRankSum -an HaplotypeScore" + + " -percentBad 0.08" + + " -mode INDEL -mG 3" + + " --minNumBadVariants 0" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRIndelTest",dependsOnMethods="testVariantRecalibratorIndel") + public void testApplyRecalibrationIndel(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " -mode INDEL" + + " -NO_HEADER" + + " -input " + params.inVCF + + " -o %s" + + " -tranchesFile " + MD5DB.getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + MD5DB.getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); + } } From 7557f4a03a0d4e62d5de121cd44e051f4c74e929 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 8 Sep 2011 11:54:14 -0400 Subject: [PATCH 031/113] AssignSomaticStatus, now with the correct mathematical model --- .../walkers/cancer/AssignSomaticStatus.java | 71 ++++++++++++++----- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java index d77621b6b..389e3d49a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java @@ -60,10 +60,13 @@ public class AssignSomaticStatus extends RodWalker { @Argument(shortName="somaticPriorQ", fullName="somaticPriorQ", required=false, doc="Phred-scaled probability that a site is a somatic mutation") public byte somaticPriorQ = 60; + @Argument(shortName="somaticMinLOD", fullName="somaticMinLOD", required=false, doc="Phred-scaled min probability that a site should be called somatic mutation") + public byte somaticMinLOD = 1; + @Output protected VCFWriter vcfWriter = null; - private final String SOMATIC_TAG_NAME = "SOMATIC"; + private final String SOMATIC_LOD_TAG_NAME = "SOMATIC_LOD"; private final String SOURCE_NAME = "AssignSomaticStatus"; private Set tumorSamples = new HashSet(); @@ -93,43 +96,75 @@ public class AssignSomaticStatus extends RodWalker { Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(SOMATIC_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the site is a somatic mutation")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Is this a confidently called somatic mutation")); + headerLines.add(new VCFFormatHeaderLine(SOMATIC_LOD_TAG_NAME, 1, VCFHeaderLineType.Float, "log10 probability that the site is a somatic mutation")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); } private double log10pNonRefInSamples(final VariantContext vc, final Set samples) { - return log10pSumInSamples(vc, samples, false); - } + double[] log10ps = log10PLFromSamples(vc, samples, false); + return MathUtils.log10sumLog10(log10ps); // product of probs => prod in real space + } private double log10pRefInSamples(final VariantContext vc, final Set samples) { - return log10pSumInSamples(vc, samples, true); + double[] log10ps = log10PLFromSamples(vc, samples, true); + return MathUtils.sum(log10ps); // product is sum } - private double log10pSumInSamples(final VariantContext vc, final Set samples, boolean calcRefP) { - double log10p = 0; + private double[] log10PLFromSamples(final VariantContext vc, final Set samples, boolean calcRefP) { + double[] log10p = new double[samples.size()]; + int i = 0; for ( final String sample : samples ) { Genotype g = vc.getGenotype(sample); - if ( g.isNoCall() ) { - log10p += 0; - } else { + double log10pSample = -1000; + if ( ! g.isNoCall() ) { double[] gLikelihoods = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); - double log10pNonRefSample = Math.log10(calcRefP ? gLikelihoods[0] : 1 - gLikelihoods[0]); - log10p += log10pNonRefSample; + log10pSample = Math.log10(calcRefP ? gLikelihoods[0] : 1 - gLikelihoods[0]); + log10pSample = Double.isInfinite(log10pSample) ? -10000 : log10pSample; } + log10p[i++] = log10pSample; } return log10p; } + /** + * P(somatic | D) + * = P(somatic) * P(D | somatic) + * = P(somatic) * P(D | normals are ref) * P(D | tumors are non-ref) + * + * P(! somatic | D) + * = P(! somatic) * P(D | ! somatic) + * = P(! somatic) * + * * ( P(D | normals are non-ref) * P(D | tumors are non-ref) [germline] + * + P(D | normals are ref) * P(D | tumors are ref)) [no-variant at all] + * + * @param vc + * @return + */ private double calcLog10pSomatic(final VariantContext vc) { - // walk over tumors, and calculate pNonRef + // walk over tumors double log10pNonRefInTumors = log10pNonRefInSamples(vc, tumorSamples); + double log10pRefInTumors = log10pRefInSamples(vc, tumorSamples); + + // walk over normals + double log10pNonRefInNormals = log10pNonRefInSamples(vc, normalSamples); double log10pRefInNormals = log10pRefInSamples(vc, normalSamples); - double log10SomaticPrior = MathUtils.phredScaleToLog10Probability(somaticPriorQ); - double log10Somatic = log10SomaticPrior + log10pNonRefInTumors - log10pRefInNormals; - return log10Somatic; + + // priors + double log10pSomaticPrior = MathUtils.phredScaleToLog10Probability(somaticPriorQ); + double log10pNotSomaticPrior = Math.log10(1 - MathUtils.phredScaleToProbability(somaticPriorQ)); + + double log10pNotSomaticGermline = log10pNonRefInNormals + log10pNonRefInTumors; + double log10pNotSomaticNoVariant = log10pRefInNormals + log10pRefInTumors; + + double log10pNotSomatic = log10pNotSomaticPrior + MathUtils.log10sumLog10(new double[]{log10pNotSomaticGermline, log10pNotSomaticNoVariant}); + double log10pSomatic = log10pSomaticPrior + log10pNonRefInTumors + log10pRefInNormals; + double lod = log10pSomatic - log10pNotSomatic; + + return Double.isInfinite(lod) ? -10000 : lod; } /** @@ -148,7 +183,9 @@ public class AssignSomaticStatus extends RodWalker { // write in the somatic status probability Map attrs = new HashMap(); // vc.getAttributes()); - attrs.put(SOMATIC_TAG_NAME, MathUtils.log10ProbabilityToPhredScale(log10pSomatic)); + attrs.put(SOMATIC_LOD_TAG_NAME, log10pSomatic); + if ( log10pSomatic > somaticMinLOD ) + attrs.put(VCFConstants.SOMATIC_KEY, true); VariantContext newvc = VariantContext.modifyAttributes(vc, attrs); vcfWriter.add(newvc); From 5edc8f8578240ae83f710d87bf631d335df436ee Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 8 Sep 2011 11:54:55 -0400 Subject: [PATCH 032/113] Moved to private package (intended home) --- .../walkers/cancer/AssignSomaticStatus.java | 221 ------------------ 1 file changed, 221 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java deleted file mode 100644 index 389e3d49a..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/cancer/AssignSomaticStatus.java +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.cancer; - -import net.sf.picard.util.MathUtil; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -/** - * Assigns somatic status to a set of calls - */ -public class AssignSomaticStatus extends RodWalker { - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - @Argument(shortName="t", fullName="tumorsample", required=true, doc="List of tumor samples") - public Set tumorSamplesArg; - - @Argument(shortName="somaticPriorQ", fullName="somaticPriorQ", required=false, doc="Phred-scaled probability that a site is a somatic mutation") - public byte somaticPriorQ = 60; - - @Argument(shortName="somaticMinLOD", fullName="somaticMinLOD", required=false, doc="Phred-scaled min probability that a site should be called somatic mutation") - public byte somaticMinLOD = 1; - - @Output - protected VCFWriter vcfWriter = null; - - private final String SOMATIC_LOD_TAG_NAME = "SOMATIC_LOD"; - private final String SOURCE_NAME = "AssignSomaticStatus"; - - private Set tumorSamples = new HashSet(); - private Set normalSamples = new HashSet(); - - /** - * Parse the familial relationship specification, and initialize VCF writer - */ - public void initialize() { - List rodNames = new ArrayList(); - rodNames.add(variantCollection.variants.getName()); - - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - - // set up tumor and normal samples - for ( final String sample : vcfSamples ) { - if ( tumorSamplesArg.contains(sample) ) - tumorSamples.add(sample); - else - normalSamples.add(sample); - } - logger.info("N tumor samples: " + tumorSamples.size()); - logger.info("N normal samples: " + normalSamples.size()); - if ( tumorSamples.size() != normalSamples.size() ) - logger.warn("Number of tumor samples isn't equal the number of normal samples"); - - Set headerLines = new HashSet(); - headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Is this a confidently called somatic mutation")); - headerLines.add(new VCFFormatHeaderLine(SOMATIC_LOD_TAG_NAME, 1, VCFHeaderLineType.Float, "log10 probability that the site is a somatic mutation")); - headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); - vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); - } - - private double log10pNonRefInSamples(final VariantContext vc, final Set samples) { - double[] log10ps = log10PLFromSamples(vc, samples, false); - return MathUtils.log10sumLog10(log10ps); // product of probs => prod in real space - } - - private double log10pRefInSamples(final VariantContext vc, final Set samples) { - double[] log10ps = log10PLFromSamples(vc, samples, true); - return MathUtils.sum(log10ps); // product is sum - } - - private double[] log10PLFromSamples(final VariantContext vc, final Set samples, boolean calcRefP) { - double[] log10p = new double[samples.size()]; - - int i = 0; - for ( final String sample : samples ) { - Genotype g = vc.getGenotype(sample); - double log10pSample = -1000; - if ( ! g.isNoCall() ) { - double[] gLikelihoods = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); - log10pSample = Math.log10(calcRefP ? gLikelihoods[0] : 1 - gLikelihoods[0]); - log10pSample = Double.isInfinite(log10pSample) ? -10000 : log10pSample; - } - log10p[i++] = log10pSample; - } - - return log10p; - } - - /** - * P(somatic | D) - * = P(somatic) * P(D | somatic) - * = P(somatic) * P(D | normals are ref) * P(D | tumors are non-ref) - * - * P(! somatic | D) - * = P(! somatic) * P(D | ! somatic) - * = P(! somatic) * - * * ( P(D | normals are non-ref) * P(D | tumors are non-ref) [germline] - * + P(D | normals are ref) * P(D | tumors are ref)) [no-variant at all] - * - * @param vc - * @return - */ - private double calcLog10pSomatic(final VariantContext vc) { - // walk over tumors - double log10pNonRefInTumors = log10pNonRefInSamples(vc, tumorSamples); - double log10pRefInTumors = log10pRefInSamples(vc, tumorSamples); - - // walk over normals - double log10pNonRefInNormals = log10pNonRefInSamples(vc, normalSamples); - double log10pRefInNormals = log10pRefInSamples(vc, normalSamples); - - // priors - double log10pSomaticPrior = MathUtils.phredScaleToLog10Probability(somaticPriorQ); - double log10pNotSomaticPrior = Math.log10(1 - MathUtils.phredScaleToProbability(somaticPriorQ)); - - double log10pNotSomaticGermline = log10pNonRefInNormals + log10pNonRefInTumors; - double log10pNotSomaticNoVariant = log10pRefInNormals + log10pRefInTumors; - - double log10pNotSomatic = log10pNotSomaticPrior + MathUtils.log10sumLog10(new double[]{log10pNotSomaticGermline, log10pNotSomaticNoVariant}); - double log10pSomatic = log10pSomaticPrior + log10pNonRefInTumors + log10pRefInNormals; - double lod = log10pSomatic - log10pNotSomatic; - - return Double.isInfinite(lod) ? -10000 : lod; - } - - /** - * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype - * - * @param tracker the reference meta-data tracker - * @param ref the reference context - * @param context the alignment context - * @return null - */ - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker != null) { - for ( final VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation()) ) { - double log10pSomatic = calcLog10pSomatic(vc); - - // write in the somatic status probability - Map attrs = new HashMap(); // vc.getAttributes()); - attrs.put(SOMATIC_LOD_TAG_NAME, log10pSomatic); - if ( log10pSomatic > somaticMinLOD ) - attrs.put(VCFConstants.SOMATIC_KEY, true); - VariantContext newvc = VariantContext.modifyAttributes(vc, attrs); - - vcfWriter.add(newvc); - } - - return null; - } - - return null; - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - @Override - public Integer reduceInit() { - return null; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return null; - } -} From eaaba6eb5136a3fe5b4f91d9a72077cf1d9fc514 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 8 Sep 2011 13:17:34 -0400 Subject: [PATCH 034/113] Confirming that when stratifying by sample in VE the monomorphic sites for a given sample are not counted for the relevant metrics. Adding integration test to cover it. --- .../VariantEvalIntegrationTest.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 6c4393d6a..699c8fac7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -14,6 +14,26 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; + @Test + public void testStratifySamplesAndExcludeMonomorphicSites() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-ST Sample", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("6a71b17c19f5914c277a99f45f5d9c39") + ); + executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); + } + @Test public void testFundamentalsCountVariantsSNPsAndIndels() { WalkerTestSpec spec = new WalkerTestSpec( From 48461b34afc6af2a545f961ac3563b7b0a602725 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 8 Sep 2011 15:01:13 -0400 Subject: [PATCH 035/113] Added TYPE argument to print out VariantType --- .../sting/gatk/walkers/variantutils/VariantsToTable.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 2a877fb09..bf9ff35de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker { getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } }); getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); From 388c9a9c55119dc9b882b4e6f0704ccdb1330037 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 2 Sep 2011 14:34:31 -0400 Subject: [PATCH 036/113] Enable public-only tests. Public-only tests will allow us to check for runtime public -> private dependencies when bamboo updates the github repository (currently, we only check for *compile-time* public -> private dependencies). To compile/run only public tests, append ".public" to the name of an existing test target: ant test.public ant integrationtest.public ant performancetest.public ant pipelinetest.public ant pipelinetestrun.public --- build.xml | 282 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 155 insertions(+), 127 deletions(-) diff --git a/build.xml b/build.xml index 275cb5555..beca6bce0 100644 --- a/build.xml +++ b/build.xml @@ -709,53 +709,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -769,20 +722,116 @@ - - - + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -792,10 +841,10 @@ - + - - + - - + + + + - - + + + + + + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + + + - + - + + + - + - + + + - + - + + + - + + + + + + - - + + - - + + - - - - - - + + From 367bbee25a6bd73248a9aa2834c5c1fb5e625e83 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Fri, 9 Sep 2011 01:33:25 -0400 Subject: [PATCH 038/113] Fixed typo when printing the contents or last N lines of a file. Thanks to larryns. --- .../org/broadinstitute/sting/queue/engine/FunctionEdge.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala index 68bc7ae61..162ed1b3c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala @@ -154,7 +154,7 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod val maxLines = 100 val tailLines = IOUtils.tail(errorFile, maxLines) val nl = "%n".format() - val summary = if (tailLines.size <= maxLines) "Last %d lines".format(maxLines) else "Contents" + val summary = if (tailLines.size > maxLines) "Last %d lines".format(maxLines) else "Contents" logger.error("%s of %s:%n%s".format(summary, errorFile, tailLines.mkString(nl))) } else { logger.error("Unable to access log file: %s".format(errorFile)) From 6ad8943ca07bc9b89edc8cfb70123c4966f003df Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 9 Sep 2011 09:45:24 -0400 Subject: [PATCH 039/113] CompOverlap no longer keeps track of the number of comp sites since it wasn't (and cannot) keeping track of them correctly. --- .../gatk/walkers/varianteval/evaluators/CompOverlap.java | 6 +----- .../walkers/varianteval/VariantEvalIntegrationTest.java | 6 +++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 5ccacac37..9facb11b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -22,9 +22,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { @DataPoint(description = "number of eval SNP sites") long nEvalVariants = 0; - @DataPoint(description = "number of comp SNP sites") - long nCompVariants = 0; - @DataPoint(description = "number of eval sites outside of comp sites") long novelSites = 0; @@ -76,9 +73,8 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { boolean evalIsGood = eval != null && eval.isPolymorphic(); - boolean compIsGood = comp != null && comp.isNotFiltered() && (eval == null || comp.getType() == eval.getType()); + boolean compIsGood = comp != null && comp.isNotFiltered(); - if (compIsGood) nCompVariants++; // count the number of comp events if (evalIsGood) nEvalVariants++; // count the number of eval events if (compIsGood && evalIsGood) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 699c8fac7..f94c20ff6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -291,7 +291,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("462d4784dd55294ef9d5118217b157a5")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("009ecc8376a20dce81ff5299ef6bfecb")); executeTestParallel("testCompOverlap",spec); } @@ -332,13 +332,13 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a3c2177849cb00fdff99574cff7f0e4f")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("0b81d97f843ec4a1a4222d1f9949bfca")); executeTestParallel("testMultipleCompTracks",spec); } @Test public void testPerSampleAndSubsettedSampleHaveSameResults() { - String md5 = "dab415cc76846e18fcf8c78f2b2ee033"; + String md5 = "b0565ac61b2860248e4abd478a177b5e"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( From 51eb95d6388d5f62e70d088fe433efda4f87bbe8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 9 Sep 2011 11:46:37 -0400 Subject: [PATCH 041/113] Missed these tests before --- .../walkers/varianteval/VariantEvalIntegrationTest.java | 8 ++++---- .../walkers/variantutils/VCFStreamingIntegrationTest.java | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index f94c20ff6..e992684bc 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -256,7 +256,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("2df4f8911ffc3c8d042298723ed465f8")); + 1, Arrays.asList("f70997b6a3e7fdc89d11e1d61a2463d4")); executeTestParallel("testSelect1", spec); } @@ -273,7 +273,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ed54aa127b173d8ad8b6482f2a929a42")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("407682de41dcf139ea635e9cda21b912")); executeTestParallel("testCompVsEvalAC",spec); } @@ -303,7 +303,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18c44636e36d6657110bf984f8eac181")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("424c9d438b1faa59b2c29413ba32f37b")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -315,7 +315,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("1b8ae4fd10de0888bd843f833859d990")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18fa0b89ebfff51141975d7e4ce7a159")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 3801e132d..00044f859 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("ea09bf764adba9765b99921c5ba2c709") + Arrays.asList("d46a735ffa898f4aa6b3758c5b03f06d") ); executeTest("testVCFStreamingChain", selectTestSpec); From 60a36188453cea7e36785e80f02b7ecbbae1e779 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 9 Sep 2011 11:45:57 -0400 Subject: [PATCH 042/113] Added "alltests" build targets. To run the same set of tests as the bamboo "All Tests" plan (unit tests, integration tests, and pipeline tests): ant alltests To do the same as above on only the public portion of the codebase: ant alltests.public --- build.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/build.xml b/build.xml index beca6bce0..a192ab6f3 100644 --- a/build.xml +++ b/build.xml @@ -896,6 +896,18 @@ + + + + + + + + + + + + From 6bd8a53efd218857a831133e2621b9c4d5dd6375 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 9 Sep 2011 12:04:41 -0400 Subject: [PATCH 043/113] Fix nasty bug involving the build report generation when multiple test targets are specified on the same command line. Ant immutable properties: sometimes your friend, often your enemy. --- build.xml | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/build.xml b/build.xml index a192ab6f3..e5ad9daf0 100644 --- a/build.xml +++ b/build.xml @@ -821,6 +821,7 @@ + @@ -828,10 +829,6 @@ - - - - @@ -841,10 +838,10 @@ - + - - - + + - + @@ -913,7 +910,7 @@ - + @@ -921,7 +918,7 @@ - + @@ -929,7 +926,7 @@ - + @@ -937,7 +934,7 @@ - + @@ -946,24 +943,24 @@ - + - + - + - + - + From 06cb20f2a5fd2681a95613ae0b8b8a53c6002f4b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 9 Sep 2011 12:56:45 -0400 Subject: [PATCH 044/113] Intermediate commit cleaning up scatter intervals -- Adding unit tests to ensure uniformity of intervals --- .../sting/utils/interval/IntervalUtils.java | 57 +- .../utils/interval/IntervalUtilsUnitTest.java | 1032 +++++++++-------- .../queue/extensions/gatk/GATKIntervals.scala | 130 +-- .../gatk/IntervalScatterFunction.scala | 4 +- .../gatk/GATKIntervalsUnitTest.scala | 10 +- 5 files changed, 658 insertions(+), 575 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f551e1368..41cbbe59f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -334,24 +334,44 @@ public class IntervalUtils { } /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. + * Splits an interval list into multiple sublists. * @param locs The genome locs to split. * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @param scatterParts The output interval lists to write to. + * @return A list of lists of genome locs, split according to splits */ - public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) { - if (splits.size() != scatterParts.size()) - throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); - int fileIndex = 0; + public static List> splitIntervalsToSubLists(List locs, List splits) { int locIndex = 1; int start = 0; + List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { - IntervalList intervalList = new IntervalList(fileHeader); + List curList = new ArrayList(); for (int i = start; i < stop; i++) - intervalList.add(toInterval(locs.get(i), locIndex++)); - intervalList.write(scatterParts.get(fileIndex++)); + curList.add(locs.get(i)); start = stop; + sublists.add(curList); + } + + return sublists; + } + + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param splits Pre-divided genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + + int fileIndex = 0; + int locIndex = 1; + for (final List split : splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (final GenomeLoc loc : split) + intervalList.add(toInterval(loc, locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); } } @@ -361,17 +381,15 @@ public class IntervalUtils { * @param numParts Number of parts to split the locs into. * @return The stop points to split the genome locs. */ - public static List splitFixedIntervals(List locs, int numParts) { + public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - long locsSize = 0; - for (GenomeLoc loc: locs) - locsSize += loc.size(); - List splitPoints = new ArrayList(); + final long locsSize = intervalSize(locs); + final List splitPoints = new ArrayList(); addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); Collections.sort(splitPoints); splitPoints.add(locs.size()); - return splitPoints; + return splitIntervalsToSubLists(locs, splitPoints); } private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { @@ -441,4 +459,11 @@ public class IntervalUtils { return merged; } } + + public static final long intervalSize(final List locs) { + long size = 0; + for ( final GenomeLoc loc : locs ) + size += loc.size(); + return size; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bb892eec8..bd6bf9591 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest { private SAMFileHeader hg19Header; private GenomeLocParser hg19GenomeLocParser; private List hg19ReferenceLocs; + private List hg19exomeIntervals; + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return hg18ReferenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); + return locs; + } @BeforeClass public void init() { @@ -54,511 +68,555 @@ public class IntervalUtilsUnitTest extends BaseTest { ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); hg19GenomeLocParser = new GenomeLocParser(seq); hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; + + hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false)); } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(hg19Ref,ex); } } - @Test(expectedExceptions=UserException.class) - public void testMergeListsBySetOperatorNoOverlap() { - // a couple of lists we'll use for the testing - List listEveryTwoFromOne = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - else - listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + private class IntervalSlicingTest extends TestDataProvider { + public int parts; + public double maxAllowableVariance; + + private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { + super(IntervalSlicingTest.class); + this.parts = parts; + this.maxAllowableVariance = maxAllowableVariance; } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 100); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 0); - } - - @Test - public void testMergeListsBySetOperatorAllOverlap() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 150); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 50); - } - - @Test - public void testMergeListsBySetOperator() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 5 == 0) { - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - } - - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 40); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 20); - } - - @Test - public void testGetContigLengths() { - Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); - Assert.assertEquals((long)lengths.get("chr1"), 247249719); - Assert.assertEquals((long)lengths.get("chr2"), 242951149); - Assert.assertEquals((long)lengths.get("chr3"), 199501827); - Assert.assertEquals((long)lengths.get("chr20"), 62435964); - Assert.assertEquals((long)lengths.get("chrX"), 154913754); - } - - private List getLocs(String... intervals) { - return getLocs(Arrays.asList(intervals)); - } - - private List getLocs(List intervals) { - if (intervals.size() == 0) - return hg18ReferenceLocs; - List locs = new ArrayList(); - for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; - } - - @Test - public void testParseIntervalArguments() { - Assert.assertEquals(getLocs().size(), 45); - Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); - Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); - } - - @Test - public void testIsIntervalFile() { - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); - - List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); - for (String extension: extensions) { - Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); + public String toString() { + return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); } } - @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) - public void testMissingIntervalFile() { - IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); + @DataProvider(name = "intervalslicingdata") + public Object[][] createTrees() { +// new IntervalSlicingTest(1, 0); +// new IntervalSlicingTest(2, 0.1); + new IntervalSlicingTest(5, 0.1); +// new IntervalSlicingTest(10, 0.1); +// new IntervalSlicingTest(67, 0.1); +// new IntervalSlicingTest(100, 0.1); +// new IntervalSlicingTest(500, 0.1); +// new IntervalSlicingTest(1000, 0.1); + return IntervalSlicingTest.getTests(IntervalSlicingTest.class); } - @Test - public void testFixedScatterIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + @Test(dataProvider = "intervalslicingdata") + public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); - List files = testFiles("basic.", 3, ".intervals"); + long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); + long idealSplitSize = totalSize / test.parts; - List locs = getLocs("chr1", "chr2", "chr3"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterFixedIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("less.", 3, ".intervals"); - - List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testSplitFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - IntervalUtils.splitFixedIntervals(locs, files.size()); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - } - @Test - public void testScatterFixedIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs2.get(0), chr1b); - Assert.assertEquals(locs3.get(0), chr2); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs3.get(0), chr2b); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3a); - Assert.assertEquals(locs3.get(0), chr3b); - } - - @Test - public void testScatterFixedIntervalsFile() { - List files = testFiles("sg.", 20, ".intervals"); - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - - int[] counts = { - 125, 138, 287, 291, 312, 105, 155, 324, - 295, 298, 141, 121, 285, 302, 282, 88, - 116, 274, 282, 248 -// 5169, 5573, 10017, 10567, 10551, -// 5087, 4908, 10120, 10435, 10399, -// 5391, 4735, 10621, 10352, 10654, -// 5227, 5256, 10151, 9649, 9825 - }; - - //String splitCounts = ""; - for (int lastIndex = 0, i = 0; i < splits.size(); i++) { - int splitIndex = splits.get(i); - int splitCount = (splitIndex - lastIndex); - //splitCounts += ", " + splitCount; - lastIndex = splitIndex; - Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + long sumOfSplitSizes = 0; + int counter = 0; + for ( final List split : splits ) { + long splitSize = IntervalUtils.intervalSize(split); + double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); + logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + counter++; + sumOfSplitSizes += splitSize; + Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); } - //System.out.println(splitCounts.substring(2)); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - int locIndex = 0; - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); - for (GenomeLoc parsedLoc: parsedLocs) - Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); - } - Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } - @Test - public void testScatterFixedIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); - IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - @Test - public void testScatterContigIntervalsOrder() { - List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr2); - Assert.assertEquals(locs2.get(0), chr1); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - - List files = testFiles("contig_basic.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("contig_less.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterContigIntervalsMoreFiles() { - List files = testFiles("contig_more.", 3, ".intervals"); - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); - } - - @Test - public void testScatterContigIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_start.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs1.get(1), chr1b); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_middle.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 2); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs2.get(1), chr2b); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("contig_split_end.", 3 ,".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3a); - Assert.assertEquals(locs3.get(1), chr3b); - } - - @Test - public void testScatterContigIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - private List testFiles(String prefix, int count, String suffix) { - ArrayList files = new ArrayList(); - for (int i = 1; i <= count; i++) { - files.add(createTempFile(prefix + i, suffix)); - } - return files; - } - - @DataProvider(name="unmergedIntervals") - public Object[][] getUnmergedIntervals() { - return new Object[][] { - new Object[] {"small_unmerged_picard_intervals.list"}, - new Object[] {"small_unmerged_gatk_intervals.list"} - }; - } - - @Test(dataProvider="unmergedIntervals") - public void testUnmergedIntervals(String unmergedIntervals) { - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); - Assert.assertEquals(locs.size(), 2); - - List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); - Assert.assertEquals(merged.size(), 1); - } +// @Test(expectedExceptions=UserException.class) +// public void testMergeListsBySetOperatorNoOverlap() { +// // a couple of lists we'll use for the testing +// List listEveryTwoFromOne = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 2 == 0) +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// else +// listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 100); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 0); +// } +// +// @Test +// public void testMergeListsBySetOperatorAllOverlap() { +// // a couple of lists we'll use for the testing +// List allSites = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 2 == 0) +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 150); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 50); +// } +// +// @Test +// public void testMergeListsBySetOperator() { +// // a couple of lists we'll use for the testing +// List allSites = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 5 == 0) { +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 40); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 20); +// } +// +// @Test +// public void testGetContigLengths() { +// Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); +// Assert.assertEquals((long)lengths.get("chr1"), 247249719); +// Assert.assertEquals((long)lengths.get("chr2"), 242951149); +// Assert.assertEquals((long)lengths.get("chr3"), 199501827); +// Assert.assertEquals((long)lengths.get("chr20"), 62435964); +// Assert.assertEquals((long)lengths.get("chrX"), 154913754); +// } +// +// @Test +// public void testParseIntervalArguments() { +// Assert.assertEquals(getLocs().size(), 45); +// Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); +// Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); +// } +// +// @Test +// public void testIsIntervalFile() { +// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); +// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); +// +// List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); +// for (String extension: extensions) { +// Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); +// } +// } +// +// @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) +// public void testMissingIntervalFile() { +// IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); +// } +// +// @Test +// public void testFixedScatterIntervalsBasic() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// +// List files = testFiles("basic.", 3, ".intervals"); +// +// List locs = getLocs("chr1", "chr2", "chr3"); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsLessFiles() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); +// +// List files = testFiles("less.", 3, ".intervals"); +// +// List locs = getLocs("chr1", "chr2", "chr3", "chr4"); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// Assert.assertEquals(locs3.get(1), chr4); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testSplitFixedIntervalsMoreFiles() { +// List files = testFiles("more.", 3, ".intervals"); +// List locs = getLocs("chr1", "chr2"); +// IntervalUtils.splitFixedIntervals(locs, files.size()); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testScatterFixedIntervalsMoreFiles() { +// List files = testFiles("more.", 3, ".intervals"); +// List locs = getLocs("chr1", "chr2"); +// List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// } +// @Test +// public void testScatterFixedIntervalsStart() { +// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); +// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); +// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1a); +// Assert.assertEquals(locs2.get(0), chr1b); +// Assert.assertEquals(locs3.get(0), chr2); +// Assert.assertEquals(locs3.get(1), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsMiddle() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); +// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2a); +// Assert.assertEquals(locs3.get(0), chr2b); +// Assert.assertEquals(locs3.get(1), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsEnd() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); +// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); +// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 2); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs1.get(1), chr2); +// Assert.assertEquals(locs2.get(0), chr3a); +// Assert.assertEquals(locs3.get(0), chr3b); +// } +// +// @Test +// public void testScatterFixedIntervalsFile() { +// List files = testFiles("sg.", 20, ".intervals"); +// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// +// int[] counts = { +// 125, 138, 287, 291, 312, 105, 155, 324, +// 295, 298, 141, 121, 285, 302, 282, 88, +// 116, 274, 282, 248 +//// 5169, 5573, 10017, 10567, 10551, +//// 5087, 4908, 10120, 10435, 10399, +//// 5391, 4735, 10621, 10352, 10654, +//// 5227, 5256, 10151, 9649, 9825 +// }; +// +// //String splitCounts = ""; +// for (int lastIndex = 0, i = 0; i < splits.size(); i++) { +// int splitIndex = splits.get(i); +// int splitCount = (splitIndex - lastIndex); +// //splitCounts += ", " + splitCount; +// lastIndex = splitIndex; +// Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); +// } +// //System.out.println(splitCounts.substring(2)); +// +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// int locIndex = 0; +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); +// for (GenomeLoc parsedLoc: parsedLocs) +// Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); +// } +// Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); +// } +// +// @Test +// public void testScatterFixedIntervalsMax() { +// List files = testFiles("sg.", 85, ".intervals"); +// List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); +// +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); +// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); +// } +// } +// +// @Test +// public void testScatterContigIntervalsOrder() { +// List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr2); +// Assert.assertEquals(locs2.get(0), chr1); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsBasic() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// +// List files = testFiles("contig_basic.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsLessFiles() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); +// +// List files = testFiles("contig_less.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// Assert.assertEquals(locs3.get(1), chr4); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testScatterContigIntervalsMoreFiles() { +// List files = testFiles("contig_more.", 3, ".intervals"); +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); +// } +// +// @Test +// public void testScatterContigIntervalsStart() { +// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); +// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); +// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("contig_split_start.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 2); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1a); +// Assert.assertEquals(locs1.get(1), chr1b); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsMiddle() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); +// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("contig_split_middle.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 2); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2a); +// Assert.assertEquals(locs2.get(1), chr2b); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsEnd() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); +// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); +// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); +// +// List files = testFiles("contig_split_end.", 3 ,".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3a); +// Assert.assertEquals(locs3.get(1), chr3b); +// } +// +// @Test +// public void testScatterContigIntervalsMax() { +// List files = testFiles("sg.", 85, ".intervals"); +// IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); +// +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); +// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); +// } +// } +// +// private List testFiles(String prefix, int count, String suffix) { +// ArrayList files = new ArrayList(); +// for (int i = 1; i <= count; i++) { +// files.add(createTempFile(prefix + i, suffix)); +// } +// return files; +// } +// +// @DataProvider(name="unmergedIntervals") +// public Object[][] getUnmergedIntervals() { +// return new Object[][] { +// new Object[] {"small_unmerged_picard_intervals.list"}, +// new Object[] {"small_unmerged_gatk_intervals.list"} +// }; +// } +// +// @Test(dataProvider="unmergedIntervals") +// public void testUnmergedIntervals(String unmergedIntervals) { +// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); +// Assert.assertEquals(locs.size(), 2); +// +// List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); +// Assert.assertEquals(merged.size(), 1); +// } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index aae5e438c..0fb997f43 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -1,65 +1,65 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.queue.extensions.gatk - -import java.io.File -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.IntervalUtils -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource -import net.sf.samtools.SAMFileHeader -import java.util.Collections -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} - -case class GATKIntervals(reference: File, intervals: List[String]) { - private lazy val referenceDataSource = new ReferenceDataSource(reference) - private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] - - lazy val samFileHeader = { - val header = new SAMFileHeader - header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) - header - } - - lazy val locs: java.util.List[GenomeLoc] = { - val parser = new GenomeLocParser(referenceDataSource.getReference) - val parsedLocs = - if (intervals.isEmpty) - GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList - else - IntervalUtils.parseIntervalArguments(parser, intervals, false) - Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) - } - - lazy val contigs = locs.map(_.getContig).distinct.toList - - def getSplits(size: Int) = { - splitsBySize.getOrElse(size, { - val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) - splitsBySize += size -> splits - splits - }) - } -} +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +import java.io.File +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource +import net.sf.samtools.SAMFileHeader +import java.util.Collections +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} + +case class GATKIntervals(reference: File, intervals: List[String]) { + private lazy val referenceDataSource = new ReferenceDataSource(reference) +// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] + + lazy val samFileHeader = { + val header = new SAMFileHeader + header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) + header + } + + lazy val locs: java.util.List[GenomeLoc] = { + val parser = new GenomeLocParser(referenceDataSource.getReference) + val parsedLocs = + if (intervals.isEmpty) + GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList + else + IntervalUtils.parseIntervalArguments(parser, intervals, false) + Collections.sort(parsedLocs) + Collections.unmodifiableList(parsedLocs) + } + + lazy val contigs = locs.map(_.getContig).distinct.toList + +// def getSplits(size: Int) = { +// splitsBySize.getOrElse(size, { +// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) +// splitsBySize += size -> splits +// splits +// }) +// } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index d88d272b9..f65d5ab29 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) - IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, - gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) + val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index b3a2d23ae..38abe24ef 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -53,8 +53,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) - Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) - Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) +// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) +// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) } @Test(timeOut = 30000) @@ -65,7 +65,7 @@ class GATKIntervalsUnitTest { // for(Item item: javaConvertedScalaList) // This for loop is actually an O(N^2) operation as the iterator calls the // O(N) javaConvertedScalaList.size() for each iteration of the loop. - Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) Assert.assertEquals(gi.contigs.size, 24) } @@ -74,8 +74,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) - Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) - Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) +// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) +// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) } @Test From 91c949db74c3bc67e02f7bc7ef99d062ed3a0c53 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 9 Sep 2011 12:57:14 -0400 Subject: [PATCH 045/113] Fixing ValidateVariants so that it validates deletion records. Fixing GATKdocs. --- .../gatk/walkers/variantutils/ValidateVariants.java | 5 ----- .../sting/utils/variantcontext/VariantContext.java | 11 ++++++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index 2c7902914..fdfca982c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -56,11 +56,6 @@ import java.util.Set; * A variant set to filter. *

* - *

Output

- *

- * A filtered VCF. - *

- * *

Examples

*
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 699133e38..1c65102ae 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -1085,14 +1085,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
     }
 
     public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
-        // don't validate if we're an insertion or complex event
-        if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) {
-            throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
+        // don't validate if we're a complex event
+        if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) {
+            throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
         }
 
         // we also need to validate the padding base for simple indels
-        if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) )
-            throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), (char)getReferenceBaseForIndel().byteValue(), (char)paddedRefBase.byteValue()));
+        if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) {
+            throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue()));
+        }
     }
 
     public void validateRSIDs(Set rsIDs) {

From 354529bff37626c5f46de89a60987d3d3fd40aec Mon Sep 17 00:00:00 2001
From: Ryan Poplin 
Date: Fri, 9 Sep 2011 13:15:24 -0400
Subject: [PATCH 046/113] adding Validate Variants integration test with a
 deletion

---
 .../ValidateVariantsIntegrationTest.java             | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
index adf3b21a8..3d41be1ae 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
@@ -113,4 +113,16 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
 
         executeTest("test bad alt allele", spec);
     }
+
+    @Test
+    public void testBadAllele2() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+            baseTestString("validationExampleBad3.vcf", "ALLELES"),
+            0,
+            UserException.MalformedFile.class
+        );
+
+        executeTest("test bad alt allele", spec);
+    }
+
 }

From 1953edcd2d1a2292117cb07ff4bb0e48e2605f0e Mon Sep 17 00:00:00 2001
From: Ryan Poplin 
Date: Fri, 9 Sep 2011 13:39:08 -0400
Subject: [PATCH 047/113] updating Validate Variants deletion integration test

---
 .../walkers/variantutils/ValidateVariantsIntegrationTest.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
index 3d41be1ae..5f71f82fd 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java
@@ -117,12 +117,12 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
     @Test
     public void testBadAllele2() {
         WalkerTestSpec spec = new WalkerTestSpec(
-            baseTestString("validationExampleBad3.vcf", "ALLELES"),
+            baseTestString("validationExampleBad3.vcf", "REF"),
             0,
             UserException.MalformedFile.class
         );
 
-        executeTest("test bad alt allele", spec);
+        executeTest("test bad ref allele in deletion", spec);
     }
 
 }

From 7f9000382e4d9ef30e25739401b72e7d1a7c2fcc Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro 
Date: Fri, 9 Sep 2011 14:09:11 -0400
Subject: [PATCH 048/113] Making indel calls default in the MDCP

You can turn off indel calling by using -noIndels.
---
 .../queue/qscripts/MethodsDevelopmentCallingPipeline.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
index 80bfe03d1..17d614290 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
@@ -22,8 +22,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
   @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
   var noBAQ: Boolean = false
 
-  @Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false)
-  var callIndels: Boolean = false
+  @Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false)
+  var noIndels: Boolean = false
 
   @Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false)
   var LOCAL_ET: Boolean = false
@@ -165,7 +165,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
     val goldStandard = true
     for (target <- targets) {
       if( !skipCalling ) {
-        if (callIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target))
+        if (!noIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target))
         add(new snpCall(target))
         add(new VQSR(target, !goldStandard))
         add(new applyVQSR(target, !goldStandard))

From 87dc5cfb24a8065a07f0fdec3d09a0943210e4ae Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Fri, 9 Sep 2011 14:23:13 -0400
Subject: [PATCH 049/113] Whitespace cleanup

---
 public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
index b96923589..b66198713 100644
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
@@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
     
     @Override
     public int hashCode() {
-        return (int)( start << 16 + stop << 4 + contigIndex );
+        return start << 16 | stop << 4 | contigIndex;
     }
 
 

From c6436ee5f0f3359912e8210f99828a33680c745c Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Fri, 9 Sep 2011 14:24:29 -0400
Subject: [PATCH 050/113] Whitespace cleanup

---
 public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
index b66198713..ba4919175 100644
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
@@ -307,6 +307,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
     @Override
     public int hashCode() {
         return start << 16 | stop << 4 | contigIndex;
+
     }
 
 

From 3c8445b934c127581919d6be960ebc372be21342 Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Fri, 9 Sep 2011 14:25:37 -0400
Subject: [PATCH 051/113] Performance bugfix for GenomeLoc.hashcode

-- old version overflowed so most GenomeLocs had 0 hashcode.  Now uses or not plus to combine
---
 public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 1 -
 1 file changed, 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
index ba4919175..b66198713 100644
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
@@ -307,7 +307,6 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
     @Override
     public int hashCode() {
         return start << 16 | stop << 4 | contigIndex;
-
     }
 
 

From 72536e5d6db56f495d560f1bcf2536c6896a49c0 Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Fri, 9 Sep 2011 15:44:47 -0400
Subject: [PATCH 052/113] Done

---
 build.xml                                     |    4 +-
 .../sting/utils/interval/IntervalUtils.java   |   70 +-
 .../utils/interval/IntervalUtilsUnitTest.java | 1000 +++++++++--------
 3 files changed, 522 insertions(+), 552 deletions(-)

diff --git a/build.xml b/build.xml
index beca6bce0..efefdd438 100644
--- a/build.xml
+++ b/build.xml
@@ -855,8 +855,8 @@
                 
                 
                 
-
-
+                
+                
                 
                     
                     
diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
index 41cbbe59f..2cfcc19a9 100644
--- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
@@ -333,28 +333,6 @@ public class IntervalUtils {
             throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size()));
     }
 
-    /**
-     * Splits an interval list into multiple sublists.
-     * @param locs The genome locs to split.
-     * @param splits The stop points for the genome locs returned by splitFixedIntervals.
-     * @return A list of lists of genome locs, split according to splits
-     */
-    public static List> splitIntervalsToSubLists(List locs, List splits) {
-        int locIndex = 1;
-        int start = 0;
-        List> sublists = new ArrayList>(splits.size());
-        for (Integer stop: splits) {
-            List curList = new ArrayList();
-            for (int i = start; i < stop; i++)
-                curList.add(locs.get(i));
-            start = stop;
-            sublists.add(curList);
-        }
-
-        return sublists;
-    }
-
-
     /**
      * Splits an interval list into multiple files.
      * @param fileHeader The sam file header.
@@ -384,39 +362,27 @@ public class IntervalUtils {
     public static List> splitFixedIntervals(List locs, int numParts) {
         if (locs.size() < numParts)
             throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
+
         final long locsSize = intervalSize(locs);
-        final List splitPoints = new ArrayList();
-        addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
-        Collections.sort(splitPoints);
-        splitPoints.add(locs.size());
-        return splitIntervalsToSubLists(locs, splitPoints);
-    }
+        final double idealSplitSize = locsSize / numParts;
+        final List> splits = new ArrayList>(numParts);
+        final LinkedList remainingLocs = new LinkedList(locs);
 
-    private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) {
-        if (numParts < 2)
-            return;
-        int halfParts = (numParts + 1) / 2;
-        Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts);
-        int splitIndex = splitPoint.first;
-        long splitSize = splitPoint.second;
-        splitPoints.add(splitIndex);
-        addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts);
-        addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts);
-    }
+        for ( int i = 0; i < numParts; i++ ) {
+            long splitSize = 0;
+            List split = new ArrayList();
+            while ( ! remainingLocs.isEmpty() ) {
+                final GenomeLoc toAdd = remainingLocs.pop();
+                splitSize += toAdd.size();
+                split.add(toAdd);
+                final long nextEltSize = remainingLocs.isEmpty() ? 0 : remainingLocs.peek().size();
+                if ( splitSize + (i % 2 == 0 ? 0 : nextEltSize) > idealSplitSize )
+                    break;
+            }
+            splits.add(split);
+        }
 
-    private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) {
-        int splitIndex = startIndex;
-        long splitSize = 0;
-        for (int i = 0; i < minLocs; i++) {
-            splitSize += locs.get(splitIndex).size();
-            splitIndex++;
-        }
-        long halfSize = locsSize / 2;
-        while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) {
-            splitSize += locs.get(splitIndex).size();
-            splitIndex++;
-        }
-        return new Pair(splitIndex, splitSize);
+        return splits;
     }
 
     /**
diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
index bd6bf9591..4809f1b5c 100644
--- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
@@ -1,6 +1,7 @@
 package org.broadinstitute.sting.utils.interval;
 
 import net.sf.picard.reference.ReferenceSequenceFile;
+import net.sf.picard.util.IntervalUtil;
 import net.sf.samtools.SAMFileHeader;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
@@ -99,19 +100,26 @@ public class IntervalUtilsUnitTest extends BaseTest {
 
     @DataProvider(name = "intervalslicingdata")
     public Object[][] createTrees() {
-//        new IntervalSlicingTest(1, 0);
-//        new IntervalSlicingTest(2, 0.1);
-        new IntervalSlicingTest(5, 0.1);
-//        new IntervalSlicingTest(10, 0.1);
-//        new IntervalSlicingTest(67, 0.1);
-//        new IntervalSlicingTest(100, 0.1);
-//        new IntervalSlicingTest(500, 0.1);
-//        new IntervalSlicingTest(1000, 0.1);
+        new IntervalSlicingTest(1, 0);
+        new IntervalSlicingTest(2, 0.1);
+        new IntervalSlicingTest(3, 0.1);
+        new IntervalSlicingTest(7, 0.1);
+        new IntervalSlicingTest(10, 0.1);
+        new IntervalSlicingTest(31, 0.1);
+        new IntervalSlicingTest(67, 0.1);
+        new IntervalSlicingTest(100, 0.1);
+        new IntervalSlicingTest(127, 0.1);
+        // starts to become a bit less efficiency with larger cuts
+        new IntervalSlicingTest(500, 0.5);
+        new IntervalSlicingTest(1000, 1);
+        new IntervalSlicingTest(10000, 10);
         return IntervalSlicingTest.getTests(IntervalSlicingTest.class);
     }
 
     @Test(dataProvider = "intervalslicingdata")
     public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) {
+        Set locsSet = new HashSet(hg19exomeIntervals);
+        Set notFoundSet = new HashSet(hg19exomeIntervals);
         List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts);
 
         long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals);
@@ -122,501 +130,497 @@ public class IntervalUtilsUnitTest extends BaseTest {
         for ( final List split : splits ) {
             long splitSize = IntervalUtils.intervalSize(split);
             double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize);
-            logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma));
+            //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma));
             counter++;
             sumOfSplitSizes += splitSize;
             Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance));
+
+            for ( final GenomeLoc loc : split ) {
+                Assert.assertTrue(locsSet.contains(loc), "Split location " + loc + " not found in set of input locs");
+                notFoundSet.remove(loc);
+            }
         }
 
-        Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
+        Assert.assertEquals(sumOfSplitSizes, totalSize, "Split intervals don't contain the exact number of bases in the original intervals");
+        Assert.assertTrue(notFoundSet.isEmpty(), "Not all intervals were present in the split set");
     }
 
-//    @Test(expectedExceptions=UserException.class)
-//    public void testMergeListsBySetOperatorNoOverlap() {
-//        // a couple of lists we'll use for the testing
-//        List listEveryTwoFromOne = new ArrayList();
-//        List listEveryTwoFromTwo = new ArrayList();
-//
-//        // create the two lists we'll use
-//        for (int x = 1; x < 101; x++) {
-//            if (x % 2 == 0)
-//                listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
-//            else
-//                listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
-//        }
-//
-//        List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION);
-//        Assert.assertEquals(ret.size(), 100);
-//        ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION);
-//        Assert.assertEquals(ret.size(), 0);
-//    }
-//
-//    @Test
-//    public void testMergeListsBySetOperatorAllOverlap() {
-//        // a couple of lists we'll use for the testing
-//        List allSites = new ArrayList();
-//        List listEveryTwoFromTwo = new ArrayList();
-//
-//        // create the two lists we'll use
-//        for (int x = 1; x < 101; x++) {
-//            if (x % 2 == 0)
-//                listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
-//            allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
-//        }
-//
-//        List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION);
-//        Assert.assertEquals(ret.size(), 150);
-//        ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION);
-//        Assert.assertEquals(ret.size(), 50);
-//    }
-//
-//    @Test
-//    public void testMergeListsBySetOperator() {
-//        // a couple of lists we'll use for the testing
-//        List allSites = new ArrayList();
-//        List listEveryTwoFromTwo = new ArrayList();
-//
-//        // create the two lists we'll use
-//        for (int x = 1; x < 101; x++) {
-//            if (x % 5 == 0) {
-//                listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
-//                allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
-//            }
-//        }
-//
-//        List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION);
-//        Assert.assertEquals(ret.size(), 40);
-//        ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION);
-//        Assert.assertEquals(ret.size(), 20);
-//    }
-//
-//    @Test
-//    public void testGetContigLengths() {
-//        Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference));
-//        Assert.assertEquals((long)lengths.get("chr1"), 247249719);
-//        Assert.assertEquals((long)lengths.get("chr2"), 242951149);
-//        Assert.assertEquals((long)lengths.get("chr3"), 199501827);
-//        Assert.assertEquals((long)lengths.get("chr20"), 62435964);
-//        Assert.assertEquals((long)lengths.get("chrX"), 154913754);
-//    }
-//
-//    @Test
-//    public void testParseIntervalArguments() {
-//        Assert.assertEquals(getLocs().size(), 45);
-//        Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3);
-//        Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4);
-//    }
-//
-//    @Test
-//    public void testIsIntervalFile() {
-//        Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list"));
-//        Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true));
-//
-//        List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard");
-//        for (String extension: extensions) {
-//            Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension);
-//        }
-//    }
-//
-//    @Test(expectedExceptions = UserException.CouldNotReadInputFile.class)
-//    public void testMissingIntervalFile() {
-//        IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list");
-//    }
-//
-//    @Test
-//    public void testFixedScatterIntervalsBasic() {
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
-//
-//        List files = testFiles("basic.", 3, ".intervals");
-//
-//        List locs = getLocs("chr1", "chr2", "chr3");
-//        List splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-//        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 1);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs2.get(0), chr2);
-//        Assert.assertEquals(locs3.get(0), chr3);
-//    }
-//
-//    @Test
-//    public void testScatterFixedIntervalsLessFiles() {
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
-//        GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4");
-//
-//        List files = testFiles("less.", 3, ".intervals");
-//
-//        List locs = getLocs("chr1", "chr2", "chr3", "chr4");
-//        List splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-//        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 2);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs2.get(0), chr2);
-//        Assert.assertEquals(locs3.get(0), chr3);
-//        Assert.assertEquals(locs3.get(1), chr4);
-//    }
-//
-//    @Test(expectedExceptions=UserException.BadArgumentValue.class)
-//    public void testSplitFixedIntervalsMoreFiles() {
-//        List files = testFiles("more.", 3, ".intervals");
-//        List locs = getLocs("chr1", "chr2");
-//        IntervalUtils.splitFixedIntervals(locs, files.size());
-//    }
-//
-//    @Test(expectedExceptions=UserException.BadArgumentValue.class)
-//    public void testScatterFixedIntervalsMoreFiles() {
-//        List files = testFiles("more.", 3, ".intervals");
-//        List locs = getLocs("chr1", "chr2");
-//        List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
-//        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
-//    }
-//    @Test
-//    public void testScatterFixedIntervalsStart() {
-//        List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
-//        GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2");
-//        GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
-//
-//        List files = testFiles("split.", 3, ".intervals");
-//
-//        List locs = getLocs(intervals);
-//        List splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-//        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 2);
-//
-//        Assert.assertEquals(locs1.get(0), chr1a);
-//        Assert.assertEquals(locs2.get(0), chr1b);
-//        Assert.assertEquals(locs3.get(0), chr2);
-//        Assert.assertEquals(locs3.get(1), chr3);
-//    }
-//
-//    @Test
-//    public void testScatterFixedIntervalsMiddle() {
-//        List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
-//        GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2");
-//        GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
-//
-//        List files = testFiles("split.", 3, ".intervals");
-//
-//        List locs = getLocs(intervals);
-//        List splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-//        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 2);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs2.get(0), chr2a);
-//        Assert.assertEquals(locs3.get(0), chr2b);
-//        Assert.assertEquals(locs3.get(1), chr3);
-//    }
-//
-//    @Test
-//    public void testScatterFixedIntervalsEnd() {
-//        List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2");
-//        GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2");
-//        GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5");
-//
-//        List files = testFiles("split.", 3, ".intervals");
-//
-//        List locs = getLocs(intervals);
-//        List splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-//        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 2);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 1);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs1.get(1), chr2);
-//        Assert.assertEquals(locs2.get(0), chr3a);
-//        Assert.assertEquals(locs3.get(0), chr3b);
-//    }
-//
-//    @Test
-//    public void testScatterFixedIntervalsFile() {
-//        List files = testFiles("sg.", 20, ".intervals");
-//        List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false);
-//        List splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-//
-//        int[] counts = {
-//                125, 138, 287, 291, 312, 105, 155, 324,
-//                295, 298, 141, 121, 285, 302, 282, 88,
-//                116, 274, 282, 248
-////                5169, 5573, 10017, 10567, 10551,
-////                5087, 4908, 10120, 10435, 10399,
-////                5391, 4735, 10621, 10352, 10654,
-////                5227, 5256, 10151, 9649, 9825
-//        };
-//
-//        //String splitCounts = "";
-//        for (int lastIndex = 0, i = 0; i < splits.size(); i++) {
-//            int splitIndex = splits.get(i);
-//            int splitCount = (splitIndex - lastIndex);
-//            //splitCounts += ", " + splitCount;
-//            lastIndex = splitIndex;
-//            Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
-//        }
-//        //System.out.println(splitCounts.substring(2));
-//
-//        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
-//
-//        int locIndex = 0;
-//        for (int i = 0; i < files.size(); i++) {
-//            String file = files.get(i).toString();
-//            List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false);
-//            Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file);
-//            for (GenomeLoc parsedLoc: parsedLocs)
-//                Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i));
-//        }
-//        Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs");
-//    }
-//
-//    @Test
-//    public void testScatterFixedIntervalsMax() {
-//        List files = testFiles("sg.", 85, ".intervals");
-//        List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
-//        IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files);
-//
-//        for (int i = 0; i < files.size(); i++) {
-//            String file = files.get(i).toString();
-//            List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false);
-//            Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()");
-//            Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()");
-//        }
-//    }
-//
-//    @Test
-//    public void testScatterContigIntervalsOrder() {
-//        List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2");
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
-//
-//        List files = testFiles("split.", 3, ".intervals");
-//
-//        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 1);
-//
-//        Assert.assertEquals(locs1.get(0), chr2);
-//        Assert.assertEquals(locs2.get(0), chr1);
-//        Assert.assertEquals(locs3.get(0), chr3);
-//    }
-//
-//    @Test
-//    public void testScatterContigIntervalsBasic() {
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
-//
-//        List files = testFiles("contig_basic.", 3, ".intervals");
-//
-//        IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 1);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs2.get(0), chr2);
-//        Assert.assertEquals(locs3.get(0), chr3);
-//    }
-//
-//    @Test
-//    public void testScatterContigIntervalsLessFiles() {
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
-//        GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4");
-//
-//        List files = testFiles("contig_less.", 3, ".intervals");
-//
-//        IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 2);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs2.get(0), chr2);
-//        Assert.assertEquals(locs3.get(0), chr3);
-//        Assert.assertEquals(locs3.get(1), chr4);
-//    }
-//
-//    @Test(expectedExceptions=UserException.BadArgumentValue.class)
-//    public void testScatterContigIntervalsMoreFiles() {
-//        List files = testFiles("contig_more.", 3, ".intervals");
-//        IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files);
-//    }
-//
-//    @Test
-//    public void testScatterContigIntervalsStart() {
-//        List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
-//        GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2");
-//        GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
-//
-//        List files = testFiles("contig_split_start.", 3, ".intervals");
-//
-//        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 2);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 1);
-//
-//        Assert.assertEquals(locs1.get(0), chr1a);
-//        Assert.assertEquals(locs1.get(1), chr1b);
-//        Assert.assertEquals(locs2.get(0), chr2);
-//        Assert.assertEquals(locs3.get(0), chr3);
-//    }
-//
-//    @Test
-//    public void testScatterContigIntervalsMiddle() {
-//        List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
-//        GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2");
-//        GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5");
-//        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
-//
-//        List files = testFiles("contig_split_middle.", 3, ".intervals");
-//
-//        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 2);
-//        Assert.assertEquals(locs3.size(), 1);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs2.get(0), chr2a);
-//        Assert.assertEquals(locs2.get(1), chr2b);
-//        Assert.assertEquals(locs3.get(0), chr3);
-//    }
-//
-//    @Test
-//    public void testScatterContigIntervalsEnd() {
-//        List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
-//        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
-//        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2");
-//        GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2");
-//        GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5");
-//
-//        List files = testFiles("contig_split_end.", 3 ,".intervals");
-//
-//        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
-//
-//        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
-//        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
-//        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
-//
-//        Assert.assertEquals(locs1.size(), 1);
-//        Assert.assertEquals(locs2.size(), 1);
-//        Assert.assertEquals(locs3.size(), 2);
-//
-//        Assert.assertEquals(locs1.get(0), chr1);
-//        Assert.assertEquals(locs2.get(0), chr2);
-//        Assert.assertEquals(locs3.get(0), chr3a);
-//        Assert.assertEquals(locs3.get(1), chr3b);
-//    }
-//
-//    @Test
-//    public void testScatterContigIntervalsMax() {
-//        List files = testFiles("sg.", 85, ".intervals");
-//        IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files);
-//
-//        for (int i = 0; i < files.size(); i++) {
-//            String file = files.get(i).toString();
-//            List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false);
-//            Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()");
-//            Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()");
-//        }
-//    }
-//
-//    private List testFiles(String prefix, int count, String suffix) {
-//        ArrayList files = new ArrayList();
-//        for (int i = 1; i <= count; i++) {
-//            files.add(createTempFile(prefix + i, suffix));
-//        }
-//        return files;
-//    }
-//
-//    @DataProvider(name="unmergedIntervals")
-//    public Object[][] getUnmergedIntervals() {
-//        return new Object[][] {
-//                new Object[] {"small_unmerged_picard_intervals.list"},
-//                new Object[] {"small_unmerged_gatk_intervals.list"}
-//        };
-//    }
-//
-//    @Test(dataProvider="unmergedIntervals")
-//    public void testUnmergedIntervals(String unmergedIntervals) {
-//        List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false);
-//        Assert.assertEquals(locs.size(), 2);
-//
-//        List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
-//        Assert.assertEquals(merged.size(), 1);
-//    }
+    @Test(expectedExceptions=UserException.class)
+    public void testMergeListsBySetOperatorNoOverlap() {
+        // a couple of lists we'll use for the testing
+        List listEveryTwoFromOne = new ArrayList();
+        List listEveryTwoFromTwo = new ArrayList();
+
+        // create the two lists we'll use
+        for (int x = 1; x < 101; x++) {
+            if (x % 2 == 0)
+                listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
+            else
+                listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
+        }
+
+        List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION);
+        Assert.assertEquals(ret.size(), 100);
+        ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION);
+        Assert.assertEquals(ret.size(), 0);
+    }
+
+    @Test
+    public void testMergeListsBySetOperatorAllOverlap() {
+        // a couple of lists we'll use for the testing
+        List allSites = new ArrayList();
+        List listEveryTwoFromTwo = new ArrayList();
+
+        // create the two lists we'll use
+        for (int x = 1; x < 101; x++) {
+            if (x % 2 == 0)
+                listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
+            allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
+        }
+
+        List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION);
+        Assert.assertEquals(ret.size(), 150);
+        ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION);
+        Assert.assertEquals(ret.size(), 50);
+    }
+
+    @Test
+    public void testMergeListsBySetOperator() {
+        // a couple of lists we'll use for the testing
+        List allSites = new ArrayList();
+        List listEveryTwoFromTwo = new ArrayList();
+
+        // create the two lists we'll use
+        for (int x = 1; x < 101; x++) {
+            if (x % 5 == 0) {
+                listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
+                allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x));
+            }
+        }
+
+        List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION);
+        Assert.assertEquals(ret.size(), 40);
+        ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION);
+        Assert.assertEquals(ret.size(), 20);
+    }
+
+    @Test
+    public void testGetContigLengths() {
+        Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference));
+        Assert.assertEquals((long)lengths.get("chr1"), 247249719);
+        Assert.assertEquals((long)lengths.get("chr2"), 242951149);
+        Assert.assertEquals((long)lengths.get("chr3"), 199501827);
+        Assert.assertEquals((long)lengths.get("chr20"), 62435964);
+        Assert.assertEquals((long)lengths.get("chrX"), 154913754);
+    }
+
+    @Test
+    public void testParseIntervalArguments() {
+        Assert.assertEquals(getLocs().size(), 45);
+        Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3);
+        Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4);
+    }
+
+    @Test
+    public void testIsIntervalFile() {
+        Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list"));
+        Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true));
+
+        List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard");
+        for (String extension: extensions) {
+            Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension);
+        }
+    }
+
+    @Test(expectedExceptions = UserException.CouldNotReadInputFile.class)
+    public void testMissingIntervalFile() {
+        IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list");
+    }
+
+    @Test
+    public void testFixedScatterIntervalsBasic() {
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
+
+        List files = testFiles("basic.", 3, ".intervals");
+
+        List locs = getLocs("chr1", "chr2", "chr3");
+        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 1);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs2.get(0), chr2);
+        Assert.assertEquals(locs3.get(0), chr3);
+    }
+
+    @Test
+    public void testScatterFixedIntervalsLessFiles() {
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
+        GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4");
+
+        List files = testFiles("less.", 3, ".intervals");
+
+        List locs = getLocs("chr1", "chr2", "chr3", "chr4");
+        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 2);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 1);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs1.get(1), chr2);
+        Assert.assertEquals(locs2.get(0), chr3);
+        Assert.assertEquals(locs3.get(0), chr4);
+    }
+
+    @Test(expectedExceptions=UserException.BadArgumentValue.class)
+    public void testSplitFixedIntervalsMoreFiles() {
+        List files = testFiles("more.", 3, ".intervals");
+        List locs = getLocs("chr1", "chr2");
+        IntervalUtils.splitFixedIntervals(locs, files.size());
+    }
+
+    @Test(expectedExceptions=UserException.BadArgumentValue.class)
+    public void testScatterFixedIntervalsMoreFiles() {
+        List files = testFiles("more.", 3, ".intervals");
+        List locs = getLocs("chr1", "chr2");
+        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, locs.size()), files);
+    }
+    @Test
+    public void testScatterFixedIntervalsStart() {
+        List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
+        GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2");
+        GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
+
+        List files = testFiles("split.", 3, ".intervals");
+
+        List locs = getLocs(intervals);
+        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 2);
+
+        Assert.assertEquals(locs1.get(0), chr1a);
+        Assert.assertEquals(locs2.get(0), chr1b);
+        Assert.assertEquals(locs3.get(0), chr2);
+        Assert.assertEquals(locs3.get(1), chr3);
+    }
+
+    @Test
+    public void testScatterFixedIntervalsMiddle() {
+        List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
+        GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2");
+        GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
+
+        List files = testFiles("split.", 3, ".intervals");
+
+        List locs = getLocs(intervals);
+        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 2);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs2.get(0), chr2a);
+        Assert.assertEquals(locs3.get(0), chr2b);
+        Assert.assertEquals(locs3.get(1), chr3);
+    }
+
+    @Test
+    public void testScatterFixedIntervalsEnd() {
+        List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2");
+        GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2");
+        GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5");
+
+        List files = testFiles("split.", 3, ".intervals");
+
+        List locs = getLocs(intervals);
+        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 2);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 1);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs1.get(1), chr2);
+        Assert.assertEquals(locs2.get(0), chr3a);
+        Assert.assertEquals(locs3.get(0), chr3b);
+    }
+
+    @Test
+    public void testScatterFixedIntervalsFile() {
+        List files = testFiles("sg.", 20, ".intervals");
+        List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false);
+        List> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+
+        int[] counts = {
+                125, 138, 287, 291, 312, 105, 155, 324,
+                295, 298, 141, 121, 285, 302, 282, 88,
+                116, 274, 282, 248
+//                5169, 5573, 10017, 10567, 10551,
+//                5087, 4908, 10120, 10435, 10399,
+//                5391, 4735, 10621, 10352, 10654,
+//                5227, 5256, 10151, 9649, 9825
+        };
+
+        //String splitCounts = "";
+        for (int i = 0; i < splits.size(); i++) {
+            long splitCount = splits.get(i).size();
+            Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
+        }
+        //System.out.println(splitCounts.substring(2));
+
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
+
+        int locIndex = 0;
+        for (int i = 0; i < files.size(); i++) {
+            String file = files.get(i).toString();
+            List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false);
+            Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file);
+            for (GenomeLoc parsedLoc: parsedLocs)
+                Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i));
+        }
+        Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs");
+    }
+
+    @Test
+    public void testScatterFixedIntervalsMax() {
+        List files = testFiles("sg.", 85, ".intervals");
+        IntervalUtils.scatterFixedIntervals(hg19Header, IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()), files);
+
+        for (int i = 0; i < files.size(); i++) {
+            String file = files.get(i).toString();
+            List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false);
+            Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()");
+            Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()");
+        }
+    }
+
+    @Test
+    public void testScatterContigIntervalsOrder() {
+        List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2");
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
+
+        List files = testFiles("split.", 3, ".intervals");
+
+        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 1);
+
+        Assert.assertEquals(locs1.get(0), chr2);
+        Assert.assertEquals(locs2.get(0), chr1);
+        Assert.assertEquals(locs3.get(0), chr3);
+    }
+
+    @Test
+    public void testScatterContigIntervalsBasic() {
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
+
+        List files = testFiles("contig_basic.", 3, ".intervals");
+
+        IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 1);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs2.get(0), chr2);
+        Assert.assertEquals(locs3.get(0), chr3);
+    }
+
+    @Test
+    public void testScatterContigIntervalsLessFiles() {
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
+        GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4");
+
+        List files = testFiles("contig_less.", 3, ".intervals");
+
+        IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 2);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs2.get(0), chr2);
+        Assert.assertEquals(locs3.get(0), chr3);
+        Assert.assertEquals(locs3.get(1), chr4);
+    }
+
+    @Test(expectedExceptions=UserException.BadArgumentValue.class)
+    public void testScatterContigIntervalsMoreFiles() {
+        List files = testFiles("contig_more.", 3, ".intervals");
+        IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files);
+    }
+
+    @Test
+    public void testScatterContigIntervalsStart() {
+        List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
+        GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2");
+        GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
+
+        List files = testFiles("contig_split_start.", 3, ".intervals");
+
+        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 2);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 1);
+
+        Assert.assertEquals(locs1.get(0), chr1a);
+        Assert.assertEquals(locs1.get(1), chr1b);
+        Assert.assertEquals(locs2.get(0), chr2);
+        Assert.assertEquals(locs3.get(0), chr3);
+    }
+
+    @Test
+    public void testScatterContigIntervalsMiddle() {
+        List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
+        GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2");
+        GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5");
+        GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
+
+        List files = testFiles("contig_split_middle.", 3, ".intervals");
+
+        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 2);
+        Assert.assertEquals(locs3.size(), 1);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs2.get(0), chr2a);
+        Assert.assertEquals(locs2.get(1), chr2b);
+        Assert.assertEquals(locs3.get(0), chr3);
+    }
+
+    @Test
+    public void testScatterContigIntervalsEnd() {
+        List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
+        GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
+        GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2");
+        GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2");
+        GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5");
+
+        List files = testFiles("contig_split_end.", 3 ,".intervals");
+
+        IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files);
+
+        List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
+        List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
+        List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
+
+        Assert.assertEquals(locs1.size(), 1);
+        Assert.assertEquals(locs2.size(), 1);
+        Assert.assertEquals(locs3.size(), 2);
+
+        Assert.assertEquals(locs1.get(0), chr1);
+        Assert.assertEquals(locs2.get(0), chr2);
+        Assert.assertEquals(locs3.get(0), chr3a);
+        Assert.assertEquals(locs3.get(1), chr3b);
+    }
+
+    @Test
+    public void testScatterContigIntervalsMax() {
+        List files = testFiles("sg.", 85, ".intervals");
+        IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files);
+
+        for (int i = 0; i < files.size(); i++) {
+            String file = files.get(i).toString();
+            List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false);
+            Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()");
+            Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()");
+        }
+    }
+
+    private List testFiles(String prefix, int count, String suffix) {
+        ArrayList files = new ArrayList();
+        for (int i = 1; i <= count; i++) {
+            files.add(createTempFile(prefix + i, suffix));
+        }
+        return files;
+    }
+
+    @DataProvider(name="unmergedIntervals")
+    public Object[][] getUnmergedIntervals() {
+        return new Object[][] {
+                new Object[] {"small_unmerged_picard_intervals.list"},
+                new Object[] {"small_unmerged_gatk_intervals.list"}
+        };
+    }
+
+    @Test(dataProvider="unmergedIntervals")
+    public void testUnmergedIntervals(String unmergedIntervals) {
+        List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false);
+        Assert.assertEquals(locs.size(), 2);
+
+        List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
+        Assert.assertEquals(merged.size(), 1);
+    }
 }

From 9e650dfc17621a76e421184ae33b03c2515f193c Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro 
Date: Fri, 9 Sep 2011 16:25:31 -0400
Subject: [PATCH 053/113] Fixing SelectVariants documentation

getting rid of messages telling users to go for the YAML file. The idea is to not support these anymore.
---
 .../gatk/walkers/variantutils/SelectVariants.java   | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index 35ff66243..018c4dcc2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -145,10 +145,9 @@ import java.util.*;
  *   -R ref.fasta \
  *   -T SelectVariants \
  *   --variant input.vcf \
- *   -o output.vcf \
- *   -SM family.yaml \
  *   -family NA12891+NA12892=NA12878 \
- *   -mvq 50
+ *   -mvq 50 \
+ *   -o violations.vcf
  *
  * Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
@@ -265,17 +264,17 @@ public class SelectVariants extends RodWalker {
     private File AF_FILE = new File("");
 
     @Hidden
-    @Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+    @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false)
     private File FAMILY_STRUCTURE_FILE = null;
 
     /**
      * String formatted as dad+mom=child where these parameters determine which sample names are examined.
      */
-    @Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false)
+    @Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
     private String FAMILY_STRUCTURE = "";
 
     /**
-     * Sample metadata information will be taken from a YAML file (see the -SM argument).
+     * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure.
      */
     @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
     private Boolean MENDELIAN_VIOLATIONS = false;
@@ -306,7 +305,7 @@ public class SelectVariants extends RodWalker {
 
 
     @Hidden
-    @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+    @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false)
     private String outMVFile = null;
 
     /* Private class used to store the intermediate variants in the integer random selection process */

From a807205fc3966855a4ab122ac4aa0548e425178b Mon Sep 17 00:00:00 2001
From: Guillermo del Angel 
Date: Fri, 9 Sep 2011 18:00:23 -0400
Subject: [PATCH 054/113] a) Minor optimization to softMax() computation to
 avoid redundant operations, results in about 5-10% increase in speed in indel
 calling. b) Added (but left commented out since it may affect integration
 tests and to isolate commits) fix to per-sample DP reporting, so that
 deletions are included in count. c) Bug fix to avoid having non-reference
 genotypes assigned to samples with PL=0,0,0. Correct behavior should be to
 no-call these samples, and to ignore these samples when computing AC
 distribution since their likelihoods are not informative.

---
 .../genotyper/ExactAFCalculationModel.java    | 84 +++++++++++++------
 ...elGenotypeLikelihoodsCalculationModel.java | 20 ++++-
 .../broadinstitute/sting/utils/MathUtils.java | 54 +++++-------
 3 files changed, 95 insertions(+), 63 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java
index cd006a3cf..6ae437b27 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java
@@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
 
     private boolean SIMPLE_GREEDY_GENOTYPER = false;
 
-
+    private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
 
     final private ExactCalculation calcToUse;
     protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
@@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
 
     }
 
-    private static final double[][] getGLs(Map GLs) {
-        double[][] genotypeLikelihoods = new double[GLs.size()+1][];
+    private static final ArrayList getGLs(Map GLs) {
+        ArrayList genotypeLikelihoods = new ArrayList();
 
-        int j = 0;
+        //int j = 0;
+        genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
         for ( Genotype sample : GLs.values() ) {
-            j++;
-
             if ( sample.hasLikelihoods() ) {
                 //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
-                genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
+                double[] gls = sample.getLikelihoods().getAsVector();
+
+                if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
+                    genotypeLikelihoods.add(gls);
             }
         }
 
         return genotypeLikelihoods;
     }
 
+
     // -------------------------------------------------------------------------------------
     //
     // Linearized, ~O(N), implementation.
@@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
     public int linearExact(Map GLs,
                            double[] log10AlleleFrequencyPriors,
                            double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
-        final int numSamples = GLs.size();
+        final ArrayList genotypeLikelihoods = getGLs(GLs);
+        final int numSamples = genotypeLikelihoods.size()-1;
         final int numChr = 2*numSamples;
-        final double[][] genotypeLikelihoods = getGLs(GLs);
 
         final ExactACCache logY = new ExactACCache(numSamples+1);
         logY.getkMinus0()[0] = 0.0; // the zero case
@@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
 
             if ( k == 0 ) { // special case for k = 0
                 for ( int j=1; j <= numSamples; j++ ) {
-                    kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
+                    kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
                 }
             } else { // k > 0
                 final double[] kMinus1 = logY.getkMinus1();
                 final double[] kMinus2 = logY.getkMinus2();
 
                 for ( int j=1; j <= numSamples; j++ ) {
-                    final double[] gl = genotypeLikelihoods[j];
+                    final double[] gl = genotypeLikelihoods.get(j);
                     final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
 
                     double aa = Double.NEGATIVE_INFINITY;
@@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
         if ( !vc.isVariant() )
             throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
 
-        boolean multiAllelicRecord = false;
-
-        if (vc.getAlternateAlleles().size() > 1)
-            multiAllelicRecord = true;
 
         Map GLs = vc.getGenotypes();
         double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
@@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
         pathMetricArray[0][0] = 0.0;
 
         // todo = can't deal with optimal dynamic programming solution with multiallelic records
-        if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
+        if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
             sampleIndices.addAll(GLs.keySet());
             sampleIdx = GLs.size();
         }
@@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
                     continue;
 
                 double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
+
+                if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL)     {
+                    //System.out.print(sample.getKey()+":");
+                    //for (int k=0; k < likelihoods.length; k++)
+                    //   System.out.format("%4.2f ",likelihoods[k]);
+                    //System.out.println();
+                    // all likelihoods are essentially the same: skip this sample and will later on force no call.
+                    //sampleIdx++;
+                    continue;
+                }
+
                 sampleIndices.add(sample.getKey());
 
                 for (int k=0; k <= AFofMaxLikelihood; k++) {
@@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
             Genotype g = GLs.get(sample);
             if ( !g.hasLikelihoods() )
                 continue;
-
-            if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
-                bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
-            else {
-                int newIdx = tracebackArray[k][startIdx];
-                bestGTguess = startIdx - newIdx;
-                startIdx = newIdx;
-            }
-
+            // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
+            // and will add no-call genotype to GL's in a second pass
             ArrayList myAlleles = new ArrayList();
 
             double qual = Double.NEGATIVE_INFINITY;
             double[] likelihoods = g.getLikelihoods().getAsVector();
+
+            if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
+                bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
+            }
+            else {
+                int newIdx = tracebackArray[k][startIdx];;
+                bestGTguess = startIdx - newIdx;
+                startIdx = newIdx;
+            }
+
             /*           System.out.format("Sample: %s GL:",sample);
                     for (int i=0; i < likelihoods.length; i++)
-                        System.out.format("%1.4f ",likelihoods[i]);
+                        System.out.format("%1.4f, ",likelihoods[i]);
             */
 
             for (int i=0; i < likelihoods.length; i++) {
@@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
 
         }
 
+        for ( Map.Entry sample : GLs.entrySet() ) {
+
+            if ( !sample.getValue().hasLikelihoods() )
+                continue;
+            Genotype g = GLs.get(sample.getKey());
+
+            double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
+
+            if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
+                continue; // regular likelihoods
+
+            ArrayList myAlleles = new ArrayList();
+
+            double qual = Genotype.NO_NEG_LOG_10PERROR;
+            myAlleles.add(Allele.NO_CALL);
+            myAlleles.add(Allele.NO_CALL);
+            //System.out.println(myAlleles.toString());
+            calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
+        }
         return calls;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index 07f02de57..2a99f1aad 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
 import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.StingException;
 import org.broadinstitute.sting.utils.genotype.Haplotype;
@@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
 
             if (pileup != null ) {
                 double[] genotypeLikelihoods;
+
                 if (useOldWrongHorribleHackedUpLikelihoodModel)
                    genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
                 else
                     genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
 
 
-
-                // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
-                // for 3 alleles it's 00 01 11 02 12 22
-                 GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
+                GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
                         alleleList,
                         genotypeLikelihoods,
                         getFilteredDepth(pileup)));
@@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
         return indelLikelihoodMap.get();
     }
 
+    // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
+    // so that per-sample DP will include deletions covering the event.
+    protected int getFilteredDepth(ReadBackedPileup pileup) {
+        int count = 0;
+        for ( PileupElement p : pileup ) {
+            if (/*p.isDeletion() ||*/ BaseUtils.isRegularBase(p.getBase()) )
+                count++;
+        }
+
+        return count;
+    }
+
 }
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
index f4c057c15..0d85f9606 100644
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@@ -1056,42 +1056,30 @@ public class MathUtils {
     }
 
     static public double softMax(final double x, final double y) {
-         if (Double.isInfinite(x))
-             return y;
+        // we need to compute log10(10^x + 10^y)
+        // By Jacobian logarithm identity, this is equal to
+        // max(x,y) + log10(1+10^-abs(x-y))
+        // we compute the second term as a table lookup
+        // with integer quantization
 
-         if (Double.isInfinite(y))
-             return x;
+        // slow exact version:
+        // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
 
-         if (y >= x + MAX_JACOBIAN_TOLERANCE)
-             return y;
-         if (x >= y + MAX_JACOBIAN_TOLERANCE)
-             return x;
+        double diff = x-y;
 
-         // OK, so |y-x| < tol: we use the following identity then:
-         // we need to compute log10(10^x + 10^y)
-         // By Jacobian logarithm identity, this is equal to
-         // max(x,y) + log10(1+10^-abs(x-y))
-         // we compute the second term as a table lookup
-         // with integer quantization
-
-         //double diff = Math.abs(x-y);
-         double diff = x-y;
-         double t1 =x;
-         if (diff<0) { //
-             t1 = y;
-             diff= -diff;
-         }
-         // t has max(x,y), diff has abs(x-y)
-         // we have pre-stored correction for 0,0.1,0.2,... 10.0
-         //int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP);
-         int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
-         // gdebug+
-         //double z =Math.log10(1+Math.pow(10.0,-diff));
-         //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
-         //gdebug-
-         return t1+jacobianLogTable[ind];
-         // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
-     }
+        if (diff > MAX_JACOBIAN_TOLERANCE)
+            return x;
+        else if (diff < -MAX_JACOBIAN_TOLERANCE)
+            return y;
+        else if (diff >= 0) {
+            int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
+            return x + jacobianLogTable[ind];
+        }
+        else {
+            int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5);
+            return y + jacobianLogTable[ind];
+        }
+    }
 
     public static double phredScaleToProbability (byte q) {
         return Math.pow(10,(-q)/10.0);

From b399424a9cd4c842e8dac0e2e0f9c17ba4002ff4 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel 
Date: Fri, 9 Sep 2011 20:44:47 -0400
Subject: [PATCH 055/113] Fix integration test affected by non-calling all-zero
 PL samples, and add a more complicated multi-sample integration test from a
 phase 1 case, GBR with mixed technologies and complex input alleles

---
 .../genotyper/UnifiedGenotyperIntegrationTest.java     | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index 185880401..e212e07ea 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -18,6 +18,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
 
     private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129;
     private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b36dbSNP129;
+    private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b37dbSNP132;
 
     // --------------------------------------------------------------------------------------------------------------
     //
@@ -28,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
     public void testMultiSamplePilot1() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
-                Arrays.asList("149e6ad9b3fd23551254a691286a96b3"));
+                Arrays.asList("4bd3e874d071c4df250dce32cf441aab"));
         executeTest("test MultiSample Pilot1", spec);
     }
 
@@ -276,7 +277,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                 Arrays.asList("e66b7321e2ac91742ad3ef91040daafd"));
         executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
 
+        WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
+                baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
+                        "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
+                Arrays.asList("b6c3f771e8844a64681187ebb2b620f1"));
+        executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4);
+
     }
 
 
+
 }

From 9344938360d6d7d1312dc5993023fedd4f540701 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel 
Date: Sat, 10 Sep 2011 19:41:01 -0400
Subject: [PATCH 056/113] Uncomment code to add deleted bases covering an indel
 to per-sample genotype reporting, update integration tests accordingly

---
 ...elGenotypeLikelihoodsCalculationModel.java |  2 +-
 .../UnifiedGenotyperIntegrationTest.java      | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index 2a99f1aad..ec5eefd60 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -449,7 +449,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
     protected int getFilteredDepth(ReadBackedPileup pileup) {
         int count = 0;
         for ( PileupElement p : pileup ) {
-            if (/*p.isDeletion() ||*/ BaseUtils.isRegularBase(p.getBase()) )
+            if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) )
                 count++;
         }
 
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index e212e07ea..41496bdf1 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
     public void testMultiSamplePilot1() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
-                Arrays.asList("4bd3e874d071c4df250dce32cf441aab"));
+                Arrays.asList("e6639ea2dc81635c706e6c35921406d7"));
         executeTest("test MultiSample Pilot1", spec);
     }
 
@@ -50,7 +50,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
     public void testSingleSamplePilot2() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
-                Arrays.asList("82d469145c174486ccc494884852cc58"));
+                Arrays.asList("d1cbd1fb9f3f7323941a95bc2def7e5a"));
         executeTest("test SingleSample Pilot2", spec);
     }
 
@@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
     //
     // --------------------------------------------------------------------------------------------------------------
 
-    private final static String COMPRESSED_OUTPUT_MD5 = "a5a9f38c645d6004d4640765a8b77ce4";
+    private final static String COMPRESSED_OUTPUT_MD5 = "2732b169cdccb21eb3ea00429619de79";
 
     @Test
     public void testCompressedOutput() {
@@ -81,7 +81,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
 
         // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
 
-        String md5 = "0a45761c0e557d9c2080eb9e7f4f6c41";
+        String md5 = "cbac3960bbcb9d6192c57549208c182c";
 
         WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
                 baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@@ -160,8 +160,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
     @Test
     public void testHeterozyosity() {
         HashMap e = new HashMap();
-        e.put( 0.01, "af5199fbc0853cf5888acdcc88f012bc" );
-        e.put( 1.0 / 1850, "4e6938645ccde1fdf204ffbf4e88170f" );
+        e.put( 0.01, "aed69402ddffe7f2ed5ca98563bfba02" );
+        e.put( 1.0 / 1850, "fa94a059f08c1821b721335d93ed2ea5" );
 
         for ( Map.Entry entry : e.entrySet() ) {
             WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@@ -185,7 +185,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                         " -o %s" +
                         " -L 1:10,000,000-10,100,000",
                 1,
-                Arrays.asList("213ebaaaacf850312d885e918eb33500"));
+                Arrays.asList("1c080e6596d4c830bb5d147b04e2a82c"));
 
         executeTest(String.format("test multiple technologies"), spec);
     }
@@ -204,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                         " -L 1:10,000,000-10,100,000" +
                         " -baq CALCULATE_AS_NECESSARY",
                 1,
-                Arrays.asList("3aecba34a89f3525afa57a38dc20e6cd"));
+                Arrays.asList("9129ad748ca3be2d3b321d2d7e83ae5b"));
 
         executeTest(String.format("test calling with BAQ"), spec);
     }
@@ -223,7 +223,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                         " -o %s" +
                         " -L 1:10,000,000-10,500,000",
                 1,
-                Arrays.asList("043973c719a85de29a35a33a674616fb"));
+                Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41"));
 
         executeTest(String.format("test indel caller in SLX"), spec);
     }
@@ -238,7 +238,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                         " -minIndelCnt 1" +
                         " -L 1:10,000,000-10,100,000",
                 1,
-                Arrays.asList("68d4e6c1849e892467aed61c33e7bf24"));
+                Arrays.asList("5fe98ee853586dc9db58f0bc97daea63"));
 
         executeTest(String.format("test indel caller in SLX witn low min allele count"), spec);
     }
@@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                          " -o %s" +
                          " -L 1:10,000,000-10,500,000",
                  1,
-                 Arrays.asList("f86d453c5d2d2f33fb28ae2050658a5e"));
+                 Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae"));
 
          executeTest(String.format("test indel calling, multiple technologies"), spec);
      }
@@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
         WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
                 baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
                         "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
-                Arrays.asList("b6c3f771e8844a64681187ebb2b620f1"));
+                Arrays.asList("4be308fd9e8167ebee677f62a7a753b7"));
         executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4);
 
     }

From 07d365ce392bc5c38ad1f3dfc7348819b7017438 Mon Sep 17 00:00:00 2001
From: Ryan Poplin 
Date: Mon, 12 Sep 2011 09:01:34 -0400
Subject: [PATCH 059/113] Fixing units in queue job report Gantt plots

---
 public/R/queueJobReport.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R
index a24d269c9..31916361e 100644
--- a/public/R/queueJobReport.R
+++ b/public/R/queueJobReport.R
@@ -140,6 +140,8 @@ print(paste("Project          :", inputFileName))
 convertUnits <- function(gatkReportData) {
   convertGroup <- function(g) {
     g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
+    g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
+    g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
     g
   }
   lapply(gatkReportData, convertGroup)

From 60ebe68aff12290f18527faed04f3f7bb356d962 Mon Sep 17 00:00:00 2001
From: Ryan Poplin 
Date: Mon, 12 Sep 2011 09:43:23 -0400
Subject: [PATCH 060/113] Fixing issue in VariantEval in which insertion and
 deletion events weren't treated symmetrically. Added new option to require
 strict allele matching.

---
 .../gatk/walkers/varianteval/VariantEvalWalker.java | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index 0d09b7033..266b97af0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -55,7 +55,7 @@ import java.util.*;
  *
  * 

Output

*

- * Evaluation tables. + * Evaluation tables detailing the results of the eval modules which were applied. *

* *

Examples

@@ -152,6 +152,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; + @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) + private boolean requireStrictAlleleMatch = false; + // Variables private Set jexlExpressions = new TreeSet(); @@ -360,16 +363,16 @@ public class VariantEvalWalker extends RodWalker implements Tr if ( matchingComps.size() == 0 ) return null; - // find the comp which matches the alternate allele from eval + // find the comp which matches both the reference allele and alternate allele from eval Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); for ( VariantContext comp : matchingComps ) { Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); - if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) ) + if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) ) return comp; } - // if none match, just return the first one - return matchingComps.get(0); + // if none match, just return the first one unless we require a strict match + return (requireStrictAlleleMatch ? null : matchingComps.get(0)); } public Integer treeReduce(Integer lhs, Integer rhs) { return null; } From 981b78ea50708139cc3157ccff990fca6cb3e7e8 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 12 Sep 2011 12:17:43 -0400 Subject: [PATCH 061/113] Changing the VQSR command line syntax back to the parsed tags approach. This cleans up the code and makes sure we won't be parsing the same rod file multiple times. I've tried to update the appropriate qscripts. --- .../variantrecalibration/TrainingSet.java | 76 ++++++++++++++++ .../VariantDataManager.java | 88 +++++++++---------- .../VariantRecalibrator.java | 66 ++++---------- ...ntRecalibrationWalkersIntegrationTest.java | 13 ++- .../MethodsDevelopmentCallingPipeline.scala | 10 +-- 5 files changed, 147 insertions(+), 106 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java new file mode 100755 index 000000000..5f688d001 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/12/11 + */ + +public class TrainingSet { + + public RodBinding rodBinding; + public boolean isKnown = false; + public boolean isTraining = false; + public boolean isAntiTraining = false; + public boolean isTruth = false; + public boolean isConsensus = false; + public double prior = 0.0; + + protected final static Logger logger = Logger.getLogger(TrainingSet.class); + + public TrainingSet( final RodBinding rodBinding) { + this.rodBinding = rodBinding; + + final Tags tags = rodBinding.getTags(); + final String name = rodBinding.getName(); + + // Parse the tags to decide which tracks have which properties + if( tags != null ) { + isKnown = tags.containsKey("known") && tags.getValue("known").equals("true"); + isTraining = tags.containsKey("training") && tags.getValue("training").equals("true"); + isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true"); + isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true"); + isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true"); + prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior ); + } + + // Report back to the user which tracks were found and the properties that were detected + if( !isConsensus && !isAntiTraining ) { + logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) ); + } else if( isConsensus ) { + logger.info( String.format( "Found consensus track: %s", name) ); + } else { + logger.info( String.format( "Found bad sites training track: %s", name) ); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 429becfc7..e04bfab76 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -51,10 +51,10 @@ public class VariantDataManager { private ExpandingArrayList data; private final double[] meanVector; private final double[] varianceVector; // this is really the standard deviation - public final ArrayList annotationKeys; + public final List annotationKeys; private final VariantRecalibratorArgumentCollection VRAC; protected final static Logger logger = Logger.getLogger(VariantDataManager.class); - + protected final List trainingSets; public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = null; @@ -62,6 +62,7 @@ public class VariantDataManager { this.VRAC = VRAC; meanVector = new double[this.annotationKeys.size()]; varianceVector = new double[this.annotationKeys.size()]; + trainingSets = new ArrayList(); } public void setData( final ExpandingArrayList data ) { @@ -104,6 +105,31 @@ public class VariantDataManager { } } + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } + + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } + + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } + + public boolean checkHasKnownSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isKnown ) { return true; } + } + return false; + } + public ExpandingArrayList getTrainingData() { final ExpandingArrayList trainingData = new ExpandingArrayList(); for( final VariantDatum datum : data ) { @@ -232,57 +258,35 @@ public class VariantDataManager { return value; } - public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap rodToPriorMap, - final List> training, final List> truth, final List> known, final List> badSites, final List> resource) { + public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { datum.isKnown = false; datum.atTruthSite = false; datum.atTrainingSite = false; datum.atAntiTrainingSite = false; datum.prior = 2.0; - //BUGBUG: need to clean this up - - for( final RodBinding rod : training ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { + for( final TrainingSet trainingSet : trainingSets ) { + for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.isKnown = datum.isKnown || trainingSet.isKnown; + datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; + datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining; + datum.prior = Math.max( datum.prior, trainingSet.prior ); + datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 ); } - } - } - for( final RodBinding rod : truth ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTruthSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : known ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.isKnown = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : resource ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : badSites ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { if( trainVC != null ) { - datum.atAntiTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining; } } } } + private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { + return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && + ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); + } + public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { for( final VariantDatum datum : data ) { RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", @@ -290,10 +294,4 @@ public class VariantDataManager { (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); } } - - private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { - return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && - ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index df4faebd1..529d17285 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -77,16 +77,15 @@ import java.util.*; *

* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. * - *

Examples

+ *

Example

*
  * java -Xmx4g -jar GenomeAnalysisTK.jar \
  *   -T VariantRecalibrator \
  *   -R reference/human_g1k_v37.fasta \
  *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
- *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -known:prior=8.0 dbsnp_132.b37.vcf \
+ *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
  *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
  *   -recalFile path/to/output.recal \
  *   -tranchesFile path/to/output.tranches \
@@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker> input;
 
     /**
-     * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
-     */
-    @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
-    public List> training;
-
-    /**
-     * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
-     * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
-     */
-    @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
-    public List> truth;
-
-    /**
-     * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
-     * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
-     */
-    @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
-    public List> known = Collections.emptyList();
-
-    /**
-     * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
-     * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
-     */
-    @Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
-    public List> badSites = Collections.emptyList();
-
-    /**
-     * Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
+     * Any set of VCF files to use as lists of training, truth, or known sites.
+     * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
+     * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
+     * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
+     * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
      */
     @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
     public List> resource = Collections.emptyList();
@@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = new TreeSet();
     private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
-    private final HashMap rodToPriorMap = new HashMap();
 
     //---------------------------------------------------------------------------------------------------------------
     //
@@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker> allInputBindings = new ArrayList>();
-        allInputBindings.addAll(truth);
-        allInputBindings.addAll(training);
-        allInputBindings.addAll(known);
-        allInputBindings.addAll(badSites);
-        allInputBindings.addAll(resource);
-        for( final RodBinding rod : allInputBindings ) {
-            try {
-                rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
-            } catch( NumberFormatException e ) {
-                throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
-            }
+        for( RodBinding rod : resource ) {
+            dataManager.addTrainingSet( new TrainingSet( rod ) );
+        }
+
+        if( !dataManager.checkHasTrainingSet() ) {
+            throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
+        }
+        if( !dataManager.checkHasTruthSet() ) {
+            throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
         }
     }
 
@@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker= 10) {

From 9d9d438bc4b1804631126565ad15f4a69f649b0b Mon Sep 17 00:00:00 2001
From: David Roazen 
Date: Mon, 12 Sep 2011 12:28:23 -0400
Subject: [PATCH 062/113] New VariantAnnotatorEngine capability: an
 initialize() method for all annotation classes.

All VariantAnnotator annotation classes may now have an (optional) initialize() method
that gets called by the VariantAnnotatorEngine ONCE before annotation starts.

As an example of how this can be used, the SnpEff annotation class will use the initialize()
method to check whether the SnpEff version number stored in the vcf header is a supported
version, and also to verify that its required RodBinding is present.
---
 .../gatk/walkers/annotator/VariantAnnotator.java  |  3 +++
 .../walkers/annotator/VariantAnnotatorEngine.java | 15 +++++++++++----
 .../interfaces/AnnotatorCompatibleWalker.java     |  1 +
 .../interfaces/VariantAnnotatorAnnotation.java    |  9 +++------
 .../gatk/walkers/genotyper/UnifiedGenotyper.java  |  1 +
 5 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index 96a400c68..971727727 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -86,6 +86,7 @@ public class VariantAnnotator extends RodWalker implements Ann
 
     @ArgumentCollection
     protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
+    public RodBinding getVariantRodBinding() { return variantCollection.variants; }
 
     /**
      * The INFO field will be annotated with information on the most biologically-significant effect
@@ -208,6 +209,8 @@ public class VariantAnnotator extends RodWalker implements Ann
             engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
         engine.initializeExpressions(expressionsToUse);
 
+        engine.invokeAnnotationInitializationMethods();
+
         // setup the header fields
         // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
         Set hInfo = new HashSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index 01926a7f3..17830f129 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -29,10 +29,7 @@ import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
@@ -113,6 +110,16 @@ public class VariantAnnotatorEngine {
             dbAnnotations.put(rod, rod.getName());
     }
 
+    public void invokeAnnotationInitializationMethods() {
+        for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
+            annotation.initialize(walker);
+        }
+
+        for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
+            annotation.initialize(walker);
+        }
+    }
+
     public Set getVCFAnnotationDescriptions() {
 
         Set descriptions = new HashSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
index 20a2aea0e..9dda57ae3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
@@ -9,6 +9,7 @@ import java.util.List;
 public interface AnnotatorCompatibleWalker {
 
     // getter methods for various used bindings
+    public abstract RodBinding getVariantRodBinding();
     public abstract RodBinding getSnpEffRodBinding();
     public abstract RodBinding getDbsnpRodBinding();
     public abstract List> getCompRodBindings();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
index f33d61df9..9e48de9c3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
@@ -24,18 +24,15 @@
 
 package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
 
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
 import java.util.List;
-import java.util.Map;
 
 @DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
 public abstract class VariantAnnotatorAnnotation {
     // return the INFO keys
     public abstract List getKeyNames();
+
+    // initialization method (optional for subclasses, and therefore non-abstract)
+    public void initialize ( AnnotatorCompatibleWalker walker ) { }
 }
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index d5dbdedd6..4ee2d5f44 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -127,6 +127,7 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; }
+    public RodBinding getVariantRodBinding() { return null; }
     public RodBinding getSnpEffRodBinding() { return null; }
     public List> getCompRodBindings() { return Collections.emptyList(); }
     public List> getResourceRodBindings() { return Collections.emptyList(); }

From ec4b30de6d8e9634ca0014c65215460bda066b64 Mon Sep 17 00:00:00 2001
From: Eric Banks 
Date: Mon, 12 Sep 2011 14:45:53 -0400
Subject: [PATCH 063/113] Patch from Laurent: typo leads to bad error messages.

---
 .../sting/commandline/ArgumentTypeDescriptor.java               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index 16358d05f..5fff8f609 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                     }
 
                     if ( tribbleType == null )
-                        if ( ! file.canRead() | !! file.isFile() ) {
+                        if ( ! file.canRead() | ! file.isFile() ) {
                             throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
                         } else {
                             throw new UserException.CommandLineException(

From 4e116760f4f8577bfaf677ed7711cfc599a6c90d Mon Sep 17 00:00:00 2001
From: Eric Banks 
Date: Mon, 12 Sep 2011 15:09:25 -0400
Subject: [PATCH 064/113] Removing some old cruft from the packages dir. 
 Updating AnalyzeCovariates to include all Covariates.

---
 public/packages/AnalyzeCovariates.xml         |  5 +---
 .../packages/FindContaminatingReadGroups.xml  | 10 -------
 public/packages/GATKResources.xml             | 20 --------------
 public/packages/IndelGenotyper.xml            | 11 --------
 .../packages/LocalRealignmentAroundIndels.xml | 12 ---------
 .../packages/QualityScoresRecalibration.xml   | 18 -------------
 public/packages/RMDIndexer.xml                | 13 ----------
 public/packages/UnifiedGenotyper.xml          | 11 --------
 public/packages/VariantAnnotator.xml          | 26 -------------------
 public/packages/VariantEval.xml               | 18 -------------
 public/packages/VariantFiltration.xml         | 13 ----------
 public/packages/VariantRecalibration.xml      | 12 ---------
 12 files changed, 1 insertion(+), 168 deletions(-)
 delete mode 100644 public/packages/FindContaminatingReadGroups.xml
 delete mode 100755 public/packages/GATKResources.xml
 delete mode 100644 public/packages/IndelGenotyper.xml
 delete mode 100644 public/packages/LocalRealignmentAroundIndels.xml
 delete mode 100644 public/packages/QualityScoresRecalibration.xml
 delete mode 100644 public/packages/RMDIndexer.xml
 delete mode 100644 public/packages/UnifiedGenotyper.xml
 delete mode 100644 public/packages/VariantAnnotator.xml
 delete mode 100644 public/packages/VariantEval.xml
 delete mode 100644 public/packages/VariantFiltration.xml
 delete mode 100644 public/packages/VariantRecalibration.xml

diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml
index 7e31934df..a6675a63d 100644
--- a/public/packages/AnalyzeCovariates.xml
+++ b/public/packages/AnalyzeCovariates.xml
@@ -6,10 +6,7 @@
     
       
       
-      
-      
-      
-      
+      
     
   
   
diff --git a/public/packages/FindContaminatingReadGroups.xml b/public/packages/FindContaminatingReadGroups.xml
deleted file mode 100644
index 880f64a81..000000000
--- a/public/packages/FindContaminatingReadGroups.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-
-
-  
-    
-    
-    
-      
-    
-  
-
diff --git a/public/packages/GATKResources.xml b/public/packages/GATKResources.xml
deleted file mode 100755
index 87e6e0e50..000000000
--- a/public/packages/GATKResources.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-  
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-   
-
diff --git a/public/packages/IndelGenotyper.xml b/public/packages/IndelGenotyper.xml
deleted file mode 100644
index c9e3ae0f6..000000000
--- a/public/packages/IndelGenotyper.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-    
-  
-
diff --git a/public/packages/LocalRealignmentAroundIndels.xml b/public/packages/LocalRealignmentAroundIndels.xml
deleted file mode 100644
index 46960e69f..000000000
--- a/public/packages/LocalRealignmentAroundIndels.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-    
-  
-
diff --git a/public/packages/QualityScoresRecalibration.xml b/public/packages/QualityScoresRecalibration.xml
deleted file mode 100644
index 95e8b7c63..000000000
--- a/public/packages/QualityScoresRecalibration.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-      
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/RMDIndexer.xml b/public/packages/RMDIndexer.xml
deleted file mode 100644
index 5d40876de..000000000
--- a/public/packages/RMDIndexer.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/UnifiedGenotyper.xml b/public/packages/UnifiedGenotyper.xml
deleted file mode 100644
index 67a17640c..000000000
--- a/public/packages/UnifiedGenotyper.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-    
-  
-
diff --git a/public/packages/VariantAnnotator.xml b/public/packages/VariantAnnotator.xml
deleted file mode 100644
index 88c0701f0..000000000
--- a/public/packages/VariantAnnotator.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/VariantEval.xml b/public/packages/VariantEval.xml
deleted file mode 100644
index 791066fb7..000000000
--- a/public/packages/VariantEval.xml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-      
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/VariantFiltration.xml b/public/packages/VariantFiltration.xml
deleted file mode 100644
index 48fa0ff37..000000000
--- a/public/packages/VariantFiltration.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-      
-    
-  
-
diff --git a/public/packages/VariantRecalibration.xml b/public/packages/VariantRecalibration.xml
deleted file mode 100644
index 6fe6b1eff..000000000
--- a/public/packages/VariantRecalibration.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-  
-    
-    
-    
-      
-      
-      
-    
-  
-

From e63d9d8f8edeaab24ad3bff06a97cc15b14b0043 Mon Sep 17 00:00:00 2001
From: Matt Hanna 
Date: Mon, 12 Sep 2011 21:50:59 -0400
Subject: [PATCH 065/113] Mauricio pointed out to me that dynamic merging the
 unmapped regions of multiple BAMs ('-L unmapped' with a BAM list) was
 completely broken.  Sorry about this!  Fixed.

---
 .../gatk/datasources/reads/BAMScheduler.java  |  3 +-
 .../gatk/datasources/reads/GATKBAMIndex.java  | 39 +++++++++++++++++++
 .../datasources/reads/ReadShardStrategy.java  | 21 ++--------
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index 467aebac5..47eb55b28 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
 
 import net.sf.picard.util.PeekableIterator;
 import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.GATKChunk;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocSortedSet;
 
@@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator {
             if(currentLocus == GenomeLoc.UNMAPPED) {
                 nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
                 for(SAMReaderID id: dataSource.getReaderIDs())
-                    nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
+                    nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
                 currentLocus = null;
                 continue;
             }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
index 5d0c38b78..dc703ff23 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
@@ -215,6 +215,45 @@ public class GATKBAMIndex {
         return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
     }
 
+    /**
+     * Use to get close to the unmapped reads at the end of a BAM file.
+     * @return The file offset of the first record in the last linear bin, or -1
+     * if there are no elements in linear bins (i.e. no mapped reads).
+     */
+    public long getStartOfLastLinearBin() {
+        openIndexFile();
+
+        seek(4);
+
+        final int sequenceCount = readInteger();
+        // Because no reads may align to the last sequence in the sequence dictionary,
+        // grab the last element of the linear index for each sequence, and return
+        // the last one from the last sequence that has one.
+        long lastLinearIndexPointer = -1;
+        for (int i = 0; i < sequenceCount; i++) {
+            // System.out.println("# Sequence TID: " + i);
+            final int nBins = readInteger();
+            // System.out.println("# nBins: " + nBins);
+            for (int j1 = 0; j1 < nBins; j1++) {
+                // Skip bin #
+                skipBytes(4);
+                final int nChunks = readInteger();
+                // Skip chunks
+                skipBytes(16 * nChunks);
+            }
+            final int nLinearBins = readInteger();
+            if (nLinearBins > 0) {
+                // Skip to last element of list of linear bins
+                skipBytes(8 * (nLinearBins - 1));
+                lastLinearIndexPointer = readLongs(1)[0];
+            }
+        }
+
+        closeIndexFile();
+
+        return lastLinearIndexPointer;
+    }
+
     /**
      * Gets the possible number of bins for a given reference sequence.
      * @return How many bins could possibly be used according to this indexing scheme to index a single contig.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
index c2235ec73..5ea75dbb0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java
@@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
             Map selectedReaders = new HashMap();
             while(selectedReaders.size() == 0 && currentFilePointer != null) {
                 shardPosition = currentFilePointer.fileSpans;
+
                 for(SAMReaderID id: shardPosition.keySet()) {
-                    // If the region contains location information (in other words, it is not at
-                    // the start of the unmapped region), add the region.
-                    if(currentFilePointer.isRegionUnmapped) {
-                        // If the region is unmapped and no location data exists, add a null as an indicator to
-                        // start at the next unmapped region.
-                        if(!isIntoUnmappedRegion) {
-                            selectedReaders.put(id,null);
-                            isIntoUnmappedRegion = true;
-                        }
-                        else
-                            selectedReaders.put(id,position.get(id));
-                    }
-                    else {
-                        SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
-                        if(!fileSpan.isEmpty())
-                            selectedReaders.put(id,fileSpan);
-                    }
+                    SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
+                    if(!fileSpan.isEmpty())
+                        selectedReaders.put(id,fileSpan);
                 }
 
                 if(selectedReaders.size() > 0) {

From 2316b6aad3e81cc0cd88980acd73d716fd4cdb2d Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Mon, 12 Sep 2011 22:02:42 -0400
Subject: [PATCH 066/113] Trying to fix problems with S3 uploading behind
 firewalls

-- Cannot reproduce the very long waits reported by some users.
-- Fixed problem that exception might result in an undeleted file, which is now fixed with deleteOnExit()
---
 .../sting/gatk/phonehome/GATKRunReport.java               | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index 4d94130a8..70307380b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -293,15 +293,16 @@ public class GATKRunReport {
      * That is, postReport() is guarenteed not to fail for any reason.
      */
     private File postReportToLocalDisk(File rootDir) {
+        String filename = getID() + ".report.xml.gz";
+        File file = new File(rootDir, filename);
         try {
-            String filename = getID() + ".report.xml.gz";
-            File file = new File(rootDir, filename);
             postReportToFile(file);
             logger.debug("Wrote report to " + file);
             return file;
         } catch ( Exception e ) {
             // we catch everything, and no matter what eat the error
             exceptDuringRunReport("Couldn't read report file", e);
+            file.delete();
             return null;
         }
     }
@@ -312,6 +313,7 @@ public class GATKRunReport {
         File localFile = postReportToLocalDisk(new File("./"));
         logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
         if ( localFile != null ) { // we succeeded in creating the local file
+            localFile.deleteOnExit();
             try {
                 // stop us from printing the annoying, and meaningless, mime types warning
                 Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
@@ -342,8 +344,6 @@ public class GATKRunReport {
                 exceptDuringRunReport("Couldn't calculate MD5", e);
             } catch ( IOException e ) {
                 exceptDuringRunReport("Couldn't read report file", e);
-            } finally {
-                localFile.delete();
             }
         }
     }

From edf29d0616c576ece9a99af23cd42a54feb83e87 Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Mon, 12 Sep 2011 22:16:52 -0400
Subject: [PATCH 068/113] Explicit info message about uploading S3 log

---
 .../org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index 70307380b..5a7658031 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -338,6 +338,7 @@ public class GATKRunReport {
                 //logger.info("Uploading " + localFile + " to AWS bucket");
                 S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
                 logger.debug("Uploaded to AWS: " + s3Object);
+                logger.info("Uploaded run statistics report to AWS S3");
             } catch ( S3ServiceException e ) {
                 exceptDuringRunReport("S3 exception occurred", e);
             } catch ( NoSuchAlgorithmException e ) {

From c6672f2397fd55de79caa05420fc0ff8201d0e4d Mon Sep 17 00:00:00 2001
From: Guillermo del Angel 
Date: Tue, 13 Sep 2011 16:57:37 -0400
Subject: [PATCH 069/113] Intermediate (but necessary) fix for Beagle walkers:
 if a marker is absent in the Beagle output files, but present in the input
 vcf, there's no reason why it should be omitted in the output vcf. Rather,
 the vc is written as is from the input vcf

---
 .../beagle/BeagleOutputToVCFWalker.java       | 19 +++++++------------
 .../walkers/beagle/BeagleIntegrationTest.java |  4 ++--
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
index 880dba5d0..7f6dabeec 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@@ -175,21 +175,16 @@ public class BeagleOutputToVCFWalker  extends RodWalker {
         }
 
         BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2);
-        // ignore places where we don't have a variant
-        if ( beagleR2Feature == null )
-            return 0;
-
-
         BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs);
-
-        // ignore places where we don't have a variant
-        if ( beagleProbsFeature == null )
-            return 0;
-
         BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased);
+
         // ignore places where we don't have a variant
-        if ( beaglePhasedFeature == null )
-            return 0;
+        if ( beagleR2Feature == null || beagleProbsFeature == null ||  beaglePhasedFeature == null)
+        {
+           vcfWriter.add(vc_input);
+           return 1;
+        }
+
 
         // get reference base for current position
         byte refByte = ref.getBase();
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
index 5f759fdbf..1a01ef8e8 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
@@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest {
                         "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " +
                         "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " +
                         "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " +
-                        "-o %s -NO_HEADER", 1, Arrays.asList("3531451e84208264104040993889aaf4"));
+                        "-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847"));
         executeTest("test BeagleOutputToVCF", spec);
     }
    
@@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest {
                 "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
                 "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
                 "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
-                "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("8dd6ec53994fb46c5c22af8535d22965"));
+                "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee"));
 
         executeTest("testBeagleChangesSitesToRef",spec);
     }

From 1213b2f8c67bb871c418f13b16728a702c877786 Mon Sep 17 00:00:00 2001
From: David Roazen 
Date: Fri, 9 Sep 2011 16:10:30 -0400
Subject: [PATCH 070/113] SnpEff 2.0.2 support

-Rewrote SnpEff support in VariantAnnotator to support the latest SnpEff release (version 2.0.2)
-Removed support for SnpEff 1.9.6 (and associated tribble codec)
-Will refuse to parse SnpEff output files produced by unsupported versions (or without a version tag)
-Correctly matches ref/alt alleles before annotating a record, unlike the previous version
-Correctly handles indels (again, unlike the previous version
---
 .../sting/gatk/walkers/annotator/SnpEff.java  | 482 ++++++++++++++----
 .../walkers/annotator/VariantAnnotator.java   |   9 +-
 .../annotator/VariantAnnotatorEngine.java     |  12 +-
 .../interfaces/AnnotatorCompatibleWalker.java |   5 +-
 .../VariantAnnotatorAnnotation.java           |   3 +-
 .../walkers/genotyper/UnifiedGenotyper.java   |   5 +-
 .../utils/codecs/snpEff/SnpEffCodec.java      | 282 ----------
 .../utils/codecs/snpEff/SnpEffConstants.java  | 115 -----
 .../utils/codecs/snpEff/SnpEffFeature.java    | 423 ---------------
 .../sting/utils/codecs/vcf/VCFHeader.java     |  12 +
 .../utils/variantcontext/VariantContext.java  |  22 +
 .../walkers/annotator/SnpEffUnitTest.java     |  86 ++++
 .../VariantAnnotatorIntegrationTest.java      |  21 +-
 .../codecs/snpEff/SnpEffCodecUnitTest.java    | 259 ----------
 14 files changed, 539 insertions(+), 1197 deletions(-)
 delete mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
 delete mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java
 delete mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java
 create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java
 delete mode 100644 public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index 350c683c2..14abbca5b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -24,7 +24,9 @@
 
 package org.broadinstitute.sting.gatk.walkers.annotator;
 
+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
 import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
@@ -46,134 +45,421 @@ import java.util.*;
  * (http://snpeff.sourceforge.net/).
  *
  * For each variant, chooses one of the effects of highest biological impact from the SnpEff
- * output file (which must be provided on the command line via --snpEffFile:SnpEff ),
+ * output file (which must be provided on the command line via --snpEffFile .vcf),
  * and adds annotations on that effect.
  *
- * The possible biological effects and their associated impacts are defined in the class:
- * org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants
- *
  * @author David Roazen
  */
 public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation {
 
-    // SnpEff annotation key names:
-    public static final String GENE_ID_KEY = "GENE_ID";
-    public static final String GENE_NAME_KEY = "GENE_NAME";
-    public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID";
-    public static final String EXON_ID_KEY = "EXON_ID";
-    public static final String EXON_RANK_KEY = "EXON_RANK";
-    public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE";
-    public static final String EFFECT_KEY = "EFFECT";
-    public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT";
-    public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION";
-    public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA";
-    public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON";
-    public static final String CODON_NUM_KEY = "CODON_NUM";
-    public static final String CDS_SIZE_KEY = "CDS_SIZE";
+    private static Logger logger = Logger.getLogger(SnpEff.class);
+
+    // We refuse to parse SnpEff output files generated by unsupported versions, or
+    // lacking a SnpEff version number in the VCF header:
+    public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
+    public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
+
+    // SnpEff aggregates all effects (and effect metadata) together into a single INFO
+    // field annotation with the key EFF:
+    public static final String SNPEFF_INFO_FIELD_KEY = "EFF";
+    public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]";
+    public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|";
+
+    // Key names for the INFO field annotations we will add to each record, along
+    // with parsing-related information:
+    public enum InfoFieldKey {
+        EFF                   (-1),
+        EFF_IMPACT            (0),
+        EFF_CODON_CHANGE      (1),
+        EFF_AMINO_ACID_CHANGE (2),
+        EFF_GENE_NAME         (3),
+        EFF_GENE_BIOTYPE      (4),
+        EFF_TRANSCRIPT_ID     (6),
+        EFF_EXON_ID           (7);
+
+        // Index within the effect metadata subfields from the SnpEff EFF annotation
+        // where each key's associated value can be found during parsing.
+        private final int fieldIndex;
+
+        InfoFieldKey ( int fieldIndex ) {
+            this.fieldIndex = fieldIndex;
+        }
+
+        public int getFieldIndex() {
+            return fieldIndex;
+        }
+    }
+
+    // Possible SnpEff biological effects. All effect names found in the SnpEff input file
+    // are validated against this list.
+    public enum EffectType {
+        NONE,
+        CHROMOSOME,
+        INTERGENIC,
+        UPSTREAM,
+        UTR_5_PRIME,
+        UTR_5_DELETED,
+        START_GAINED,
+        SPLICE_SITE_ACCEPTOR,
+        SPLICE_SITE_DONOR,
+        START_LOST,
+        SYNONYMOUS_START,
+        NON_SYNONYMOUS_START,
+        CDS,
+        GENE,
+        TRANSCRIPT,
+        EXON,
+        EXON_DELETED,
+        NON_SYNONYMOUS_CODING,
+        SYNONYMOUS_CODING,
+        FRAME_SHIFT,
+        CODON_CHANGE,
+        CODON_INSERTION,
+        CODON_CHANGE_PLUS_CODON_INSERTION,
+        CODON_DELETION,
+        CODON_CHANGE_PLUS_CODON_DELETION,
+        STOP_GAINED,
+        SYNONYMOUS_STOP,
+        NON_SYNONYMOUS_STOP,
+        STOP_LOST,
+        INTRON,
+        UTR_3_PRIME,
+        UTR_3_DELETED,
+        DOWNSTREAM,
+        INTRON_CONSERVED,
+        INTERGENIC_CONSERVED,
+        REGULATION,
+        CUSTOM,
+        WITHIN_NON_CODING_GENE
+    }
+
+    // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact.
+    public enum EffectImpact {
+        LOW       (1),
+        MODERATE  (2),
+        HIGH      (3);
+
+        private final int severityRating;
+
+        EffectImpact ( int severityRating ) {
+            this.severityRating = severityRating;
+        }
+
+        public boolean isHigherImpactThan ( EffectImpact other ) {
+            return this.severityRating > other.severityRating;
+        }
+    }
+
+    // SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information.
+    public enum EffectCoding {
+        CODING,
+        NON_CODING,
+        UNKNOWN
+    }
+
+
+    public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
+        validateRodBinding(walker.getSnpEffRodBinding());
+        checkSnpEffVersion(walker, toolkit);
+    }
 
     public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) {
-        RodBinding snpEffRodBinding = walker.getSnpEffRodBinding();
-        validateRodBinding(snpEffRodBinding);
+        RodBinding snpEffRodBinding = walker.getSnpEffRodBinding();
 
-        List features = tracker.getValues(snpEffRodBinding, ref.getLocus());
+        // Get only SnpEff records that start at this locus, not merely span it:
+        List snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus());
 
-        // Add only annotations for one of the most biologically-significant effects as defined in
-        // the SnpEffConstants class:
-        SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features);
-
-        if ( mostSignificantEffect == null ) {
+        // Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate.
+        // If there is more than one such record, we only need to pick the first one, since the biological
+        // effects will be the same across all such records:
+        VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc);
+        if ( matchingRecord == null ) {
             return null;
         }
 
-        return generateAnnotations(mostSignificantEffect);
+        // Parse the SnpEff INFO field annotation from the matching record into individual effect objects:
+        List effects = parseSnpEffRecord(matchingRecord);
+        if ( effects.size() == 0 ) {
+            return null;
+        }
+
+        // Add only annotations for one of the most biologically-significant effects from this set:
+        SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects);
+        return mostSignificantEffect.getAnnotations();
     }
 
-    private void validateRodBinding ( RodBinding snpEffRodBinding ) {
+    private void validateRodBinding ( RodBinding snpEffRodBinding ) {
         if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) {
-            throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " +
-                                    "as a rodbinding on the command line, but no SnpEff rodbinding was found.");
+            throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " +
+                                    "as a rodbinding on the command line via the --snpEffFile option, but " +
+                                    "no SnpEff rodbinding was found.");
         }
     }
 
-    private SnpEffFeature getMostSignificantEffect ( List snpEffFeatures ) {
-        SnpEffFeature mostSignificantEffect = null;
+    private void checkSnpEffVersion ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) {
+        RodBinding snpEffRodBinding = walker.getSnpEffRodBinding();
 
-        for ( SnpEffFeature snpEffFeature : snpEffFeatures ) {
+        VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName());
+        VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY);
+
+        if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) {
+            throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " +
+                                    "input file, and so could not verify that the file was generated by a supported version of SnpEff (" +
+                                    Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")");
+        }
+
+        String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0];
+
+        if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) {
+            throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " +
+                                    "is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS));
+        }
+    }
+
+    private boolean isSupportedSnpEffVersion ( String versionString ) {
+        for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) {
+            if ( supportedVersion.equals(versionString) ) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    private VariantContext getMatchingSnpEffRecord ( List snpEffRecords, VariantContext vc ) {
+        for ( VariantContext snpEffRecord : snpEffRecords ) {
+            if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) {
+                return snpEffRecord;
+            }
+        }
+
+        return null;
+    }
+
+    private List parseSnpEffRecord ( VariantContext snpEffRecord ) {
+        List parsedEffects = new ArrayList();
+
+        Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY);
+        List individualEffects;
+
+        // The VCF codec stores multi-valued fields as a List, and single-valued fields as a String.
+        // We can have either in the case of SnpEff, since there may be one or more than one effect in this record.
+        if ( effectFieldValue instanceof List ) {
+            individualEffects = (List)effectFieldValue;
+        }
+        else {
+            individualEffects = Arrays.asList((String)effectFieldValue);
+        }
+
+        for ( String effectString : individualEffects ) {
+            String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER);
+
+            if ( effectNameAndMetadata.length != 2 ) {
+                logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s",
+                                          snpEffRecord.getChr(), snpEffRecord.getStart(), effectString));
+                continue;
+            }
+
+            String effectName = effectNameAndMetadata[0];
+            String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1);
+
+            SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata);
+
+            if ( parsedEffect.isWellFormed() ) {
+                parsedEffects.add(parsedEffect);
+            }
+            else {
+                logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"",
+                                          snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString));
+            }
+        }
+
+        return parsedEffects;
+    }
+
+    private SnpEffEffect getMostSignificantEffect ( List effects ) {
+        SnpEffEffect mostSignificantEffect = null;
+
+        for ( SnpEffEffect effect : effects ) {
             if ( mostSignificantEffect == null ||
-                 snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) {
+                 effect.isHigherImpactThan(mostSignificantEffect) ) {
 
-                mostSignificantEffect = snpEffFeature;
+                mostSignificantEffect = effect;
             }
         }
 
         return mostSignificantEffect;
     }
 
-    private Map generateAnnotations ( SnpEffFeature mostSignificantEffect ) {
-        Map annotations = new LinkedHashMap(Utils.optimumHashSize(getKeyNames().size()));
-
-        if ( mostSignificantEffect.hasGeneID() )
-            annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID());
-        if ( mostSignificantEffect.hasGeneName() )
-            annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName());
-        if ( mostSignificantEffect.hasTranscriptID() )
-            annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID());
-        if ( mostSignificantEffect.hasExonID() )
-            annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID());
-        if ( mostSignificantEffect.hasExonRank() )
-            annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank()));
-        if ( mostSignificantEffect.isNonCodingGene() )
-            annotations.put(WITHIN_NON_CODING_GENE_KEY, null);
-
-        annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString());
-        annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString());
-        if ( mostSignificantEffect.hasEffectExtraInformation() )
-            annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation());
-
-        if ( mostSignificantEffect.hasOldAndNewAA() )
-            annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA());
-        if ( mostSignificantEffect.hasOldAndNewCodon() )
-            annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon());
-        if ( mostSignificantEffect.hasCodonNum() )
-            annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum()));
-        if ( mostSignificantEffect.hasCdsSize() )
-            annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize()));
-
-        return annotations;
-    }
-
     public List getKeyNames() {
-        return Arrays.asList( GENE_ID_KEY,
-                              GENE_NAME_KEY,
-                              TRANSCRIPT_ID_KEY,
-                              EXON_ID_KEY,
-                              EXON_RANK_KEY,
-                              WITHIN_NON_CODING_GENE_KEY,
-                              EFFECT_KEY,
-                              EFFECT_IMPACT_KEY,
-                              EFFECT_EXTRA_INFORMATION_KEY,
-                              OLD_NEW_AA_KEY,
-                              OLD_NEW_CODON_KEY,
-                              CODON_NUM_KEY,
-                              CDS_SIZE_KEY
+        return Arrays.asList( InfoFieldKey.EFF.toString(),
+                              InfoFieldKey.EFF_IMPACT.toString(),
+                              InfoFieldKey.EFF_CODON_CHANGE.toString(),
+                              InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(),
+                              InfoFieldKey.EFF_GENE_NAME.toString(),
+                              InfoFieldKey.EFF_GENE_BIOTYPE.toString(),
+                              InfoFieldKey.EFF_TRANSCRIPT_ID.toString(),
+                              InfoFieldKey.EFF_EXON_ID.toString()
                             );
     }
 
     public List getDescriptions() {
         return Arrays.asList(
-            new VCFInfoHeaderLine(GENE_ID_KEY,                  1, VCFHeaderLineType.String,  "Gene ID for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(GENE_NAME_KEY,                1, VCFHeaderLineType.String,  "Gene name for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY,            1, VCFHeaderLineType.String,  "Transcript ID for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(EXON_ID_KEY,                  1, VCFHeaderLineType.String,  "Exon ID for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(EXON_RANK_KEY,                1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY,   0, VCFHeaderLineType.Flag,    "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
-            new VCFInfoHeaderLine(EFFECT_KEY,                   1, VCFHeaderLineType.String,  "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
-            new VCFInfoHeaderLine(EFFECT_IMPACT_KEY,            1, VCFHeaderLineType.String,  "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
-            new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String,  "Additional information about the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(OLD_NEW_AA_KEY,               1, VCFHeaderLineType.String,  "Old/New amino acid for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(OLD_NEW_CODON_KEY,            1, VCFHeaderLineType.String,  "Old/New codon for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(CODON_NUM_KEY,                1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
-            new VCFInfoHeaderLine(CDS_SIZE_KEY,                 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
+            new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(),                   1, VCFHeaderLineType.String,  "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
+            new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(),            1, VCFHeaderLineType.String,  "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
+            new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(),      1, VCFHeaderLineType.String,  "Old/New codon for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String,  "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(),         1, VCFHeaderLineType.String,  "Gene name for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(),      1, VCFHeaderLineType.String,  "Gene biotype for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(),     1, VCFHeaderLineType.String,  "Transcript ID for the highest-impact effect resulting from the current variant"),
+            new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(),           1, VCFHeaderLineType.String,  "Exon ID for the highest-impact effect resulting from the current variant")
         );
     }
+
+    /**
+     * Helper class to parse, validate, and store a single SnpEff effect and its metadata.
+     */
+    protected static class SnpEffEffect {
+        private EffectType effect;
+        private EffectImpact impact;
+        private String codonChange;
+        private String aminoAcidChange;
+        private String geneName;
+        private String geneBiotype;
+        private EffectCoding coding;
+        private String transcriptID;
+        private String exonID;
+
+        private String parseError = null;
+        private boolean isWellFormed = true;
+
+        private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
+        private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
+        private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
+
+        // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
+        // errors come after warnings, not vice versa:
+        private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
+        private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
+
+        private static final int SNPEFF_CODING_FIELD_INDEX = 5;
+
+        public SnpEffEffect ( String effectName, String[] effectMetadata ) {
+            parseEffectName(effectName);
+            parseEffectMetadata(effectMetadata);
+        }
+
+        private void parseEffectName ( String effectName ) {
+            try {
+                effect = EffectType.valueOf(effectName);
+            }
+            catch ( IllegalArgumentException e ) {
+                parseError(String.format("%s is not a recognized effect type", effectName));
+            }
+        }
+
+        private void parseEffectMetadata ( String[] effectMetadata ) {
+            if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
+                if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
+                    parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
+                }
+                else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
+                    parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
+                }
+                else {
+                    parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
+                                             EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length));
+                }
+
+                return;
+            }
+
+            try {
+                impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]);
+            }
+            catch ( IllegalArgumentException e ) {
+                parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]));
+            }
+
+            codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()];
+            aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()];
+            geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()];
+            geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()];
+
+            if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
+                try {
+                    coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
+                }
+                catch ( IllegalArgumentException e ) {
+                    parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX]));
+                }
+            }
+            else {
+                coding = EffectCoding.UNKNOWN;
+            }
+
+            transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()];
+            exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()];
+        }
+
+        private void parseError ( String message ) {
+            isWellFormed = false;
+
+            // Cache only the first error encountered:
+            if ( parseError == null ) {
+                parseError = message;
+            }
+        }
+
+        public boolean isWellFormed() {
+            return isWellFormed;
+        }
+
+        public String getParseError() {
+            return parseError == null ? "" : parseError;
+        }
+
+        public boolean isCoding() {
+            return coding == EffectCoding.CODING;
+        }
+
+        public boolean isHigherImpactThan ( SnpEffEffect other ) {
+            // If one effect is within a coding gene and the other is not, the effect that is
+            // within the coding gene has higher impact:
+
+            if ( isCoding() && ! other.isCoding() ) {
+                return true;
+            }
+            else if ( ! isCoding() && other.isCoding() ) {
+                return false;
+            }
+
+            // Otherwise, both effects are either in or not in a coding gene, so we compare the impacts
+            // of the effects themselves:
+
+            return impact.isHigherImpactThan(other.impact);
+        }
+
+        public Map getAnnotations() {
+            Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length));
+
+            addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString());
+            addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString());
+            addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange);
+            addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange);
+            addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName);
+            addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype);
+            addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID);
+            addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID);
+
+            return annotations;
+        }
+
+        private void addAnnotation ( Map annotations, String keyName, String keyValue ) {
+            // Only add annotations for keys associated with non-empty values:
+            if ( keyValue != null && keyValue.trim().length() > 0 ) {
+                annotations.put(keyName, keyValue);
+            }
+        }
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index 971727727..fb3dbc3cf 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.utils.classloader.PluginManager;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@@ -93,8 +92,8 @@ public class VariantAnnotator extends RodWalker implements Ann
      * listed in the SnpEff output file for each variant.
      */
     @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false)
-    public RodBinding snpEffFile;
-    public RodBinding getSnpEffRodBinding() { return snpEffFile; }
+    public RodBinding snpEffFile;
+    public RodBinding getSnpEffRodBinding() { return snpEffFile; }
 
     /**
       * rsIDs from this file are used to populate the ID column of the output.  Also, the DB INFO flag will be set when appropriate.
@@ -204,9 +203,9 @@ public class VariantAnnotator extends RodWalker implements Ann
         }
 
         if ( USE_ALL_ANNOTATIONS )
-            engine = new VariantAnnotatorEngine(this);
+            engine = new VariantAnnotatorEngine(this, getToolkit());
         else
-            engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
+            engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit());
         engine.initializeExpressions(expressionsToUse);
 
         engine.invokeAnnotationInitializationMethods();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index 17830f129..68cd07803 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -26,6 +26,7 @@
 package org.broadinstitute.sting.gatk.walkers.annotator;
 
 import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -46,6 +47,7 @@ public class VariantAnnotatorEngine {
 
     private HashMap, String> dbAnnotations = new HashMap, String>();
     private AnnotatorCompatibleWalker walker;
+    private GenomeAnalysisEngine toolkit;
 
     private static class VAExpression {
 
@@ -71,16 +73,18 @@ public class VariantAnnotatorEngine {
     }
 
     // use this constructor if you want all possible annotations
-    public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) {
+    public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
         this.walker = walker;
+        this.toolkit = toolkit;
         requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations();
         requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations();
         initializeDBs();
     }
 
     // use this constructor if you want to select specific annotations (and/or interfaces)
-    public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker) {
+    public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) {
         this.walker = walker;
+        this.toolkit = toolkit;
         initializeAnnotations(annotationGroupsToUse, annotationsToUse);
         initializeDBs();
     }
@@ -112,11 +116,11 @@ public class VariantAnnotatorEngine {
 
     public void invokeAnnotationInitializationMethods() {
         for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
-            annotation.initialize(walker);
+            annotation.initialize(walker, toolkit);
         }
 
         for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
-            annotation.initialize(walker);
+            annotation.initialize(walker, toolkit);
         }
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
index 9dda57ae3..7200f841b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java
@@ -1,7 +1,6 @@
 package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
 
 import org.broadinstitute.sting.commandline.RodBinding;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
 import java.util.List;
@@ -10,8 +9,8 @@ public interface AnnotatorCompatibleWalker {
 
     // getter methods for various used bindings
     public abstract RodBinding getVariantRodBinding();
-    public abstract RodBinding getSnpEffRodBinding();
+    public abstract RodBinding getSnpEffRodBinding();
     public abstract RodBinding getDbsnpRodBinding();
     public abstract List> getCompRodBindings();
     public abstract List> getResourceRodBindings();
-}
\ No newline at end of file
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
index 9e48de9c3..160a3d258 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java
@@ -24,6 +24,7 @@
 
 package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
 
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
 
 import java.util.List;
@@ -34,5 +35,5 @@ public abstract class VariantAnnotatorAnnotation {
     public abstract List getKeyNames();
 
     // initialization method (optional for subclasses, and therefore non-abstract)
-    public void initialize ( AnnotatorCompatibleWalker walker ) { }
+    public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { }
 }
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index 4ee2d5f44..428f97e2a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
 import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
@@ -128,7 +127,7 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; }
     public RodBinding getVariantRodBinding() { return null; }
-    public RodBinding getSnpEffRodBinding() { return null; }
+    public RodBinding getSnpEffRodBinding() { return null; }
     public List> getCompRodBindings() { return Collections.emptyList(); }
     public List> getResourceRodBindings() { return Collections.emptyList(); }
 
@@ -211,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker
- * This format has 23 tab-delimited fields:
- *
- * 
- * Chromosome
- * Position
- * Reference
- * Change
- * Change Type: {SNP, MNP, INS, DEL}
- * Zygosity: {Hom, Het}
- * Quality
- * Coverage
- * Warnings
- * Gene ID
- * Gene Name
- * Bio Type
- * Transcript ID
- * Exon ID
- * Exon Rank
- * Effect
- * Old/New Amino Acid
- * Old/New Codon
- * Codon Num
- * CDS Size
- * Codons Around
- * Amino Acids Around
- * Custom Interval ID
- * 
- * Note that we treat all except the Chromosome, Position, and Effect fields as optional. - *

- * - *

- * See also: @see SNPEff project page - *

- * - * @author David Roazen - * @since 2011 - */ -public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec { - - public static final int EXPECTED_NUMBER_OF_FIELDS = 23; - public static final String FIELD_DELIMITER_PATTERN = "\\t"; - public static final String EFFECT_FIELD_DELIMITER_PATTERN = "[,:]"; - public static final String HEADER_LINE_START = "# "; - public static final String[] HEADER_FIELD_NAMES = { "Chromo", - "Position", - "Reference", - "Change", - "Change type", - "Homozygous", - "Quality", - "Coverage", - "Warnings", - "Gene_ID", - "Gene_name", - "Bio_type", - "Trancript_ID", // yes, this is how it's spelled in the SnpEff output - "Exon_ID", - "Exon_Rank", - "Effect", - "old_AA/new_AA", - "Old_codon/New_codon", - "Codon_Num(CDS)", - "CDS_size", - "Codons around", - "AAs around", - "Custom_interval_ID" - }; - - // The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line: - public static final int[] REQUIRED_FIELDS = { 0, 1, 15 }; - - public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE"; - - - public Feature decodeLoc ( String line ) { - return decode(line); - } - - public Feature decode ( String line ) { - String[] tokens = line.split(FIELD_DELIMITER_PATTERN, -1); - - if ( tokens.length != EXPECTED_NUMBER_OF_FIELDS ) { - throw new TribbleException.InvalidDecodeLine("Line does not have the expected (" + EXPECTED_NUMBER_OF_FIELDS + - ") number of fields: found " + tokens.length + " fields.", line); - } - - try { - trimAllFields(tokens); - checkForRequiredFields(tokens, line); - - String contig = tokens[0]; - long position = Long.parseLong(tokens[1]); - - String reference = tokens[2].isEmpty() ? null : tokens[2]; - String change = tokens[3].isEmpty() ? null : tokens[3]; - ChangeType changeType = tokens[4].isEmpty() ? null : ChangeType.valueOf(tokens[4]); - Zygosity zygosity = tokens[5].isEmpty() ? null : Zygosity.valueOf(tokens[5]); - Double quality = tokens[6].isEmpty() ? null : Double.parseDouble(tokens[6]); - Long coverage = tokens[7].isEmpty() ? null : Long.parseLong(tokens[7]); - String warnings = tokens[8].isEmpty() ? null : tokens[8]; - String geneID = tokens[9].isEmpty() ? null : tokens[9]; - String geneName = tokens[10].isEmpty() ? null : tokens[10]; - String bioType = tokens[11].isEmpty() ? null : tokens[11]; - String transcriptID = tokens[12].isEmpty() ? null : tokens[12]; - String exonID = tokens[13].isEmpty() ? null : tokens[13]; - Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]); - - boolean isNonCodingGene = isNonCodingGene(tokens[15]); - - // Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present, - // otherwise split it into two subfields. We need this limit to prevent the extra effect-related information - // in the final field (when present) from being inappropriately tokenized: - - int effectFieldTokenLimit = isNonCodingGene ? 3 : 2; - String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit); - EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene); - String effectExtraInformation = parseEffectExtraInformation(effectFieldTokens, isNonCodingGene); - - String oldAndNewAA = tokens[16].isEmpty() ? null : tokens[16]; - String oldAndNewCodon = tokens[17].isEmpty() ? null : tokens[17]; - Integer codonNum = tokens[18].isEmpty() ? null : Integer.parseInt(tokens[18]); - Integer cdsSize = tokens[19].isEmpty() ? null : Integer.parseInt(tokens[19]); - String codonsAround = tokens[20].isEmpty() ? null : tokens[20]; - String aasAround = tokens[21].isEmpty() ? null : tokens[21]; - String customIntervalID = tokens[22].isEmpty() ? null : tokens[22]; - - return new SnpEffFeature(contig, position, reference, change, changeType, zygosity, quality, coverage, - warnings, geneID, geneName, bioType, transcriptID, exonID, exonRank, isNonCodingGene, - effect, effectExtraInformation, oldAndNewAA, oldAndNewCodon, codonNum, cdsSize, - codonsAround, aasAround, customIntervalID); - } - catch ( NumberFormatException e ) { - throw new TribbleException.InvalidDecodeLine("Error parsing a numeric field : " + e.getMessage(), line); - } - catch ( IllegalArgumentException e ) { - throw new TribbleException.InvalidDecodeLine("Illegal value in field: " + e.getMessage(), line); - } - } - - private void trimAllFields ( String[] tokens ) { - for ( int i = 0; i < tokens.length; i++ ) { - tokens[i] = tokens[i].trim(); - } - } - - private void checkForRequiredFields ( String[] tokens, String line ) { - for ( int requiredFieldIndex : REQUIRED_FIELDS ) { - if ( tokens[requiredFieldIndex].isEmpty() ) { - throw new TribbleException.InvalidDecodeLine("Line is missing required field \"" + - HEADER_FIELD_NAMES[requiredFieldIndex] + "\"", - line); - } - } - } - - private boolean isNonCodingGene ( String effectField ) { - return effectField.startsWith(NON_CODING_GENE_FLAG); - } - - private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) { - String effectName = ""; - - // If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield, - // otherwise it will be in the first subfield: - - if ( effectFieldTokens.length > 1 && isNonCodingGene ) { - effectName = effectFieldTokens[1].trim(); - } - else { - effectName = effectFieldTokens[0].trim(); - } - - return EffectType.valueOf(effectName); - } - - private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) { - - // The extra effect-related information, if present, will always be the last subfield: - - if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) { - return effectFieldTokens[effectFieldTokens.length - 1].trim(); - } - - return null; - } - - public Class getFeatureType() { - return SnpEffFeature.class; - } - - public Object readHeader ( LineReader reader ) { - String headerLine = ""; - - try { - headerLine = reader.readLine(); - } - catch ( IOException e ) { - throw new TribbleException("Unable to read header line from input file."); - } - - validateHeaderLine(headerLine); - return headerLine; - } - - private void validateHeaderLine ( String headerLine ) { - if ( headerLine == null || ! headerLine.startsWith(HEADER_LINE_START) ) { - throw new TribbleException.InvalidHeader("Header line does not start with " + HEADER_LINE_START); - } - - String[] headerTokens = headerLine.substring(HEADER_LINE_START.length()).split(FIELD_DELIMITER_PATTERN); - - if ( headerTokens.length != EXPECTED_NUMBER_OF_FIELDS ) { - throw new TribbleException.InvalidHeader("Header line does not contain headings for the expected number (" + - EXPECTED_NUMBER_OF_FIELDS + ") of columns."); - } - - for ( int columnIndex = 0; columnIndex < headerTokens.length; columnIndex++ ) { - if ( ! HEADER_FIELD_NAMES[columnIndex].equals(headerTokens[columnIndex]) ) { - throw new TribbleException.InvalidHeader("Header field #" + columnIndex + ": Expected \"" + - HEADER_FIELD_NAMES[columnIndex] + "\" but found \"" + - headerTokens[columnIndex] + "\""); - } - } - } - - public boolean canDecode ( final File potentialInput ) { - try { - LineReader reader = new AsciiLineReader(new FileInputStream(potentialInput)); - readHeader(reader); - } - catch ( Exception e ) { - return false; - } - - return true; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java deleted file mode 100644 index 270db470f..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -/** - * A set of constants associated with the SnpEff codec. - * - * @author David Roazen - */ -public class SnpEffConstants { - - // Possible SnpEff biological effects and their associated impacts: - public enum EffectType { - START_GAINED (EffectImpact.HIGH), - START_LOST (EffectImpact.HIGH), - EXON_DELETED (EffectImpact.HIGH), - FRAME_SHIFT (EffectImpact.HIGH), - STOP_GAINED (EffectImpact.HIGH), - STOP_LOST (EffectImpact.HIGH), - SPLICE_SITE_ACCEPTOR (EffectImpact.HIGH), - SPLICE_SITE_DONOR (EffectImpact.HIGH), - - NON_SYNONYMOUS_CODING (EffectImpact.MODERATE), - UTR_5_DELETED (EffectImpact.MODERATE), - UTR_3_DELETED (EffectImpact.MODERATE), - CODON_INSERTION (EffectImpact.MODERATE), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectImpact.MODERATE), - CODON_DELETION (EffectImpact.MODERATE), - CODON_CHANGE_PLUS_CODON_DELETION (EffectImpact.MODERATE), - - NONE (EffectImpact.LOW), - CHROMOSOME (EffectImpact.LOW), - INTERGENIC (EffectImpact.LOW), - UPSTREAM (EffectImpact.LOW), - UTR_5_PRIME (EffectImpact.LOW), - SYNONYMOUS_START (EffectImpact.LOW), - NON_SYNONYMOUS_START (EffectImpact.LOW), - CDS (EffectImpact.LOW), - GENE (EffectImpact.LOW), - TRANSCRIPT (EffectImpact.LOW), - EXON (EffectImpact.LOW), - SYNONYMOUS_CODING (EffectImpact.LOW), - CODON_CHANGE (EffectImpact.LOW), - SYNONYMOUS_STOP (EffectImpact.LOW), - NON_SYNONYMOUS_STOP (EffectImpact.LOW), - INTRON (EffectImpact.LOW), - UTR_3_PRIME (EffectImpact.LOW), - DOWNSTREAM (EffectImpact.LOW), - INTRON_CONSERVED (EffectImpact.LOW), - INTERGENIC_CONSERVED (EffectImpact.LOW), - CUSTOM (EffectImpact.LOW); - - private final EffectImpact impact; - - EffectType ( EffectImpact impact ) { - this.impact = impact; - } - - public EffectImpact getImpact() { - return impact; - } - } - - public enum EffectImpact { - LOW (1), - MODERATE (2), - HIGH (3); - - private final int severityRating; - - EffectImpact ( int severityRating ) { - this.severityRating = severityRating; - } - - public boolean isHigherImpactThan ( EffectImpact other ) { - return this.severityRating > other.severityRating; - } - } - - // The kinds of variants supported by the SnpEff output format: - public enum ChangeType { - SNP, - MNP, - INS, - DEL - } - - // Possible zygosities of SnpEff variants: - public enum Zygosity { - Hom, - Het - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java deleted file mode 100644 index 2f120b7d2..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -import org.broad.tribble.Feature; - -import java.util.NoSuchElementException; - -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; - -/** - * Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output. - * - * Many fields are optional, and missing values are represented by nulls. You should always call the - * hasX() method before calling the corresponding getX() method. Required fields can never be null - * and do not have a hasX() method. - * - * @author David Roazen - */ -public class SnpEffFeature implements Feature { - - private String contig; // REQUIRED FIELD - private long position; // REQUIRED FIELD - private String reference; - private String change; - private ChangeType changeType; - private Zygosity zygosity; - private Double quality; - private Long coverage; - private String warnings; - private String geneID; - private String geneName; - private String bioType; - private String transcriptID; - private String exonID; - private Integer exonRank; - private boolean isNonCodingGene; // REQUIRED FIELD - private EffectType effect; // REQUIRED FIELD - private String effectExtraInformation; - private String oldAndNewAA; - private String oldAndNewCodon; - private Integer codonNum; - private Integer cdsSize; - private String codonsAround; - private String aasAround; - private String customIntervalID; - - public SnpEffFeature ( String contig, - long position, - String reference, - String change, - ChangeType changeType, - Zygosity zygosity, - Double quality, - Long coverage, - String warnings, - String geneID, - String geneName, - String bioType, - String transcriptID, - String exonID, - Integer exonRank, - boolean isNonCodingGene, - EffectType effect, - String effectExtraInformation, - String oldAndNewAA, - String oldAndNewCodon, - Integer codonNum, - Integer cdsSize, - String codonsAround, - String aasAround, - String customIntervalID ) { - - if ( contig == null || effect == null ) { - throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields"); - } - - this.contig = contig; - this.position = position; - this.reference = reference; - this.change = change; - this.changeType = changeType; - this.zygosity = zygosity; - this.quality = quality; - this.coverage = coverage; - this.warnings = warnings; - this.geneID = geneID; - this.geneName = geneName; - this.bioType = bioType; - this.transcriptID = transcriptID; - this.exonID = exonID; - this.exonRank = exonRank; - this.isNonCodingGene = isNonCodingGene; - this.effect = effect; - this.effectExtraInformation = effectExtraInformation; - this.oldAndNewAA = oldAndNewAA; - this.oldAndNewCodon = oldAndNewCodon; - this.codonNum = codonNum; - this.cdsSize = cdsSize; - this.codonsAround = codonsAround; - this.aasAround = aasAround; - this.customIntervalID = customIntervalID; - } - - public boolean isHigherImpactThan ( SnpEffFeature other ) { - - // If one effect is in a non-coding gene and the other is not, the effect NOT in the - // non-coding gene has higher impact: - - if ( ! isNonCodingGene() && other.isNonCodingGene() ) { - return true; - } - else if ( isNonCodingGene() && ! other.isNonCodingGene() ) { - return false; - } - - // Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts - // of the effects themselves as defined in the SnpEffConstants class: - - return getEffectImpact().isHigherImpactThan(other.getEffectImpact()); - } - - public String getChr() { - return contig; - } - - public int getStart() { - return (int)position; - } - - public int getEnd() { - return (int)position; - } - - public boolean hasReference() { - return reference != null; - } - - public String getReference() { - if ( reference == null ) throw new NoSuchElementException("This feature has no reference field"); - return reference; - } - - public boolean hasChange() { - return change != null; - } - - public String getChange() { - if ( change == null ) throw new NoSuchElementException("This feature has no change field"); - return change; - } - - public boolean hasChangeType() { - return changeType != null; - } - - public ChangeType getChangeType() { - if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field"); - return changeType; - } - - public boolean hasZygosity() { - return zygosity != null; - } - - public Zygosity getZygosity() { - if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field"); - return zygosity; - } - - public boolean hasQuality() { - return quality != null; - } - - public Double getQuality() { - if ( quality == null ) throw new NoSuchElementException("This feature has no quality field"); - return quality; - } - - public boolean hasCoverage() { - return coverage != null; - } - - public Long getCoverage() { - if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field"); - return coverage; - } - - public boolean hasWarnings() { - return warnings != null; - } - - public String getWarnings() { - if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field"); - return warnings; - } - - public boolean hasGeneID() { - return geneID != null; - } - - public String getGeneID() { - if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field"); - return geneID; - } - - public boolean hasGeneName() { - return geneName != null; - } - - public String getGeneName() { - if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field"); - return geneName; - } - - public boolean hasBioType() { - return bioType != null; - } - - public String getBioType() { - if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field"); - return bioType; - } - - public boolean hasTranscriptID() { - return transcriptID != null; - } - - public String getTranscriptID() { - if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field"); - return transcriptID; - } - - public boolean hasExonID() { - return exonID != null; - } - - public String getExonID() { - if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field"); - return exonID; - } - - public boolean hasExonRank() { - return exonRank != null; - } - - public Integer getExonRank() { - if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field"); - return exonRank; - } - - public boolean isNonCodingGene() { - return isNonCodingGene; - } - - public EffectType getEffect() { - return effect; - } - - public EffectImpact getEffectImpact() { - return effect.getImpact(); - } - - public boolean hasEffectExtraInformation() { - return effectExtraInformation != null; - } - - public String getEffectExtraInformation() { - if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field"); - return effectExtraInformation; - } - - public boolean hasOldAndNewAA() { - return oldAndNewAA != null; - } - - public String getOldAndNewAA() { - if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field"); - return oldAndNewAA; - } - - public boolean hasOldAndNewCodon() { - return oldAndNewCodon != null; - } - - public String getOldAndNewCodon() { - if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field"); - return oldAndNewCodon; - } - - public boolean hasCodonNum() { - return codonNum != null; - } - - public Integer getCodonNum() { - if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field"); - return codonNum; - } - - public boolean hasCdsSize() { - return cdsSize != null; - } - - public Integer getCdsSize() { - if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field"); - return cdsSize; - } - - public boolean hasCodonsAround() { - return codonsAround != null; - } - - public String getCodonsAround() { - if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field"); - return codonsAround; - } - - public boolean hadAasAround() { - return aasAround != null; - } - - public String getAasAround() { - if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field"); - return aasAround; - } - - public boolean hasCustomIntervalID() { - return customIntervalID != null; - } - - public String getCustomIntervalID() { - if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field"); - return customIntervalID; - } - - public boolean equals ( Object o ) { - if ( o == null || ! (o instanceof SnpEffFeature) ) { - return false; - } - - SnpEffFeature other = (SnpEffFeature)o; - - return contig.equals(other.contig) && - position == other.position && - (reference == null ? other.reference == null : reference.equals(other.reference)) && - (change == null ? other.change == null : change.equals(other.change)) && - changeType == other.changeType && - zygosity == other.zygosity && - (quality == null ? other.quality == null : quality.equals(other.quality)) && - (coverage == null ? other.coverage == null : coverage.equals(other.coverage)) && - (warnings == null ? other.warnings == null : warnings.equals(other.warnings)) && - (geneID == null ? other.geneID == null : geneID.equals(other.geneID)) && - (geneName == null ? other.geneName == null : geneName.equals(other.geneName)) && - (bioType == null ? other.bioType == null : bioType.equals(other.bioType)) && - (transcriptID == null ? other.transcriptID == null : transcriptID.equals(other.transcriptID)) && - (exonID == null ? other.exonID == null : exonID.equals(other.exonID)) && - (exonRank == null ? other.exonRank == null : exonRank.equals(other.exonRank)) && - isNonCodingGene == other.isNonCodingGene && - effect == other.effect && - (effectExtraInformation == null ? other.effectExtraInformation == null : effectExtraInformation.equals(other.effectExtraInformation)) && - (oldAndNewAA == null ? other.oldAndNewAA == null : oldAndNewAA.equals(other.oldAndNewAA)) && - (oldAndNewCodon == null ? other.oldAndNewCodon == null : oldAndNewCodon.equals(other.oldAndNewCodon)) && - (codonNum == null ? other.codonNum == null : codonNum.equals(other.codonNum)) && - (cdsSize == null ? other.cdsSize == null : cdsSize.equals(other.cdsSize)) && - (codonsAround == null ? other.codonsAround == null : codonsAround.equals(other.codonsAround)) && - (aasAround == null ? other.aasAround == null : aasAround.equals(other.aasAround)) && - (customIntervalID == null ? other.customIntervalID == null : customIntervalID.equals(other.customIntervalID)); - } - - public String toString() { - return "[Contig: " + contig + - " Position: " + position + - " Reference: " + reference + - " Change: " + change + - " Change Type: " + changeType + - " Zygosity: " + zygosity + - " Quality: " + quality + - " Coverage: " + coverage + - " Warnings: " + warnings + - " Gene ID: " + geneID + - " Gene Name: " + geneName + - " Bio Type: " + bioType + - " Transcript ID: " + transcriptID + - " Exon ID: " + exonID + - " Exon Rank: " + exonRank + - " Non-Coding Gene: " + isNonCodingGene + - " Effect: " + effect + - " Effect Extra Information: " + effectExtraInformation + - " Old/New AA: " + oldAndNewAA + - " Old/New Codon: " + oldAndNewCodon + - " Codon Num: " + codonNum + - " CDS Size: " + cdsSize + - " Codons Around: " + codonsAround + - " AAs Around: " + aasAround + - " Custom Interval ID: " + customIntervalID + - "]"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index eb01e5dca..fd1c74993 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -24,6 +24,7 @@ public class VCFHeader { private final Set mMetaData; private final Map mInfoMetaData = new HashMap(); private final Map mFormatMetaData = new HashMap(); + private final Map mOtherMetaData = new HashMap(); // the list of auxillary tags private final Set mGenotypeSampleNames = new LinkedHashSet(); @@ -110,6 +111,9 @@ public class VCFHeader { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; mFormatMetaData.put(formatLine.getName(), formatLine); } + else { + mOtherMetaData.put(line.getKey(), line); + } } } @@ -185,6 +189,14 @@ public class VCFHeader { public VCFFormatHeaderLine getFormatHeaderLine(String key) { return mFormatMetaData.get(key); } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFHeaderLine getOtherHeaderLine(String key) { + return mOtherMetaData.get(key); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1c65102ae..cfd59b504 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -817,6 +817,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this); } + /** + * @param other VariantContext whose alternate alleles to compare against + * @return true if this VariantContext has the same alternate alleles as other, + * regardless of ordering. Otherwise returns false. + */ + public boolean hasSameAlternateAllelesAs ( VariantContext other ) { + Set thisAlternateAlleles = getAlternateAlleles(); + Set otherAlternateAlleles = other.getAlternateAlleles(); + + if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { + return false; + } + + for ( Allele allele : thisAlternateAlleles ) { + if ( ! otherAlternateAlleles.contains(allele) ) { + return false; + } + } + + return true; + } + // --------------------------------------------------------------------------------------------------------- // // Working with genotypes diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java new file mode 100644 index 000000000..462abeba1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.testng.Assert; +import org.testng.annotations.Test; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.SnpEffEffect; + +public class SnpEffUnitTest { + + @Test + public void testParseWellFormedEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); + } + + @Test + public void testParseInvalidEffectNameEffect() { + String effectName = "MADE_UP_EFFECT"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseInvalidEffectImpactEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseWrongNumberOfMetadataFieldsEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseSnpEffWarningEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + } + + @Test + public void testParseSnpEffErrorEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 832079807..f902ce276 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -129,12 +130,24 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testSnpEffAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -NO_HEADER -o %s -A SnpEff --variant " + - validationDataLocation + "1000G.exomes.vcf --snpEffFile " + validationDataLocation + - "snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000", + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000", 1, - Arrays.asList("03eae1dab19a9358250890594bf53607") + Arrays.asList("a1c3ba9efc28ee0606339604095076ea") ); executeTest("Testing SnpEff annotations", spec); } + + @Test + public void testSnpEffAnnotationsUnsupportedVersion() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.unsupported.version.vcf -L 1:1-1,500,000", + 1, + UserException.class + ); + executeTest("Testing SnpEff annotations (unsupported version)", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java deleted file mode 100644 index 6d492565b..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -import org.apache.commons.io.input.ReaderInputStream; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.testng.Assert; -import org.testng.annotations.Test; - -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; - -import java.io.StringReader; - -public class SnpEffCodecUnitTest { - - @Test - public void testParseWellFormedSnpEffHeaderLine() { - String wellFormedSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around\tCustom_interval_ID"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wellFormedSnpEffHeaderLine))); - String headerReturned = (String)codec.readHeader(reader); - - Assert.assertEquals(headerReturned, wellFormedSnpEffHeaderLine); - } - - @Test(expectedExceptions = TribbleException.InvalidHeader.class) - public void testParseWrongNumberOfFieldsSnpEffHeaderLine() { - String wrongNumberOfFieldsSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wrongNumberOfFieldsSnpEffHeaderLine))); - codec.readHeader(reader); - } - - @Test(expectedExceptions = TribbleException.InvalidHeader.class) - public void testParseMisnamedColumnSnpEffHeaderLine() { - String misnamedColumnSnpEffHeaderLine = "# Chromo\tPosition\tRef\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around\tCustom_interval_ID"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(misnamedColumnSnpEffHeaderLine))); - codec.readHeader(reader); - } - - @Test - public void testParseSimpleEffectSnpEffLine() { - String simpleEffectSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 69428l, - "T", - "G", - ChangeType.SNP, - Zygosity.Hom, - 6049.69, - 61573l, - null, - "ENSG00000177693", - "OR4F5", - "mRNA", - "ENST00000326183", - "exon_1_69055_70108", - 1, - false, - EffectType.NON_SYNONYMOUS_CODING, - null, - "F/C", - "TTT/TGT", - 113, - 918, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(simpleEffectSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseNonCodingRegionSnpEffLine() { - String nonCodingRegionSnpEffLine = "1\t1337592\tG\tC\tSNP\tHom\t1935.52\t21885\t\tENSG00000250188\t" + - "RP4-758J18.5\tmRNA\tENST00000514958\texon_1_1337454_1338076\t2\tWITHIN_NON_CODING_GENE, NON_SYNONYMOUS_CODING\t" + - "L/V\tCTA/GTA\t272\t952\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 1337592l, - "G", - "C", - ChangeType.SNP, - Zygosity.Hom, - 1935.52, - 21885l, - null, - "ENSG00000250188", - "RP4-758J18.5", - "mRNA", - "ENST00000514958", - "exon_1_1337454_1338076", - 2, - true, - EffectType.NON_SYNONYMOUS_CODING, - null, - "L/V", - "CTA/GTA", - 272, - 952, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(nonCodingRegionSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseExtraEffectInformationSnpEffLine() { - String extraEffectInformationSnpEffLine = "1\t879537\tT\tC\tSNP\tHom\t341.58\t13733\t\tENSG00000187634\tSAMD11\t" + - "mRNA\tENST00000341065\t\t\tUTR_3_PRIME: 4 bases from transcript end\t\t\t\t\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 879537l, - "T", - "C", - ChangeType.SNP, - Zygosity.Hom, - 341.58, - 13733l, - null, - "ENSG00000187634", - "SAMD11", - "mRNA", - "ENST00000341065", - null, - null, - false, - EffectType.UTR_3_PRIME, - "4 bases from transcript end", - null, - null, - null, - null, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(extraEffectInformationSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseMultiEffectSnpEffLine() { - String multiEffectSnpEffLine = "1\t901901\tC\tT\tSNP\tHom\t162.91\t4646\t\tENSG00000187583\tPLEKHN1\tmRNA\t" + - "ENST00000379410\texon_1_901877_901994\t1\tSTART_GAINED: ATG, UTR_5_PRIME: 11 bases from TSS\t\t\t\t\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 901901l, - "C", - "T", - ChangeType.SNP, - Zygosity.Hom, - 162.91, - 4646l, - null, - "ENSG00000187583", - "PLEKHN1", - "mRNA", - "ENST00000379410", - "exon_1_901877_901994", - 1, - false, - EffectType.START_GAINED, - "ATG, UTR_5_PRIME: 11 bases from TSS", - null, - null, - null, - null, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(multiEffectSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseWrongNumberOfFieldsSnpEffLine() { - String wrongNumberOfFieldsSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t"; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(wrongNumberOfFieldsSnpEffLine); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseBlankEffectFieldSnpEffLine() { - String blankEffectFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\t\tF/C\tTTT/TGT\t113\t918\t\t\t"; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(blankEffectFieldSnpEffLine); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseInvalidNumericFieldSnpEffLine() { - String invalidNumericFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\tfoo\t\t\t";; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(invalidNumericFieldSnpEffLine); - } -} From e0c8c0ddcb48617d5f9146c2d9ffe57d967dd899 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 14 Sep 2011 06:04:32 -0400 Subject: [PATCH 071/113] Modified VariantEval FunctionalClass stratification to remove hardcoded GenomicAnnotator keynames This is a temporary and hopefully short-lived solution. I've modified the FunctionalClass stratification to stratify by effect impact as defined by SnpEff annotations (high, moderate, and low impact) rather than by the silent/missense/nonsense categories. If we want to bring back the silent/missense/nonsense stratification, we should probably take the approach of asking the SnpEff author to add it as a feature to SnpEff rather than coding it ourselves, since the whole point of moving to SnpEff was to outsource genomic annotation. --- .../stratifications/FunctionalClass.java | 53 ++++++++----------- .../VariantEvalIntegrationTest.java | 7 ++- 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index 193a65591..c675b111c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -2,21 +2,29 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; import java.util.List; /** - * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation. + * Stratifies by low-, moderate-, and high-impact genomic effect using SnpEff annotations produced by VariantAnnotator */ public class FunctionalClass extends VariantStratifier { + + public static final String LOW_IMPACT_STATE_NAME = "low-impact"; + public static final String MODERATE_IMPACT_STATE_NAME = "moderate-impact"; + public static final String HIGH_IMPACT_STATE_NAME = "high-impact"; + + public static final String EFFECT_IMPACT_ATTRIBUTE_KEY = SnpEff.InfoFieldKey.EFF_IMPACT.toString(); + @Override public void initialize() { states.add("all"); - states.add("silent"); - states.add("missense"); - states.add("nonsense"); + states.add(LOW_IMPACT_STATE_NAME); + states.add(MODERATE_IMPACT_STATE_NAME); + states.add(HIGH_IMPACT_STATE_NAME); } @@ -25,36 +33,17 @@ public class FunctionalClass extends VariantStratifier { relevantStates.add("all"); - if (eval != null && eval.isVariant()) { - String type = null; + if ( eval != null && eval.isVariant() && eval.hasAttribute(EFFECT_IMPACT_ATTRIBUTE_KEY) ) { + String effectImpact = eval.getAttributeAsString(EFFECT_IMPACT_ATTRIBUTE_KEY); - if (eval.hasAttribute("refseq.functionalClass")) { - type = eval.getAttributeAsString("refseq.functionalClass"); - } else if (eval.hasAttribute("refseq.functionalClass_1")) { - int annotationId = 1; - String key; - - do { - key = String.format("refseq.functionalClass_%d", annotationId); - - String newtype = eval.getAttributeAsString(key); - - if ( newtype != null && !newtype.equalsIgnoreCase("null") && - ( type == null || - ( type.equals("silent") && !newtype.equals("silent") ) || - ( type.equals("missense") && newtype.equals("nonsense") ) ) - ) { - type = newtype; - } - - annotationId++; - } while (eval.hasAttribute(key)); + if ( effectImpact.equals(SnpEff.EffectImpact.LOW.toString()) ) { + relevantStates.add(LOW_IMPACT_STATE_NAME); } - - if (type != null) { - if (type.equals("silent")) { relevantStates.add("silent"); } - else if (type.equals("missense")) { relevantStates.add("missense"); } - else if (type.equals("nonsense")) { relevantStates.add("nonsense"); } + else if ( effectImpact.equals(SnpEff.EffectImpact.MODERATE.toString()) ) { + relevantStates.add(MODERATE_IMPACT_STATE_NAME); + } + else if ( effectImpact.equals(SnpEff.EffectImpact.HIGH.toString()) ) { + relevantStates.add(HIGH_IMPACT_STATE_NAME); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index e992684bc..00ecd5b67 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -123,9 +123,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T VariantEval", - "-R " + b37KGReference, - "--dbsnp " + b37dbSNP132, - "--eval " + fundamentalTestVCF, + "-R " + hg19Reference, + "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", "-noEV", "-EV CountVariants", "-noST", @@ -134,7 +133,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("e40b77e7ed6581328e373a24b93cd170") + Arrays.asList("e93b3d66a5c150cbf1ae4262ec075d2d") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } From 3db457ed01f7d99eaa0d62b9924cba6f1d269dad Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 14 Sep 2011 10:47:28 -0400 Subject: [PATCH 072/113] Revert "Modified VariantEval FunctionalClass stratification to remove hardcoded GenomicAnnotator keynames" After discussing this with Mark, it seems clear that the old version of the VariantEval FunctionalClass stratification is preferable to this version. By reverting, we maintain backwards compatibility with legacy output files from the old GenomicAnnotator, and can add SnpEff support later without breaking that backwards compatibility. This reverts commit b44acd1abd9ab6eec37111a19fa797f9e2ca3326. --- .../stratifications/FunctionalClass.java | 53 +++++++++++-------- .../VariantEvalIntegrationTest.java | 7 +-- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index c675b111c..193a65591 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -2,29 +2,21 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; import java.util.List; /** - * Stratifies by low-, moderate-, and high-impact genomic effect using SnpEff annotations produced by VariantAnnotator + * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation. */ public class FunctionalClass extends VariantStratifier { - - public static final String LOW_IMPACT_STATE_NAME = "low-impact"; - public static final String MODERATE_IMPACT_STATE_NAME = "moderate-impact"; - public static final String HIGH_IMPACT_STATE_NAME = "high-impact"; - - public static final String EFFECT_IMPACT_ATTRIBUTE_KEY = SnpEff.InfoFieldKey.EFF_IMPACT.toString(); - @Override public void initialize() { states.add("all"); - states.add(LOW_IMPACT_STATE_NAME); - states.add(MODERATE_IMPACT_STATE_NAME); - states.add(HIGH_IMPACT_STATE_NAME); + states.add("silent"); + states.add("missense"); + states.add("nonsense"); } @@ -33,17 +25,36 @@ public class FunctionalClass extends VariantStratifier { relevantStates.add("all"); - if ( eval != null && eval.isVariant() && eval.hasAttribute(EFFECT_IMPACT_ATTRIBUTE_KEY) ) { - String effectImpact = eval.getAttributeAsString(EFFECT_IMPACT_ATTRIBUTE_KEY); + if (eval != null && eval.isVariant()) { + String type = null; - if ( effectImpact.equals(SnpEff.EffectImpact.LOW.toString()) ) { - relevantStates.add(LOW_IMPACT_STATE_NAME); + if (eval.hasAttribute("refseq.functionalClass")) { + type = eval.getAttributeAsString("refseq.functionalClass"); + } else if (eval.hasAttribute("refseq.functionalClass_1")) { + int annotationId = 1; + String key; + + do { + key = String.format("refseq.functionalClass_%d", annotationId); + + String newtype = eval.getAttributeAsString(key); + + if ( newtype != null && !newtype.equalsIgnoreCase("null") && + ( type == null || + ( type.equals("silent") && !newtype.equals("silent") ) || + ( type.equals("missense") && newtype.equals("nonsense") ) ) + ) { + type = newtype; + } + + annotationId++; + } while (eval.hasAttribute(key)); } - else if ( effectImpact.equals(SnpEff.EffectImpact.MODERATE.toString()) ) { - relevantStates.add(MODERATE_IMPACT_STATE_NAME); - } - else if ( effectImpact.equals(SnpEff.EffectImpact.HIGH.toString()) ) { - relevantStates.add(HIGH_IMPACT_STATE_NAME); + + if (type != null) { + if (type.equals("silent")) { relevantStates.add("silent"); } + else if (type.equals("missense")) { relevantStates.add("missense"); } + else if (type.equals("nonsense")) { relevantStates.add("nonsense"); } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 00ecd5b67..e992684bc 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -123,8 +123,9 @@ public class VariantEvalIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T VariantEval", - "-R " + hg19Reference, - "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -133,7 +134,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("e93b3d66a5c150cbf1ae4262ec075d2d") + Arrays.asList("e40b77e7ed6581328e373a24b93cd170") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } From a942fa38ef87c1e1565d6a1cff041d8a62aaeb0a Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 15 Sep 2011 10:22:28 -0400 Subject: [PATCH 073/113] Refine the way we merge records in CombineVariants of different types. As of before, two records of different types were not combined and were kept separate. This is still the case, except when the alleles of one record are a strict subset of alleles of another record. For example, a SNP with alleles {A*,T} and a mixed record with alleles {A*,T, AAT} are now combined when start position matches. --- .../walkers/variantutils/CombineVariants.java | 35 +++++++++++++++++-- .../variantcontext/VariantContextUtils.java | 12 +++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 7062f17e5..3e3b29a7f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -234,16 +234,47 @@ public class CombineVariants extends RodWalker { if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - List mergedVCs = new ArrayList(); + List preMergedVCs = new ArrayList(); Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); // iterate over the types so that it's deterministic for ( VariantContext.Type type : VariantContext.Type.values() ) { if ( VCsByType.containsKey(type) ) - mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), + preMergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } + List mergedVCs = new ArrayList(); + // se have records merged but separated by type. If a particular record is for example a snp but all alleles are a subset of an existing mixed record, + // we will still merge those records. + if (preMergedVCs.size() > 1) { + for (VariantContext vc1 : preMergedVCs) { + VariantContext newvc = vc1; + boolean merged = false; + for (int k=0; k < mergedVCs.size(); k++) { + VariantContext vc2 = mergedVCs.get(k); + + if (VariantContextUtils.allelesAreSubset(vc1,vc2) || VariantContextUtils.allelesAreSubset(vc2,vc1)) { + // all alleles of vc1 are contained in vc2 but they are of different type (say, vc1 is snp, vc2 is complex): try to merget v1 into v2 + List vcpair = new ArrayList(); + vcpair.add(vc1); + vcpair.add(vc2); + newvc = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcpair, + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC); + mergedVCs.set(k,newvc); + merged = true; + break; + } + } + if (!merged) + mergedVCs.add(vc1); + } + } + else { + mergedVCs = preMergedVCs; + } + for ( VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 986d6305c..506bb3b33 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -663,6 +663,18 @@ public class VariantContextUtils { return merged; } + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; From 1e682deb26aad12d4421cf9ff7f08318c1cd4ab3 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 15 Sep 2011 13:07:50 -0400 Subject: [PATCH 074/113] Minor html-formatting-related documentation fix to the SnpEff class. --- .../org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 14abbca5b..bb3685fb5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -45,7 +45,7 @@ import java.util.*; * (http://snpeff.sourceforge.net/). * * For each variant, chooses one of the effects of highest biological impact from the SnpEff - * output file (which must be provided on the command line via --snpEffFile .vcf), + * output file (which must be provided on the command line via --snpEffFile filename.vcf), * and adds annotations on that effect. * * @author David Roazen From 202405b1a165db3f1d97f687da10826df181c849 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 13:52:31 -0400 Subject: [PATCH 075/113] Updating the FunctionalClass stratification in VariantEval to handle the snpEff annotations; this change really needs to be in before the release so that the pipeline can output semi-meaningful plots. This commit maintains backwards compatibility with the crappy Genomic Annotator output. However, I did clean up the code a bit so that we now use an Enum instead of hard-coded values (so it's now much easier to change things if we choose to do so in the future). I do not see this as the final commit on this topic - I think we need to make some changes to the snpEff annotator to preferentially choose certain annotations within effect classes; Mark, let's chat about this for a bit when you get back next week. Also, for the record, I should be blamed for David's temporary commit the other day because I gave him the green light (since when do you care about backwards compatibility anyways?). In any case, at least now we have something that works for both the old and new annotations. --- .../stratifications/FunctionalClass.java | 53 +++++++++++++------ .../VariantEvalIntegrationTest.java | 25 ++++++++- 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index 193a65591..a32857ffc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; @@ -11,12 +12,19 @@ import java.util.List; * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation. */ public class FunctionalClass extends VariantStratifier { + + public enum FunctionalType { + silent, + missense, + nonsense + } + + @Override public void initialize() { states.add("all"); - states.add("silent"); - states.add("missense"); - states.add("nonsense"); + for ( FunctionalType type : FunctionalType.values() ) + states.add(type.name()); } @@ -26,10 +34,12 @@ public class FunctionalClass extends VariantStratifier { relevantStates.add("all"); if (eval != null && eval.isVariant()) { - String type = null; + FunctionalType type = null; if (eval.hasAttribute("refseq.functionalClass")) { - type = eval.getAttributeAsString("refseq.functionalClass"); + try { + type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass")); + } catch ( Exception e ) {} // don't error out if the type isn't supported } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -37,24 +47,33 @@ public class FunctionalClass extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); - - if ( newtype != null && !newtype.equalsIgnoreCase("null") && - ( type == null || - ( type.equals("silent") && !newtype.equals("silent") ) || - ( type.equals("missense") && newtype.equals("nonsense") ) ) - ) { - type = newtype; + String newtypeStr = eval.getAttributeAsString(key); + if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) { + try { + FunctionalType newType = FunctionalType.valueOf(newtypeStr); + if ( type == null || + ( type == FunctionalType.silent && newType != FunctionalType.silent ) || + ( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) { + type = newType; + } + } catch ( Exception e ) {} // don't error out if the type isn't supported } annotationId++; } while (eval.hasAttribute(key)); + + } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFF.name() ) ) { + SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFF.name()).toString()); + if ( snpEffType == SnpEff.EffectType.STOP_GAINED ) + type = FunctionalType.nonsense; + else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING ) + type = FunctionalType.missense; + else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING ) + type = FunctionalType.silent; } - if (type != null) { - if (type.equals("silent")) { relevantStates.add("silent"); } - else if (type.equals("missense")) { relevantStates.add("missense"); } - else if (type.equals("nonsense")) { relevantStates.add("nonsense"); } + if ( type != null ) { + relevantStates.add(type.name()); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index e992684bc..d8f7ad3b6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.Arrays; public class VariantEvalIntegrationTest extends WalkerTest { - private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval"; + private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf"; @@ -14,6 +14,27 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; + @Test + public void testFunctionClassWithSnpeff() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-noST", + "-ST FunctionalClass", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12") + ); + executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); + } + @Test public void testStratifySamplesAndExcludeMonomorphicSites() { WalkerTestSpec spec = new WalkerTestSpec( @@ -21,7 +42,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf", + "--eval " + variantEvalTestDataRoot + "CEU.trio.callsForVE.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-ST Sample", From d369d105932b457a01b47166b7b7a1cd41d6b337 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 13:56:23 -0400 Subject: [PATCH 076/113] Adding documentation before the release for GATK wiki page --- .../broadinstitute/sting/gatk/filters/PlatformFilter.java | 2 +- .../sting/gatk/walkers/PrintReadsWalker.java | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java index 30b2f828d..8e241bb2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * @version 0.1 */ public class PlatformFilter extends ReadFilter { - @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false) + @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false) protected String[] PLFilterNames; public boolean filterOut(SAMRecord rec) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index fdfac6bf7..4f072e88c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; * -I input1.bam \ * -I input2.bam \ * --read_filter MappingQualityZero + * + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input.bam \ + * -n 2000 *
* */ From ce73dc40712510a360738df49e802b4c21d95621 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 15 Sep 2011 15:33:09 -0400 Subject: [PATCH 079/113] Update to the bindings for liftOverVCF.pl (to -V from -B) --- public/perl/liftOverVCF.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl index 21cb8bb6b..ba4198292 100755 --- a/public/perl/liftOverVCF.pl +++ b/public/perl/liftOverVCF.pl @@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; if ($recordOriginalLocation) { $cmd .= " -recordOriginalLocation"; } @@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out"; system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); # clean up From f04e51c6c2b74a79644e9473230410a8ba85fe92 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 15:38:56 -0400 Subject: [PATCH 080/113] Adding docs from Andrey since his repo was all screwed up. --- .../indels/SomaticIndelDetectorWalker.java | 143 ++++++++++++------ 1 file changed, 94 insertions(+), 49 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index e5ad3106d..8bba8eac2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; import java.util.*; + /** + * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well, + * but this latter functionality is now superceded by UnifiedGenotyper. + * + *

* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing - * data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read - * and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample, - * or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified, - * the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline - * if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic - * if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains - * only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords. + * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs + * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many + * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional + * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will + * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional + * metrics for the post-processing tools to make the final decision). The calls are performed by default + * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I + * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments, + * respectively. Indels are called from the tumor sample and annotated as germline + * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic + * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling + * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold + * in tumor sample, it will not be reported. * - * If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you, - * please email asivache at broadinstitute dot org and he will gladly explain everything in more detail. + * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input + * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged + * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups). * + *

Input

+ *

+ * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode). + *

+ * + *

Output

+ *

+ * Indel calls with associated metrics. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SomaticIndelDetector \
+ *   -o indels.vcf \
+ *   -verbose indels.txt
+ *   -I:normal normal.bam \
+ *   -I:tumor tumor.bam
+ * 
* */ + @ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class}) public class SomaticIndelDetectorWalker extends ReadWalker { // @Output // PrintStream out; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to write variants (indels) in VCF format",required=true) protected VCFWriter vcf_writer = null; @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) @@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker { @Hidden @Argument(fullName = "genotype_intervals", shortName = "genotype", - doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false) + doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) public String genotypeIntervalsFile = null; @Hidden @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, - doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ - "if the list turns out to be unsorted, it will throw an exception. "+ - "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ - "to sort and keep it in memory (increases memory usage!).") + doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ + "if the list turns out to be unsorted, it will throw an exception. "+ + "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ + "to sort and keep it in memory (increases memory usage!).") protected boolean GENOTYPE_NOT_SORTED = false; @Hidden - @Argument(fullName="unpaired", shortName="unpaired", - doc="Perform unpaired calls (no somatic status detection)", required=false) + @Argument(fullName="unpaired", shortName="unpaired", + doc="Perform unpaired calls (no somatic status detection)", required=false) boolean call_unpaired = false; - boolean call_somatic ; + boolean call_somatic ; - @Argument(fullName="verboseOutput", shortName="verbose", - doc="Verbose output file in text format", required=false) - java.io.File verboseOutput = null; + @Argument(fullName="verboseOutput", shortName="verbose", + doc="Verbose output file in text format", required=false) + java.io.File verboseOutput = null; @Argument(fullName="bedOutput", shortName="bed", - doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) + doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) java.io.File bedOutput = null; - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false) - int minCoverage = 6; + @Argument(fullName="minCoverage", shortName="minCoverage", + doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+ + "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false) + int minCoverage = 6; - @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", - doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) - int minNormalCoverage = 4; + @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", + doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+ + "or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) + int minNormalCoverage = 4; - @Argument(fullName="minFraction", shortName="minFraction", - doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ - " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) - double minFraction = 0.3; + @Argument(fullName="minFraction", shortName="minFraction", + doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ + " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) + double minFraction = 0.3; - @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", - doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false) - double minConsensusFraction = 0.7; + @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", + doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+ + "all indel observations at the site exceeds this threshold", required=false) + double minConsensusFraction = 0.7; - @Argument(fullName="minIndelCount", shortName="minCnt", - doc="Minimum count of reads supporting consensus indel required for making the call. "+ - " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ - "(minIndelCount not met) will not pass.", required=false) - int minIndelCount = 0; + @Argument(fullName="minIndelCount", shortName="minCnt", + doc="Minimum count of reads supporting consensus indel required for making the call. "+ + " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ + "(minIndelCount not met) will not pass.", required=false) + int minIndelCount = 0; - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) - String RefseqFileName = null; + @Argument(fullName="refseq", shortName="refseq", + doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+ + "GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) + String RefseqFileName = null; - @Argument(fullName="blacklistedLanes", shortName="BL", - doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ - "by this application, so they will not contribute indels to consider and will not be counted.", required=false) - PlatformUnitFilterHelper dummy; - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false; +//@Argument(fullName="blacklistedLanes", shortName="BL", +// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ +// "by this application, so they will not contribute indels to consider and will not be counted.", required=false) +//PlatformUnitFilterHelper dummy; + + @Hidden + @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on", + required=false) Boolean DEBUG = false; @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ - "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200; + "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+ + "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+ + "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+ + "if window is too small",required=false) int WINDOW_SIZE = 200; @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; + + private WindowContext tumor_context; private WindowContext normal_context; private int currentContigIndex = -1; From fe474b77f85f325ed20d6cb6c50dc298d024d03e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 16:05:39 -0400 Subject: [PATCH 081/113] Updating docs so printing looks nicer --- .../gatk/walkers/variantutils/VariantValidationAssessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index b98646270..ea8549474 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; /** - * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes) + * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes) * *

* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). From 4ef6a4598c3704fd5aac5f5302a148ddfedd3958 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 16:10:34 -0400 Subject: [PATCH 082/113] Updating docs to include output --- .../walkers/varianteval/VariantEvalWalker.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 266b97af0..28f4f2a56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -56,6 +56,22 @@ import java.util.*; *

Output

*

* Evaluation tables detailing the results of the eval modules which were applied. + * For example: + *

+ * output.eval.gatkreport:
+ * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
+ * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
+ * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
+ * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
+ * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
+ * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
+ * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
+ * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
+ * ...
+ * 
*

* *

Examples

From 6d02a34bfba1537f294f5a077b24702e539b87a5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 16:17:54 -0400 Subject: [PATCH 083/113] Updating docs to include output --- .../variantutils/VariantValidationAssessor.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index ea8549474..8eaf976d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -57,7 +57,16 @@ import java.util.*; * *

Output

*

- * An annotated VCF. + * An annotated VCF. Additionally, a table like the following will be output: + *

+ *     Total number of samples assayed:                  185
+ *     Total number of records processed:                152
+ *     Number of Hardy-Weinberg violations:              34 (22%)
+ *     Number of no-call violations:                     12 (7%)
+ *     Number of homozygous variant violations:          0 (0%)
+ *     Number of records passing all filters:            106 (69%)
+ *     Number of passing records that are polymorphic:   98 (92%)
+ * 
*

* *

Examples

From fd1831b4a520e68b15b6b5b958aa2d04ade4e287 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 16:25:03 -0400 Subject: [PATCH 084/113] Updating docs to include more details --- .../gatk/walkers/fasta/FastaAlternateReferenceWalker.java | 6 ++++-- .../sting/gatk/walkers/fasta/FastaReferenceWalker.java | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index fd912334f..4e2c17bf6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -43,8 +43,10 @@ import java.util.List; * Generates an alternative reference sequence over the specified interval. * *

- * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). - * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'. + * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). + * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * Note that if there are multiple variants at a site, it takes the first one seen. + * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). * *

Input

*

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java index 5f3b37cc8..7ae5c5c75 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java @@ -42,6 +42,9 @@ import java.io.PrintStream; * *

* The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). * *

Input

*

From 2f58fdb369a3cd4857281dd210427fac6352ca88 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 15 Sep 2011 16:26:11 -0400 Subject: [PATCH 085/113] Adding expected output doc to CountCovariates --- .../recalibration/CountCovariatesWalker.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 98c8950e3..1bdb70bdd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -76,6 +76,42 @@ import java.util.Map; *

Output

*

* A recalibration table file in CSV format that is used by the TableRecalibration walker. + * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. + * + * The first 20 lines of such a file is shown below. + * * The file begins with a series of comment lines describing: + * ** The number of counted loci + * ** The number of counted bases + * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases + * + * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. + * + * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change + * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of + * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. + * + *

+ * # Counted Sites    19451059
+ * # Counted Bases    56582018
+ * # Skipped Sites    82666
+ * # Fraction Skipped 1 / 235 bp
+ * ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
+ * SRR006446,11,65,CA,9,1,10
+ * SRR006446,11,48,TA,10,0,40
+ * SRR006446,11,67,AA,27,0,40
+ * SRR006446,11,61,GA,11,1,10
+ * SRR006446,12,34,CA,47,1,17
+ * SRR006446,12,30,GA,52,1,17
+ * SRR006446,12,36,AA,352,1,25
+ * SRR006446,12,17,TA,182,11,12
+ * SRR006446,11,48,TG,2,0,40
+ * SRR006446,11,67,AG,1,0,40
+ * SRR006446,12,34,CG,9,0,40
+ * SRR006446,12,30,GG,43,0,40
+ * ERR001876,4,31,AG,1,0,40
+ * ERR001876,4,31,AT,2,2,1
+ * ERR001876,4,31,CA,1,0,40
+ * 
*

* *

Examples

From 9dc6354130b23683c31a7b2c1ef8c2ed94da1946 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Sep 2011 16:55:24 -0400 Subject: [PATCH 086/113] Oops didn't mean to touch this test before --- .../gatk/walkers/varianteval/VariantEvalIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index d8f7ad3b6..99622cbf6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -42,7 +42,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + variantEvalTestDataRoot + "CEU.trio.callsForVE.vcf", + "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-ST Sample", From d78e00e5b2cd5e8a1b1aa75209100b039e521442 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 15 Sep 2011 16:09:07 -0400 Subject: [PATCH 087/113] Renaming VariantAnnotator SnpEff keys This is to head off potential confusion with the output from the SnpEff tool itself, which also uses a key named EFF. --- .../sting/gatk/walkers/annotator/SnpEff.java | 90 ++++++++++--------- .../stratifications/FunctionalClass.java | 4 +- .../VariantAnnotatorIntegrationTest.java | 2 +- .../VariantEvalIntegrationTest.java | 2 +- 4 files changed, 53 insertions(+), 45 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index bb3685fb5..4ead77506 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -68,23 +68,31 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio // Key names for the INFO field annotations we will add to each record, along // with parsing-related information: public enum InfoFieldKey { - EFF (-1), - EFF_IMPACT (0), - EFF_CODON_CHANGE (1), - EFF_AMINO_ACID_CHANGE (2), - EFF_GENE_NAME (3), - EFF_GENE_BIOTYPE (4), - EFF_TRANSCRIPT_ID (6), - EFF_EXON_ID (7); + EFFECT_KEY ("SNPEFF_EFFECT", -1), + IMPACT_KEY ("SNPEFF_IMPACT", 0), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), + EXON_ID_KEY ("SNPEFF_EXON_ID", 7); + + // Actual text of the key + private final String keyName; // Index within the effect metadata subfields from the SnpEff EFF annotation // where each key's associated value can be found during parsing. private final int fieldIndex; - InfoFieldKey ( int fieldIndex ) { + InfoFieldKey ( String keyName, int fieldIndex ) { + this.keyName = keyName; this.fieldIndex = fieldIndex; } + public String getKeyName() { + return keyName; + } + public int getFieldIndex() { return fieldIndex; } @@ -292,27 +300,27 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio } public List getKeyNames() { - return Arrays.asList( InfoFieldKey.EFF.toString(), - InfoFieldKey.EFF_IMPACT.toString(), - InfoFieldKey.EFF_CODON_CHANGE.toString(), - InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), - InfoFieldKey.EFF_GENE_NAME.toString(), - InfoFieldKey.EFF_GENE_BIOTYPE.toString(), - InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), - InfoFieldKey.EFF_EXON_ID.toString() + return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), + InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), + InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), + InfoFieldKey.GENE_NAME_KEY.getKeyName(), + InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), + InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } public List getDescriptions() { return Arrays.asList( - new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), - new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") + new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), + new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } @@ -375,16 +383,16 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio } try { - impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]); + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); } catch ( IllegalArgumentException e ) { - parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()])); + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); } - codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()]; - aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()]; - geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()]; - geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()]; + codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; + aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; + geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; + geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { try { @@ -398,8 +406,8 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio coding = EffectCoding.UNKNOWN; } - transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()]; - exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()]; + transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()]; + exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()]; } private void parseError ( String message ) { @@ -443,14 +451,14 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio public Map getAnnotations() { Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length)); - addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString()); - addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString()); - addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange); - addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange); - addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName); - addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype); - addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID); - addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID); + addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); + addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); + addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); + addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); + addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); + addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); + addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); return annotations; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index a32857ffc..88ffcaaeb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -62,8 +62,8 @@ public class FunctionalClass extends VariantStratifier { annotationId++; } while (eval.hasAttribute(key)); - } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFF.name() ) ) { - SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFF.name()).toString()); + } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) { + SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString()); if ( snpEffType == SnpEff.EffectType.STOP_GAINED ) type = FunctionalType.nonsense; else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING ) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index f902ce276..08baae7a7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -134,7 +134,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000", 1, - Arrays.asList("a1c3ba9efc28ee0606339604095076ea") + Arrays.asList("486fc6a5ca1819f5ab180d5d72b1ebc9") ); executeTest("Testing SnpEff annotations", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 99622cbf6..b90e6d0ff 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { 1, Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12") ); - executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); + executeTest("testFunctionClassWithSnpeff", spec); } @Test From e6e9b08c9a47640f9be32b47f495174118636a5c Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Thu, 15 Sep 2011 18:51:09 -0400 Subject: [PATCH 089/113] Must provide alleles VCF to UGCallVariants --- .../sting/gatk/walkers/genotyper/UGCallVariants.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index 500b11360..d88e55687 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; From 9fdf1f8eb663858cacafd8fb339d098cdce4b96d Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 15 Sep 2011 21:05:22 -0400 Subject: [PATCH 090/113] Fix some doc formatting for Depth of Coverage --- .../gatk/walkers/coverage/DepthOfCoverageWalker.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 3a18fe610..86f97a36c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -69,14 +69,23 @@ import java.util.*; *

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + *

* - no suffix: per locus coverage + *

* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + *

* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + *

* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + *

* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + *

* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + *

* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + *

* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + *

* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases *

* From 939babc820cc5174a1d97a8b6bdb992ca6cedc09 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 15 Sep 2011 21:05:51 -0400 Subject: [PATCH 091/113] Updating formating for ValidationAmplicons GATK docs --- .../sting/gatk/walkers/validation/ValidationAmplicons.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 01e8cd321..48cba6a1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -61,7 +61,7 @@ import java.util.List; * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC *
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: - * + *
  * Valid                     // amplicon is valid
  * SITE_IS_FILTERED=1        // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
  * VARIANT_TOO_NEAR_PROBE=1  // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
@@ -72,10 +72,10 @@ import java.util.List;
  * END_TOO_CLOSE,            // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
  * NO_VARIANTS_FOUND,        // no variants found within the amplicon region
  * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
- * 

+ *

* *

Examples

- *

+ * 
  *    java
  *      -jar GenomeAnalysisTK.jar
  *      -T ValidationAmplicons

From 33967a4e0c09e85cc4dc1d0eb83fe6feef80c46d Mon Sep 17 00:00:00 2001
From: Khalid Shakir 
Date: Fri, 16 Sep 2011 12:46:07 -0400
Subject: [PATCH 093/113] Fixed issue reported by chartl where cloned functions
 lost tags on @Inputs. Updated ExampleUnifiedGenotyper.scala with new syntax.

---
 .../examples/ExampleUnifiedGenotyper.scala    |  6 +--
 .../sting/queue/extensions/gatk/RodBind.scala |  2 +-
 .../queue/extensions/gatk/TaggedFile.scala    |  2 +-
 .../sting/queue/function/QFunction.scala      | 16 +-------
 .../{function => util}/FileExtension.scala    |  2 +-
 .../sting/queue/util/IOUtils.scala            | 40 ++++++++++++++-----
 6 files changed, 36 insertions(+), 32 deletions(-)
 rename public/scala/src/org/broadinstitute/sting/queue/{function => util}/FileExtension.scala (89%)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala
index 1d473b210..9bddfd97c 100644
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala
@@ -56,15 +56,15 @@ class ExampleUnifiedGenotyper extends QScript {
     genotyper.input_file :+= qscript.bamFile
     genotyper.out = swapExt(qscript.bamFile, "bam", "unfiltered.vcf")
 
-    evalUnfiltered.rodBind :+= RodBind("eval", "VCF", genotyper.out)
+    evalUnfiltered.eval :+= genotyper.out
     evalUnfiltered.out = swapExt(genotyper.out, "vcf", "eval")
 
-    variantFilter.rodBind :+= RodBind("variant", "VCF", genotyper.out)
+    variantFilter.variant = genotyper.out
     variantFilter.out = swapExt(qscript.bamFile, "bam", "filtered.vcf")
     variantFilter.filterName = filterNames
     variantFilter.filterExpression = filterExpressions.map("\"" + _ + "\"")
 
-    evalFiltered.rodBind :+= RodBind("eval", "VCF", variantFilter.out)
+    evalFiltered.eval :+= variantFilter.out
     evalFiltered.out = swapExt(variantFilter.out, "vcf", "eval")
 
     add(genotyper, evalUnfiltered)
diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala
index 42f63e225..b4c5d91d3 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala
@@ -1,7 +1,7 @@
 package org.broadinstitute.sting.queue.extensions.gatk
 
 import java.io.File
-import org.broadinstitute.sting.queue.function.FileExtension
+import org.broadinstitute.sting.queue.util.FileExtension
 import java.lang.String
 
 /**
diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala
index ed8158b49..b19f9e430 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala
@@ -1,7 +1,7 @@
 package org.broadinstitute.sting.queue.extensions.gatk
 
 import java.io.File
-import org.broadinstitute.sting.queue.function.FileExtension
+import org.broadinstitute.sting.queue.util.FileExtension
 
 /**
  * Used to provide tagged -I input_file arguments to the GATK.
diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala
index c905581fa..500f7b200 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala
@@ -387,25 +387,11 @@ trait QFunction extends Logging with QJobReport {
    */
   protected def canon(value: Any) = {
     value match {
-      case fileExtension: FileExtension =>
-        val newFile = absolute(fileExtension);
-        val newFileExtension = fileExtension.withPath(newFile.getPath)
-        newFileExtension
-      case file: File =>
-        if (file.getClass != classOf[File])
-          throw new QException("Extensions of file must also extend with FileExtension so that the path can be modified.");
-        absolute(file)
+      case file: File => IOUtils.absolute(commandDirectory, file)
       case x => x
     }
   }
 
-  /**
-   * Returns the absolute path to the file relative to the run directory and the job command directory.
-   * @param file File to root relative to the command directory if it is not already absolute.
-   * @return The absolute path to file.
-   */
-  private def absolute(file: File) = IOUtils.absolute(commandDirectory, file)
-
   /**
    * Scala sugar type for checking annotation required and exclusiveOf.
    */
diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala b/public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala
similarity index 89%
rename from public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala
rename to public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala
index e2394a5bf..9b6e52c8e 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala
@@ -1,4 +1,4 @@
-package org.broadinstitute.sting.queue.function
+package org.broadinstitute.sting.queue.util
 
 import java.io.File
 
diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala
index 79ffa8cb9..b17ccc0d5 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala
@@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.util
 import org.apache.commons.io.FileUtils
 import java.io.{FileReader, File}
 import org.broadinstitute.sting.utils.exceptions.UserException
+import org.broadinstitute.sting.queue.QException
 
 /**
  * A collection of utilities for modifying java.io.
@@ -12,7 +13,7 @@ object IOUtils extends Logging {
    * Checks if the temp directory has been setup and throws an exception if they user hasn't set it correctly.
    * @param tempDir Temporary directory.
    */
-  def checkTempDir(tempDir: File) = {
+  def checkTempDir(tempDir: File) {
     val tempDirPath = tempDir.getAbsolutePath
     // Keeps the user from leaving the temp directory as the default, and on Macs from having pluses
     // in the path which can cause problems with the Google Reflections library.
@@ -20,7 +21,7 @@ object IOUtils extends Logging {
     if (tempDirPath.startsWith("/var/folders/") || (tempDirPath == "/tmp") || (tempDirPath == "/tmp/"))
       throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set")
     if (!tempDir.exists && !tempDir.mkdirs)
-      throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath())
+      throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath)
   }
 
   /**
@@ -35,9 +36,9 @@ object IOUtils extends Logging {
        throw new UserException.BadTmpDir("Could not create temp directory: " + tempDirParent)
     val temp = File.createTempFile(prefix + "-", suffix, tempDirParent)
     if (!temp.delete)
-      throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath())
+      throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath)
     if (!temp.mkdir)
-      throw new UserException.BadTmpDir("Could not create sub directory: " + temp.getAbsolutePath())
+      throw new UserException.BadTmpDir("Could not create sub directory: " + temp.getAbsolutePath)
     absolute(temp)
   }
 
@@ -46,7 +47,7 @@ object IOUtils extends Logging {
    * @param file File to write to.
    * @param content Content to write.
    */
-  def writeContents(file: File, content: String) =  FileUtils.writeStringToFile(file, content)
+  def writeContents(file: File, content: String) { FileUtils.writeStringToFile(file, content) }
 
   /**
    * Reads content of a file into a string.
@@ -146,10 +147,12 @@ object IOUtils extends Logging {
    * @return The absolute path to the file in the parent dir if the path was not absolute, otherwise the original path.
    */
   def absolute(parent: File, file: File): File = {
-    if (file.isAbsolute)
-      absolute(file)
-    else
-      absolute(new File(parent, file.getPath))
+    val newPath =
+      if (file.isAbsolute)
+        absolutePath(file)
+      else
+        absolutePath(new File(parent, file.getPath))
+    replacePath(file, newPath)
   }
 
   /**
@@ -159,12 +162,16 @@ object IOUtils extends Logging {
    * @return the absolute path to the file.
    */
   def absolute(file: File) = {
+    replacePath(file, absolutePath(file))
+  }
+
+  private def absolutePath(file: File) = {
     var fileAbs = file.getAbsoluteFile
     var names = List.empty[String]
     while (fileAbs != null) {
       val name = fileAbs.getName
       fileAbs = fileAbs.getParentFile
-      
+
       if (name == ".") {
         /* skip */
 
@@ -190,7 +197,18 @@ object IOUtils extends Logging {
       }
     }
 
-    new File(names.mkString("/", "/", ""))
+    names.mkString("/", "/", "")
+  }
+
+  private def replacePath(file: File, path: String) = {
+    file match {
+      case fileExtension: FileExtension =>
+        fileExtension.withPath(path)
+      case file: File =>
+        if (file.getClass != classOf[File])
+          throw new QException("Sub classes of java.io.File must also implement FileExtension so that the path can be modified.")
+        new File(path)
+    }
   }
 
   /**

From 7fa1e237d9e1e41fbdcd42f069df7c658f523bc7 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel 
Date: Fri, 16 Sep 2011 12:53:54 -0400
Subject: [PATCH 094/113] Forgot to git stash pop new MD5's for CombineVariants
 integration test

---
 .../walkers/variantutils/CombineVariantsIntegrationTest.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java
index 3267173a7..35495d797 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java
@@ -90,7 +90,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
 
     @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f"); } // official project VCF files in tabix format
     @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9"); } // official project VCF files in tabix format
-    @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a"); }
+    @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "312a22aedb088b678bc891f1a1b03c91"); }
 
     @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083"); }
 
@@ -110,7 +110,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
                         " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
                         " -genotypeMergeOptions UNIQUIFY -L 1"),
                 1,
-                Arrays.asList("1de95f91ca15d2a8856de35dee0ce33e"));
+                Arrays.asList("35acb0f15f9cd18c653ede4e15e365c9"));
         executeTest("threeWayWithRefs", spec);
     }
 

From cb4a50b1478bb54c19ae57703733e4f330bd7a2f Mon Sep 17 00:00:00 2001
From: Ryan Poplin 
Date: Sat, 17 Sep 2011 16:42:49 -0400
Subject: [PATCH 095/113] Adding ability to try both small and large kmer
 lengths. Highest likelihood wins.

---
 .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index 87dd37bf6..a5cb00a5d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
+import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.variantcontext.*;
 

From bed78b47e090e19274273a1a552e0e40c82e0161 Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Sun, 18 Sep 2011 20:18:18 -0400
Subject: [PATCH 096/113] Marginally better formating, with hours the default
 time

---
 public/R/queueJobReport.R | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R
index a24d269c9..9f37aa038 100644
--- a/public/R/queueJobReport.R
+++ b/public/R/queueJobReport.R
@@ -12,14 +12,14 @@ if ( onCMDLine ) {
   inputFileName = args[1]
   outputPDF = args[2]
 } else {
-  #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
-  inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
+  inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
+  #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
   #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
   outputPDF = NA
 }
 
-RUNTIME_UNITS = "(sec)"
-ORIGINAL_UNITS_TO_SECONDS = 1/1000
+RUNTIME_UNITS = "(hours)"
+ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60
 
 # 
 # Helper function to aggregate all of the jobs in the report across all tables
@@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
 #
 # Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
 #
-plotJobsGantt <- function(gatkReport, sortOverall) {
+plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
   allJobs = allJobsFromReport(gatkReport)
   if ( sortOverall ) {
     title = "All jobs, by analysis, by start time"
@@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
   }
   allJobs$index = 1:nrow(allJobs)
   minTime = min(allJobs$startTime)
-  allJobs$relStartTime = allJobs$startTime - minTime
-  allJobs$relDoneTime = allJobs$doneTime - minTime
+  allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
+  allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
   allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
   maxRelTime = max(allJobs$relDoneTime)
   p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
-  p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
-  p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
+  p <- p + theme_bw()
+  p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
+  if ( includeText )
+    p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
   p <- p + xlim(0, maxRelTime * 1.1)
   p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
-  p <- p + ylab("Job")
+  p <- p + ylab("Job number")
   p <- p + opts(title=title)
   print(p)
 }
@@ -155,8 +157,8 @@ if ( ! is.na(outputPDF) ) {
   pdf(outputPDF, height=8.5, width=11)
 } 
 
-plotJobsGantt(gatkReportData, T)
-plotJobsGantt(gatkReportData, F)
+plotJobsGantt(gatkReportData, T, F)
+plotJobsGantt(gatkReportData, F, F)
 plotProgressByTime(gatkReportData)
 for ( group in gatkReportData ) {
  plotGroup(group)

From 4ad330008ddb29e163089afa2c264f62dccd4c3f Mon Sep 17 00:00:00 2001
From: Mark DePristo 
Date: Mon, 19 Sep 2011 10:19:10 -0400
Subject: [PATCH 097/113] Final intervals cleanup

-- No functional changes (my algorithm wouldn't work)
-- Major structural cleanup (returning more basic data structures that allow us to development new algorithm)
-- Unit tests for the efficiency of interval partitioning
---
 build.xml                                     |  4 +-
 .../sting/utils/interval/IntervalUtils.java   | 70 ++++++++++++++-----
 .../utils/interval/IntervalUtilsUnitTest.java | 63 ++++++++---------
 3 files changed, 82 insertions(+), 55 deletions(-)

diff --git a/build.xml b/build.xml
index 1196f32dc..e5ad9daf0 100644
--- a/build.xml
+++ b/build.xml
@@ -852,8 +852,8 @@
                 
                 
                 
-                
-                
+
+
                 
                     
                     
diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
index 2cfcc19a9..41cbbe59f 100644
--- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
@@ -333,6 +333,28 @@ public class IntervalUtils {
             throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size()));
     }
 
+    /**
+     * Splits an interval list into multiple sublists.
+     * @param locs The genome locs to split.
+     * @param splits The stop points for the genome locs returned by splitFixedIntervals.
+     * @return A list of lists of genome locs, split according to splits
+     */
+    public static List> splitIntervalsToSubLists(List locs, List splits) {
+        int locIndex = 1;
+        int start = 0;
+        List> sublists = new ArrayList>(splits.size());
+        for (Integer stop: splits) {
+            List curList = new ArrayList();
+            for (int i = start; i < stop; i++)
+                curList.add(locs.get(i));
+            start = stop;
+            sublists.add(curList);
+        }
+
+        return sublists;
+    }
+
+
     /**
      * Splits an interval list into multiple files.
      * @param fileHeader The sam file header.
@@ -362,27 +384,39 @@ public class IntervalUtils {
     public static List> splitFixedIntervals(List locs, int numParts) {
         if (locs.size() < numParts)
             throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
-
         final long locsSize = intervalSize(locs);
-        final double idealSplitSize = locsSize / numParts;
-        final List> splits = new ArrayList>(numParts);
-        final LinkedList remainingLocs = new LinkedList(locs);
+        final List splitPoints = new ArrayList();
+        addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
+        Collections.sort(splitPoints);
+        splitPoints.add(locs.size());
+        return splitIntervalsToSubLists(locs, splitPoints);
+    }
 
-        for ( int i = 0; i < numParts; i++ ) {
-            long splitSize = 0;
-            List split = new ArrayList();
-            while ( ! remainingLocs.isEmpty() ) {
-                final GenomeLoc toAdd = remainingLocs.pop();
-                splitSize += toAdd.size();
-                split.add(toAdd);
-                final long nextEltSize = remainingLocs.isEmpty() ? 0 : remainingLocs.peek().size();
-                if ( splitSize + (i % 2 == 0 ? 0 : nextEltSize) > idealSplitSize )
-                    break;
-            }
-            splits.add(split);
+    private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) {
+        if (numParts < 2)
+            return;
+        int halfParts = (numParts + 1) / 2;
+        Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts);
+        int splitIndex = splitPoint.first;
+        long splitSize = splitPoint.second;
+        splitPoints.add(splitIndex);
+        addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts);
+        addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts);
+    }
+
+    private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) {
+        int splitIndex = startIndex;
+        long splitSize = 0;
+        for (int i = 0; i < minLocs; i++) {
+            splitSize += locs.get(splitIndex).size();
+            splitIndex++;
         }
-
-        return splits;
+        long halfSize = locsSize / 2;
+        while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) {
+            splitSize += locs.get(splitIndex).size();
+            splitIndex++;
+        }
+        return new Pair(splitIndex, splitSize);
     }
 
     /**
diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
index 4809f1b5c..98b878d23 100644
--- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
@@ -1,7 +1,6 @@
 package org.broadinstitute.sting.utils.interval;
 
 import net.sf.picard.reference.ReferenceSequenceFile;
-import net.sf.picard.util.IntervalUtil;
 import net.sf.samtools.SAMFileHeader;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
@@ -101,25 +100,18 @@ public class IntervalUtilsUnitTest extends BaseTest {
     @DataProvider(name = "intervalslicingdata")
     public Object[][] createTrees() {
         new IntervalSlicingTest(1, 0);
-        new IntervalSlicingTest(2, 0.1);
-        new IntervalSlicingTest(3, 0.1);
-        new IntervalSlicingTest(7, 0.1);
-        new IntervalSlicingTest(10, 0.1);
-        new IntervalSlicingTest(31, 0.1);
-        new IntervalSlicingTest(67, 0.1);
-        new IntervalSlicingTest(100, 0.1);
-        new IntervalSlicingTest(127, 0.1);
-        // starts to become a bit less efficiency with larger cuts
-        new IntervalSlicingTest(500, 0.5);
+        new IntervalSlicingTest(2, 1);
+        new IntervalSlicingTest(5, 1);
+        new IntervalSlicingTest(10, 1);
+        new IntervalSlicingTest(67, 1);
+        new IntervalSlicingTest(100, 1);
+        new IntervalSlicingTest(500, 1);
         new IntervalSlicingTest(1000, 1);
-        new IntervalSlicingTest(10000, 10);
         return IntervalSlicingTest.getTests(IntervalSlicingTest.class);
     }
 
-    @Test(dataProvider = "intervalslicingdata")
+    @Test(enabled = true, dataProvider = "intervalslicingdata")
     public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) {
-        Set locsSet = new HashSet(hg19exomeIntervals);
-        Set notFoundSet = new HashSet(hg19exomeIntervals);
         List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts);
 
         long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals);
@@ -134,15 +126,9 @@ public class IntervalUtilsUnitTest extends BaseTest {
             counter++;
             sumOfSplitSizes += splitSize;
             Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance));
-
-            for ( final GenomeLoc loc : split ) {
-                Assert.assertTrue(locsSet.contains(loc), "Split location " + loc + " not found in set of input locs");
-                notFoundSet.remove(loc);
-            }
         }
 
-        Assert.assertEquals(sumOfSplitSizes, totalSize, "Split intervals don't contain the exact number of bases in the original intervals");
-        Assert.assertTrue(notFoundSet.isEmpty(), "Not all intervals were present in the split set");
+        Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
     }
 
     @Test(expectedExceptions=UserException.class)
@@ -246,7 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
         List files = testFiles("basic.", 3, ".intervals");
 
         List locs = getLocs("chr1", "chr2", "chr3");
-        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+        List> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
 
         List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
         List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@@ -271,20 +258,21 @@ public class IntervalUtilsUnitTest extends BaseTest {
         List files = testFiles("less.", 3, ".intervals");
 
         List locs = getLocs("chr1", "chr2", "chr3", "chr4");
-        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+        List> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
 
         List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
         List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
         List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
 
-        Assert.assertEquals(locs1.size(), 2);
+        Assert.assertEquals(locs1.size(), 1);
         Assert.assertEquals(locs2.size(), 1);
-        Assert.assertEquals(locs3.size(), 1);
+        Assert.assertEquals(locs3.size(), 2);
 
         Assert.assertEquals(locs1.get(0), chr1);
-        Assert.assertEquals(locs1.get(1), chr2);
-        Assert.assertEquals(locs2.get(0), chr3);
-        Assert.assertEquals(locs3.get(0), chr4);
+        Assert.assertEquals(locs2.get(0), chr2);
+        Assert.assertEquals(locs3.get(0), chr3);
+        Assert.assertEquals(locs3.get(1), chr4);
     }
 
     @Test(expectedExceptions=UserException.BadArgumentValue.class)
@@ -298,7 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
     public void testScatterFixedIntervalsMoreFiles() {
         List files = testFiles("more.", 3, ".intervals");
         List locs = getLocs("chr1", "chr2");
-        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, locs.size()), files);
+        List> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
     }
     @Test
     public void testScatterFixedIntervalsStart() {
@@ -311,7 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
         List files = testFiles("split.", 3, ".intervals");
 
         List locs = getLocs(intervals);
-        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+        List> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
 
         List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
         List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@@ -338,7 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
         List files = testFiles("split.", 3, ".intervals");
 
         List locs = getLocs(intervals);
-        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+        List> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
 
         List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
         List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@@ -365,7 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
         List files = testFiles("split.", 3, ".intervals");
 
         List locs = getLocs(intervals);
-        IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
+        List> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
 
         List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
         List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@@ -399,7 +391,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
 
         //String splitCounts = "";
         for (int i = 0; i < splits.size(); i++) {
-            long splitCount = splits.get(i).size();
+            int splitCount = splits.get(i).size();
             Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
         }
         //System.out.println(splitCounts.substring(2));
@@ -420,7 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
     @Test
     public void testScatterFixedIntervalsMax() {
         List files = testFiles("sg.", 85, ".intervals");
-        IntervalUtils.scatterFixedIntervals(hg19Header, IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()), files);
+        List> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg19Header, splits, files);
 
         for (int i = 0; i < files.size(); i++) {
             String file = files.get(i).toString();

From ca1b30e4a4822672803421278eb8301b14cff417 Mon Sep 17 00:00:00 2001
From: Christopher Hartl 
Date: Mon, 19 Sep 2011 10:29:06 -0400
Subject: [PATCH 098/113] Fix the -T argument in the DepthOfCoverage docs Add
 documentation for the RefSeqCodec, pointing users to the wiki page describing
 how to create the file

---
 .../coverage/DepthOfCoverageWalker.java       |  9 ++++---
 .../utils/codecs/refseq/RefSeqCodec.java      | 24 +++++++++++++++----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
index 86f97a36c..664c319ab 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
@@ -63,9 +63,12 @@ import java.util.*;
  * 

Input

*

* One or more bam files (with proper headers) to be analyzed for coverage statistics - * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *

- * + *

+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) + *

*

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -93,7 +96,7 @@ import java.util.*; *

  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T VariantEval \
+ *   -T DepthOfCoverage \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index d94d9ff84..f142fa5aa 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;
 
 /**
- * TODO FOR CHRIS HARTL
+ * Allows for reading in RefSeq information
  *
  * 

- * Codec Description + * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, + * strandedness of transcription. *

* *

- * See also: link to file specification + * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here + * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq *

+ *

Usage

+ * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example + *
+ * -refSeqBinding:REFSEQ /path/to/refSeq.txt
+ * 
+ * + * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *

File format example

+ * If you want to define your own file for use, the format is (tab delimited): + * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) + * and exon frames, for example: + *
+ * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
+ * 
+ * for more information see here *

- * A BAM file containing exactly one sample. + * *

* * @author Mark DePristo From 3e93f246f7b8849a3126fab5e0757cfdee22e661 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 19 Sep 2011 11:40:45 -0400 Subject: [PATCH 099/113] Support for sample sets in AssignSomaticStatus -- Also cleaned up SampleUtils.getSamplesFromCommandLine() to return a set, not a list, and trim the sample names. --- .../sting/utils/SampleUtils.java | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index f9997bfd8..1b4703e4a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -190,11 +190,21 @@ public class SampleUtils { } - public static List getSamplesFromCommandLineInput(Collection sampleArgs) { + /** + * Returns a new set of samples, containing a final list of samples expanded from sampleArgs + * + * Each element E of sampleArgs can either be a literal sample name or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique sample names. + * + * @param sampleArgs + * @return + */ + public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { if (sampleArgs != null) { // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our // sample list set, and treat the entries as if they had been specified on the command line. - List samplesFromFiles = new ArrayList(); + Set samplesFromFiles = new HashSet(); for (String SAMPLE_EXPRESSION : sampleArgs) { File sampleFile = new File(SAMPLE_EXPRESSION); @@ -203,7 +213,7 @@ public class SampleUtils { List lines = reader.readLines(); for (String line : lines) { - samplesFromFiles.add(line); + samplesFromFiles.add(line.trim()); } } catch (FileNotFoundException e) { samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample @@ -212,7 +222,8 @@ public class SampleUtils { return samplesFromFiles; } - return new ArrayList(); + + return new HashSet(); } public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { From 034b8685889a879eda0e7c1a001358a26845755a Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 19 Sep 2011 12:16:07 -0400 Subject: [PATCH 100/113] Revert "Fix the -T argument in the DepthOfCoverage docs" This reverts commit 0994efda998cf3a41b1a43696dbc852a441d5316. --- .../coverage/DepthOfCoverageWalker.java | 9 +++---- .../utils/codecs/refseq/RefSeqCodec.java | 24 ++++--------------- 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 664c319ab..86f97a36c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -63,12 +63,9 @@ import java.util.*; *

Input

*

* One or more bam files (with proper headers) to be analyzed for coverage statistics + * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *

- *

- *(Optional) A REFSEQ Rod to aggregate coverage to the gene level - *

- * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) - *

+ * *

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -96,7 +93,7 @@ import java.util.*; *

  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T DepthOfCoverage \
+ *   -T VariantEval \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index f142fa5aa..d94d9ff84 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -12,35 +12,19 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;
 
 /**
- * Allows for reading in RefSeq information
+ * TODO FOR CHRIS HARTL
  *
  * 

- * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, - * strandedness of transcription. + * Codec Description *

* *

- * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here - * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq + * See also: link to file specification *

- *

Usage

- * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example - *
- * -refSeqBinding:REFSEQ /path/to/refSeq.txt
- * 
- * - * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *

File format example

- * If you want to define your own file for use, the format is (tab delimited): - * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) - * and exon frames, for example: - *
- * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
- * 
- * for more information see here *

- * + * A BAM file containing exactly one sample. *

* * @author Mark DePristo From 85626e7a5dbae8a263b8e2ff2e64bd25656d6e9c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Sep 2011 12:24:05 -0400 Subject: [PATCH 101/113] We no longer want people to use the August 2010 Dindel calls for indel realignment but instead Guillermo's new whole genome bi-allelic indel calls; updating the bundle accordingly. Also, there was some confusion by the 1000G data processing folks as to exactly what these indel files are, so I've renamed them so that it's clear. Wiki updated too. --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 59c00b8cd..e8b8258c1 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -131,11 +131,11 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", "hapmap_3.3", b37, true, true)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf", - "1000G_indels_for_realignment", b37, true, false)) + addResource(new Resource("/humgen/1kg/processing/official_release/phase1/ALL.wgs.VQSR_consensus_biallelic.20101123.indels.sites.vcf", + "1000G_biallelic.indels", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf", - "indels_mills_devine", b37, true, true)) + "Mills_Devine_2hit.indels", b37, true, true)) // // example call set for wiki tutorial From 8143def292a49844ab3ff302fbb00f5c866299f7 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 19 Sep 2011 10:29:06 -0400 Subject: [PATCH 102/113] Fix the -T argument in the DepthOfCoverage docs Add documentation for the RefSeqCodec, pointing users to the wiki page describing how to create the file --- .../coverage/DepthOfCoverageWalker.java | 9 ++++--- .../utils/codecs/refseq/RefSeqCodec.java | 24 +++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 86f97a36c..664c319ab 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -63,9 +63,12 @@ import java.util.*; *

Input

*

* One or more bam files (with proper headers) to be analyzed for coverage statistics - * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *

- * + *

+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) + *

*

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -93,7 +96,7 @@ import java.util.*; *

  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T VariantEval \
+ *   -T DepthOfCoverage \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index d94d9ff84..f142fa5aa 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;
 
 /**
- * TODO FOR CHRIS HARTL
+ * Allows for reading in RefSeq information
  *
  * 

- * Codec Description + * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, + * strandedness of transcription. *

* *

- * See also: link to file specification + * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here + * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq *

+ *

Usage

+ * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example + *
+ * -refSeqBinding:REFSEQ /path/to/refSeq.txt
+ * 
+ * + * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *

File format example

+ * If you want to define your own file for use, the format is (tab delimited): + * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) + * and exon frames, for example: + *
+ * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
+ * 
+ * for more information see here *

- * A BAM file containing exactly one sample. + * *

* * @author Mark DePristo From 5e832254a4e024378f7fdee252abf7df9e289c6a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Sep 2011 13:28:41 -0400 Subject: [PATCH 103/113] Fixing ReadAndInterval overlap comments. --- .../sting/utils/sam/ReadUtils.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 62bbb0307..18fcdabf2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -118,31 +118,40 @@ public class ReadUtils { /** * This enum represents all the different ways in which a read can overlap an interval. * - * NO_OVERLAP: + * NO_OVERLAP_CONTIG: + * read and interval are in different contigs. + * + * NO_OVERLAP_LEFT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * NO_OVERLAP_RIGHT: * the read does not overlap the interval. * * |----------------| (interval) * <----------------> (read) * - * LEFT_OVERLAP: + * OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it * * |----------------| (interval) * <----------------> (read) * - * RIGHT_OVERLAP: + * OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it * * |----------------| (interval) * <----------------> (read) * - * FULL_OVERLAP: + * OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval * * |-----------| (interval) * <-------------------> (read) * - * CONTAINED: + * OVERLAP_CONTAINED: * the read starts and ends inside the interval * * |----------------| (interval) From ba150570f3d7747256f634a2828ab673a98953f7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Sep 2011 13:30:32 -0400 Subject: [PATCH 104/113] Updating to use new rod system syntax plus name change for CountRODs --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index e8b8258c1..036a77b58 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -300,9 +300,9 @@ class GATKResourcesBundle extends QScript { bamFile = bamIn } - class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRod with UNIVERSAL_GATK_ARGS { + class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRODs with UNIVERSAL_GATK_ARGS { //@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx") - this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) + this.rod :+= vcf this.reference_sequence = ref } @@ -313,7 +313,7 @@ class GATKResourcesBundle extends QScript { } class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("variant", "VCF", dbsnp) + this.variant = dbsnp this.select ++= List("\"dbSNPBuildID <= 129\"") this.reference_sequence = ref this.out = dbsnp129 From 080c9575470c505e10f7b09d59fa22fcb668867d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Sep 2011 13:53:08 -0400 Subject: [PATCH 105/113] Fixing contracts for SoftUnclippedEnd utils Now accepts reads that are entirely contained inside an insertion. --- .../broadinstitute/sting/utils/sam/ReadUtils.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 18fcdabf2..2de17db14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -667,7 +667,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedStart(SAMRecord read) { int start = read.getUnclippedStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { @@ -679,7 +679,7 @@ public class ReadUtils { return start; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedEnd(SAMRecord read) { int stop = read.getUnclippedStart(); int shift = 0; @@ -695,6 +695,14 @@ public class ReadUtils { return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + private static boolean readIsEntirelyInsertion(SAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } + /** * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before * the alignment start of the read. From 56106d54ed620965aea0b39052de43c81671c817 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Sep 2011 14:00:00 -0400 Subject: [PATCH 106/113] Changing ReadUtils behavior to comply with GenomeLocParser Now the functions getRefCoordSoftUnclippedStart and getRefCoordSoftUnclippedEnd will return getUnclippedStart if the read is all contained within an insertion. Updated the contracts accordingly. This should give the same behavior as the GenomeLocParser now. --- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 2de17db14..5d3ef3086 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -667,7 +667,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedStart(SAMRecord read) { int start = read.getUnclippedStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { @@ -679,9 +679,13 @@ public class ReadUtils { return start; } - @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedEnd(SAMRecord read) { int stop = read.getUnclippedStart(); + + if (readIsEntirelyInsertion(read)) + return stop; + int shift = 0; CigarOperator lastOperator = null; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { From 61b89e236ab13b073a3572e983b6c730efd5331e Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Tue, 20 Sep 2011 00:14:35 -0400 Subject: [PATCH 107/113] To work around potential problem with invalid javax.mail 1.4.1 in ivy cache, added explicit javax.mail 1.4.4 along with build.xml code to remove 1.4.1. --- build.xml | 8 ++++++++ ivy.xml | 6 ++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/build.xml b/build.xml index e5ad9daf0..1f26e7b7a 100644 --- a/build.xml +++ b/build.xml @@ -163,6 +163,14 @@ + + + + diff --git a/ivy.xml b/ivy.xml index 115f4062a..f90b9a010 100644 --- a/ivy.xml +++ b/ivy.xml @@ -15,10 +15,8 @@ - - - - + + From 5d0705acd654cfed6e9bf2ba690a0c75ee5a50d8 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 20 Sep 2011 09:07:28 -0400 Subject: [PATCH 108/113] Adding quality scores to the VCF records created by the Haplotype Caller --- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a5cb00a5d..87dd37bf6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -40,7 +40,6 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.*; From b7511c5ff3b36e16037bfbbbd17b9fd4c9ea47af Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 20 Sep 2011 10:53:18 -0400 Subject: [PATCH 109/113] Fixed long-standing bug in tribble index creation -- Previously, on the fly indices didn't have dictionary set on the fly, so the GATK would read, add dictionary, and rewrite the index. This is now fixed, so that the on the fly index contains the reference dictionary when first written, avoiding the unnecessary read and write -- Added a GenomeAnalysisEngine and Walker function called getMasterSequenceDictionary() that fetches the reference sequence dictionary. This can be used conveniently everywhere, and is what's written into the Tribble index -- Refactored tribble index utilities from RMDTrackBuilder into IndexDictionaryUtils -- VCFWriter now requires the master sequence dictionary -- Updated walkers that create VCFWriters to provide the master sequence dictionary --- .../sting/gatk/GenomeAnalysisEngine.java | 8 ++ .../gatk/io/storage/VCFWriterStorage.java | 4 +- .../sting/gatk/io/stubs/VCFWriterStub.java | 10 ++ .../gatk/refdata/indexer/RMDIndexer.java | 2 +- .../refdata/tracks/IndexDictionaryUtils.java | 106 ++++++++++++++++ .../gatk/refdata/tracks/RMDTrackBuilder.java | 113 ++++-------------- .../sting/gatk/walkers/Walker.java | 10 ++ .../variantutils/LiftoverVariants.java | 2 +- .../variantutils/RandomlySplitVariants.java | 2 +- .../utils/codecs/vcf/IndexingVCFWriter.java | 38 ++++-- .../utils/codecs/vcf/StandardVCFWriter.java | 26 ++-- .../sting/utils/gcf/GCFWriter.java | 5 +- .../org/broadinstitute/sting/WalkerTest.java | 2 +- .../tracks/RMDTrackBuilderUnitTest.java | 4 +- .../codecs/vcf/IndexFactoryUnitTest.java | 22 +++- .../utils/genotype/vcf/VCFWriterUnitTest.java | 5 +- 16 files changed, 228 insertions(+), 131 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 5b9ebd99b..972943e26 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -929,6 +929,14 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(reader); } + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + /** * Returns data source object encapsulating all essential info and handlers used to traverse * reads; header merger, individual file readers etc can be accessed through the returned data source object. diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index ebb4cbe66..4ca7b935f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { else if ( stub.getOutputStream() != null ) { this.file = null; this.stream = stub.getOutputStream(); - writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes()); + writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes()); } else throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); @@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { } // The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it. - return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); + return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 936243f9d..82cb43634 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.io.stubs; +import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub, VCFWriter { return isCompressed; } + /** + * Gets the master sequence dictionary from the engine associated with this stub + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return engine.getMasterSequenceDictionary(); + } + /** * Should we tell the VCF writer not to write genotypes? * @return true if the writer should not write genotypes. diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java index 029800aea..9e5a95d10 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java @@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram { Index index = IndexFactory.createIndex(inputFileSource, codec, approach); // add writing of the sequence dictionary, if supplied - builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false); + builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary()); // create the output stream, and write the index LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java new file mode 100644 index 000000000..d133439dc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broad.tribble.index.Index; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.utils.SequenceDictionaryUtils; + +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Utilities for working with Sequence Dictionaries embedded in tribble indices + * + * @author Your Name + * @since Date created + */ +public class IndexDictionaryUtils { + private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); + + // a constant we use for marking sequence dictionary entries in the Tribble index property list + public static final String SequenceDictionaryPropertyPredicate = "DICT:"; + + /** + * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index + * @param index the index file to use + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + for (Map.Entry entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { + LinkedHashSet seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { + for ( SAMSequenceRecord seq : dict.getSequences() ) { + final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); + final String length = String.valueOf(seq.getSequenceLength()); + index.addProperty(contig,length); + } + } + + public static void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict, + final ValidationExclusion.TYPE validationExclusionType ) { + // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) + logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + else { + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 46eb79aa7..3b4558579 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureSource; @@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.util.LinkedHashSet; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - /** - * - * @author aaron + * + * @author aaron * ` * Class RMDTrackBuilder * @@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager { private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false; - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - // private sequence dictionary we use to set our tracks with private SAMSequenceDictionary dict = null; @@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager { try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } catch (ReviewedStingException e) { } - sequenceDictionary = getSequenceDictionaryFromProperties(index); + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match if (sequenceDictionary.size() == 0 && dict != null) { File indexFile = Tribble.indexFile(inputFile); - setIndexSequenceDictionary(inputFile,index,dict,indexFile,true); - sequenceDictionary = getSequenceDictionaryFromProperties(index); + validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); + } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); } if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE ) @@ -363,96 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - setIndexSequenceDictionary(inputFile, idx, dict, null, false); + validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); return idx; } - - // --------------------------------------------------------------------------------------------------------- - // static functions to work with the sequence dictionaries of indexes - // --------------------------------------------------------------------------------------------------------- - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { - LinkedHashSet seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - /** * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. * (that each contig in the index is in the sequence dictionary). * @param inputFile for proper error message formatting. * @param dict the sequence dictionary * @param index the index file - * @param indexFile the index file - * @param rewriteIndex should we rewrite the index when we're done? - * */ - public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) { - if (dict == null) return; - - SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { + if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict); + final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); - for (SAMSequenceRecord seq : currentDict.getSequences()) { - if (dict.getSequence(seq.getSequenceName()) == null) - continue; - index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength())); - } - // re-write the index - if (rewriteIndex) try { - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); - } + // actually update the dictionary in the index + IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); } - public void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { - for ( SAMSequenceRecord seq : dict.getSequences() ) { - final String contig = SequenceDictionaryPropertyPredicate + seq.getSequenceName(); - final String length = String.valueOf(seq.getSequenceLength()); - index.addProperty(contig,length); - } - } - - public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); - } + public void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict ) { + IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index c88c7c3c4..10261112c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -77,6 +78,15 @@ public abstract class Walker { return toolkit; } + /** + * Gets the master sequence dictionary for this walker + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + protected SAMSequenceDictionary getMasterSequenceDictionary() { + return getToolkit().getMasterSequenceDictionary(); + } + /** * (conceptual static) method that states whether you want to see reads piling up at a locus * that contain a deletion at the locus. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 1c76a21ea..a932d44ed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(metaData, samples); - writer = new StandardVCFWriter(file, false); + writer = new StandardVCFWriter(file, getMasterSequenceDictionary(), false); writer.writeHeader(vcfHeader); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java index 1fefd20fc..fa5093839 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java @@ -75,7 +75,7 @@ public class RandomlySplitVariants extends RodWalker { hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); vcfWriter1.writeHeader(new VCFHeader(hInfo, samples)); - vcfWriter2 = new StandardVCFWriter(file2, true); + vcfWriter2 = new StandardVCFWriter(file2, getMasterSequenceDictionary(), true); vcfWriter2.writeHeader(new VCFHeader(hInfo, samples)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java index 4ae87ddcb..71ec4ce1b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java @@ -24,6 +24,9 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.TribbleException; import org.broad.tribble.index.DynamicIndexCreator; @@ -31,7 +34,9 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; @@ -41,21 +46,24 @@ import java.io.*; */ public abstract class IndexingVCFWriter implements VCFWriter { final private String name; + private final SAMSequenceDictionary refDict; - private File indexFile = null; private OutputStream outputStream; private PositionalStream positionalStream = null; private DynamicIndexCreator indexer = null; private LittleEndianOutputStream idxStream = null; - protected IndexingVCFWriter(String name, File location, OutputStream output, boolean enableOnTheFlyIndexing) { + @Requires({"name != null", + "! ( location == null && output == null )", + "! ( enableOnTheFlyIndexing && location == null )"}) + protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) { outputStream = output; this.name = name; + this.refDict = refDict; if ( enableOnTheFlyIndexing ) { - indexFile = Tribble.indexFile(location); try { - idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location))); //System.out.println("Creating index on the fly for " + location); indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); indexer.initialize(location, indexer.defaultBinSize()); @@ -66,15 +74,16 @@ public abstract class IndexingVCFWriter implements VCFWriter { idxStream = null; indexer = null; positionalStream = null; - indexFile = null; } } } + @Ensures("result != null") public OutputStream getOutputStream() { return outputStream; } + @Ensures("result != null") public String getStreamName() { return name; } @@ -89,6 +98,7 @@ public abstract class IndexingVCFWriter implements VCFWriter { if ( indexer != null ) { try { Index index = indexer.finalizeIndex(positionalStream.getPosition()); + IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict); index.write(idxStream); idxStream.close(); } catch (IOException e) { @@ -108,15 +118,27 @@ public abstract class IndexingVCFWriter implements VCFWriter { indexer.addFeature(vc, positionalStream.getPosition()); } - protected static final String writerName(File location, OutputStream stream) { + /** + * Returns a reasonable "name" for this writer, to display to the user if something goes wrong + * + * @param location + * @param stream + * @return + */ + protected static final String writerName(final File location, final OutputStream stream) { return location == null ? stream.toString() : location.getAbsolutePath(); } - protected static OutputStream openOutputStream(File location) { + /** + * Returns a output stream writing to location, or throws a UserException if this fails + * @param location + * @return + */ + protected static OutputStream openOutputStream(final File location) { try { return new FileOutputStream(location); } catch (FileNotFoundException e) { - throw new ReviewedStingException("Unable to create VCF file at location: " + location, e); + throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 7cba5fc3e..0da7a100f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.TribbleException; import org.broad.tribble.index.DynamicIndexCreator; @@ -62,21 +63,12 @@ public class StandardVCFWriter extends IndexingVCFWriter { * * @param location the file location to write to */ - public StandardVCFWriter(File location) { - this(location, openOutputStream(location), true, false); + public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) { + this(location, openOutputStream(location), refDict, true, false); } - public StandardVCFWriter(File location, boolean enableOnTheFlyIndexing) { - this(location, openOutputStream(location), enableOnTheFlyIndexing, false); - } - - /** - * create a VCF writer, given a stream to write to - * - * @param output the file location to write to - */ - public StandardVCFWriter(OutputStream output) { - this(output, false); + public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) { + this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false); } /** @@ -85,12 +77,12 @@ public class StandardVCFWriter extends IndexingVCFWriter { * @param output the file location to write to * @param doNotWriteGenotypes do not write genotypes */ - public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) { - this(null, output, false, doNotWriteGenotypes); + public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) { + this(null, output, refDict, false, doNotWriteGenotypes); } - public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { - super(writerName(location, output), location, output, enableOnTheFlyIndexing); + public StandardVCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size this.doNotWriteGenotypes = doNotWriteGenotypes; } diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java index 7ff6e27a2..18fae18c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.gcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -52,8 +53,8 @@ public class GCFWriter extends IndexingVCFWriter { // // -------------------------------------------------------------------------------- - public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { - super(writerName(location, null), location, null, enableOnTheFlyIndexing); + public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing); this.location = location; this.skipGenotypes = doNotWriteGenotypes; diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 386c17659..a1817e3c7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -75,7 +75,7 @@ public class WalkerTest extends BaseTest { Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec()); Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - if ( ! indexFromOutputFile.equalsIgnoreTimestamp(dynamicIndex) ) { + if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) { Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", indexFromOutputFile.getProperties(), dynamicIndex.getProperties())); diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java index ae218e898..724c343e4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -29,7 +29,6 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -45,7 +44,6 @@ import org.testng.annotations.Test; import java.io.*; import java.nio.channels.FileChannel; -import java.util.Map; /** @@ -164,7 +162,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { try { Index idx = builder.loadIndex(vcfFile, new VCFCodec()); // catch any exception; this call should pass correctly - SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx); + SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx); } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index 1809ab778..55bd4783b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -1,27 +1,45 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.*; import org.broad.tribble.iterators.CloseableTribbleIterator; import org.broad.tribble.source.BasicFeatureSource; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.testng.Assert; +import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; /** * tests out the various functions in the index factory class */ -public class IndexFactoryUnitTest { +public class IndexFactoryUnitTest extends BaseTest { File inputFile = new File("public/testdata/HiSeq.10000.vcf"); File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf"); File outputFileIndex = Tribble.indexFile(outputFile); + private SAMSequenceDictionary dict; + + @BeforeTest + public void setup() { + try { + dict = new CachingIndexedFastaSequenceFile(new File(b37KGReference)).getSequenceDictionary(); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(b37KGReference,ex); + } + } + // // test out scoring the indexes // @@ -37,7 +55,7 @@ public class IndexFactoryUnitTest { BasicFeatureSource source = new BasicFeatureSource(inputFile.getAbsolutePath(), indexFromInputFile, new VCFCodec()); int counter = 0; - VCFWriter writer = new StandardVCFWriter(outputFile); + VCFWriter writer = new StandardVCFWriter(outputFile, dict); writer.writeHeader((VCFHeader)source.getHeader()); CloseableTribbleIterator it = source.iterator(); while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index e3a926fb9..a8e6593b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -38,12 +38,13 @@ public class VCFWriterUnitTest extends BaseTest { private Set additionalColumns = new HashSet(); private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; @BeforeClass public void beforeTests() { File referenceFile = new File(hg18Reference); try { - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); } catch(FileNotFoundException ex) { @@ -55,7 +56,7 @@ public class VCFWriterUnitTest extends BaseTest { @Test public void testBasicWriteAndRead() { VCFHeader header = createFakeHeader(metaData,additionalColumns); - VCFWriter writer = new StandardVCFWriter(fakeVCFFile); + VCFWriter writer = new StandardVCFWriter(fakeVCFFile, seq.getSequenceDictionary()); writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header)); From a1b4cafe7a63ecbeea9b06a94445f6bbf925d5e1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 20 Sep 2011 13:59:59 -0400 Subject: [PATCH 110/113] Bug fix for NPE when timer wasn't initialized --- .../broadinstitute/sting/gatk/traversals/TraversalEngine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 27fd173cb..c6321e2ad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -358,7 +358,7 @@ public abstract class TraversalEngine,Provide public void printOnTraversalDone() { printProgress(null, null, true); - final double elapsed = timer.getElapsedTime(); + final double elapsed = timer == null ? 0 : timer.getElapsedTime(); ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); From 827c942c8027d7888b29ddff8399834b7bcf94e4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 20 Sep 2011 14:01:14 -0400 Subject: [PATCH 111/113] Rev tribble --- .../{tribble-24.jar => tribble-25.jar} | Bin 299210 -> 305986 bytes .../{tribble-24.xml => tribble-25.xml} | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename settings/repository/org.broad/{tribble-24.jar => tribble-25.jar} (89%) rename settings/repository/org.broad/{tribble-24.xml => tribble-25.xml} (51%) diff --git a/settings/repository/org.broad/tribble-24.jar b/settings/repository/org.broad/tribble-25.jar similarity index 89% rename from settings/repository/org.broad/tribble-24.jar rename to settings/repository/org.broad/tribble-25.jar index b1c39e60a14ce895e515cadb07c356d6d428bbc4..6d467ba3fdd6e6285e8244a87aefa4a0d232cfe1 100644 GIT binary patch delta 11585 zcmai42Vhjy(w@2ZW_PpM^g;qD8$wM8J(N%qdJ_y%LrVe#5&|UjCRvas1PHK_qZBEE zfYKvb5Gjh~2_ond3rc(XY)=tTo=P$Q%(*waS>*r!g>&!RGjqO~GiT16bMML7Z6R-c z8seYPN7bwV7J!&RE!X-tP^0PAW`mltGj{EzIC=ZMF5jGSn9TR`CvmDGE8bQg%8Iu& zyR0bx&Qy%DCePAAmBenpP`&1)?RUNAXwWuYctmIyC7AHi_V>?tZifoMCDoZTGTeDO zqIUUr;jifpUW~f0H`o-tPOoWZU!fBx6FTZdWj$3Vd=m4_(zc>zZKMIOHXCY{gsJ_@ zI?REN`BJ33R`<^|DSNkfGbz%rPi~+j)($(R1h8ql=ae=+OWrwa61j7{QH)%)IQeg3`&f2W z3@iVwBoLh!lES#!X_{)!7%MU-_U%_CfdE=TYNJHJ$MbXZP zpQFofZM0aq59J3+rs+i8vVTgqi0JN0f+(D8yO>}yv5sl^l`X$ZjW@I=-8F;uNP|8YWO?~TYBH_ z8%x0vt2%?*M@k9j{!3BiewP~Squ&1Wv62u!#cK)9(u2Ov3t#GE^wpOi+oUarzQ5_q z1Px8Xx+5m%xjR-1+iq{avaI~dopy>(5okEY8FROlKi9vfMIN{4)^Z^j0U)6hz+M84EvW)fc-F(a8Tjn$!D zdkwT=6Ku*N&A6^PKepgYOBPPyODjKYjcxp}EvE9Nof$j$K@03?#x%aCTd@;%=1UhV zIyzxj^8W&0#BSWWyBT{}u%{I|U_&$Z;;PC@* za2!L8x59ADGh;qGZh{31*u@jAI0+{+R-pxpd?6pFSa2%erv(TdIJa31G`?Ni>+%pv+pHfK>6Z(yOrPt2-a@1$K)1G&o7)z>KB%Gc(_i% z4ClqlS!U`;z8Rkpw;L*MK)LgB?z_&IWu#byEYI+pth1 zQF>6{AtHDK)^a-T`mbMueuTeBbh?W18ompM#7+y=Grh-S_z&?B(SBsnSv+UM5AeK* zucx#YCoD>Y*!q!b5_7JrL9Se1{Km&I69Q=Vl1&uWzC6HmuL<*^7!VvpD`2AM1+dWl ztkCl_Ue6kb7RhQHQSwe1va7FqDCyr{}5ofD)LFRQ-a{?uj6;cB%hg zyA4LWtwCt-=+W1%TDw4Reg0D4M%YAJodEb>Fau>r?C$z22)kl{wwLTufw4wIq?i|i z?Hps^Ll_G`KrZ|T^5DOecxu1|jG}Z?A0}gGD8#NXRjRb>ScXbMh0X9HX{v{I*h1Y_ zWVeIB*Btc#nww#3Ty*2a4-i8w#4HQlj(xk zBM|2yBM*_VA}y{*(E%`b=b6g^ESc2a53JfqRhFqI8>$uOWB zwO0topLEzWlUxw*g4zy`7OUhgqVZfTks@^Is!oGH?2wVkm^%&3?j*yQR_atjV$u-^ z^xy@s(N;TSR2({5X}c0NWsgjZoN z3zOm|h5V}mlo0xU`1slOC8SD6O27^7>$! z+=FR?*NnhC0u_dl9mpzHdUq@23ch^w#ylKHL~cORoZ!Vsm2)0jw-AgakR zXo}D;`4}|g+?-5AbA;2-B2`O@(vCq(en?>ieriQe4*9sN@*?+6lXD=I7A=6wWawA) zf_jCL(N&0qYqU;Wha_64Tfo1d9WBaT;9KYi-$5aKPfOYlG(rC(z0i_6SV$A~IGiBF z;Us?&P7xY)+C2=VWXGE{Osa#{skbPzDR71ac%{*szD-Se%Gp49NLx%ZymJql!@E?Y zk<5D}Pn!k9r?FJWJ}pipr?uA8E#!uz)03lUWZDneRr$3iK01@n z0DdFy-K8}CUxp+5XU^Z!q14fmiA+=mSK3x-0a14hz39g9edlR^uU3Ug@#Hy=%I zSLQ=7oTZpm$uoX%jwp=+)(#&?uTihvPA!J>6qIQAO(h)~ZLsc^rDt)}wbw zp9*+_cu!P7Ul;T{)rdPt}f{`s!DHq4ce8XO@+9WacFPU%HUO!MC zK#a^N(cth*4dfRM-Fh5HwsT5Z$JP(v`0pS$ZqY6ncmyK#R8XJN_Mi$FOx~784^e=- zNlAAoLb9F}KBu;syWokw$6>g8C`QmQ=xs+jXgC_VVAQGqO`wWT8Jv)!v{Q$qTRTBd z7)AzmqK$2qygy6Vp0xMpyS_}xG=u}lX@lt7#$brRq4a8>2@P-l!|iTS*l-)#8NB2rxY$3+_Q;cQbsSWriQ9{Ah7ho8uZSAXS@b+M7wR5r-i=B**6jSTj{} zZs34S&bk~*M)B86GsU{bua)`?#JjWsrZ0|g{W#>)Y@5(mH}aA7 z_)N9+*jRua6g-p$5K?`jeBOmEQ9j3@0HKvhZDp=nqYa5;Fp|ahn<-0QYn1sKyT_1?a%;C^Snykpc(r3COA8CiZ8h zPq(}uoJXhk^>a0Xa#Ci!d@&Br;AhI1*Bo`ya+ zA7)`OEWrg7MGIjwE`l#{30%cx(x>CdTeD~;-g3Zga$OGX>3Nf2p$K-u9SSZJEs|^C zSE9`H=Ch3!)4Mbw8^U2)PI+4qPE)mNTXEeTaMx*}{eyxdW`FU>nRnKNkEz19W)44i#}o5~@Y?MAsD)!z>4QkXOX1?k9yuD z!Cdh5d@HBV&#q<(_>qtNC_d@@6J@D1M5&5+bG;fVze^pvj{&Z(4Y9pEs&Fkz!mtGX zYVF$44_o=m57Sr2eV4wSJiFa>WjOBszdk;SBct%4RJEYhmp%iiYXgwa2NJ}sR5Xi{ zR!X4j&nz4$4`^H&IXK8i>e@XH=lMyh-c}*$eXUDmge}tEOB$e|SCWc6A^$&PX2Oe=3zmUqrsBKC@*&Hp!hq8|T`%O}XPI zsWtnQSM|PjJg&rBIy9<Ry7Amx%pGoVe&Y~ zb$O2ZsXof(OVkaznHg);F*-`cdi6s+;FbyXrMl99aV&}$}g)O0;N;phwuKmn5JK`ML$ovepU?-uddJn#e=h|udDr!szbN*t z*>h8}Xxe0{&YZiE;$Vpu>YAWwh4S3U^_5xsK)3nTQ0=fj`LZLlhq|TNk=k9oze8iR zN}W5*u9fQXMQ2W}q@{IhYk%ls_WD|xJOXna>!5w4yJb~3?YypRWH0TlKsi#ktlbuc z(7fc*Tr3=`))eW9nkF*7QiEMtnObdK!{T9DTdCpnFs-jirE^*ll%uubQ##QnM@wU# zSe2tSk;JJSO-=^!dydvY*0dh24Uxp!(OPdwTpg`FDT$_Iw5KGoWQ>*}iOJBuV4CL?l~)4KZ^uGYne$M%JV_Pt zleAa~yn3QmOS!nyU<5d?m-vZSiZq*@PY;zjWq*~>fh-p~M0EdlEbM;JXb~=wCut#4 zv71dqP4`0({M;$}w7|()w3s(ZlLyOU#U#=?P_K24dO4&~MjuqdR7wyu9k>%0SFE4a zm0pLrG;(9n;rha*pE6debWg|01QT|ZE8@i5$ivDqE!bnV(%0}_c|n|F;4PoGAmG`Q%P3Rl=(YA%XcPX^4L=6m zu9$nDUZT0Ad7=2v)8&I2W4xG4@)D|aIzizs1o9G%78;RO308?_{UFPFnJfk6;!p zwp_Y+Np=5^j(Y1yLM^(6zQQp}3zLmr?{j#-hr#simP#kDMzOaaZFFlE*~6Mn2Hg5! zwjZ6(P-)}UsE;awi?uM3aSiQa@NC1O1+%GZQGq%u{SS;4uUD}yFzarZWYGq!CT+Jw z^c<>d9SsAQ1Hta5*(cr9RH0@H{+#45c z09*!miPjTA#k4M*o=^FE)O@YJoyCWL)?h&l$YrJPjt~=W++i&qC=}E<-K;(Uh;8QJo04@BXn>9`53=5l>S5U6OQvVk{Jt( zk(AM0_yhB6$xXM|T*cbUP4^nC?1fsmXnawz%OH@J)I4+a*avj(z@?>^C0B0hc+{xA z4B!pqiubEn zSBa$@?IO|_Yf*pOO00Nyq~9}?(zx{Sa>UQF#USafpI$mMs(m~Fm)2fVpY&qf7NX`d z!_bF|Nn36K*ESGE9n?T^v%hNko8AG5Tl|(#uIJ*ts`i?~ZDYm$;i|8AWr+cMY6-#C zm@7L`Hd>7lTe9hIAMDGF<>P9^ppkR3Pd%4Tdb@wy^AY4mF3r7c@P9>bV{d}t z;!2hLZ%d=gSJ6uWmwI0E(+;@heMO5^2L6c)4$P`cZ#PsLddYV^q|4vhpxVT<&l;WF z{4AOKvRSov@Oe`^sc9am<3-MLF-D`-^p}wGe4#(vGb=R#Cd=;vIxDhj<#8 z#q}GikNAOFNoW4Wh`qC_3%t=yvx)o5HJgNeKjU1FVtS9_;tgAKO!Zc`t`qF@r3UQ2 zQi8oU)gz$4RSSBnHi^GVjb*}r4M}yHZb+4`(afUv8iSX~y!q95$pOpgPfT)X=W}i@ HISu{?jA+l@ delta 11458 zcmahv34Baf_vgI#GBcUH?29C_Oax()NP<$aQ%VW3v}uZx7ENr4t#)IJonXjqZL!l$ z5i(Sj?u%0T@2X$R*Y;ETwW#i@&HtSHUS=}d-~T_{_s+X#z2}~L?s@O#m5(Cc*&Y#` z-$&K#05*U@4>VmH+)N!vx4SpW+lR^W_G=TBaPDNxH&-=zZV5-nz`aqd$wjYOGC zvs8Z{z29Kj8uy36vN>U$(Uap?ZU8T(b~1q4rm6u1IE$^)y0V6q#ctFYAbFZ=HE-IMJhJZNP`H ziyy18V$Bk*VRgUCt5`kr(6hlT!rlDTT~cPzsjL_YxIbE9*q!plRRajV{FKz|K6b6G z6kB!Q4|z%C%Ouqu-XT_8U#2y$?tHzO;hHfwM@qnb{^vGQp#7KkrL~@m3SJB3k+9vZ zY;0g!NcbdAP%ySv0%n4SvqYazY$J6X`F-J)b(sL`GDXouGCeFBBgG4$80tC~2v!UV zs5O85r1K@%h9OoA4a6`Ew_$`8BkdT4(Ts|*VFNqHVjMrj+aU!T+AzU}iFQoFWIMFs zX9puwY}knJsdh-_&c^)E#D-1nV8=9ma5b}HbH=7w(aECH?bMsWmo)CqSoYaC8;)l$Pq1Stma%||Hk=d)V{oz!r|^BM6(6_LJuL{QVOzfB zT5-A^Cg2Pk&a`wX1o^Y$tO%TK!#Q@Gi!PQn&x-TK#b8VlRo%2ko~Q`CSBZ%pHF0Xu zxXHaAn@~Dw)Zmiwqb5%&8eir<-!@*XjKWk5^&E)8eM=Lw0$K|*}hea?)R3)M_Y^9E?aJ3HQKx7rLMSPlw4_nr7?^@9@33JrtL^?4E zn?fxVVY~}LA5-*rQC3N5 zR?m`gqjcB-yZEw4$93q|aXoC&aRWT3V&~Yb_kX`7u;%;!$)|4F9&rzgwF^~zz}qC1M9^R?Xtm-CIv%G1!`H0rMb>=+ zc8Z`BY=SR|oD`fNbQn)E!zl!Z$d5-ye;sDREQEyK6Go1j?igJ<$sa zj* zUU{eZm3InZd8ZJhy8;w}BySZ!Ha)B+Mv6P5gPzxrg0)0gXFz*MC3qdU>AGJ0-W(gX z+(1$+^a#)nD(R|AI2;5#K~GOo3jigw3sUz+lP}Nd$Cvy6@oh5swmZ?$$)~SMPuEH4jFN}v@D04M{QjCL1mXjD z={O!{U@6Tdd&XgyU|LGXz*eXt1Dj$W*hbz`q_@JsZjHMQEv>LUIiZC!)9EA48Ki9)VD26evdkjhp6-upEH^=OIWs0=n}!Bo}(XaRg$0V8esNuE-HJ zr9cA;3=LWl)EdIzE@%L4Ad$kVF@;hV>C2@+@=a`eGT+5B%SP=ojp{)H6zU5mB`JmJ z9%$r&RG${RM6ROHSS=;R7{qmsfMD1ygPcX~F-7(y8Ww4%VK#QA(=dJVdW5hiDKIq( z#Xt%UHqQ`h_X@R>(8VN-h3++l3gTm--C3yU+5%gcw zXkrp5dK<$g$bik{r!CNpCeNp~yH{;@*hd_!_9-K3NnH<8%2O6b(WTSGYy+#d7c`Y3 z%S9IXB6@EpOv&;tY*6Kbr)XX?obFy$aT3(t9( z4TA!l;|~KqcL(?L|H!@nAGi;Lb|-W7KQccL2UxYZl!Y;AzVUkgk@z4SVq$mE(U_4w z+T;I7TW!$Z;SinIY<-8E{72@)MgY6#FK*ysMMj*yNeSyMNpFOd#XGkaIAX?W4R}mf zb|gIII7mm`Enk{z+E~6>LW$0*JXHqIt*kgc@+BJ8OVUr8~u= zc_)P#HQ>tRKCFQjeAmo-Kn=+L7?YOPOcsGeK~VNF$U*3rc??=2t&>awS|OZ**11}G zoOTTE;)gaYfuGveJ?*VM$Ai;k^ykvf2egr^F1SoDj^9uox&pD3U{m2+NT(#*3ciE( z@I7>g8_*AafKs?gyV8%8PkxqOXiWnwr8W8@yhI|05dBHg%1NTa>+%YC8D61TQ)z#D zmADmnjqto(8Jw>Zr;Asc4$7Y}1g-GKEocdEQjbPNZxI7kAf``~sE>U**GZ1N+sL$( z`D&mYB|px|1m^Nw8h4HF?Km&foycR@^OhYDMfaG9_H!fJZv&e%-Q$7|60;qeF=5Aq zMM~{fNMPoEe96_=>6vjfGaU=|)_&!Sk8b2MfWOFlwa|bPLoy$~gn0E6T;&dB&9J&_L98l%Z%>&)DJ$@B4P6(cuFp(eo$1aQDoE-&20f8A`6+566;Zn5(lE%YDah1hNc|V(9je_) zHt#>>)un)D=*_sNIDL^3y;HnRCV-;=vxDMgv+aeejez5GK2NjOJ~@L zVx=>LVHZfJs)@E!7X_J=>a`)$v_XdyxCocXY@Re5(1fL1;ooEpy&F(3+q8@B99->r zr1_k|XqwDSMbLm_Fc2Xx!^~6-2#}eoSHH~kV=zd8y|u2(zF>>t1&(^0u`4K?Qi2lynObGL5Y83GqI(7y9NMhoRo7 z7)H}z@IFklk>P>iC;tKOV&Pr_%HSj^PCI!}y0t6xqSOk|6^dbmyf39|FIZ0R#&z>5 z(-00Lrwyl%C?g;SAAv-C6q?~kN=2jSo5g7QR52cU-~{@XP)wg7N}w3WP%0V=OL4pl z)>2u_byX*Nsc1$;o#A0Sw1xl3Sno}#qct$nNf~%l zUnU%_Ny7BQ@Yqp^G|`MG_99Ar2o5h}i5uZH1yXS;G{?uG8%~2EE}UMs6xk4SRLyBW z`yQ@QtZV!}YOFx~nMz;!jtG~I!x&m^WAh9n@7RxDt+yY0H{f0}3*-jG1-t=OaRJ9* z96}q5+QvHi4w7Rq9wFBjXEWDKB13Y~Ovnw43#@_CjbK@1r7V3hqwp}4L|e;?jz9^+ zk40M;eyrh>(2?CyW|e-DC4orH#Rjg4(bgqnqb*BHkHTcrZ+mNVWs)$;f;N=l09|0i zdE}z`wE7l8XIw;qvzYd?CG?856v}Zq#moxWf~(*&`q=UXu9Y4fO}?2%>+T1t6at}$ zYIm*(Y!t3;svG7|bh-M_>+MejTIm&MI~5YY(1LGH_2jRzh=``$zOIP4$!3V$J& z8UT)4pwzTBCd>X%k5^$7nmJ$^jTL#o`xzg&M`747b97D456(%g1tP zF0XU4FZI*MJ(5J9iY)Y{$oDGK&K#@78+24j@PCp zwD7k{fkrN`zn^eX!P}@TDtNO)RH#5P(sxTjG1A7ekHS=+sq(D`>%Se@Oe^Kqgqo)D z2QDhqkiQMF3@UZ%7B;^?^DiV4tViMTYACW)L(;)M)P=JAly*ZCijxdbs6L`4N)^%! zy2sOC$54Pwr=qYmZC58Of%K_!6g~!lzGDvhxb3;s0nZ2UCveZ%Za7Px3V8bVz^GLD zf%~luUq63M2e_si-{$E9w?%X=rEim)it#T!oIj6GUf-edc5*GZR57vG_&nr!XDTlG zKR-_SybYx}${p)hd_dIg=;Hu5Q)jan$4Iex9ZL#?x=_Om7< z!nG+Z-~=U9ZfRm|ffg^;WNCKK;I`TsgR60_Rw23QxGYuVf1?K03pAei>+QAQ3{QO9 zMXQXFhAb@5&RZo^FkCxhn3VRY_KIQpcayZ?Ql#iNMQbO?mQT_0#nj^ESz1TggR`{)2@IL7-6w&4v$g&bxHVfFD1rWSw0sHdn4=Al zz%O%5;_h>`F|y}C9gyhKM#-LXmsTW!t1hir0)yviqa|=+o;FDWE$3?`64*Ll8zh0B z=4-tq(0PG2OafI4OfK6(ZLsVazECTaqd2}$>neL{7n(K?SVR``0Cz9a9+H4kt__jE zh;r?INqn^29CF19t)a+VtTmH~_-5@R)9GLjxTTBHi?!zNuzfan$etjPzeF<*V~5k> zsuEl2J{TRv2d!yRUfBlfytZ4cU8co|ilv$*Bfk$SQ7Xxy!*EVP{v#7 zCbt%T;@+!c3U_zwyYwAb-TQeeWE#18{=Su2ZjR3gi_z^vx zE@e(wr8SY1zUqU5+K+NoKS{}zi1^nk(^$Vk#E<-qAxzGvRw8q?Nj_LO{TE?cYD5G%qQ#{rry);8f5koeZ zzK}UKv?oi6;!;gKwZTk3se3-jT0?*T!p%P;-fNJH+VR++veJy(%Ui2DT_=Uy{NuLq z19j3PM0S~C5f4|=1oDu_SJLQ)6o~V5|Cm|llp#rT}#UVJODVw1@lypd4%zv@LL`?zK>s_hvs z$|5ReDnUNHG>$Re`E_hgfU zqj&$f`!4)KJltCP@#cKZq)o-PVAUq_HkIMe-KY+by;b{*Rx$cwNldVrn{7H2B8~TAJMa^T&t38T99( z)DUq!LbZsYdnuB6I7dGFu=X;=B)4q8;k@uyop}~HnUcI71)bg2{leh;uGgB2@a}-bkkO{G}USJUMf(X;*YK7Aj7H%WzRNHRIta35mn|Od?h`9)b#y!&EGV@OK%kT z;%#D*PWyu8xeC1a0CBP}Ijzw)Q|Q_DZGU^2Dg|!-3qNm!!89*Tvy0WU)kv|Nn559E zZ6tK`$a?sTb@JjCTv?ew#Sl0DEw*eD)91)^6L});)g`pSaP!|}&de}PT|@7=tWeUQ zZP&v^e`y8W?(&VCHts#;2K|~}=%q;-Nc)Ab?;r!auQ5#_S|XfBI03WL(@GzhK!q z3jjC&a_`KmhLr3yS{3;hDFA=oZcfY9k!>TMrVPr>zv?>hj=>amRSg%XzNUb!*lh;v z``;yJT%#S5o4;50T<|hA5by0Wm81;W_x)TdLb+x6jqmbRFMVKTQ!P+r>^8^PbvKRi z;7tR4K-XfWdO7^9j7rl4?$%O7{#TR;y~8Q^w0#E`%{4dw%f+`<{H>8pA1Cr#Y4j1Ng%%*T)-gSKtzyIxDu1~7*H?pL>M=c>O>$?I?Gb>dmFvBr?q$~;D)`&!U%F7H~)Ex zE;LAoKdq%oQm?TDBzYe=-A#MQ@Js1b&FWYD@fLgjY0_KmqevD_pV2}>%IP4hnqm-( I_~7V&0me&b8~^|S diff --git a/settings/repository/org.broad/tribble-24.xml b/settings/repository/org.broad/tribble-25.xml similarity index 51% rename from settings/repository/org.broad/tribble-24.xml rename to settings/repository/org.broad/tribble-25.xml index 9b2b967f8..ed7a1fd69 100644 --- a/settings/repository/org.broad/tribble-24.xml +++ b/settings/repository/org.broad/tribble-25.xml @@ -1,3 +1,3 @@ - + From bffd3cca6fe26fe31a6fa5a33745dcaee0139c56 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 20 Sep 2011 15:07:06 -0400 Subject: [PATCH 113/113] Bug fix for reduced read; only adds regular bases for calculation -- No longer passes on deletions for genotyping --- .../walkers/genotyper/DiploidSNPGenotypeLikelihoods.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 5f6865d04..ec180f0cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -276,8 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( elt.isReducedRead() ) { // reduced read representation byte qual = elt.getReducedQual(); - add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods - return elt.getReducedCount(); // we added nObs bases here + if ( BaseUtils.isRegularBase( elt.getBase() )) { + add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods + return elt.getReducedCount(); // we added nObs bases here + } else // odd bases or deletions => don't use them + return 0; } else { byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;