From c3ea96d85616a3ce0b4b90d40e2d141b1efd16cf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 08:42:01 -0400 Subject: [PATCH 01/32] Removing many unused functions of unquestionable purpose --- .../sting/utils/QualityUtils.java | 101 ++---------------- 1 file changed, 10 insertions(+), 91 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index fad2320fc..093da7dd6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double MIN_REASONABLE_ERROR = 0.0001; public final static byte MAX_REASONABLE_Q_SCORE = 40; public final static byte MIN_USABLE_Q_SCORE = 6; - public final static int MAPPING_QUALITY_UNAVAILABLE = 255; + private static double qualToErrorProbCache[] = new double[256]; + static { + for (byte i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i); + } + /** * Private constructor. No instantiating this class! */ @@ -33,10 +36,6 @@ public class QualityUtils { return 1.0 - qualToErrorProb(qual); } - static public double qualToProb(int qual) { - return qualToProb( (double)qual ); - } - static public double qualToProb(double qual) { return 1.0 - Math.pow(10.0, qual/(-10.0)); } @@ -48,10 +47,14 @@ public class QualityUtils { * @param qual a quality score (0-40) * @return a probability (0.0-1.0) */ - static public double qualToErrorProb(byte qual) { + static public double qualToErrorProbRaw(byte qual) { return Math.pow(10.0, ((double) qual)/-10.0); } + static public double qualToErrorProb(byte qual) { + return qualToErrorProbCache[qual]; + } + /** * Convert a probability to a quality score. Note, this is capped at Q40. * @@ -110,88 +113,4 @@ public class QualityUtils { //return (byte) Math.min(qual, maxQual); return (byte) Math.max(Math.min(qual, maxQual), 1); } - - /** - * Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field. - * Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis. - * Another note: the probability encoded here gets rounded to the nearest 1%. - * - * @param baseIndex the base index - * @param prob the base probability - * @return a byte containing the index and the probability - */ - static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) { - byte compressedQual = 0; - - compressedQual = (byte) baseIndex; - - byte cprob = (byte) (100.0*prob); - byte qualmask = (byte) 252; - compressedQual += ((cprob << 2) & qualmask); - - return compressedQual; - } - - /** - * From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T) - * - * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality - * @return base index - */ - static public int compressedQualityToBaseIndex(byte compressedQual) { - return (int) (compressedQual & 0x3); - } - - /** - * From a compressed base, extract the base probability - * - * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality - * @return the probability - */ - static public double compressedQualityToProb(byte compressedQual) { - // Because java natives are signed, extra care must be taken to avoid - // shifting a 1 into the sign bit in the implicit promotion of 2 to an int. - int x2 = ((int) compressedQual) & 0xff; - x2 = (x2 >>> 2); - - return ((double) x2)/100.0; - } - - /** - * Return the complement of a compressed quality - * - * @param compressedQual the compressed quality score (as returned by baseAndProbToCompressedQuality) - * @return the complementary compressed quality - */ - static public byte complementCompressedQuality(byte compressedQual) { - int baseIndex = compressedQualityToBaseIndex(compressedQual); - double prob = compressedQualityToProb(compressedQual); - - return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob); - } - - /** - * Return the reverse complement of a byte array of compressed qualities - * - * @param compressedQuals a byte array of compressed quality scores - * @return the reverse complement of the byte array - */ - static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) { - byte[] rcCompressedQuals = new byte[compressedQuals.length]; - - for (int pos = 0; pos < compressedQuals.length; pos++) { - rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]); - } - - return rcCompressedQuals; - } - - /** - * Return the reverse of a byte array of qualities (compressed or otherwise) - * @param quals the array of bytes to be reversed - * @return the reverse of the quality array - */ - static public byte[] reverseQualityArray( byte[] quals ) { - return Utils.reverse(quals); // no sense in duplicating functionality - } } From c57198a1b998ba25b7facac526cafa04f9b8f77a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 08:46:17 -0400 Subject: [PATCH 02/32] Optimizations in VCFCodec -- Don't create an empty LinkedHashSet() for PASS fields. Just return Collections.emptySet() instead. -- For filter fields with actual values, returns an unmodifiableSet instead of one that can be changed --- .../broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index fa030ef5f..cd320b332 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -110,11 +110,8 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; - // empty set for passes filters - LinkedHashSet fFields = new LinkedHashSet(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return fFields; + return Collections.emptySet(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4"); if ( filterString.length() == 0 ) @@ -124,6 +121,8 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterHash.containsKey(filterString) ) return filterHash.get(filterString); + // empty set for passes filters + LinkedHashSet fFields = new LinkedHashSet(); // otherwise we have to parse and cache the value if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) fFields.add(filterString); @@ -132,7 +131,7 @@ public class VCFCodec extends AbstractVCFCodec { filterHash.put(filterString, fFields); - return fFields; + return Collections.unmodifiableSet(fFields); } From 82f213177730123def312b6a5b64878f7f665769 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 12:27:11 -0400 Subject: [PATCH 03/32] Simplied getAttributeAsX interfaces -- Removed versions getAttribriteAsX(key) that except on not having the value. -- Removed version that getAttributeAsXNoException(key) -- The only available assessors are now getAttributeAsX(key, default). -- This single accessors properly handle their argument types, so if the value is a double it is returned directly for getAttributeAsDouble(), or if it's a string it's converted to a double. If the key isn't found, default is returned. --- .../gatk/walkers/annotator/SBByDepth.java | 2 +- .../indels/HaplotypeIndelErrorModel.java | 2 +- .../gatk/walkers/phasing/PhasingRead.java | 2 +- .../walkers/phasing/RefSeqDataParser.java | 10 ++-- .../varianteval/evaluators/CountVariants.java | 6 +-- .../evaluators/GenotypePhasingEvaluator.java | 3 +- .../evaluators/SimpleMetricsByAC.java | 2 +- .../evaluators/TiTvVariantEvaluator.java | 2 +- .../evaluators/ValidationReport.java | 2 +- .../stratifications/AlleleCount.java | 2 +- .../stratifications/AlleleFrequency.java | 2 +- .../stratifications/Degeneracy.java | 10 ++-- .../stratifications/FunctionalClass.java | 4 +- .../VQSRCalibrationCurve.java | 4 +- .../walkers/variantutils/SelectVariants.java | 2 +- .../walkers/variantutils/VariantsToTable.java | 2 +- .../sting/utils/codecs/vcf/VCFCodec.java | 14 +++-- .../sting/utils/variantcontext/Genotype.java | 9 ---- .../InferredGeneticContext.java | 53 ++++++++++++------- .../utils/variantcontext/VariantContext.java | 10 ---- .../variantcontext/VariantContextUtils.java | 16 +++--- 21 files changed, 79 insertions(+), 80 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java index 180bed24d..d2c4d24ab 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java @@ -26,7 +26,7 @@ public class SBByDepth extends AnnotationByDepth { if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY)) return null; - double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY)); + double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1); final Map genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index e68aa31e0..232e468f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel { baseMatchArray = new double[MAX_CACHED_QUAL+1]; baseMismatchArray = new double[MAX_CACHED_QUAL+1]; for (int k=1; k <= MAX_CACHED_QUAL; k++) { - double baseProb = QualityUtils.qualToProb(k); + double baseProb = QualityUtils.qualToProb((byte)k); baseMatchArray[k] = probToQual(baseProb); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java index a56c9e21e..63fb33295 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray { public PhasingRead(int length, int mappingQual) { super(length); - this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual)); + this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual)); this.baseProbs = new PreciseNonNegativeDouble[length]; Arrays.fill(this.baseProbs, null); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java index 55da1c152..f94140814 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java @@ -44,12 +44,12 @@ public class RefSeqDataParser { String nameKeyToUseMultiplePrefix = nameKeyToUse + "_"; Map entriesToNames = new HashMap(); - Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY); - if (numRecords != null) { + int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1); + if (numRecords != -1) { boolean done = false; if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1": - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); done = true; @@ -59,14 +59,14 @@ public class RefSeqDataParser { if (!done) { for (int i = 1; i <= numRecords; i++) { String key = nameKeyToUseMultiplePrefix + i; - String name = vc.getAttributeAsStringNoException(key); + String name = vc.getAttributeAsString(key, null); if (name != null) entriesToNames.put(key, name); } } } else { // no entry with the # of records: - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 59ef3d992..fd379dfda 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -109,12 +109,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval { case SNP: nVariantLoci++; nSNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case MNP: nVariantLoci++; nMNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case INDEL: nVariantLoci++; @@ -136,7 +136,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval { String refStr = vc1.getReference().getBaseString().toUpperCase(); - String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null; + String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null; // if (aaStr.equals(".")) { // aaStr = refStr; // } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index a476a2680..e69dbfb28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { } public static Double getPQ(Genotype gt) { - return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + return d == -1 ? null : d; } public static boolean topMatchesTop(AllelePair b1, AllelePair b2) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java index d466645ea..38cbf1c45 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java @@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval if ( eval.hasGenotypes() ) ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); + ac = eval.getAttributeAsInt("AC", -1); } if ( ac != -1 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index be957abd7..ee58012a0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -50,7 +50,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } String refStr = vc.getReference().getBaseString().toUpperCase(); - String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase(); + String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase(); if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) { BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 9c331b577..7fa56785b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -130,7 +130,7 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { //// System.out.printf(" ac = %d%n", ac); } else - ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY); + ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO; } else if ( vc.hasGenotypes() ) { return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 5cdea4e00..56b06d032 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -45,7 +45,7 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = -1; if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { - AC = eval.getAttributeAsInt("AC"); + AC = eval.getAttributeAsInt("AC", 0); } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getChromosomeCount(allele)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java index 96d9f30ec..ac1ee9e0e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java @@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier { if (eval != null) { try { - relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3)))); + relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3)))); } catch (Exception e) { return relevantStates; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java index cc878e975..06ac05ec8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java @@ -92,8 +92,8 @@ public class Degeneracy extends VariantStratifier { Integer frame = null; if (eval.hasAttribute("refseq.functionalClass")) { - aa = eval.getAttributeAsString("refseq.variantAA"); - frame = eval.getAttributeAsInt("refseq.frame"); + aa = eval.getAttributeAsString("refseq.variantAA", null); + frame = eval.getAttributeAsInt("refseq.frame", 0); } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -101,7 +101,7 @@ public class Degeneracy extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); + String newtype = eval.getAttributeAsString(key, null); if ( newtype != null && ( type == null || @@ -111,13 +111,13 @@ public class Degeneracy extends VariantStratifier { type = newtype; String aakey = String.format("refseq.variantAA_%d", annotationId); - aa = eval.getAttributeAsString(aakey); + aa = eval.getAttributeAsString(aakey, null); if (aa != null) { String framekey = String.format("refseq.frame_%d", annotationId); if (eval.hasAttribute(framekey)) { - frame = eval.getAttributeAsInt(framekey); + frame = eval.getAttributeAsInt(framekey, 0); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index 0de871fe6..4af12fbd1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -32,7 +32,7 @@ public class FunctionalClass extends VariantStratifier { String type = null; if (eval.hasAttribute("refseq.functionalClass")) { - type = eval.getAttributeAsString("refseq.functionalClass"); + type = eval.getAttributeAsString("refseq.functionalClass", null); } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -40,7 +40,7 @@ public class FunctionalClass extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); + String newtype = eval.getAttributeAsString(key, null); if ( newtype != null && !newtype.equalsIgnoreCase("null") && ( type == null || diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java index bc7252ec2..04ba3ff14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java @@ -115,7 +115,7 @@ public class VQSRCalibrationCurve { if ( vc.isFiltered() ) return 0.0; else if ( vc.hasAttribute(VQSRQualKey) ) { - double qual = vc.getAttributeAsDouble(VQSRQualKey); + double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0); return probTrueVariant(qual); } else { throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc); @@ -143,7 +143,7 @@ public class VQSRCalibrationCurve { for ( int i = 0; i < log10Likelihoods.length; i++) { double p = Math.pow(10, log10Likelihoods[i]); double q = alpha * p + (1-alpha) * noInfoPr; - if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q); + if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q); updated[i] = Math.log10(q); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bb3cd82a1..ceafb0cf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -575,7 +575,7 @@ public class SelectVariants extends RodWalker { // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. // We then pick a variant with probablity AF*desiredFraction if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { - String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); + String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null); double af; double afBoost = 1.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 2a877fb09..aafbe4db4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -192,7 +192,7 @@ public class VariantsToTable extends RodWalker { if ( getters.containsKey(field) ) { val = getters.get(field).get(vc); } else if ( vc.hasAttribute(field) ) { - val = vc.getAttributeAsString(field); + val = vc.getAttributeAsString(field, null); } else if ( isWildCard(field) ) { Set wildVals = new HashSet(); for ( Map.Entry elt : vc.getAttributes().entrySet()) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index cd320b332..94e40fc98 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -105,7 +105,10 @@ public class VCFCodec extends AbstractVCFCodec { * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) */ protected Set parseFilters(String filterString) { + return parseFilters(filterHash, lineNo, filterString); + } + public static Set parseFilters(final Map> cache, final int lineNo, final String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; @@ -113,13 +116,13 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) return Collections.emptySet(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4"); + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status"); + generateException("The VCF specification requires a valid filter status", lineNo); // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); + if ( cache != null && cache.containsKey(filterString) ) + return Collections.unmodifiableSet(cache.get(filterString)); // empty set for passes filters LinkedHashSet fFields = new LinkedHashSet(); @@ -129,7 +132,8 @@ public class VCFCodec extends AbstractVCFCodec { else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - filterHash.put(filterString, fFields); + fFields = fFields; + if ( cache != null ) cache.put(filterString, fFields); return Collections.unmodifiableSet(fFields); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index fdf3d97db..85d752003 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -293,17 +293,8 @@ public class Genotype { return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java index 3d162adb0..4266fb4b5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java @@ -204,27 +204,40 @@ public final class InferredGeneticContext { return defaultValue; } -// public AttributedObject getAttributes(Collection keys) { -// AttributedObject selected = new AttributedObject(); -// -// for ( Object key : keys ) -// selected.putAttribute(key, this.getAttribute(key)); -// -// return selected; -// } + public String getAttributeAsString(String key, String defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof String ) return (String)x; + return String.valueOf(x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" - public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } - public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } - public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } + public int getAttributeAsInt(String key, int defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Integer ) return (Integer)x; + return Integer.valueOf((String)x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key, String defaultValue) { return (String)getAttribute(key, defaultValue); } - public int getAttributeAsInt(String key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); } - public double getAttributeAsDouble(String key, double defaultValue) { return (Double)getAttribute(key, defaultValue); } - public boolean getAttributeAsBoolean(String key, boolean defaultValue){ return (Boolean)getAttribute(key, defaultValue); } + public double getAttributeAsDouble(String key, double defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Double ) return (Double)x; + return Double.valueOf((String)x); // throws an exception if this isn't a string + } - public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } - public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } - public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } - public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } + public boolean getAttributeAsBoolean(String key, boolean defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Boolean ) return (Boolean)x; + return Boolean.valueOf((String)x); // throws an exception if this isn't a string + } + +// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" +// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } +// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } +// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } +// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } +// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } +// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } +// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 673fe4529..e6637a5d9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -666,21 +666,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } - - // --------------------------------------------------------------------------------------------------------- // // Working with alleles diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 986d6305c..d5c541b19 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -565,11 +565,11 @@ public class VariantContextUtils { // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY)); + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (rsID == null && vc.hasID()) rsID = vc.getID(); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY); + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); @@ -1147,9 +1147,7 @@ public class VariantContextUtils { for (String orAttrib : MERGE_OR_ATTRIBS) { boolean attribVal = false; for (VariantContext vc : vcList) { - Boolean val = vc.getAttributeAsBooleanNoException(orAttrib); - if (val != null) - attribVal = (attribVal || val); + attribVal = vc.getAttributeAsBoolean(orAttrib, false); if (attribVal) // already true, so no reason to continue: break; } @@ -1159,7 +1157,7 @@ public class VariantContextUtils { // Merge ID fields: String iDVal = null; for (VariantContext vc : vcList) { - String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY); + String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { if (iDVal == null) iDVal = val; @@ -1239,8 +1237,10 @@ public class VariantContextUtils { public PhaseAndQuality(Genotype gt) { this.isPhased = gt.isPhased(); - if (this.isPhased) - this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + if (this.isPhased) { + this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + if ( this.PQ == -1 ) this.PQ = null; + } } } From 124ef6c4834d8a948629291762b7f9d7d9696d50 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:12:28 -0400 Subject: [PATCH 04/32] MISSING_VALUE now gets defaultValue in getAttribute functions --- .../sting/utils/variantcontext/InferredGeneticContext.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java index 4266fb4b5..bf16cd1cf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; + import java.util.*; @@ -213,7 +215,7 @@ public final class InferredGeneticContext { public int getAttributeAsInt(String key, int defaultValue) { Object x = getAttribute(key); - if ( x == null ) return defaultValue; + if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; if ( x instanceof Integer ) return (Integer)x; return Integer.valueOf((String)x); // throws an exception if this isn't a string } From 03aa04e37c8a2cf31d650cba6a8c703fbb033978 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:13:08 -0400 Subject: [PATCH 05/32] Simple refactoring to make formating functions public --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 2 +- .../sting/utils/codecs/vcf/StandardVCFWriter.java | 11 +++++++++-- .../sting/utils/codecs/vcf/VCFCodec.java | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index bb212e128..624d06a71 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -227,7 +227,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, throw new UserException.MalformedVCF(message, lineNo); } - private static void generateException(String message, int lineNo) { + protected static void generateException(String message, int lineNo) { throw new UserException.MalformedVCF(message, lineNo); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index d3705813c..e28cd7598 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -275,7 +275,7 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); // FILTER - String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + String filters = getFilterString(vc, filtersWereAppliedToContext); mWriter.write(filters); mWriter.write(VCFConstants.FIELD_SEPARATOR); @@ -319,7 +319,14 @@ public class StandardVCFWriter implements VCFWriter { } catch (IOException e) { throw new RuntimeException("Unable to write the VCF object to " + locationString()); } + } + public static final String getFilterString(final VariantContext vc) { + return getFilterString(vc, false); + } + + public static final String getFilterString(final VariantContext vc, boolean forcePASS) { + return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); } private String getQualValue(double qual) { @@ -462,7 +469,7 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(encoding); } - private static String formatVCFField(Object val) { + public static String formatVCFField(Object val) { String result; if ( val == null ) result = VCFConstants.MISSING_VALUE_v4; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 94e40fc98..42ea05355 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -118,7 +118,7 @@ public class VCFCodec extends AbstractVCFCodec { if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status", lineNo); + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); // do we have the filter string cached? if ( cache != null && cache.containsKey(filterString) ) From 048202d18e42444e47e6237246b31d24c57dec9a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:13:28 -0400 Subject: [PATCH 06/32] Bugfix for cached quals --- .../java/src/org/broadinstitute/sting/utils/QualityUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 093da7dd6..19e03a19d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -17,7 +17,7 @@ public class QualityUtils { private static double qualToErrorProbCache[] = new double[256]; static { - for (byte i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i); + for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i); } /** From d471617c65f6d1f3885ea6628b7676ed6bbc6f8d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Sep 2011 21:15:19 -0400 Subject: [PATCH 07/32] GATK binary VCF (gvcf) prototype format for efficiency testing -- Very minimal working version that can read / write binary VCFs with genotypes -- Already 10x faster for sites, 5x for fully parsed genotypes, and 1000x for skipping genotypes when reading --- .../broadinstitute/sting/utils/gvcf/GVCF.java | 252 ++++++++++++++++++ .../sting/utils/gvcf/GVCFGenotype.java | 147 ++++++++++ .../sting/utils/gvcf/GVCFHeader.java | 180 +++++++++++++ .../sting/utils/gvcf/GVCFHeaderBuilder.java | 80 ++++++ 4 files changed, 659 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java new file mode 100644 index 000000000..8568c1aab --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; +import java.util.*; + +/** + * GATK binary VCF record + * + * @author Your Name + * @since Date created + */ +public class GVCF { + private final static int RECORD_TERMINATOR = 123456789; + private int chromOffset; + private int start, stop; + private String id; + private List alleleMap; + private int alleleOffsets[]; + private float qual; + private byte refPad; + private String info; + private int filterOffset; + + private List genotypes = Collections.emptyList(); + + public GVCF(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc, boolean skipGenotypes) { + chromOffset = gvcfHeaderBuilder.encodeString(vc.getChr()); + start = vc.getStart(); + stop = vc.getEnd(); + refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0; + id = vc.getID(); + + // encode alleles + alleleMap = new ArrayList(vc.getNAlleles()); + alleleOffsets = new int[vc.getNAlleles()]; + alleleMap.add(vc.getReference()); + alleleOffsets[0] = gvcfHeaderBuilder.encodeAllele(vc.getReference()); + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + alleleMap.add(vc.getAlternateAllele(i)); + alleleOffsets[i+1] = gvcfHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); + } + + qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual()); + info = infoFieldString(vc, gvcfHeaderBuilder); + filterOffset = gvcfHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); + + if ( ! skipGenotypes ) { + genotypes = encodeGenotypes(gvcfHeaderBuilder, vc); + } + } + + public GVCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { + chromOffset = inputStream.readInt(); + start = inputStream.readInt(); + stop = inputStream.readInt(); + id = inputStream.readUTF(); + refPad = inputStream.readByte(); + alleleOffsets = readIntArray(inputStream); + qual = inputStream.readFloat(); + info = inputStream.readUTF(); + filterOffset = inputStream.readInt(); + + int nGenotypes = inputStream.readInt(); + int sizeOfGenotypes = inputStream.readInt(); + if ( skipGenotypes ) { + genotypes = Collections.emptyList(); + inputStream.skipBytes(sizeOfGenotypes); + } else { + genotypes = new ArrayList(nGenotypes); + for ( int i = 0; i < nGenotypes; i++ ) + genotypes.add(new GVCFGenotype(this, inputStream)); + } + + int recordDone = inputStream.readInt(); + if ( recordDone != RECORD_TERMINATOR ) + throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); + } + + public VariantContext decode(final String source, final GVCFHeader header) { + final String contig = header.getString(chromOffset); + alleleMap = header.getAlleles(alleleOffsets); + double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual); + Set filters = header.getFilters(filterOffset); + Map attributes = new HashMap(); + attributes.put("INFO", info); + Byte refPadByte = refPad == 0 ? null : refPad; + Map genotypes = decodeGenotypes(header); + + return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte); + } + + private Map decodeGenotypes(final GVCFHeader header) { + if ( genotypes.isEmpty() ) + return VariantContext.NO_GENOTYPES; + else { + Map map = new TreeMap(); + + for ( int i = 0; i < genotypes.size(); i++ ) { + final String sampleName = header.getSample(i); + final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap); + map.put(sampleName, g); + } + + return map; + } + } + + private List encodeGenotypes(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc) { + int nGenotypes = vc.getNSamples(); + if ( nGenotypes > 0 ) { + List genotypes = new ArrayList(nGenotypes); + for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null); + + for ( Genotype g : vc.getGenotypes().values() ) { + int i = gvcfHeaderBuilder.encodeSample(g.getSampleName()); + genotypes.set(i, new GVCFGenotype(gvcfHeaderBuilder, alleleMap, g)); + } + + return genotypes; + } else { + return Collections.emptyList(); + } + } + + public int getNAlleles() { return alleleOffsets.length; } + + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeInt(chromOffset); + outputStream.writeInt(start); + outputStream.writeInt(stop); + outputStream.writeUTF(id); + outputStream.writeByte(refPad); + writeIntArray(alleleOffsets, outputStream, true); + outputStream.writeFloat(qual); + outputStream.writeUTF(info); + outputStream.writeInt(filterOffset); + + int nGenotypes = genotypes.size(); + int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; + outputStream.writeInt(nGenotypes); + outputStream.writeInt(expectedSizeOfGenotypes); + int obsSizeOfGenotypes = 0; + for ( GVCFGenotype g : genotypes ) + obsSizeOfGenotypes += g.write(outputStream); + if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) + throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); + + outputStream.writeInt(RECORD_TERMINATOR); + return outputStream.size() - startSize; + } + + private final String infoFieldString(VariantContext vc, final GVCFHeaderBuilder gvcfHeaderBuilder) { + StringBuilder s = new StringBuilder(); + + boolean first = true; + for ( Map.Entry field : vc.getAttributes().entrySet() ) { + String key = field.getKey(); + if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) + continue; + int stringIndex = gvcfHeaderBuilder.encodeString(key); + String outputValue = StandardVCFWriter.formatVCFField(field.getValue()); + if ( outputValue != null ) { + if ( ! first ) s.append(";"); + s.append(stringIndex).append("=").append(outputValue); + first = false; + } + } + + return s.toString(); + } + + private final static int BUFFER_SIZE = 1048576; // 2**20 + public static DataOutputStream createOutputStream(final File file) throws FileNotFoundException { + return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); + } + + public static DataInputStream createInputStream(final File file) throws FileNotFoundException { + return new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE)); + } + + protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException { + return readIntArray(inputStream, inputStream.readInt()); + } + + protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException { + int[] array = new int[size]; + for ( int i = 0; i < array.length; i++ ) + array[i] = inputStream.readInt(); + return array; + } + + protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException { + if ( writeSize ) outputStream.writeInt(array.length); + for ( int i : array ) + outputStream.writeInt(i); + } + + protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException { + return readByteArray(inputStream, inputStream.readInt()); + } + + protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException { + byte[] array = new byte[size]; + for ( int i = 0; i < array.length; i++ ) + array[i] = inputStream.readByte(); + return array; + } + + protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException { + if ( writeSize ) outputStream.writeInt(array.length); + for ( byte i : array ) + outputStream.writeByte(i); + } + + protected final static byte qualToByte(double phredScaledQual) { + return (byte)Math.round(Math.min(phredScaledQual, 255)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java new file mode 100644 index 000000000..2ef6d9b3a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.*; + +/** + * GATK binary VCF record + * + * @author Your Name + * @since Date created + */ +public class GVCFGenotype { + private byte gq; + private int gt; + private int dp; + private int ad[]; + private byte[] pl; + + // todo -- what to do about phasing? Perhaps we shouldn't support it + // todo -- is the FL field generic or just a flag? Should we even support per sample filtering? + + public GVCFGenotype(final GVCFHeaderBuilder gvcfHeaderBuilder, final List allAlleles, Genotype genotype) { + gq = GVCF.qualToByte(genotype.getPhredScaledQual()); + gt = encodeAlleles(genotype.getAlleles(), allAlleles); + + dp = genotype.getAttributeAsInt("DP", 0); + + int nAlleles = allAlleles.size(); + ad = new int[nAlleles]; + + int npls = nAllelesToNPls(nAlleles); + pl = new byte[npls]; + } + + private int nAllelesToNPls( int nAlleles ) { + return nAlleles*(nAlleles+1) / 2; + } + + public GVCFGenotype(GVCF gvcf, DataInputStream inputStream) throws IOException { + int gqInt = inputStream.readUnsignedByte(); + gq = (byte)gqInt; + gt = inputStream.readInt(); + dp = inputStream.readInt(); + ad = GVCF.readIntArray(inputStream, gvcf.getNAlleles()); + pl = GVCF.readByteArray(inputStream, nAllelesToNPls(gvcf.getNAlleles())); + } + + // 2 alleles => 1 + 8 + 8 + 3 => 20 + protected int sizeInBytes() { + return 1 // gq + + 4 * 2 // gt + dp + + 4 * ad.length // ad + + 1 * pl.length; // pl + } + + public Genotype decode(final String sampleName, final GVCFHeader header, GVCF gvcf, List alleleIndex) { + final List alleles = decodeAlleles(gt, alleleIndex); + final double negLog10PError = gq / 10.0; + final Set filters = Collections.emptySet(); + final Map attributes = new HashMap(); + attributes.put("DP", dp); + attributes.put("AD", ad); + attributes.put("PL", pl); + + return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false); + } + + private static int encodeAlleles(List gtList, List allAlleles) { + final int nAlleles = gtList.size(); + if ( nAlleles > 4 ) + throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList); + + int gtInt = 0; + for ( int i = 0; i < nAlleles ; i++ ) { + final int bitOffset = i * 8; + final int allelei = getAlleleIndex(gtList.get(i), allAlleles); + final int gti = (allelei + 1) << bitOffset; + gtInt = gtInt | gti; + } + + return gtInt; + } + + private static int getAlleleIndex(Allele q, List allAlleles) { + if ( q.isNoCall() ) + return 254; + for ( int i = 0; i < allAlleles.size(); i++ ) + if ( q.equals(allAlleles.get(i)) ) + return i; + throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles); + } + + private static List decodeAlleles(int gtInt, List alleleIndex) { + List alleles = new ArrayList(4); + + for ( int i = 0; i < 32; i += 8 ) { + final int gi = (gtInt & (0x000000FF << i)) >> i; + if ( gi != 0 ) { + final int allelei = gi - 1; + alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) ); + } else { + break; + } + } + + return alleles; + } + + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeByte(gq); + outputStream.writeInt(gt); + outputStream.writeInt(dp); + GVCF.writeIntArray(ad, outputStream, false); + GVCF.writeByteArray(pl, outputStream, false); + return outputStream.size() - startSize; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java new file mode 100644 index 000000000..c52c975bd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.*; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class GVCFHeader { + final protected static Logger logger = Logger.getLogger(GVCFHeader.class); + + private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes(); + final List alleles; + final List strings; + final List samples; + final List> filters; + + public GVCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + this.alleles = linearize(allelesIn); + this.strings = linearize(stringIn); + this.samples = linearize(samplesIn); + this.filters = null; // not used with this constructor + } + + public GVCFHeader(DataInputStream inputStream) throws IOException { + byte[] headerTest = new byte[MAGIC_HEADER.length]; + inputStream.read(headerTest); + if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) { + throw new UserException("Could not read GVCF file. MAGIC_HEADER missing. Saw " + headerTest); + } else { + alleles = stringsToAlleles(readStrings(inputStream)); + strings = readStrings(inputStream); + samples = readStrings(inputStream); + logger.info(String.format("Allele map of %d elements", alleles.size())); + logger.info(String.format("String map of %d elements", strings.size())); + logger.info(String.format("Sample map of %d elements", samples.size())); + filters = initializeFilterCache(); + } + } + + public int write(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.write(MAGIC_HEADER); + write(outputStream, allelesToStrings(alleles)); + write(outputStream, strings); + write(outputStream, samples); + return outputStream.size() - startBytes; + } + + public void write(DataOutputStream outputStream, List l) throws IOException { + outputStream.writeInt(l.size()); + for ( String elt : l ) outputStream.writeUTF(elt); + } + + private List allelesToStrings(List alleles) { + List strings = new ArrayList(alleles.size()); + for ( Allele allele : alleles ) strings.add(allele.toString()); + return strings; + } + + private List> initializeFilterCache() { + // required to allow offset -> set lookup + List> l = new ArrayList>(strings.size()); + for ( int i = 0; i < strings.size(); i++ ) l.add(null); + return l; + } + + private static List stringsToAlleles(final List strings) { + final List alleles = new ArrayList(strings.size()); + for ( String string : strings ) { + boolean isRef = string.endsWith("*"); + if ( isRef ) string = string.substring(0, string.length() - 1); + alleles.add(Allele.create(string, isRef)); + } + return alleles; + } + + private static List readStrings(final DataInputStream inputStream) throws IOException { + final int nStrings = inputStream.readInt(); + + final List strings = new ArrayList(nStrings); + for ( int i = 0; i < nStrings; i++ ) { + strings.add(inputStream.readUTF()); + } + + return strings; + } + + private static List linearize(final Map map) { + final ArrayList l = new ArrayList(map.size()); + for ( int i = 0; i < map.size(); i++ ) l.add(null); + for ( final Map.Entry elt : map.entrySet() ) + l.set(elt.getValue(), elt.getKey()); + return l; + } + + public String getSample(final int offset) { return samples.get(offset); } + public String getString(final int offset) { return strings.get(offset); } + public Allele getAllele(final int offset) { return alleles.get(offset); } + public List getAlleles(final int[] offsets) { + final List alleles = new ArrayList(offsets.length); + for ( int i : offsets ) alleles.add(getAllele(i)); + return alleles; + } + + public Set getFilters(final int offset) { + Set cached = filters.get(offset); + + if ( cached != null ) + return cached; + else { + final String filterString = getString(offset); + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; // UNFILTERED records are represented by null + else { + Set set = VCFCodec.parseFilters(null, -1, filterString); + filters.set(offset, set); // remember the result + return set; + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java new file mode 100644 index 000000000..2d045b8ea --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.HashMap; +import java.util.Map; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class GVCFHeaderBuilder { + Map alleles = new HashMap(); + Map strings = new HashMap(); + Map samples = new HashMap(); + + public GVCFHeader createHeader() { + return new GVCFHeader(alleles, strings, samples); + } + + public int encodeString(final String chr) { return encode(strings, chr); } + public int encodeAllele(final Allele allele) { return encode(alleles, allele); } + public int encodeSample(final String sampleName) { return encode(samples, sampleName); } + + private int encode(Map map, T key) { + Integer v = map.get(key); + if ( v == null ) { + v = map.size(); + map.put(key, v); + } + return v; + } +} From 01b6177ce15c2d4270ac863da1a2e4e43e020411 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 17:10:56 -0400 Subject: [PATCH 08/32] Renaming GVCF -> GCF --- .../utils/{gvcf/GVCF.java => gcf/GCF.java} | 47 +++++++++---------- .../GCFGenotype.java} | 20 ++++---- .../GVCFHeader.java => gcf/GCFHeader.java} | 12 ++--- .../GCFHeaderBuilder.java} | 8 ++-- 4 files changed, 41 insertions(+), 46 deletions(-) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCF.java => gcf/GCF.java} (83%) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCFGenotype.java => gcf/GCFGenotype.java} (86%) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCFHeader.java => gcf/GCFHeader.java} (92%) rename public/java/src/org/broadinstitute/sting/utils/{gvcf/GVCFHeaderBuilder.java => gcf/GCFHeaderBuilder.java} (93%) diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java similarity index 83% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java index 8568c1aab..5ab241ebf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCF.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -22,12 +22,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -42,7 +39,7 @@ import java.util.*; * @author Your Name * @since Date created */ -public class GVCF { +public class GCF { private final static int RECORD_TERMINATOR = 123456789; private int chromOffset; private int start, stop; @@ -54,10 +51,10 @@ public class GVCF { private String info; private int filterOffset; - private List genotypes = Collections.emptyList(); + private List genotypes = Collections.emptyList(); - public GVCF(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc, boolean skipGenotypes) { - chromOffset = gvcfHeaderBuilder.encodeString(vc.getChr()); + public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) { + chromOffset = GCFHeaderBuilder.encodeString(vc.getChr()); start = vc.getStart(); stop = vc.getEnd(); refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0; @@ -67,22 +64,22 @@ public class GVCF { alleleMap = new ArrayList(vc.getNAlleles()); alleleOffsets = new int[vc.getNAlleles()]; alleleMap.add(vc.getReference()); - alleleOffsets[0] = gvcfHeaderBuilder.encodeAllele(vc.getReference()); + alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference()); for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { alleleMap.add(vc.getAlternateAllele(i)); - alleleOffsets[i+1] = gvcfHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); + alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); } qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual()); - info = infoFieldString(vc, gvcfHeaderBuilder); - filterOffset = gvcfHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); + info = infoFieldString(vc, GCFHeaderBuilder); + filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); if ( ! skipGenotypes ) { - genotypes = encodeGenotypes(gvcfHeaderBuilder, vc); + genotypes = encodeGenotypes(GCFHeaderBuilder, vc); } } - public GVCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { + public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { chromOffset = inputStream.readInt(); start = inputStream.readInt(); stop = inputStream.readInt(); @@ -99,9 +96,9 @@ public class GVCF { genotypes = Collections.emptyList(); inputStream.skipBytes(sizeOfGenotypes); } else { - genotypes = new ArrayList(nGenotypes); + genotypes = new ArrayList(nGenotypes); for ( int i = 0; i < nGenotypes; i++ ) - genotypes.add(new GVCFGenotype(this, inputStream)); + genotypes.add(new GCFGenotype(this, inputStream)); } int recordDone = inputStream.readInt(); @@ -109,7 +106,7 @@ public class GVCF { throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); } - public VariantContext decode(final String source, final GVCFHeader header) { + public VariantContext decode(final String source, final GCFHeader header) { final String contig = header.getString(chromOffset); alleleMap = header.getAlleles(alleleOffsets); double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual); @@ -122,7 +119,7 @@ public class GVCF { return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte); } - private Map decodeGenotypes(final GVCFHeader header) { + private Map decodeGenotypes(final GCFHeader header) { if ( genotypes.isEmpty() ) return VariantContext.NO_GENOTYPES; else { @@ -138,15 +135,15 @@ public class GVCF { } } - private List encodeGenotypes(final GVCFHeaderBuilder gvcfHeaderBuilder, final VariantContext vc) { + private List encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) { int nGenotypes = vc.getNSamples(); if ( nGenotypes > 0 ) { - List genotypes = new ArrayList(nGenotypes); + List genotypes = new ArrayList(nGenotypes); for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null); for ( Genotype g : vc.getGenotypes().values() ) { - int i = gvcfHeaderBuilder.encodeSample(g.getSampleName()); - genotypes.set(i, new GVCFGenotype(gvcfHeaderBuilder, alleleMap, g)); + int i = GCFHeaderBuilder.encodeSample(g.getSampleName()); + genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g)); } return genotypes; @@ -174,7 +171,7 @@ public class GVCF { outputStream.writeInt(nGenotypes); outputStream.writeInt(expectedSizeOfGenotypes); int obsSizeOfGenotypes = 0; - for ( GVCFGenotype g : genotypes ) + for ( GCFGenotype g : genotypes ) obsSizeOfGenotypes += g.write(outputStream); if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); @@ -183,7 +180,7 @@ public class GVCF { return outputStream.size() - startSize; } - private final String infoFieldString(VariantContext vc, final GVCFHeaderBuilder gvcfHeaderBuilder) { + private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) { StringBuilder s = new StringBuilder(); boolean first = true; @@ -191,7 +188,7 @@ public class GVCF { String key = field.getKey(); if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) continue; - int stringIndex = gvcfHeaderBuilder.encodeString(key); + int stringIndex = GCFHeaderBuilder.encodeString(key); String outputValue = StandardVCFWriter.formatVCFField(field.getValue()); if ( outputValue != null ) { if ( ! first ) s.append(";"); diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java similarity index 86% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java index 2ef6d9b3a..dd1fb091c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -38,7 +38,7 @@ import java.util.*; * @author Your Name * @since Date created */ -public class GVCFGenotype { +public class GCFGenotype { private byte gq; private int gt; private int dp; @@ -48,8 +48,8 @@ public class GVCFGenotype { // todo -- what to do about phasing? Perhaps we shouldn't support it // todo -- is the FL field generic or just a flag? Should we even support per sample filtering? - public GVCFGenotype(final GVCFHeaderBuilder gvcfHeaderBuilder, final List allAlleles, Genotype genotype) { - gq = GVCF.qualToByte(genotype.getPhredScaledQual()); + public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List allAlleles, Genotype genotype) { + gq = GCF.qualToByte(genotype.getPhredScaledQual()); gt = encodeAlleles(genotype.getAlleles(), allAlleles); dp = genotype.getAttributeAsInt("DP", 0); @@ -65,13 +65,13 @@ public class GVCFGenotype { return nAlleles*(nAlleles+1) / 2; } - public GVCFGenotype(GVCF gvcf, DataInputStream inputStream) throws IOException { + public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException { int gqInt = inputStream.readUnsignedByte(); gq = (byte)gqInt; gt = inputStream.readInt(); dp = inputStream.readInt(); - ad = GVCF.readIntArray(inputStream, gvcf.getNAlleles()); - pl = GVCF.readByteArray(inputStream, nAllelesToNPls(gvcf.getNAlleles())); + ad = GCF.readIntArray(inputStream, GCF.getNAlleles()); + pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles())); } // 2 alleles => 1 + 8 + 8 + 3 => 20 @@ -82,7 +82,7 @@ public class GVCFGenotype { + 1 * pl.length; // pl } - public Genotype decode(final String sampleName, final GVCFHeader header, GVCF gvcf, List alleleIndex) { + public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List alleleIndex) { final List alleles = decodeAlleles(gt, alleleIndex); final double negLog10PError = gq / 10.0; final Set filters = Collections.emptySet(); @@ -140,8 +140,8 @@ public class GVCFGenotype { outputStream.writeByte(gq); outputStream.writeInt(gt); outputStream.writeInt(dp); - GVCF.writeIntArray(ad, outputStream, false); - GVCF.writeByteArray(pl, outputStream, false); + GCF.writeIntArray(ad, outputStream, false); + GCF.writeByteArray(pl, outputStream, false); return outputStream.size() - startSize; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java similarity index 92% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java index c52c975bd..d0c765cc4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java @@ -22,11 +22,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -64,8 +62,8 @@ import java.util.*; * @author Your Name * @since Date created */ -public class GVCFHeader { - final protected static Logger logger = Logger.getLogger(GVCFHeader.class); +public class GCFHeader { + final protected static Logger logger = Logger.getLogger(GCFHeader.class); private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes(); final List alleles; @@ -73,14 +71,14 @@ public class GVCFHeader { final List samples; final List> filters; - public GVCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { this.alleles = linearize(allelesIn); this.strings = linearize(stringIn); this.samples = linearize(samplesIn); this.filters = null; // not used with this constructor } - public GVCFHeader(DataInputStream inputStream) throws IOException { + public GCFHeader(DataInputStream inputStream) throws IOException { byte[] headerTest = new byte[MAGIC_HEADER.length]; inputStream.read(headerTest); if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java similarity index 93% rename from public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java rename to public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java index 2d045b8ea..40e01ec72 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gvcf/GVCFHeaderBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.gvcf; +package org.broadinstitute.sting.utils.gcf; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -56,13 +56,13 @@ import java.util.Map; * @author Your Name * @since Date created */ -public class GVCFHeaderBuilder { +public class GCFHeaderBuilder { Map alleles = new HashMap(); Map strings = new HashMap(); Map samples = new HashMap(); - public GVCFHeader createHeader() { - return new GVCFHeader(alleles, strings, samples); + public GCFHeader createHeader() { + return new GCFHeader(alleles, strings, samples); } public int encodeString(final String chr) { return encode(strings, chr); } From fe5724b6ea7c77f3f38f35b2d29d118860b6fc2a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 23:27:08 -0400 Subject: [PATCH 09/32] Refactored indexing part of StandardVCFWriter into superclass -- Now other implementations of the VCFWriter can easily share common functions, such as writing an index on the fly --- .../utils/codecs/vcf/IndexingVCFWriter.java | 116 ++++++++++++++++++ .../utils/codecs/vcf/StandardVCFWriter.java | 107 +++++----------- 2 files changed, 148 insertions(+), 75 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java new file mode 100644 index 000000000..632bf8ed3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.vcf; + +import org.broad.tribble.Tribble; +import org.broad.tribble.TribbleException; +import org.broad.tribble.index.DynamicIndexCreator; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.util.LittleEndianOutputStream; +import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * this class writes VCF files + */ +public abstract class IndexingVCFWriter implements VCFWriter { + final private File indexFile; + final private String name; + + private PositionalStream positionalStream; + private DynamicIndexCreator indexer; + private LittleEndianOutputStream idxStream; + + protected IndexingVCFWriter(String name, File location, OutputStream output, boolean enableOnTheFlyIndexing) { + this.name = name; + + if ( enableOnTheFlyIndexing ) { + indexFile = Tribble.indexFile(location); + try { + idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + //System.out.println("Creating index on the fly for " + location); + indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + indexer.initialize(location, indexer.defaultBinSize()); + positionalStream = new PositionalStream(output); + } catch ( IOException ex ) { + // No matter what we keep going, since we don't care if we can't create the index file + } + } else { + idxStream = null; + indexer = null; + positionalStream = null; + indexFile = null; + } + } + + public String getStreamName() { + return name; + } + + public abstract void writeHeader(VCFHeader header); + + /** + * attempt to close the VCF file + */ + public void close() { + // try to close the index stream (keep it separate to help debugging efforts) + if ( indexer != null ) { + try { + Index index = indexer.finalizeIndex(positionalStream.getPosition()); + index.write(idxStream); + idxStream.close(); + } catch (IOException e) { + throw new ReviewedStingException("Unable to close index for " + getStreamName(), e); + } + } + } + + /** + * add a record to the file + * + * @param vc the Variant Context object + */ + public void add(VariantContext vc) { + // if we are doing on the fly indexing, add the record ***before*** we write any bytes + if ( indexer != null ) + indexer.addFeature(vc, positionalStream.getPosition()); + } + + protected static final String writerName(File location, OutputStream stream) { + return location == null ? stream.toString() : location.getAbsolutePath(); + } + + protected static OutputStream openOutputStream(File location) { + try { + return new FileOutputStream(location); + } catch (FileNotFoundException e) { + throw new ReviewedStingException("Unable to create VCF file at location: " + location, e); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index e28cd7598..ebcba9635 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -44,26 +44,19 @@ import java.util.*; /** * this class writes VCF files */ -public class StandardVCFWriter implements VCFWriter { +public class StandardVCFWriter extends IndexingVCFWriter { + // the print stream we're writing to + final protected BufferedWriter mWriter; + + // should we write genotypes or just sites? + final protected boolean doNotWriteGenotypes; // the VCF header we're storing protected VCFHeader mHeader = null; - // the print stream we're writing to - protected BufferedWriter mWriter; - protected PositionalStream positionalStream = null; - // were filters applied? protected boolean filtersWereAppliedToContext = false; - // should we write genotypes or just sites? - protected boolean doNotWriteGenotypes = false; - - protected DynamicIndexCreator indexer = null; - protected File indexFile = null; - LittleEndianOutputStream idxStream = null; - File location = null; - /** * create a VCF writer, given a file to write to * @@ -93,32 +86,22 @@ public class StandardVCFWriter implements VCFWriter { * @param doNotWriteGenotypes do not write genotypes */ public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) { - mWriter = new BufferedWriter(new OutputStreamWriter(output)); - this.doNotWriteGenotypes = doNotWriteGenotypes; + this(null, output, false, doNotWriteGenotypes); } public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { - this.location = location; - - if ( enableOnTheFlyIndexing ) { - indexFile = Tribble.indexFile(location); - try { - idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - //System.out.println("Creating index on the fly for " + location); - indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - indexer.initialize(location, indexer.defaultBinSize()); - positionalStream = new PositionalStream(output); - output = positionalStream; - } catch ( IOException ex ) { - // No matter what we keep going, since we don't care if we can't create the index file - } - } - - //mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output))); - mWriter = new BufferedWriter(new OutputStreamWriter(output)); + super(writerName(location, output), location, output, enableOnTheFlyIndexing); + mWriter = new BufferedWriter(new OutputStreamWriter(output)); // todo -- fix buffer size this.doNotWriteGenotypes = doNotWriteGenotypes; } + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + + @Override public void writeHeader(VCFHeader header) { mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header; @@ -158,44 +141,24 @@ public class StandardVCFWriter implements VCFWriter { mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new TribbleException("IOException writing the VCF header to " + locationString(), e); + throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e); } } - private String locationString() { - return location == null ? mWriter.toString() : location.getAbsolutePath(); - } - /** * attempt to close the VCF file */ + @Override public void close() { // try to close the vcf stream try { mWriter.flush(); mWriter.close(); } catch (IOException e) { - throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage()); + throw new ReviewedStingException("Unable to close " + getStreamName(), e); } - // try to close the index stream (keep it separate to help debugging efforts) - if ( indexer != null ) { - try { - Index index = indexer.finalizeIndex(positionalStream.getPosition()); - index.write(idxStream); - idxStream.close(); - } catch (IOException e) { - throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage()); - } - } - } - - protected static OutputStream openOutputStream(File location) { - try { - return new FileOutputStream(location); - } catch (FileNotFoundException e) { - throw new TribbleException("Unable to create VCF file at location: " + location); - } + super.close(); } /** @@ -203,28 +166,17 @@ public class StandardVCFWriter implements VCFWriter { * * @param vc the Variant Context object */ + @Override public void add(VariantContext vc) { - add(vc, false); - } - - /** - * add a record to the file - * - * @param vc the Variant Context object - * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD) - */ - public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) { if ( mHeader == null ) - throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString()); + throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName()); if ( doNotWriteGenotypes ) vc = VariantContext.modifyGenotypes(vc, null); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles); - - // if we are doing on the fly indexing, add the record ***before*** we write any bytes - if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition()); + vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false); + super.add(vc); Map alleleMap = new HashMap(vc.getAlleles().size()); alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup @@ -317,10 +269,16 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to " + locationString()); + throw new RuntimeException("Unable to write the VCF object to " + getStreamName()); } } + // -------------------------------------------------------------------------------- + // + // implementation functions + // + // -------------------------------------------------------------------------------- + public static final String getFilterString(final VariantContext vc) { return getFilterString(vc, false); } @@ -531,12 +489,11 @@ public class StandardVCFWriter implements VCFWriter { } - public static int countOccurrences(char c, String s) { + private static int countOccurrences(char c, String s) { int count = 0; for (int i = 0; i < s.length(); i++) { count += s.charAt(i) == c ? 1 : 0; } return count; } - } From cd2c511c4ae8a7d13ca6fe3604308ca5fdea5c00 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 23:28:46 -0400 Subject: [PATCH 10/32] GCF improvements -- Support for streaming VCF writing via the VCFWriter interface -- GCF now has a header and a footer. The header is minimal, and contains a forward pointer to the position of the footer in the file. -- Readers now read the header, and then jump to the footer to get the rest of the "header" information -- Version now a field in GCF --- .../broadinstitute/sting/utils/gcf/GCF.java | 69 +++++----- .../sting/utils/gcf/GCFHeader.java | 49 +++++-- .../sting/utils/gcf/GCFWriter.java | 122 ++++++++++++++++++ 3 files changed, 198 insertions(+), 42 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java index 5ab241ebf..ef0d9ca42 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -79,8 +79,13 @@ public class GCF { } } - public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { + public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException { chromOffset = inputStream.readInt(); + + // have we reached the footer? + if ( chromOffset == GCFHeader.FOOTER_START_MARKER ) + throw new EOFException(); + start = inputStream.readInt(); stop = inputStream.readInt(); id = inputStream.readUTF(); @@ -106,6 +111,32 @@ public class GCF { throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); } + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeInt(chromOffset); + outputStream.writeInt(start); + outputStream.writeInt(stop); + outputStream.writeUTF(id); + outputStream.writeByte(refPad); + writeIntArray(alleleOffsets, outputStream, true); + outputStream.writeFloat(qual); + outputStream.writeUTF(info); + outputStream.writeInt(filterOffset); + + int nGenotypes = genotypes.size(); + int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; + outputStream.writeInt(nGenotypes); + outputStream.writeInt(expectedSizeOfGenotypes); + int obsSizeOfGenotypes = 0; + for ( GCFGenotype g : genotypes ) + obsSizeOfGenotypes += g.write(outputStream); + if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) + throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); + + outputStream.writeInt(RECORD_TERMINATOR); + return outputStream.size() - startSize; + } + public VariantContext decode(final String source, final GCFHeader header) { final String contig = header.getString(chromOffset); alleleMap = header.getAlleles(alleleOffsets); @@ -154,31 +185,6 @@ public class GCF { public int getNAlleles() { return alleleOffsets.length; } - public int write(DataOutputStream outputStream) throws IOException { - int startSize = outputStream.size(); - outputStream.writeInt(chromOffset); - outputStream.writeInt(start); - outputStream.writeInt(stop); - outputStream.writeUTF(id); - outputStream.writeByte(refPad); - writeIntArray(alleleOffsets, outputStream, true); - outputStream.writeFloat(qual); - outputStream.writeUTF(info); - outputStream.writeInt(filterOffset); - - int nGenotypes = genotypes.size(); - int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; - outputStream.writeInt(nGenotypes); - outputStream.writeInt(expectedSizeOfGenotypes); - int obsSizeOfGenotypes = 0; - for ( GCFGenotype g : genotypes ) - obsSizeOfGenotypes += g.write(outputStream); - if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) - throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); - - outputStream.writeInt(RECORD_TERMINATOR); - return outputStream.size() - startSize; - } private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) { StringBuilder s = new StringBuilder(); @@ -200,13 +206,14 @@ public class GCF { return s.toString(); } - private final static int BUFFER_SIZE = 1048576; // 2**20 - public static DataOutputStream createOutputStream(final File file) throws FileNotFoundException { - return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); + protected final static int BUFFER_SIZE = 1048576; // 2**20 + + public static DataInputStream createDataInputStream(final InputStream stream) { + return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE)); } - public static DataInputStream createInputStream(final File file) throws FileNotFoundException { - return new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE)); + public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException { + return new FileInputStream(file); } protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException { diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java index d0c765cc4..6d96eda56 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java @@ -30,9 +30,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; +import java.io.*; import java.util.*; /** @@ -65,25 +63,45 @@ import java.util.*; public class GCFHeader { final protected static Logger logger = Logger.getLogger(GCFHeader.class); - private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes(); + public final static int GCF_VERSION = 1; + public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes(); + public final static int FOOTER_START_MARKER = -1; + public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version + + final int version; + long footerPosition; final List alleles; final List strings; final List samples; final List> filters; public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + version = GCF_VERSION; + footerPosition = 0; this.alleles = linearize(allelesIn); this.strings = linearize(stringIn); this.samples = linearize(samplesIn); this.filters = null; // not used with this constructor } - public GCFHeader(DataInputStream inputStream) throws IOException { - byte[] headerTest = new byte[MAGIC_HEADER.length]; + public GCFHeader(FileInputStream fileInputStream) throws IOException { + DataInputStream inputStream = new DataInputStream(fileInputStream); + byte[] headerTest = new byte[GCF_FILE_START_MARKER.length]; inputStream.read(headerTest); - if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) { - throw new UserException("Could not read GVCF file. MAGIC_HEADER missing. Saw " + headerTest); + if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) { + throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest)); } else { + version = inputStream.readInt(); + logger.info("Read GCF version " + version); + footerPosition = inputStream.readLong(); + logger.info("Read footer position of " + footerPosition); + long lastPos = fileInputStream.getChannel().position(); + logger.info(" Last position is " + lastPos); + + // seek to the footer + fileInputStream.getChannel().position(footerPosition); + if ( inputStream.readInt() != FOOTER_START_MARKER ) + throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker"); alleles = stringsToAlleles(readStrings(inputStream)); strings = readStrings(inputStream); samples = readStrings(inputStream); @@ -91,19 +109,28 @@ public class GCFHeader { logger.info(String.format("String map of %d elements", strings.size())); logger.info(String.format("Sample map of %d elements", samples.size())); filters = initializeFilterCache(); + fileInputStream.getChannel().position(lastPos); } } - public int write(final DataOutputStream outputStream) throws IOException { + public static int writeHeader(final DataOutputStream outputStream) throws IOException { int startBytes = outputStream.size(); - outputStream.write(MAGIC_HEADER); + outputStream.write(GCF_FILE_START_MARKER); + outputStream.writeInt(GCF_VERSION); + outputStream.writeLong(0); + return outputStream.size() - startBytes; + } + + public int writeFooter(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding write(outputStream, allelesToStrings(alleles)); write(outputStream, strings); write(outputStream, samples); return outputStream.size() - startBytes; } - public void write(DataOutputStream outputStream, List l) throws IOException { + private void write(DataOutputStream outputStream, List l) throws IOException { outputStream.writeInt(l.size()); for ( String elt : l ) outputStream.writeUTF(elt); } diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java new file mode 100644 index 000000000..7ff6e27a2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * GCFWriter implementing the VCFWriter interface + * @author Your Name + * @since Date created + */ +public class GCFWriter extends IndexingVCFWriter { + final boolean skipGenotypes; + final FileOutputStream fileOutputStream; + final DataOutputStream dataOutputStream; + final GCFHeaderBuilder gcfHeaderBuilder; + int nbytes = 0; + VCFHeader header = null; + File location; + + // -------------------------------------------------------------------------------- + // + // Constructors + // + // -------------------------------------------------------------------------------- + + public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, null), location, null, enableOnTheFlyIndexing); + this.location = location; + this.skipGenotypes = doNotWriteGenotypes; + + // write the output + try { + fileOutputStream = new FileOutputStream(location); + dataOutputStream = createDataOutputStream(fileOutputStream); + gcfHeaderBuilder = new GCFHeaderBuilder(); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(location, e); + } + } + + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + + @Override + public void writeHeader(VCFHeader header) { + this.header = header; + try { + nbytes += GCFHeader.writeHeader(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e); + } + } + + @Override + public void add(VariantContext vc) { + super.add(vc); + GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes); + try { + nbytes += gcf.write(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e); + } + } + + @Override + public void close() { + // todo -- write out VCF header lines + GCFHeader gcfHeader = gcfHeaderBuilder.createHeader(); + try { + long headerPosition = nbytes; + nbytes += gcfHeader.writeFooter(dataOutputStream); + dataOutputStream.close(); + //System.out.println("Writing forward reference to " + headerPosition); + + RandomAccessFile raFile = new RandomAccessFile(location, "rw"); + raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET); + raFile.writeLong(headerPosition); + raFile.close(); + } catch ( IOException e ) { + throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e); + } + + super.close(); + } + + private static final DataOutputStream createDataOutputStream(final OutputStream stream) { + return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE)); + } + +} From 48461b34afc6af2a545f961ac3563b7b0a602725 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 8 Sep 2011 15:01:13 -0400 Subject: [PATCH 11/32] Added TYPE argument to print out VariantType --- .../sting/gatk/walkers/variantutils/VariantsToTable.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 2a877fb09..bf9ff35de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker { getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } }); getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); From 06cb20f2a5fd2681a95613ae0b8b8a53c6002f4b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 9 Sep 2011 12:56:45 -0400 Subject: [PATCH 13/32] Intermediate commit cleaning up scatter intervals -- Adding unit tests to ensure uniformity of intervals --- .../sting/utils/interval/IntervalUtils.java | 57 +- .../utils/interval/IntervalUtilsUnitTest.java | 1032 +++++++++-------- .../queue/extensions/gatk/GATKIntervals.scala | 130 +-- .../gatk/IntervalScatterFunction.scala | 4 +- .../gatk/GATKIntervalsUnitTest.scala | 10 +- 5 files changed, 658 insertions(+), 575 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f551e1368..41cbbe59f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -334,24 +334,44 @@ public class IntervalUtils { } /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. + * Splits an interval list into multiple sublists. * @param locs The genome locs to split. * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @param scatterParts The output interval lists to write to. + * @return A list of lists of genome locs, split according to splits */ - public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) { - if (splits.size() != scatterParts.size()) - throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); - int fileIndex = 0; + public static List> splitIntervalsToSubLists(List locs, List splits) { int locIndex = 1; int start = 0; + List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { - IntervalList intervalList = new IntervalList(fileHeader); + List curList = new ArrayList(); for (int i = start; i < stop; i++) - intervalList.add(toInterval(locs.get(i), locIndex++)); - intervalList.write(scatterParts.get(fileIndex++)); + curList.add(locs.get(i)); start = stop; + sublists.add(curList); + } + + return sublists; + } + + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param splits Pre-divided genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + + int fileIndex = 0; + int locIndex = 1; + for (final List split : splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (final GenomeLoc loc : split) + intervalList.add(toInterval(loc, locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); } } @@ -361,17 +381,15 @@ public class IntervalUtils { * @param numParts Number of parts to split the locs into. * @return The stop points to split the genome locs. */ - public static List splitFixedIntervals(List locs, int numParts) { + public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - long locsSize = 0; - for (GenomeLoc loc: locs) - locsSize += loc.size(); - List splitPoints = new ArrayList(); + final long locsSize = intervalSize(locs); + final List splitPoints = new ArrayList(); addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); Collections.sort(splitPoints); splitPoints.add(locs.size()); - return splitPoints; + return splitIntervalsToSubLists(locs, splitPoints); } private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { @@ -441,4 +459,11 @@ public class IntervalUtils { return merged; } } + + public static final long intervalSize(final List locs) { + long size = 0; + for ( final GenomeLoc loc : locs ) + size += loc.size(); + return size; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bb892eec8..bd6bf9591 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest { private SAMFileHeader hg19Header; private GenomeLocParser hg19GenomeLocParser; private List hg19ReferenceLocs; + private List hg19exomeIntervals; + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return hg18ReferenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); + return locs; + } @BeforeClass public void init() { @@ -54,511 +68,555 @@ public class IntervalUtilsUnitTest extends BaseTest { ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); hg19GenomeLocParser = new GenomeLocParser(seq); hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; + + hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false)); } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(hg19Ref,ex); } } - @Test(expectedExceptions=UserException.class) - public void testMergeListsBySetOperatorNoOverlap() { - // a couple of lists we'll use for the testing - List listEveryTwoFromOne = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - else - listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + private class IntervalSlicingTest extends TestDataProvider { + public int parts; + public double maxAllowableVariance; + + private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { + super(IntervalSlicingTest.class); + this.parts = parts; + this.maxAllowableVariance = maxAllowableVariance; } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 100); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 0); - } - - @Test - public void testMergeListsBySetOperatorAllOverlap() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 150); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 50); - } - - @Test - public void testMergeListsBySetOperator() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 5 == 0) { - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - } - - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 40); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 20); - } - - @Test - public void testGetContigLengths() { - Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); - Assert.assertEquals((long)lengths.get("chr1"), 247249719); - Assert.assertEquals((long)lengths.get("chr2"), 242951149); - Assert.assertEquals((long)lengths.get("chr3"), 199501827); - Assert.assertEquals((long)lengths.get("chr20"), 62435964); - Assert.assertEquals((long)lengths.get("chrX"), 154913754); - } - - private List getLocs(String... intervals) { - return getLocs(Arrays.asList(intervals)); - } - - private List getLocs(List intervals) { - if (intervals.size() == 0) - return hg18ReferenceLocs; - List locs = new ArrayList(); - for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; - } - - @Test - public void testParseIntervalArguments() { - Assert.assertEquals(getLocs().size(), 45); - Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); - Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); - } - - @Test - public void testIsIntervalFile() { - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); - - List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); - for (String extension: extensions) { - Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); + public String toString() { + return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); } } - @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) - public void testMissingIntervalFile() { - IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); + @DataProvider(name = "intervalslicingdata") + public Object[][] createTrees() { +// new IntervalSlicingTest(1, 0); +// new IntervalSlicingTest(2, 0.1); + new IntervalSlicingTest(5, 0.1); +// new IntervalSlicingTest(10, 0.1); +// new IntervalSlicingTest(67, 0.1); +// new IntervalSlicingTest(100, 0.1); +// new IntervalSlicingTest(500, 0.1); +// new IntervalSlicingTest(1000, 0.1); + return IntervalSlicingTest.getTests(IntervalSlicingTest.class); } - @Test - public void testFixedScatterIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + @Test(dataProvider = "intervalslicingdata") + public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); - List files = testFiles("basic.", 3, ".intervals"); + long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); + long idealSplitSize = totalSize / test.parts; - List locs = getLocs("chr1", "chr2", "chr3"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterFixedIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("less.", 3, ".intervals"); - - List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testSplitFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - IntervalUtils.splitFixedIntervals(locs, files.size()); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - } - @Test - public void testScatterFixedIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs2.get(0), chr1b); - Assert.assertEquals(locs3.get(0), chr2); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs3.get(0), chr2b); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3a); - Assert.assertEquals(locs3.get(0), chr3b); - } - - @Test - public void testScatterFixedIntervalsFile() { - List files = testFiles("sg.", 20, ".intervals"); - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - - int[] counts = { - 125, 138, 287, 291, 312, 105, 155, 324, - 295, 298, 141, 121, 285, 302, 282, 88, - 116, 274, 282, 248 -// 5169, 5573, 10017, 10567, 10551, -// 5087, 4908, 10120, 10435, 10399, -// 5391, 4735, 10621, 10352, 10654, -// 5227, 5256, 10151, 9649, 9825 - }; - - //String splitCounts = ""; - for (int lastIndex = 0, i = 0; i < splits.size(); i++) { - int splitIndex = splits.get(i); - int splitCount = (splitIndex - lastIndex); - //splitCounts += ", " + splitCount; - lastIndex = splitIndex; - Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + long sumOfSplitSizes = 0; + int counter = 0; + for ( final List split : splits ) { + long splitSize = IntervalUtils.intervalSize(split); + double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); + logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + counter++; + sumOfSplitSizes += splitSize; + Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); } - //System.out.println(splitCounts.substring(2)); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - int locIndex = 0; - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); - for (GenomeLoc parsedLoc: parsedLocs) - Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); - } - Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } - @Test - public void testScatterFixedIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); - IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - @Test - public void testScatterContigIntervalsOrder() { - List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr2); - Assert.assertEquals(locs2.get(0), chr1); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - - List files = testFiles("contig_basic.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("contig_less.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterContigIntervalsMoreFiles() { - List files = testFiles("contig_more.", 3, ".intervals"); - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); - } - - @Test - public void testScatterContigIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_start.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs1.get(1), chr1b); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_middle.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 2); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs2.get(1), chr2b); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("contig_split_end.", 3 ,".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3a); - Assert.assertEquals(locs3.get(1), chr3b); - } - - @Test - public void testScatterContigIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - private List testFiles(String prefix, int count, String suffix) { - ArrayList files = new ArrayList(); - for (int i = 1; i <= count; i++) { - files.add(createTempFile(prefix + i, suffix)); - } - return files; - } - - @DataProvider(name="unmergedIntervals") - public Object[][] getUnmergedIntervals() { - return new Object[][] { - new Object[] {"small_unmerged_picard_intervals.list"}, - new Object[] {"small_unmerged_gatk_intervals.list"} - }; - } - - @Test(dataProvider="unmergedIntervals") - public void testUnmergedIntervals(String unmergedIntervals) { - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); - Assert.assertEquals(locs.size(), 2); - - List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); - Assert.assertEquals(merged.size(), 1); - } +// @Test(expectedExceptions=UserException.class) +// public void testMergeListsBySetOperatorNoOverlap() { +// // a couple of lists we'll use for the testing +// List listEveryTwoFromOne = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 2 == 0) +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// else +// listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 100); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 0); +// } +// +// @Test +// public void testMergeListsBySetOperatorAllOverlap() { +// // a couple of lists we'll use for the testing +// List allSites = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 2 == 0) +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 150); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 50); +// } +// +// @Test +// public void testMergeListsBySetOperator() { +// // a couple of lists we'll use for the testing +// List allSites = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 5 == 0) { +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 40); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 20); +// } +// +// @Test +// public void testGetContigLengths() { +// Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); +// Assert.assertEquals((long)lengths.get("chr1"), 247249719); +// Assert.assertEquals((long)lengths.get("chr2"), 242951149); +// Assert.assertEquals((long)lengths.get("chr3"), 199501827); +// Assert.assertEquals((long)lengths.get("chr20"), 62435964); +// Assert.assertEquals((long)lengths.get("chrX"), 154913754); +// } +// +// @Test +// public void testParseIntervalArguments() { +// Assert.assertEquals(getLocs().size(), 45); +// Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); +// Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); +// } +// +// @Test +// public void testIsIntervalFile() { +// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); +// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); +// +// List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); +// for (String extension: extensions) { +// Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); +// } +// } +// +// @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) +// public void testMissingIntervalFile() { +// IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); +// } +// +// @Test +// public void testFixedScatterIntervalsBasic() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// +// List files = testFiles("basic.", 3, ".intervals"); +// +// List locs = getLocs("chr1", "chr2", "chr3"); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsLessFiles() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); +// +// List files = testFiles("less.", 3, ".intervals"); +// +// List locs = getLocs("chr1", "chr2", "chr3", "chr4"); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// Assert.assertEquals(locs3.get(1), chr4); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testSplitFixedIntervalsMoreFiles() { +// List files = testFiles("more.", 3, ".intervals"); +// List locs = getLocs("chr1", "chr2"); +// IntervalUtils.splitFixedIntervals(locs, files.size()); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testScatterFixedIntervalsMoreFiles() { +// List files = testFiles("more.", 3, ".intervals"); +// List locs = getLocs("chr1", "chr2"); +// List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// } +// @Test +// public void testScatterFixedIntervalsStart() { +// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); +// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); +// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1a); +// Assert.assertEquals(locs2.get(0), chr1b); +// Assert.assertEquals(locs3.get(0), chr2); +// Assert.assertEquals(locs3.get(1), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsMiddle() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); +// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2a); +// Assert.assertEquals(locs3.get(0), chr2b); +// Assert.assertEquals(locs3.get(1), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsEnd() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); +// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); +// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 2); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs1.get(1), chr2); +// Assert.assertEquals(locs2.get(0), chr3a); +// Assert.assertEquals(locs3.get(0), chr3b); +// } +// +// @Test +// public void testScatterFixedIntervalsFile() { +// List files = testFiles("sg.", 20, ".intervals"); +// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// +// int[] counts = { +// 125, 138, 287, 291, 312, 105, 155, 324, +// 295, 298, 141, 121, 285, 302, 282, 88, +// 116, 274, 282, 248 +//// 5169, 5573, 10017, 10567, 10551, +//// 5087, 4908, 10120, 10435, 10399, +//// 5391, 4735, 10621, 10352, 10654, +//// 5227, 5256, 10151, 9649, 9825 +// }; +// +// //String splitCounts = ""; +// for (int lastIndex = 0, i = 0; i < splits.size(); i++) { +// int splitIndex = splits.get(i); +// int splitCount = (splitIndex - lastIndex); +// //splitCounts += ", " + splitCount; +// lastIndex = splitIndex; +// Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); +// } +// //System.out.println(splitCounts.substring(2)); +// +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// int locIndex = 0; +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); +// for (GenomeLoc parsedLoc: parsedLocs) +// Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); +// } +// Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); +// } +// +// @Test +// public void testScatterFixedIntervalsMax() { +// List files = testFiles("sg.", 85, ".intervals"); +// List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); +// +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); +// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); +// } +// } +// +// @Test +// public void testScatterContigIntervalsOrder() { +// List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr2); +// Assert.assertEquals(locs2.get(0), chr1); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsBasic() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// +// List files = testFiles("contig_basic.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsLessFiles() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); +// +// List files = testFiles("contig_less.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// Assert.assertEquals(locs3.get(1), chr4); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testScatterContigIntervalsMoreFiles() { +// List files = testFiles("contig_more.", 3, ".intervals"); +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); +// } +// +// @Test +// public void testScatterContigIntervalsStart() { +// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); +// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); +// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("contig_split_start.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 2); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1a); +// Assert.assertEquals(locs1.get(1), chr1b); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsMiddle() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); +// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("contig_split_middle.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 2); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2a); +// Assert.assertEquals(locs2.get(1), chr2b); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsEnd() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); +// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); +// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); +// +// List files = testFiles("contig_split_end.", 3 ,".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3a); +// Assert.assertEquals(locs3.get(1), chr3b); +// } +// +// @Test +// public void testScatterContigIntervalsMax() { +// List files = testFiles("sg.", 85, ".intervals"); +// IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); +// +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); +// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); +// } +// } +// +// private List testFiles(String prefix, int count, String suffix) { +// ArrayList files = new ArrayList(); +// for (int i = 1; i <= count; i++) { +// files.add(createTempFile(prefix + i, suffix)); +// } +// return files; +// } +// +// @DataProvider(name="unmergedIntervals") +// public Object[][] getUnmergedIntervals() { +// return new Object[][] { +// new Object[] {"small_unmerged_picard_intervals.list"}, +// new Object[] {"small_unmerged_gatk_intervals.list"} +// }; +// } +// +// @Test(dataProvider="unmergedIntervals") +// public void testUnmergedIntervals(String unmergedIntervals) { +// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); +// Assert.assertEquals(locs.size(), 2); +// +// List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); +// Assert.assertEquals(merged.size(), 1); +// } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index aae5e438c..0fb997f43 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -1,65 +1,65 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.queue.extensions.gatk - -import java.io.File -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.IntervalUtils -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource -import net.sf.samtools.SAMFileHeader -import java.util.Collections -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} - -case class GATKIntervals(reference: File, intervals: List[String]) { - private lazy val referenceDataSource = new ReferenceDataSource(reference) - private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] - - lazy val samFileHeader = { - val header = new SAMFileHeader - header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) - header - } - - lazy val locs: java.util.List[GenomeLoc] = { - val parser = new GenomeLocParser(referenceDataSource.getReference) - val parsedLocs = - if (intervals.isEmpty) - GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList - else - IntervalUtils.parseIntervalArguments(parser, intervals, false) - Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) - } - - lazy val contigs = locs.map(_.getContig).distinct.toList - - def getSplits(size: Int) = { - splitsBySize.getOrElse(size, { - val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) - splitsBySize += size -> splits - splits - }) - } -} +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +import java.io.File +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource +import net.sf.samtools.SAMFileHeader +import java.util.Collections +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} + +case class GATKIntervals(reference: File, intervals: List[String]) { + private lazy val referenceDataSource = new ReferenceDataSource(reference) +// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] + + lazy val samFileHeader = { + val header = new SAMFileHeader + header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) + header + } + + lazy val locs: java.util.List[GenomeLoc] = { + val parser = new GenomeLocParser(referenceDataSource.getReference) + val parsedLocs = + if (intervals.isEmpty) + GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList + else + IntervalUtils.parseIntervalArguments(parser, intervals, false) + Collections.sort(parsedLocs) + Collections.unmodifiableList(parsedLocs) + } + + lazy val contigs = locs.map(_.getContig).distinct.toList + +// def getSplits(size: Int) = { +// splitsBySize.getOrElse(size, { +// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) +// splitsBySize += size -> splits +// splits +// }) +// } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index d88d272b9..f65d5ab29 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) - IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, - gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) + val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index b3a2d23ae..38abe24ef 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -53,8 +53,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) - Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) - Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) +// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) +// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) } @Test(timeOut = 30000) @@ -65,7 +65,7 @@ class GATKIntervalsUnitTest { // for(Item item: javaConvertedScalaList) // This for loop is actually an O(N^2) operation as the iterator calls the // O(N) javaConvertedScalaList.size() for each iteration of the loop. - Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) Assert.assertEquals(gi.contigs.size, 24) } @@ -74,8 +74,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) - Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) - Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) +// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) +// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) } @Test From 87dc5cfb24a8065a07f0fdec3d09a0943210e4ae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 9 Sep 2011 14:23:13 -0400 Subject: [PATCH 14/32] Whitespace cleanup --- public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b96923589..b66198713 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome @Override public int hashCode() { - return (int)( start << 16 + stop << 4 + contigIndex ); + return start << 16 | stop << 4 | contigIndex; } From c6436ee5f0f3359912e8210f99828a33680c745c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 9 Sep 2011 14:24:29 -0400 Subject: [PATCH 15/32] Whitespace cleanup --- public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b66198713..ba4919175 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -307,6 +307,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome @Override public int hashCode() { return start << 16 | stop << 4 | contigIndex; + } From 3c8445b934c127581919d6be960ebc372be21342 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 9 Sep 2011 14:25:37 -0400 Subject: [PATCH 16/32] Performance bugfix for GenomeLoc.hashcode -- old version overflowed so most GenomeLocs had 0 hashcode. Now uses or not plus to combine --- public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index ba4919175..b66198713 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -307,7 +307,6 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome @Override public int hashCode() { return start << 16 | stop << 4 | contigIndex; - } From 72536e5d6db56f495d560f1bcf2536c6896a49c0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 9 Sep 2011 15:44:47 -0400 Subject: [PATCH 17/32] Done --- build.xml | 4 +- .../sting/utils/interval/IntervalUtils.java | 70 +- .../utils/interval/IntervalUtilsUnitTest.java | 1000 +++++++++-------- 3 files changed, 522 insertions(+), 552 deletions(-) diff --git a/build.xml b/build.xml index beca6bce0..efefdd438 100644 --- a/build.xml +++ b/build.xml @@ -855,8 +855,8 @@ - - + + diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 41cbbe59f..2cfcc19a9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -333,28 +333,6 @@ public class IntervalUtils { throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size())); } - /** - * Splits an interval list into multiple sublists. - * @param locs The genome locs to split. - * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @return A list of lists of genome locs, split according to splits - */ - public static List> splitIntervalsToSubLists(List locs, List splits) { - int locIndex = 1; - int start = 0; - List> sublists = new ArrayList>(splits.size()); - for (Integer stop: splits) { - List curList = new ArrayList(); - for (int i = start; i < stop; i++) - curList.add(locs.get(i)); - start = stop; - sublists.add(curList); - } - - return sublists; - } - - /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -384,39 +362,27 @@ public class IntervalUtils { public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); + final long locsSize = intervalSize(locs); - final List splitPoints = new ArrayList(); - addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); - Collections.sort(splitPoints); - splitPoints.add(locs.size()); - return splitIntervalsToSubLists(locs, splitPoints); - } + final double idealSplitSize = locsSize / numParts; + final List> splits = new ArrayList>(numParts); + final LinkedList remainingLocs = new LinkedList(locs); - private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { - if (numParts < 2) - return; - int halfParts = (numParts + 1) / 2; - Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts); - int splitIndex = splitPoint.first; - long splitSize = splitPoint.second; - splitPoints.add(splitIndex); - addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); - addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); - } + for ( int i = 0; i < numParts; i++ ) { + long splitSize = 0; + List split = new ArrayList(); + while ( ! remainingLocs.isEmpty() ) { + final GenomeLoc toAdd = remainingLocs.pop(); + splitSize += toAdd.size(); + split.add(toAdd); + final long nextEltSize = remainingLocs.isEmpty() ? 0 : remainingLocs.peek().size(); + if ( splitSize + (i % 2 == 0 ? 0 : nextEltSize) > idealSplitSize ) + break; + } + splits.add(split); + } - private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) { - int splitIndex = startIndex; - long splitSize = 0; - for (int i = 0; i < minLocs; i++) { - splitSize += locs.get(splitIndex).size(); - splitIndex++; - } - long halfSize = locsSize / 2; - while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) { - splitSize += locs.get(splitIndex).size(); - splitIndex++; - } - return new Pair(splitIndex, splitSize); + return splits; } /** diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bd6bf9591..4809f1b5c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -99,19 +100,26 @@ public class IntervalUtilsUnitTest extends BaseTest { @DataProvider(name = "intervalslicingdata") public Object[][] createTrees() { -// new IntervalSlicingTest(1, 0); -// new IntervalSlicingTest(2, 0.1); - new IntervalSlicingTest(5, 0.1); -// new IntervalSlicingTest(10, 0.1); -// new IntervalSlicingTest(67, 0.1); -// new IntervalSlicingTest(100, 0.1); -// new IntervalSlicingTest(500, 0.1); -// new IntervalSlicingTest(1000, 0.1); + new IntervalSlicingTest(1, 0); + new IntervalSlicingTest(2, 0.1); + new IntervalSlicingTest(3, 0.1); + new IntervalSlicingTest(7, 0.1); + new IntervalSlicingTest(10, 0.1); + new IntervalSlicingTest(31, 0.1); + new IntervalSlicingTest(67, 0.1); + new IntervalSlicingTest(100, 0.1); + new IntervalSlicingTest(127, 0.1); + // starts to become a bit less efficiency with larger cuts + new IntervalSlicingTest(500, 0.5); + new IntervalSlicingTest(1000, 1); + new IntervalSlicingTest(10000, 10); return IntervalSlicingTest.getTests(IntervalSlicingTest.class); } @Test(dataProvider = "intervalslicingdata") public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + Set locsSet = new HashSet(hg19exomeIntervals); + Set notFoundSet = new HashSet(hg19exomeIntervals); List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); @@ -122,501 +130,497 @@ public class IntervalUtilsUnitTest extends BaseTest { for ( final List split : splits ) { long splitSize = IntervalUtils.intervalSize(split); double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); - logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); counter++; sumOfSplitSizes += splitSize; Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); + + for ( final GenomeLoc loc : split ) { + Assert.assertTrue(locsSet.contains(loc), "Split location " + loc + " not found in set of input locs"); + notFoundSet.remove(loc); + } } - Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); + Assert.assertEquals(sumOfSplitSizes, totalSize, "Split intervals don't contain the exact number of bases in the original intervals"); + Assert.assertTrue(notFoundSet.isEmpty(), "Not all intervals were present in the split set"); } -// @Test(expectedExceptions=UserException.class) -// public void testMergeListsBySetOperatorNoOverlap() { -// // a couple of lists we'll use for the testing -// List listEveryTwoFromOne = new ArrayList(); -// List listEveryTwoFromTwo = new ArrayList(); -// -// // create the two lists we'll use -// for (int x = 1; x < 101; x++) { -// if (x % 2 == 0) -// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// else -// listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// } -// -// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); -// Assert.assertEquals(ret.size(), 100); -// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); -// Assert.assertEquals(ret.size(), 0); -// } -// -// @Test -// public void testMergeListsBySetOperatorAllOverlap() { -// // a couple of lists we'll use for the testing -// List allSites = new ArrayList(); -// List listEveryTwoFromTwo = new ArrayList(); -// -// // create the two lists we'll use -// for (int x = 1; x < 101; x++) { -// if (x % 2 == 0) -// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// } -// -// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); -// Assert.assertEquals(ret.size(), 150); -// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); -// Assert.assertEquals(ret.size(), 50); -// } -// -// @Test -// public void testMergeListsBySetOperator() { -// // a couple of lists we'll use for the testing -// List allSites = new ArrayList(); -// List listEveryTwoFromTwo = new ArrayList(); -// -// // create the two lists we'll use -// for (int x = 1; x < 101; x++) { -// if (x % 5 == 0) { -// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// } -// } -// -// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); -// Assert.assertEquals(ret.size(), 40); -// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); -// Assert.assertEquals(ret.size(), 20); -// } -// -// @Test -// public void testGetContigLengths() { -// Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); -// Assert.assertEquals((long)lengths.get("chr1"), 247249719); -// Assert.assertEquals((long)lengths.get("chr2"), 242951149); -// Assert.assertEquals((long)lengths.get("chr3"), 199501827); -// Assert.assertEquals((long)lengths.get("chr20"), 62435964); -// Assert.assertEquals((long)lengths.get("chrX"), 154913754); -// } -// -// @Test -// public void testParseIntervalArguments() { -// Assert.assertEquals(getLocs().size(), 45); -// Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); -// Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); -// } -// -// @Test -// public void testIsIntervalFile() { -// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); -// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); -// -// List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); -// for (String extension: extensions) { -// Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); -// } -// } -// -// @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) -// public void testMissingIntervalFile() { -// IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); -// } -// -// @Test -// public void testFixedScatterIntervalsBasic() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// -// List files = testFiles("basic.", 3, ".intervals"); -// -// List locs = getLocs("chr1", "chr2", "chr3"); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterFixedIntervalsLessFiles() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); -// -// List files = testFiles("less.", 3, ".intervals"); -// -// List locs = getLocs("chr1", "chr2", "chr3", "chr4"); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// Assert.assertEquals(locs3.get(1), chr4); -// } -// -// @Test(expectedExceptions=UserException.BadArgumentValue.class) -// public void testSplitFixedIntervalsMoreFiles() { -// List files = testFiles("more.", 3, ".intervals"); -// List locs = getLocs("chr1", "chr2"); -// IntervalUtils.splitFixedIntervals(locs, files.size()); -// } -// -// @Test(expectedExceptions=UserException.BadArgumentValue.class) -// public void testScatterFixedIntervalsMoreFiles() { -// List files = testFiles("more.", 3, ".intervals"); -// List locs = getLocs("chr1", "chr2"); -// List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// } -// @Test -// public void testScatterFixedIntervalsStart() { -// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); -// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); -// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// List locs = getLocs(intervals); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1a); -// Assert.assertEquals(locs2.get(0), chr1b); -// Assert.assertEquals(locs3.get(0), chr2); -// Assert.assertEquals(locs3.get(1), chr3); -// } -// -// @Test -// public void testScatterFixedIntervalsMiddle() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); -// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// List locs = getLocs(intervals); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2a); -// Assert.assertEquals(locs3.get(0), chr2b); -// Assert.assertEquals(locs3.get(1), chr3); -// } -// -// @Test -// public void testScatterFixedIntervalsEnd() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); -// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); -// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// List locs = getLocs(intervals); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 2); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs1.get(1), chr2); -// Assert.assertEquals(locs2.get(0), chr3a); -// Assert.assertEquals(locs3.get(0), chr3b); -// } -// -// @Test -// public void testScatterFixedIntervalsFile() { -// List files = testFiles("sg.", 20, ".intervals"); -// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// -// int[] counts = { -// 125, 138, 287, 291, 312, 105, 155, 324, -// 295, 298, 141, 121, 285, 302, 282, 88, -// 116, 274, 282, 248 -//// 5169, 5573, 10017, 10567, 10551, -//// 5087, 4908, 10120, 10435, 10399, -//// 5391, 4735, 10621, 10352, 10654, -//// 5227, 5256, 10151, 9649, 9825 -// }; -// -// //String splitCounts = ""; -// for (int lastIndex = 0, i = 0; i < splits.size(); i++) { -// int splitIndex = splits.get(i); -// int splitCount = (splitIndex - lastIndex); -// //splitCounts += ", " + splitCount; -// lastIndex = splitIndex; -// Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); -// } -// //System.out.println(splitCounts.substring(2)); -// -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// int locIndex = 0; -// for (int i = 0; i < files.size(); i++) { -// String file = files.get(i).toString(); -// List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); -// Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); -// for (GenomeLoc parsedLoc: parsedLocs) -// Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); -// } -// Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); -// } -// -// @Test -// public void testScatterFixedIntervalsMax() { -// List files = testFiles("sg.", 85, ".intervals"); -// List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); -// -// for (int i = 0; i < files.size(); i++) { -// String file = files.get(i).toString(); -// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); -// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); -// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); -// } -// } -// -// @Test -// public void testScatterContigIntervalsOrder() { -// List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr2); -// Assert.assertEquals(locs2.get(0), chr1); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsBasic() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// -// List files = testFiles("contig_basic.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsLessFiles() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); -// -// List files = testFiles("contig_less.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// Assert.assertEquals(locs3.get(1), chr4); -// } -// -// @Test(expectedExceptions=UserException.BadArgumentValue.class) -// public void testScatterContigIntervalsMoreFiles() { -// List files = testFiles("contig_more.", 3, ".intervals"); -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); -// } -// -// @Test -// public void testScatterContigIntervalsStart() { -// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); -// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); -// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("contig_split_start.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 2); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1a); -// Assert.assertEquals(locs1.get(1), chr1b); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsMiddle() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); -// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("contig_split_middle.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 2); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2a); -// Assert.assertEquals(locs2.get(1), chr2b); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsEnd() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); -// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); -// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); -// -// List files = testFiles("contig_split_end.", 3 ,".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3a); -// Assert.assertEquals(locs3.get(1), chr3b); -// } -// -// @Test -// public void testScatterContigIntervalsMax() { -// List files = testFiles("sg.", 85, ".intervals"); -// IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); -// -// for (int i = 0; i < files.size(); i++) { -// String file = files.get(i).toString(); -// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); -// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); -// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); -// } -// } -// -// private List testFiles(String prefix, int count, String suffix) { -// ArrayList files = new ArrayList(); -// for (int i = 1; i <= count; i++) { -// files.add(createTempFile(prefix + i, suffix)); -// } -// return files; -// } -// -// @DataProvider(name="unmergedIntervals") -// public Object[][] getUnmergedIntervals() { -// return new Object[][] { -// new Object[] {"small_unmerged_picard_intervals.list"}, -// new Object[] {"small_unmerged_gatk_intervals.list"} -// }; -// } -// -// @Test(dataProvider="unmergedIntervals") -// public void testUnmergedIntervals(String unmergedIntervals) { -// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); -// Assert.assertEquals(locs.size(), 2); -// -// List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); -// Assert.assertEquals(merged.size(), 1); -// } + @Test(expectedExceptions=UserException.class) + public void testMergeListsBySetOperatorNoOverlap() { + // a couple of lists we'll use for the testing + List listEveryTwoFromOne = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + else + listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 100); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 0); + } + + @Test + public void testMergeListsBySetOperatorAllOverlap() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 150); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 50); + } + + @Test + public void testMergeListsBySetOperator() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 5 == 0) { + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 40); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 20); + } + + @Test + public void testGetContigLengths() { + Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); + Assert.assertEquals((long)lengths.get("chr1"), 247249719); + Assert.assertEquals((long)lengths.get("chr2"), 242951149); + Assert.assertEquals((long)lengths.get("chr3"), 199501827); + Assert.assertEquals((long)lengths.get("chr20"), 62435964); + Assert.assertEquals((long)lengths.get("chrX"), 154913754); + } + + @Test + public void testParseIntervalArguments() { + Assert.assertEquals(getLocs().size(), 45); + Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); + Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); + } + + @Test + public void testIsIntervalFile() { + Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); + Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); + + List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); + for (String extension: extensions) { + Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); + } + } + + @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) + public void testMissingIntervalFile() { + IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); + } + + @Test + public void testFixedScatterIntervalsBasic() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + + List files = testFiles("basic.", 3, ".intervals"); + + List locs = getLocs("chr1", "chr2", "chr3"); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterFixedIntervalsLessFiles() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); + + List files = testFiles("less.", 3, ".intervals"); + + List locs = getLocs("chr1", "chr2", "chr3", "chr4"); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3); + Assert.assertEquals(locs3.get(0), chr4); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testSplitFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + IntervalUtils.splitFixedIntervals(locs, files.size()); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, locs.size()), files); + } + @Test + public void testScatterFixedIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs2.get(0), chr1b); + Assert.assertEquals(locs3.get(0), chr2); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs3.get(0), chr2b); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3a); + Assert.assertEquals(locs3.get(0), chr3b); + } + + @Test + public void testScatterFixedIntervalsFile() { + List files = testFiles("sg.", 20, ".intervals"); + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + + int[] counts = { + 125, 138, 287, 291, 312, 105, 155, 324, + 295, 298, 141, 121, 285, 302, 282, 88, + 116, 274, 282, 248 +// 5169, 5573, 10017, 10567, 10551, +// 5087, 4908, 10120, 10435, 10399, +// 5391, 4735, 10621, 10352, 10654, +// 5227, 5256, 10151, 9649, 9825 + }; + + //String splitCounts = ""; + for (int i = 0; i < splits.size(); i++) { + long splitCount = splits.get(i).size(); + Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + } + //System.out.println(splitCounts.substring(2)); + + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + int locIndex = 0; + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); + for (GenomeLoc parsedLoc: parsedLocs) + Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); + } + Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + } + + @Test + public void testScatterFixedIntervalsMax() { + List files = testFiles("sg.", 85, ".intervals"); + IntervalUtils.scatterFixedIntervals(hg19Header, IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()), files); + + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); + Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); + } + } + + @Test + public void testScatterContigIntervalsOrder() { + List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr2); + Assert.assertEquals(locs2.get(0), chr1); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsBasic() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + + List files = testFiles("contig_basic.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsLessFiles() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); + + List files = testFiles("contig_less.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + Assert.assertEquals(locs3.get(1), chr4); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterContigIntervalsMoreFiles() { + List files = testFiles("contig_more.", 3, ".intervals"); + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); + } + + @Test + public void testScatterContigIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("contig_split_start.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs1.get(1), chr1b); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("contig_split_middle.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 2); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs2.get(1), chr2b); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); + + List files = testFiles("contig_split_end.", 3 ,".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3a); + Assert.assertEquals(locs3.get(1), chr3b); + } + + @Test + public void testScatterContigIntervalsMax() { + List files = testFiles("sg.", 85, ".intervals"); + IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); + + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); + Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); + } + } + + private List testFiles(String prefix, int count, String suffix) { + ArrayList files = new ArrayList(); + for (int i = 1; i <= count; i++) { + files.add(createTempFile(prefix + i, suffix)); + } + return files; + } + + @DataProvider(name="unmergedIntervals") + public Object[][] getUnmergedIntervals() { + return new Object[][] { + new Object[] {"small_unmerged_picard_intervals.list"}, + new Object[] {"small_unmerged_gatk_intervals.list"} + }; + } + + @Test(dataProvider="unmergedIntervals") + public void testUnmergedIntervals(String unmergedIntervals) { + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); + Assert.assertEquals(locs.size(), 2); + + List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + Assert.assertEquals(merged.size(), 1); + } } From 2316b6aad3e81cc0cd88980acd73d716fd4cdb2d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 12 Sep 2011 22:02:42 -0400 Subject: [PATCH 18/32] Trying to fix problems with S3 uploading behind firewalls -- Cannot reproduce the very long waits reported by some users. -- Fixed problem that exception might result in an undeleted file, which is now fixed with deleteOnExit() --- .../sting/gatk/phonehome/GATKRunReport.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 4d94130a8..70307380b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -293,15 +293,16 @@ public class GATKRunReport { * That is, postReport() is guarenteed not to fail for any reason. */ private File postReportToLocalDisk(File rootDir) { + String filename = getID() + ".report.xml.gz"; + File file = new File(rootDir, filename); try { - String filename = getID() + ".report.xml.gz"; - File file = new File(rootDir, filename); postReportToFile(file); logger.debug("Wrote report to " + file); return file; } catch ( Exception e ) { // we catch everything, and no matter what eat the error exceptDuringRunReport("Couldn't read report file", e); + file.delete(); return null; } } @@ -312,6 +313,7 @@ public class GATKRunReport { File localFile = postReportToLocalDisk(new File("./")); logger.debug("Generating GATK report to AWS S3 based on local file " + localFile); if ( localFile != null ) { // we succeeded in creating the local file + localFile.deleteOnExit(); try { // stop us from printing the annoying, and meaningless, mime types warning Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); @@ -342,8 +344,6 @@ public class GATKRunReport { exceptDuringRunReport("Couldn't calculate MD5", e); } catch ( IOException e ) { exceptDuringRunReport("Couldn't read report file", e); - } finally { - localFile.delete(); } } } From edf29d0616c576ece9a99af23cd42a54feb83e87 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 12 Sep 2011 22:16:52 -0400 Subject: [PATCH 20/32] Explicit info message about uploading S3 log --- .../org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 70307380b..5a7658031 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -338,6 +338,7 @@ public class GATKRunReport { //logger.info("Uploading " + localFile + " to AWS bucket"); S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); logger.debug("Uploaded to AWS: " + s3Object); + logger.info("Uploaded run statistics report to AWS S3"); } catch ( S3ServiceException e ) { exceptDuringRunReport("S3 exception occurred", e); } catch ( NoSuchAlgorithmException e ) { From bed78b47e090e19274273a1a552e0e40c82e0161 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 18 Sep 2011 20:18:18 -0400 Subject: [PATCH 21/32] Marginally better formating, with hours the default time --- public/R/queueJobReport.R | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R index a24d269c9..9f37aa038 100644 --- a/public/R/queueJobReport.R +++ b/public/R/queueJobReport.R @@ -12,14 +12,14 @@ if ( onCMDLine ) { inputFileName = args[1] outputPDF = args[2] } else { - #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt" - inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" + inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt" + #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt" outputPDF = NA } -RUNTIME_UNITS = "(sec)" -ORIGINAL_UNITS_TO_SECONDS = 1/1000 +RUNTIME_UNITS = "(hours)" +ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60 # # Helper function to aggregate all of the jobs in the report across all tables @@ -33,7 +33,7 @@ allJobsFromReport <- function(report) { # # Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job # -plotJobsGantt <- function(gatkReport, sortOverall) { +plotJobsGantt <- function(gatkReport, sortOverall, includeText) { allJobs = allJobsFromReport(gatkReport) if ( sortOverall ) { title = "All jobs, by analysis, by start time" @@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) { } allJobs$index = 1:nrow(allJobs) minTime = min(allJobs$startTime) - allJobs$relStartTime = allJobs$startTime - minTime - allJobs$relDoneTime = allJobs$doneTime - minTime + allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS + allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts) maxRelTime = max(allJobs$relDoneTime) p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName)) - p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm"))) - p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) + p <- p + theme_bw() + p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm"))) + if ( includeText ) + p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) p <- p + xlim(0, maxRelTime * 1.1) p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS)) - p <- p + ylab("Job") + p <- p + ylab("Job number") p <- p + opts(title=title) print(p) } @@ -155,8 +157,8 @@ if ( ! is.na(outputPDF) ) { pdf(outputPDF, height=8.5, width=11) } -plotJobsGantt(gatkReportData, T) -plotJobsGantt(gatkReportData, F) +plotJobsGantt(gatkReportData, T, F) +plotJobsGantt(gatkReportData, F, F) plotProgressByTime(gatkReportData) for ( group in gatkReportData ) { plotGroup(group) From 4ad330008ddb29e163089afa2c264f62dccd4c3f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 19 Sep 2011 10:19:10 -0400 Subject: [PATCH 22/32] Final intervals cleanup -- No functional changes (my algorithm wouldn't work) -- Major structural cleanup (returning more basic data structures that allow us to development new algorithm) -- Unit tests for the efficiency of interval partitioning --- build.xml | 4 +- .../sting/utils/interval/IntervalUtils.java | 70 ++++++++++++++----- .../utils/interval/IntervalUtilsUnitTest.java | 63 ++++++++--------- 3 files changed, 82 insertions(+), 55 deletions(-) diff --git a/build.xml b/build.xml index 1196f32dc..e5ad9daf0 100644 --- a/build.xml +++ b/build.xml @@ -852,8 +852,8 @@ - - + + diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 2cfcc19a9..41cbbe59f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -333,6 +333,28 @@ public class IntervalUtils { throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size())); } + /** + * Splits an interval list into multiple sublists. + * @param locs The genome locs to split. + * @param splits The stop points for the genome locs returned by splitFixedIntervals. + * @return A list of lists of genome locs, split according to splits + */ + public static List> splitIntervalsToSubLists(List locs, List splits) { + int locIndex = 1; + int start = 0; + List> sublists = new ArrayList>(splits.size()); + for (Integer stop: splits) { + List curList = new ArrayList(); + for (int i = start; i < stop; i++) + curList.add(locs.get(i)); + start = stop; + sublists.add(curList); + } + + return sublists; + } + + /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -362,27 +384,39 @@ public class IntervalUtils { public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - final long locsSize = intervalSize(locs); - final double idealSplitSize = locsSize / numParts; - final List> splits = new ArrayList>(numParts); - final LinkedList remainingLocs = new LinkedList(locs); + final List splitPoints = new ArrayList(); + addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); + Collections.sort(splitPoints); + splitPoints.add(locs.size()); + return splitIntervalsToSubLists(locs, splitPoints); + } - for ( int i = 0; i < numParts; i++ ) { - long splitSize = 0; - List split = new ArrayList(); - while ( ! remainingLocs.isEmpty() ) { - final GenomeLoc toAdd = remainingLocs.pop(); - splitSize += toAdd.size(); - split.add(toAdd); - final long nextEltSize = remainingLocs.isEmpty() ? 0 : remainingLocs.peek().size(); - if ( splitSize + (i % 2 == 0 ? 0 : nextEltSize) > idealSplitSize ) - break; - } - splits.add(split); + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { + if (numParts < 2) + return; + int halfParts = (numParts + 1) / 2; + Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts); + int splitIndex = splitPoint.first; + long splitSize = splitPoint.second; + splitPoints.add(splitIndex); + addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); + addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); + } + + private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) { + int splitIndex = startIndex; + long splitSize = 0; + for (int i = 0; i < minLocs; i++) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; } - - return splits; + long halfSize = locsSize / 2; + while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + return new Pair(splitIndex, splitSize); } /** diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 4809f1b5c..98b878d23 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -101,25 +100,18 @@ public class IntervalUtilsUnitTest extends BaseTest { @DataProvider(name = "intervalslicingdata") public Object[][] createTrees() { new IntervalSlicingTest(1, 0); - new IntervalSlicingTest(2, 0.1); - new IntervalSlicingTest(3, 0.1); - new IntervalSlicingTest(7, 0.1); - new IntervalSlicingTest(10, 0.1); - new IntervalSlicingTest(31, 0.1); - new IntervalSlicingTest(67, 0.1); - new IntervalSlicingTest(100, 0.1); - new IntervalSlicingTest(127, 0.1); - // starts to become a bit less efficiency with larger cuts - new IntervalSlicingTest(500, 0.5); + new IntervalSlicingTest(2, 1); + new IntervalSlicingTest(5, 1); + new IntervalSlicingTest(10, 1); + new IntervalSlicingTest(67, 1); + new IntervalSlicingTest(100, 1); + new IntervalSlicingTest(500, 1); new IntervalSlicingTest(1000, 1); - new IntervalSlicingTest(10000, 10); return IntervalSlicingTest.getTests(IntervalSlicingTest.class); } - @Test(dataProvider = "intervalslicingdata") + @Test(enabled = true, dataProvider = "intervalslicingdata") public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { - Set locsSet = new HashSet(hg19exomeIntervals); - Set notFoundSet = new HashSet(hg19exomeIntervals); List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); @@ -134,15 +126,9 @@ public class IntervalUtilsUnitTest extends BaseTest { counter++; sumOfSplitSizes += splitSize; Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); - - for ( final GenomeLoc loc : split ) { - Assert.assertTrue(locsSet.contains(loc), "Split location " + loc + " not found in set of input locs"); - notFoundSet.remove(loc); - } } - Assert.assertEquals(sumOfSplitSizes, totalSize, "Split intervals don't contain the exact number of bases in the original intervals"); - Assert.assertTrue(notFoundSet.isEmpty(), "Not all intervals were present in the split set"); + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } @Test(expectedExceptions=UserException.class) @@ -246,7 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("basic.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3"); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -271,20 +258,21 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("less.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); + Assert.assertEquals(locs3.size(), 2); Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3); - Assert.assertEquals(locs3.get(0), chr4); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + Assert.assertEquals(locs3.get(1), chr4); } @Test(expectedExceptions=UserException.BadArgumentValue.class) @@ -298,7 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest { public void testScatterFixedIntervalsMoreFiles() { List files = testFiles("more.", 3, ".intervals"); List locs = getLocs("chr1", "chr2"); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, locs.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); } @Test public void testScatterFixedIntervalsStart() { @@ -311,7 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -338,7 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -365,7 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -399,7 +391,7 @@ public class IntervalUtilsUnitTest extends BaseTest { //String splitCounts = ""; for (int i = 0; i < splits.size(); i++) { - long splitCount = splits.get(i).size(); + int splitCount = splits.get(i).size(); Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); } //System.out.println(splitCounts.substring(2)); @@ -420,7 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsMax() { List files = testFiles("sg.", 85, ".intervals"); - IntervalUtils.scatterFixedIntervals(hg19Header, IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); + IntervalUtils.scatterFixedIntervals(hg19Header, splits, files); for (int i = 0; i < files.size(); i++) { String file = files.get(i).toString(); From ca1b30e4a4822672803421278eb8301b14cff417 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 19 Sep 2011 10:29:06 -0400 Subject: [PATCH 23/32] Fix the -T argument in the DepthOfCoverage docs Add documentation for the RefSeqCodec, pointing users to the wiki page describing how to create the file --- .../coverage/DepthOfCoverageWalker.java | 9 ++++--- .../utils/codecs/refseq/RefSeqCodec.java | 24 +++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 86f97a36c..664c319ab 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -63,9 +63,12 @@ import java.util.*; *

Input

*

* One or more bam files (with proper headers) to be analyzed for coverage statistics - * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *

- * + *

+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) + *

*

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -93,7 +96,7 @@ import java.util.*; *

  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T VariantEval \
+ *   -T DepthOfCoverage \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index d94d9ff84..f142fa5aa 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;
 
 /**
- * TODO FOR CHRIS HARTL
+ * Allows for reading in RefSeq information
  *
  * 

- * Codec Description + * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, + * strandedness of transcription. *

* *

- * See also: link to file specification + * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here + * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq *

+ *

Usage

+ * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example + *
+ * -refSeqBinding:REFSEQ /path/to/refSeq.txt
+ * 
+ * + * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *

File format example

+ * If you want to define your own file for use, the format is (tab delimited): + * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) + * and exon frames, for example: + *
+ * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
+ * 
+ * for more information see here *

- * A BAM file containing exactly one sample. + * *

* * @author Mark DePristo From 3e93f246f7b8849a3126fab5e0757cfdee22e661 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 19 Sep 2011 11:40:45 -0400 Subject: [PATCH 24/32] Support for sample sets in AssignSomaticStatus -- Also cleaned up SampleUtils.getSamplesFromCommandLine() to return a set, not a list, and trim the sample names. --- .../sting/utils/SampleUtils.java | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index f9997bfd8..1b4703e4a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -190,11 +190,21 @@ public class SampleUtils { } - public static List getSamplesFromCommandLineInput(Collection sampleArgs) { + /** + * Returns a new set of samples, containing a final list of samples expanded from sampleArgs + * + * Each element E of sampleArgs can either be a literal sample name or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique sample names. + * + * @param sampleArgs + * @return + */ + public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { if (sampleArgs != null) { // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our // sample list set, and treat the entries as if they had been specified on the command line. - List samplesFromFiles = new ArrayList(); + Set samplesFromFiles = new HashSet(); for (String SAMPLE_EXPRESSION : sampleArgs) { File sampleFile = new File(SAMPLE_EXPRESSION); @@ -203,7 +213,7 @@ public class SampleUtils { List lines = reader.readLines(); for (String line : lines) { - samplesFromFiles.add(line); + samplesFromFiles.add(line.trim()); } } catch (FileNotFoundException e) { samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample @@ -212,7 +222,8 @@ public class SampleUtils { return samplesFromFiles; } - return new ArrayList(); + + return new HashSet(); } public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { From 034b8685889a879eda0e7c1a001358a26845755a Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 19 Sep 2011 12:16:07 -0400 Subject: [PATCH 25/32] Revert "Fix the -T argument in the DepthOfCoverage docs" This reverts commit 0994efda998cf3a41b1a43696dbc852a441d5316. --- .../coverage/DepthOfCoverageWalker.java | 9 +++---- .../utils/codecs/refseq/RefSeqCodec.java | 24 ++++--------------- 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 664c319ab..86f97a36c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -63,12 +63,9 @@ import java.util.*; *

Input

*

* One or more bam files (with proper headers) to be analyzed for coverage statistics + * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *

- *

- *(Optional) A REFSEQ Rod to aggregate coverage to the gene level - *

- * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) - *

+ * *

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -96,7 +93,7 @@ import java.util.*; *

  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T DepthOfCoverage \
+ *   -T VariantEval \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index f142fa5aa..d94d9ff84 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -12,35 +12,19 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;
 
 /**
- * Allows for reading in RefSeq information
+ * TODO FOR CHRIS HARTL
  *
  * 

- * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, - * strandedness of transcription. + * Codec Description *

* *

- * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here - * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq + * See also: link to file specification *

- *

Usage

- * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example - *
- * -refSeqBinding:REFSEQ /path/to/refSeq.txt
- * 
- * - * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *

File format example

- * If you want to define your own file for use, the format is (tab delimited): - * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) - * and exon frames, for example: - *
- * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
- * 
- * for more information see here *

- * + * A BAM file containing exactly one sample. *

* * @author Mark DePristo From 85626e7a5dbae8a263b8e2ff2e64bd25656d6e9c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Sep 2011 12:24:05 -0400 Subject: [PATCH 26/32] We no longer want people to use the August 2010 Dindel calls for indel realignment but instead Guillermo's new whole genome bi-allelic indel calls; updating the bundle accordingly. Also, there was some confusion by the 1000G data processing folks as to exactly what these indel files are, so I've renamed them so that it's clear. Wiki updated too. --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 59c00b8cd..e8b8258c1 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -131,11 +131,11 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", "hapmap_3.3", b37, true, true)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf", - "1000G_indels_for_realignment", b37, true, false)) + addResource(new Resource("/humgen/1kg/processing/official_release/phase1/ALL.wgs.VQSR_consensus_biallelic.20101123.indels.sites.vcf", + "1000G_biallelic.indels", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf", - "indels_mills_devine", b37, true, true)) + "Mills_Devine_2hit.indels", b37, true, true)) // // example call set for wiki tutorial From 8143def292a49844ab3ff302fbb00f5c866299f7 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 19 Sep 2011 10:29:06 -0400 Subject: [PATCH 27/32] Fix the -T argument in the DepthOfCoverage docs Add documentation for the RefSeqCodec, pointing users to the wiki page describing how to create the file --- .../coverage/DepthOfCoverageWalker.java | 9 ++++--- .../utils/codecs/refseq/RefSeqCodec.java | 24 +++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 86f97a36c..664c319ab 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -63,9 +63,12 @@ import java.util.*; *

Input

*

* One or more bam files (with proper headers) to be analyzed for coverage statistics - * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *

- * + *

+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) + *

*

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -93,7 +96,7 @@ import java.util.*; *

  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T VariantEval \
+ *   -T DepthOfCoverage \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index d94d9ff84..f142fa5aa 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;
 
 /**
- * TODO FOR CHRIS HARTL
+ * Allows for reading in RefSeq information
  *
  * 

- * Codec Description + * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, + * strandedness of transcription. *

* *

- * See also: link to file specification + * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here + * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq *

+ *

Usage

+ * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example + *
+ * -refSeqBinding:REFSEQ /path/to/refSeq.txt
+ * 
+ * + * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *

File format example

+ * If you want to define your own file for use, the format is (tab delimited): + * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) + * and exon frames, for example: + *
+ * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
+ * 
+ * for more information see here *

- * A BAM file containing exactly one sample. + * *

* * @author Mark DePristo From 5e832254a4e024378f7fdee252abf7df9e289c6a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Sep 2011 13:28:41 -0400 Subject: [PATCH 28/32] Fixing ReadAndInterval overlap comments. --- .../sting/utils/sam/ReadUtils.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 62bbb0307..18fcdabf2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -118,31 +118,40 @@ public class ReadUtils { /** * This enum represents all the different ways in which a read can overlap an interval. * - * NO_OVERLAP: + * NO_OVERLAP_CONTIG: + * read and interval are in different contigs. + * + * NO_OVERLAP_LEFT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * NO_OVERLAP_RIGHT: * the read does not overlap the interval. * * |----------------| (interval) * <----------------> (read) * - * LEFT_OVERLAP: + * OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it * * |----------------| (interval) * <----------------> (read) * - * RIGHT_OVERLAP: + * OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it * * |----------------| (interval) * <----------------> (read) * - * FULL_OVERLAP: + * OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval * * |-----------| (interval) * <-------------------> (read) * - * CONTAINED: + * OVERLAP_CONTAINED: * the read starts and ends inside the interval * * |----------------| (interval) From ba150570f3d7747256f634a2828ab673a98953f7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Sep 2011 13:30:32 -0400 Subject: [PATCH 29/32] Updating to use new rod system syntax plus name change for CountRODs --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index e8b8258c1..036a77b58 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -300,9 +300,9 @@ class GATKResourcesBundle extends QScript { bamFile = bamIn } - class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRod with UNIVERSAL_GATK_ARGS { + class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRODs with UNIVERSAL_GATK_ARGS { //@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx") - this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) + this.rod :+= vcf this.reference_sequence = ref } @@ -313,7 +313,7 @@ class GATKResourcesBundle extends QScript { } class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("variant", "VCF", dbsnp) + this.variant = dbsnp this.select ++= List("\"dbSNPBuildID <= 129\"") this.reference_sequence = ref this.out = dbsnp129 From 080c9575470c505e10f7b09d59fa22fcb668867d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Sep 2011 13:53:08 -0400 Subject: [PATCH 30/32] Fixing contracts for SoftUnclippedEnd utils Now accepts reads that are entirely contained inside an insertion. --- .../broadinstitute/sting/utils/sam/ReadUtils.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 18fcdabf2..2de17db14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -667,7 +667,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedStart(SAMRecord read) { int start = read.getUnclippedStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { @@ -679,7 +679,7 @@ public class ReadUtils { return start; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedEnd(SAMRecord read) { int stop = read.getUnclippedStart(); int shift = 0; @@ -695,6 +695,14 @@ public class ReadUtils { return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + private static boolean readIsEntirelyInsertion(SAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } + /** * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before * the alignment start of the read. From 56106d54ed620965aea0b39052de43c81671c817 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Sep 2011 14:00:00 -0400 Subject: [PATCH 31/32] Changing ReadUtils behavior to comply with GenomeLocParser Now the functions getRefCoordSoftUnclippedStart and getRefCoordSoftUnclippedEnd will return getUnclippedStart if the read is all contained within an insertion. Updated the contracts accordingly. This should give the same behavior as the GenomeLocParser now. --- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 2de17db14..5d3ef3086 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -667,7 +667,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedStart(SAMRecord read) { int start = read.getUnclippedStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { @@ -679,9 +679,13 @@ public class ReadUtils { return start; } - @Ensures({"(result >= read.getUnclippedStart() && result <= read.getUnclippedEnd()) || readIsEntirelyInsertion(read)"}) + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedEnd(SAMRecord read) { int stop = read.getUnclippedStart(); + + if (readIsEntirelyInsertion(read)) + return stop; + int shift = 0; CigarOperator lastOperator = null; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { From 61b89e236ab13b073a3572e983b6c730efd5331e Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Tue, 20 Sep 2011 00:14:35 -0400 Subject: [PATCH 32/32] To work around potential problem with invalid javax.mail 1.4.1 in ivy cache, added explicit javax.mail 1.4.4 along with build.xml code to remove 1.4.1. --- build.xml | 8 ++++++++ ivy.xml | 6 ++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/build.xml b/build.xml index e5ad9daf0..1f26e7b7a 100644 --- a/build.xml +++ b/build.xml @@ -163,6 +163,14 @@ + + + + diff --git a/ivy.xml b/ivy.xml index 115f4062a..f90b9a010 100644 --- a/ivy.xml +++ b/ivy.xml @@ -15,10 +15,8 @@ - - - - + +