diff --git a/.gitignore b/.gitignore index 9a20b68ca..65f111587 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ /*.bam /*.bai /*.bed -*.idx *~ /*.vcf /*.txt diff --git a/build.xml b/build.xml index bb02c1ff1..12ebfa18f 100644 --- a/build.xml +++ b/build.xml @@ -91,9 +91,8 @@ - - - + + @@ -675,8 +674,9 @@ - + + @@ -865,14 +865,18 @@ - - + + + + + + @@ -921,12 +925,17 @@ + + - + + + + @@ -1104,7 +1113,7 @@ - + @@ -1114,7 +1123,7 @@ - + @@ -1194,8 +1203,8 @@ - - + + @@ -1244,7 +1253,7 @@ listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter"> - + @@ -1287,7 +1296,7 @@ - + @@ -1331,9 +1340,9 @@ - + - + @@ -1442,4 +1451,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ivy.xml b/ivy.xml index 4bd6ad7b8..ed13af1c2 100644 --- a/ivy.xml +++ b/ivy.xml @@ -41,6 +41,8 @@ + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index a47e417c4..5016526c0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -55,7 +55,9 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; import java.io.PrintStream; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Map; /** @@ -74,6 +76,12 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + /** + * This argument informs the prior probability of having an indel at a site. + */ + @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) + public double INDEL_HETEROZYGOSITY = 1.0/8000; + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; @@ -112,6 +120,29 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 6; + /** + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, + * see e.g. Waterson (1975) or Tajima (1996). + * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N + * + * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate, + * as for example when the ancestral status of the reference allele is not known. + * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns: + * a) User must specify 2N values, where N is the number of samples. + * b) Only diploid calls supported. + * c) Probability values are specified in double format, in linear space. + * d) No negative values allowed. + * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. + * f) If user-defined values add to more than one, an error will be produced. + * + * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. + * -inputPrior 0.33 -inputPrior 0.33 + * for the single-sample diploid case. + */ + @Advanced + @Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false) + public List inputPrior = Collections.emptyList(); + /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we @@ -155,10 +186,6 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); - @Hidden - @Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false) - public PrintStream contaminationLog = null; - @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; @@ -170,15 +197,16 @@ public class StandardCallerArgumentCollection { this.alleles = SCAC.alleles; this.GenotypingMode = SCAC.GenotypingMode; this.heterozygosity = SCAC.heterozygosity; + this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY; this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; this.OutputMode = SCAC.OutputMode; this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION; this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE; - this.contaminationLog = SCAC.contaminationLog; this.exactCallsLog = SCAC.exactCallsLog; this.sampleContamination=SCAC.sampleContamination; this.AFmodel = SCAC.AFmodel; + this.inputPrior = SCAC.inputPrior; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 43e929ac0..a3a9e50e9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -58,8 +59,12 @@ import java.util.*; /** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele). - * Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities + * + *

This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities(ref bases vs. bases of the alternate allele).

+ * + *

Caveat

+ *

The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

*/ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation { public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } @@ -86,13 +91,13 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot } for (Map el : alleleLikelihoodMap.getLikelihoodMapValues()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) - refQuals.add(-10.0*(double)el.get(a)); - else if (allAlleles.contains(a)) - altQuals.add(-10.0*(double)el.get(a)); + if (a.getMostLikelyAllele().isReference()) + refQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele())); + else if (allAlleles.contains(a.getMostLikelyAllele())) + altQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele())); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index eb3dc6959..64d45df02 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -65,9 +65,15 @@ import java.util.*; /** - * Allele count in genotypes, for each ALT allele, in the same order as listed; - * allele Frequency, for each ALT allele, in the same order as listed; total number - * of alleles in called genotypes. + * Allele counts and frequency for each ALT allele and total number of alleles in called genotypes + * + *

This annotation tool outputs the following: + * + *

    + *
  • Allele count in genotypes, for each ALT allele, in the same order as listed
  • + *
  • Allele Frequency, for each ALT allele, in the same order as listed
  • + *
  • Total number of alleles in called genotypes
  • + *

*/ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index dbb977ebf..366512119 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -57,14 +58,15 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; /** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 6/28/12 - */ - -/** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele) - * Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases + * + *

This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele).

+ * + *

Caveat

+ *

The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ * + * @author rpoplin + * @since 6/28/12 */ public class ClippingRankSumTest extends RankSumTest { @@ -83,12 +85,12 @@ public class ClippingRankSumTest extends RankSumTest { for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java index 5138ac9af..5c48417ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java @@ -70,10 +70,11 @@ import java.util.Map; /** * Total (unfiltered) depth over all samples. * - * While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's + *

While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for * N samples with -dcov D is N * D + *

*/ public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 5acea12f6..1cf91f181 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; @@ -72,11 +73,11 @@ import java.util.Map; /** - * The depth of coverage of each VCF allele in this sample. + * The depth of coverage of each allele per sample * - * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this + *

The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the - * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of + * caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of * REF and ALT fields) is the unfiltered count of all reads that carried with them the * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the * power I have to determine the genotype of the sample at this site, while the AD tells me how many times @@ -86,10 +87,12 @@ import java.util.Map; * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are - * many non-informatice reads. - * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation, + * many non-informative reads.

+ * + *

Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation, * one should not base assumptions about the underlying genotype based on it; - * instead, the genotype likelihoods (PLs) are what determine the genotype calls. + * instead, the genotype likelihoods (PLs) are what determine the genotype calls.

+ * */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { @@ -139,12 +142,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa } for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = el.getKey(); - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative() ) continue; // read is non-informative - if (!vc.getAlleles().contains(a)) + if (!vc.getAlleles().contains(a.getMostLikelyAllele())) continue; // sanity check - shouldn't be needed - alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); + alleleCounts.put(a.getMostLikelyAllele(), alleleCounts.get(a.getMostLikelyAllele()) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 14c785678..957eb1aba 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import cern.jet.math.Arithmetic; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -54,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -68,12 +70,19 @@ import java.util.*; /** - * Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation - * being seen on only the forward or only the reverse strand) in the reads? More bias is - * indicative of false positive calls. Note that the fisher strand test may not be - * calculated for certain complex indel cases or for multi-allelic sites. + * Phred-scaled p-value using Fisher's Exact Test to detect strand bias + * + *

Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation + * being seen on only the forward or only the reverse strand) in the reads. More bias is + * indicative of false positive calls. + *

+ * + *

Caveat

+ *

The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

*/ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static Logger logger = Logger.getLogger(FisherStrand.class); + private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; @@ -95,6 +104,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat else if (stratifiedPerReadAlleleLikelihoodMap != null) { // either SNP with no alignment context, or indels: per-read likelihood map needed final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); +// logger.info("VC " + vc); +// printTable(table, 0.0); return pValueForBestTable(table, null); } else @@ -131,9 +142,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private Map annotationForOneTable(final double pValue) { final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs return Collections.singletonMap(FS, value); -// Map map = new HashMap(); -// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue))); -// return map; } public List getKeyNames() { @@ -192,7 +200,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private static void printTable(int[][] table, double pValue) { - System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue); + logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); } private static boolean rotateTable(int[][] table) { @@ -266,10 +274,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount); + updateTable(table, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount); } } @@ -306,22 +314,31 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { - // ignore reduced reads because they are always on the forward strand! - // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test - if ( read.isReducedRead() ) - return; final boolean matchesRef = allele.equals(ref, true); final boolean matchesAlt = allele.equals(alt, true); if ( matchesRef || matchesAlt ) { + final int row = matchesRef ? 0 : 1; - final boolean isFW = !read.getReadNegativeStrandFlag(); + if ( read.isStrandless() ) { - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; + // ignore strandless reduced reads because they are always on the forward strand! + if ( !read.isReducedRead() ) { - table[row][column] += representativeCount; + // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 + // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even + // if the read is only seen once, because it's a merged read or other) + final int toAdd = Math.max(representativeCount / 2, 1); + table[row][0] += toAdd; + table[row][1] += toAdd; + } + } else { + // a normal read with an actual strand + final boolean isFW = !read.getReadNegativeStrandFlag(); + final int column = isFW ? 0 : 1; + table[row][column] += representativeCount; + } } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 48b3593c5..827e39c11 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -68,10 +68,16 @@ import java.util.Map; /** - * The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site + * GC content of the reference around the given site + * + *

The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference.

+ * + *

Caveat

+ *

The window size used to calculate the GC content around the site is set by the tool used for annotation + * (currently UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the Technical Document for each tool + * to find out what window size they use.

*/ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class GCContent extends InfoFieldAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -87,7 +93,7 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota public List getKeyNames() { return Arrays.asList("GC"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content within 20 bp +/- the variant")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content around the variant (see docs for window size details)")); } public boolean useZeroQualityReads() { return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index 703810025..43ec537a4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -51,6 +51,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; @@ -68,9 +69,16 @@ import java.util.Map; /** - * Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium + * Hardy-Weinberg test for disequilibrium + * + *

This annotation calculates the Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium.

+ * + *

Caveats

+ *

This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.

+ *

Right now we just ignore genotypes that are not confident, but this throws off our HW ratios. + * More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is het or hom var.

*/ -public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation { +public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAnnotation { private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index c25cb6820..4039241ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -50,6 +50,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.GenomeLoc; @@ -63,9 +64,16 @@ import java.util.List; import java.util.Map; /** - * Largest contiguous homopolymer run of the variant allele in either direction on the reference. Computed only for bi-allelic sites. + * Largest contiguous homopolymer run of the variant allele + * + *

Calculates the length of the largest contiguous homopolymer run of the variant allele in either direction on the reference.

+ * + *

Caveats

+ *

This can only be computed for bi-allelic sites.

+ *

This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.

+ *

This needs to be computed in a more accurate manner. We currently look only at direct runs of the alternate allele adjacent to this position.

*/ -public class HomopolymerRun extends InfoFieldAnnotation { +public class HomopolymerRun extends InfoFieldAnnotation implements ExperimentalAnnotation { private boolean ANNOTATE_INDELS = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index 19f32bae0..ad974a083 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -65,13 +65,20 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; /** - * Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation - * versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is - * diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than - * the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios. + * Likelihood of being a Mendelian Violation + * + *

Given a variant context, this tool uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation + * versus the likelihood of the site transmitting according to mendelian rules.

+ * + *

Note that this annotation requires a valid ped file.

+ * + *

Caveat

+ *

This tool assumes that the organism is diploid. When multiple trios are present, the annotation is simply the maximum + * of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain + * sites and many trios.

*/ -public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { +public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; public static final String MVLR_KEY = "MVLR"; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 8c401eecd..3873138a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -59,8 +60,12 @@ import java.util.*; /** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) - * Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities + * + *

This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele).

+ * + *

Caveat

+ *

The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

*/ public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation { @@ -88,13 +93,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn return; } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); // BUGBUG: There needs to be a comparable isUsableBase check here - if (a.isNoCall()) + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)el.getKey().getMappingQuality()); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)el.getKey().getMappingQuality()); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 80bbfc2e4..a3fbcc439 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -54,16 +55,14 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. @@ -72,6 +71,7 @@ import java.util.Map; * reads associated with the samples with polymorphic genotypes. */ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { +// private final static Logger logger = Logger.getLogger(QualByDepth.class); public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -113,13 +113,37 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( depth == 0 ) return null; - double altAlleleLength = AverageAltAlleleLength.getMeanAltAlleleLength(vc); + final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); + QD = fixTooHighQD(QD); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } + /** + * The haplotype caller generates very high quality scores when multiple events are on the + * same haplotype. This causes some very good variants to have unusually high QD values, + * and VQSR will filter these out. This code looks at the QD value, and if it is above + * threshold we map it down to the mean high QD value, with some jittering + * + * // TODO -- remove me when HaplotypeCaller bubble caller is live + * + * @param QD the raw QD score + * @return a QD value + */ + private double fixTooHighQD(final double QD) { + if ( QD < MAX_QD_BEFORE_FIXING ) { + return QD; + } else { + return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + } + } + + private final static double MAX_QD_BEFORE_FIXING = 35; + private final static double IDEAL_HIGH_QD = 30; + private final static double JITTER_SIGMA = 3; + public List getKeyNames() { return Arrays.asList("QD"); } public List getDescriptions() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ec107512a..ef456824e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -183,6 +183,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR * @param headerLines */ public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { - useDithering = ! toolkit.getArguments().disableRandomization; + useDithering = ! toolkit.getArguments().disableDithering; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index ae0d2a87b..6ce4aab49 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -51,6 +51,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -65,8 +66,12 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; /** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). - * Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele + * + *

This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error.

+ * + *

Caveat

+ *

The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

*/ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { @@ -103,8 +108,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio } for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative() ) continue; // read is non-informative final GATKSAMRecord read = el.getKey(); @@ -119,9 +124,9 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio if (readPos > numAlignedBases / 2) readPos = numAlignedBases - (readPos + 1); - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)readPos); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)readPos); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index cede1e5ee..dd57c8ac6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -65,7 +66,9 @@ import java.util.Map; /** - * Fraction of reads containing spanning deletions at this site. + * Fraction of reads containing spanning deletions at this site + * + *

Note that this annotation is currently not compatible with HaplotypeCaller.

*/ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { @@ -86,10 +89,12 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn int deletions = 0; int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - deletions += pileup.getNumberOfDeletions(); - depth += pileup.getNumberOfElements(); + for ( final PileupElement p : sample.getValue().getBasePileup() ) { + final int actualSampleDepth = p.getRepresentativeCount(); + depth += actualSampleDepth; + if ( p.isDeletion() ) + deletions += actualSampleDepth; + } } Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index 2e0e759c2..332d18341 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -65,7 +65,14 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - +/** + * Annotates variants that are composed of tandem repeats + * + *

This tool outputs the number of times the tandem repeat unit is repeated, for each allele (including reference).

+ * + *

Caveat

+ *

This annotation is currently not compatible with HaplotypeCaller.

+ */ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation { private static final String STR_PRESENT = "STR"; private static final String REPEAT_UNIT_KEY = "RU"; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index b3f5728a2..f8efd7c3f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -65,12 +65,21 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; /** - * Created by IntelliJ IDEA. - * User: rpoplin, lfran, ebanks - * Date: 11/14/11 + * Wittkowski transmission disequilibrium test + * + *

Test statistic from Wittkowski transmission disequilibrium test. + * The calculation is based on the following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT

+ * + *

Note that this annotation requires a valid ped file.

+ * + *

Caveat

+ *

This annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).

+ * + * @author rpoplin, lfran, ebanks + * @since 11/14/11 */ -public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements RodRequiringAnnotation { private Set trios = null; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java index 89b0bcf96..555c75deb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java @@ -50,7 +50,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.IndelUtils; @@ -62,8 +61,11 @@ import java.util.*; /** * Assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.) + * + *

This tool assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.). + * It also specifies whether the variant is multiallelic (>2 alleles).

*/ -public class VariantType extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class VariantType extends InfoFieldAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index 82d08da41..ad97dc008 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -65,7 +65,7 @@ import java.util.List; public class BQSRGatherer extends Gatherer { - private static final String EMPTY_INPUT_LIST = "list of inputs files is empty"; + private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; private static final String MISSING_OUTPUT_FILE = "missing output file name"; @Override @@ -80,6 +80,8 @@ public class BQSRGatherer extends Gatherer { RecalibrationReport generalReport = null; for (File input : inputs) { final RecalibrationReport inputReport = new RecalibrationReport(input); + if( inputReport.isEmpty() ) { continue; } + if (generalReport == null) generalReport = inputReport; else diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index e1972334b..dde49b7db 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -95,14 +95,14 @@ import java.util.List; * *

* - *

Input

+ *

Input

*

* The input read data whose base quality scores need to be assessed. *

* A database of known polymorphic sites to skip over. *

* - *

Output

+ *

Output

*

* A GATK Report file with many tables: *

    @@ -116,7 +116,7 @@ import java.util.List; * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table. *

    * - *

    Examples

    + *

    Examples

    *
      * java -Xmx4g -jar GenomeAnalysisTK.jar \
      *   -T BaseRecalibrator \
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
    index 5ab296a5f..0a4899f1c 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
    @@ -61,7 +61,7 @@ import java.util.List;
      * User: rpoplin
      * Date: Nov 27, 2009
      *
    - * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
    + * A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker.
      * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
      */
     
    @@ -91,7 +91,7 @@ public class RecalibrationArgumentCollection {
          * If not provided, then no plots will be generated (useful for queue scatter/gathering).
          * However, we *highly* recommend that users generate these plots whenever possible for QC checking.
          */
    -    @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false)
    +    @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false, defaultToStdout = false)
         public File RECAL_PDF_FILE = null;
     
         /**
    @@ -131,14 +131,14 @@ public class RecalibrationArgumentCollection {
         public boolean RUN_WITHOUT_DBSNP = false;
     
         /**
    -     * CountCovariates and TableRecalibration accept a --solid_recal_mode  flag which governs how the recalibrator handles the
    +     * BaseRecalibrator accepts a --solid_recal_mode  flag which governs how the recalibrator handles the
          * reads which have had the reference inserted because of color space inconsistencies.
          */
         @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
         public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
     
         /**
    -     * CountCovariates and TableRecalibration accept a --solid_nocall_strategy  flag which governs how the recalibrator handles
    +     * BaseRecalibrator accepts a --solid_nocall_strategy  flag which governs how the recalibrator handles
          * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
          * their color space tag can not be recalibrated.
          */
    @@ -146,38 +146,38 @@ public class RecalibrationArgumentCollection {
         public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
     
         /**
    -     * The context covariate will use a context of this size to calculate it's covariate value for base mismatches
    +     * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
          */
    -    @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
    +    @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false)
         public int MISMATCHES_CONTEXT_SIZE = 2;
     
         /**
    -     * The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
    +     * The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
          */
    -    @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
    +    @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false)
         public int INDELS_CONTEXT_SIZE = 3;
     
         /**
          * The cycle covariate will generate an error if it encounters a cycle greater than this value.
          * This argument is ignored if the Cycle covariate is not used.
          */
    -    @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
    +    @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false)
         public int MAXIMUM_CYCLE_VALUE = 500;
     
         /**
    -     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
    +     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off]
          */
         @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
         public byte MISMATCHES_DEFAULT_QUALITY = -1;
     
         /**
    -     * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
    +     * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on]
          */
         @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
         public byte INSERTIONS_DEFAULT_QUALITY = 45;
     
         /**
    -     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
    +     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on]
          */
         @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
         public byte DELETIONS_DEFAULT_QUALITY = 45;
    @@ -220,7 +220,7 @@ public class RecalibrationArgumentCollection {
         public String FORCE_PLATFORM = null;
     
         @Hidden
    -    @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
    +    @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
         public PrintStream RECAL_TABLE_UPDATE_LOG = null;
     
         /**
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
    index 5e6e2a8d9..9f33234cf 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
    @@ -178,7 +178,7 @@ public class RecalibrationEngine {
             final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable();
     
             // iterate over all values in the qual table
    -        for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) {
    +        for ( final NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) {
                 final int rgKey = leaf.keys[0];
                 final int eventIndex = leaf.keys[2];
                 final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex);
    @@ -206,7 +206,9 @@ public class RecalibrationEngine {
          */
         @Requires("! finalized")
         private RecalibrationTables mergeThreadLocalRecalibrationTables() {
    -        if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty");
    +        if ( recalibrationTablesList.isEmpty() ) {
    +            recalibrationTablesList.add( new RecalibrationTables(covariates, numReadGroups, maybeLogStream) );
    +        }
     
             RecalibrationTables merged = null;
             for ( final RecalibrationTables table : recalibrationTablesList ) {
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
    index fb11f6249..271617059 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
    @@ -47,6 +47,7 @@
     package org.broadinstitute.sting.gatk.walkers.bqsr;
     
     import org.broadinstitute.sting.commandline.*;
    +import org.broadinstitute.sting.gatk.CommandLineGATK;
     import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
     import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
     import org.broadinstitute.sting.gatk.filters.*;
    @@ -55,18 +56,27 @@ import org.broadinstitute.sting.gatk.report.GATKReport;
     import org.broadinstitute.sting.gatk.report.GATKReportTable;
     import org.broadinstitute.sting.gatk.walkers.*;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
    +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
    +import org.broadinstitute.sting.utils.help.HelpConstants;
     import org.broadinstitute.sting.utils.recalibration.*;
     
     import java.io.*;
     
     /**
    + * Evaluate the performance of the base recalibration process
    + *
    + * 

    This tool aims to evaluate the results of the Base Quality Score Recalibration (BQSR) process.

    + * + *

    Caveat

    + *

    This tool is currently experimental. We do not provide documentation nor support for its operation.

    + * */ - +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) @PartitionBy(PartitionType.READ) public class RecalibrationPerformance extends RodWalker implements NanoSchedulable { - @Output(doc="Write output to this file", required = true) + @Output public PrintStream out; @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file") diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index 7f8b0dded..28a48c212 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -53,39 +53,155 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; * @since 6/15/12 */ public class BaseAndQualsCounts extends BaseCounts { - private final long[] sumInsertionQuals; - private final long[] sumDeletionQuals; - public BaseAndQualsCounts() { - super(); - this.sumInsertionQuals = new long[BaseIndex.values().length]; - this.sumDeletionQuals = new long[BaseIndex.values().length]; - // Java primitive arrays comes zero-filled, so no need to do it explicitly. + private long sumInsertionQual_A = 0; + private long sumDeletionQual_A = 0; + private long sumInsertionQual_C = 0; + private long sumDeletionQual_C = 0; + private long sumInsertionQual_G = 0; + private long sumDeletionQual_G = 0; + private long sumInsertionQual_T = 0; + private long sumDeletionQual_T = 0; + private long sumInsertionQual_D = 0; + private long sumDeletionQual_D = 0; + private long sumInsertionQual_I = 0; + private long sumDeletionQual_I = 0; + private long sumInsertionQual_N = 0; + private long sumDeletionQual_N = 0; + + /* + * Increments the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + */ + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + incr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); } - public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { + /* + * Increments the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + * @param isSoftClip true if is soft-clipped + */ + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { + // if we already have high quality bases, ignore low quality ones + if ( isLowQualBase && !isLowQuality() ) + return; + + // if this is a high quality base then remove any low quality bases and start from scratch + if ( !isLowQualBase && isLowQuality() ) { + if ( totalCount() > 0 ) + clear(); + setLowQuality(false); + } + final BaseIndex i = BaseIndex.byteToBase(base); - super.incr(i, baseQual); - sumInsertionQuals[i.index] += insQual; - sumDeletionQuals[i.index] += delQual; + super.incr(i, baseQual, baseMappingQual, isSoftClip); + switch (i) { + case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break; + case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break; + case G: sumInsertionQual_G += insQual; sumDeletionQual_G += delQual; break; + case T: sumInsertionQual_T += insQual; sumDeletionQual_T += delQual; break; + case D: sumInsertionQual_D += insQual; sumDeletionQual_D += delQual; break; + case I: sumInsertionQual_I += insQual; sumDeletionQual_I += delQual; break; + case N: sumInsertionQual_N += insQual; sumDeletionQual_N += delQual; break; + } } - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { + /* + * Decrements the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + */ + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + decr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); + } + + /* + * Decrements the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + * @param isSoftClip true if is soft-clipped + */ + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { + // if this is not the right type of base, ignore it + if ( isLowQualBase != isLowQuality() ) + return; + final BaseIndex i = BaseIndex.byteToBase(base); - super.decr(i, baseQual); - sumInsertionQuals[i.index] -= insQual; - sumDeletionQuals[i.index] -= delQual; + super.decr(i, baseQual, baseMappingQual, isSoftClip); + switch (i) { + case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break; + case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break; + case G: sumInsertionQual_G -= insQual; sumDeletionQual_G -= delQual; break; + case T: sumInsertionQual_T -= insQual; sumDeletionQual_T -= delQual; break; + case D: sumInsertionQual_D -= insQual; sumDeletionQual_D -= delQual; break; + case I: sumInsertionQual_I -= insQual; sumDeletionQual_I -= delQual; break; + case N: sumInsertionQual_N -= insQual; sumDeletionQual_N -= delQual; break; + } } public byte averageInsertionQualsOfBase(final BaseIndex base) { - return getGenericAverageQualOfBase(base, sumInsertionQuals); + return (byte) (getInsertionQual(base) / countOfBase(base)); } public byte averageDeletionQualsOfBase(final BaseIndex base) { - return getGenericAverageQualOfBase(base, sumDeletionQuals); + return (byte) (getDeletionQual(base) / countOfBase(base)); } - private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) { - return (byte) (sumQuals[base.index] / countOfBase(base)); + private long getInsertionQual(final BaseIndex base) { + switch (base) { + case A: return sumInsertionQual_A; + case C: return sumInsertionQual_C; + case G: return sumInsertionQual_G; + case T: return sumInsertionQual_T; + case D: return sumInsertionQual_D; + case I: return sumInsertionQual_I; + case N: return sumInsertionQual_N; + default: throw new IllegalArgumentException(base.name()); + } + } + + private long getDeletionQual(final BaseIndex base) { + switch (base) { + case A: return sumDeletionQual_A; + case C: return sumDeletionQual_C; + case G: return sumDeletionQual_G; + case T: return sumDeletionQual_T; + case D: return sumDeletionQual_D; + case I: return sumDeletionQual_I; + case N: return sumDeletionQual_N; + default: throw new IllegalArgumentException(base.name()); + } + } + + /** + * Clears out all stored data in this object + */ + public void clear() { + super.clear(); + sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0; + sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 399cbd2a5..e1329db3b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import org.broadinstitute.sting.utils.MathUtils; /** @@ -62,70 +64,118 @@ import com.google.java.contract.Requires; public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - private final int[] counts; // keeps track of the base counts - private final long[] sumQuals; // keeps track of the quals of each base - private int totalCount = 0; // keeps track of total count since this is requested so often - public BaseCounts() { - counts = new int[BaseIndex.values().length]; - sumQuals = new long[BaseIndex.values().length]; - // Java primitive arrays comes zero-filled, so no need to do it explicitly. - } + private int count_A = 0; // keeps track of the base counts + private int sumQual_A = 0; // keeps track of the quals of each base + private int count_C = 0; + private int sumQual_C = 0; + private int count_G = 0; + private int sumQual_G = 0; + private int count_T = 0; + private int sumQual_T = 0; + private int count_D = 0; + private int sumQual_D = 0; + private int count_I = 0; + private int sumQual_I = 0; + private int count_N = 0; + private int sumQual_N = 0; + private int totalCount = 0; // keeps track of total count since this is requested so often + private int nSoftClippedBases = 0; + private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this + private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise + public static BaseCounts createWithCounts(int[] countsACGT) { BaseCounts baseCounts = new BaseCounts(); - baseCounts.counts[BaseIndex.A.index] = countsACGT[0]; - baseCounts.counts[BaseIndex.C.index] = countsACGT[1]; - baseCounts.counts[BaseIndex.G.index] = countsACGT[2]; - baseCounts.counts[BaseIndex.T.index] = countsACGT[3]; + baseCounts.count_A = countsACGT[0]; + baseCounts.count_C = countsACGT[1]; + baseCounts.count_G = countsACGT[2]; + baseCounts.count_T = countsACGT[3]; baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3]; return baseCounts; } @Requires("other != null") public void add(final BaseCounts other) { - for (final BaseIndex i : BaseIndex.values()) { - final int otherCount = other.counts[i.index]; - counts[i.index] += otherCount; - totalCount += otherCount; - } + this.count_A += other.count_A; + this.count_C += other.count_C; + this.count_G += other.count_G; + this.count_T += other.count_T; + this.count_D += other.count_D; + this.count_I += other.count_I; + this.count_N += other.count_N; + this.totalCount += other.totalCount; + this.nSoftClippedBases = other.nSoftClippedBases; + this.mappingQualities.addAll(other.mappingQualities); } @Requires("other != null") public void sub(final BaseCounts other) { - for (final BaseIndex i : BaseIndex.values()) { - final int otherCount = other.counts[i.index]; - counts[i.index] -= otherCount; - totalCount -= otherCount; - } + this.count_A -= other.count_A; + this.count_C -= other.count_C; + this.count_G -= other.count_G; + this.count_T -= other.count_T; + this.count_D -= other.count_D; + this.count_I -= other.count_I; + this.count_N -= other.count_N; + this.totalCount -= other.totalCount; + this.nSoftClippedBases -= other.nSoftClippedBases; + this.mappingQualities.removeAll(other.mappingQualities); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(final byte base) { - final BaseIndex i = BaseIndex.byteToBase(base); - counts[i.index]++; - totalCount++; + add(BaseIndex.byteToBase(base), 1); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final BaseIndex base, final byte qual) { - counts[base.index]++; - totalCount++; - sumQuals[base.index] += qual; + public void incr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { + switch (base) { + case A: ++count_A; sumQual_A += qual; break; + case C: ++count_C; sumQual_C += qual; break; + case G: ++count_G; sumQual_G += qual; break; + case T: ++count_T; sumQual_T += qual; break; + case D: ++count_D; sumQual_D += qual; break; + case I: ++count_I; sumQual_I += qual; break; + case N: ++count_N; sumQual_N += qual; break; + } + ++totalCount; + nSoftClippedBases += isSoftclip ? 1 : 0; + mappingQualities.add(mappingQuality); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(final byte base) { - final BaseIndex i = BaseIndex.byteToBase(base); - counts[i.index]--; - totalCount--; + add(BaseIndex.byteToBase(base), -1); + } + + private void add(final BaseIndex base, int amount) { + switch(base) { + case A: count_A += amount; break; + case C: count_C += amount; break; + case G: count_G += amount; break; + case T: count_T += amount; break; + case D: count_D += amount; break; + case I: count_I += amount; break; + case N: count_N += amount; break; + } + totalCount += amount; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final BaseIndex base, final byte qual) { - counts[base.index]--; - totalCount--; - sumQuals[base.index] -= qual; + public void decr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { + switch (base) { + case A: --count_A; sumQual_A -= qual; break; + case C: --count_C; sumQual_C -= qual; break; + case G: --count_G; sumQual_G -= qual; break; + case T: --count_T; sumQual_T -= qual; break; + case D: --count_D; sumQual_D -= qual; break; + case I: --count_I; sumQual_I -= qual; break; + case N: --count_N; sumQual_N -= qual; break; + } + --totalCount; + nSoftClippedBases -= isSoftclip ? 1 : 0; + mappingQualities.remove((Integer) mappingQuality); } @Ensures("result >= 0") @@ -135,7 +185,16 @@ import com.google.java.contract.Requires; @Ensures("result >= 0") public long getSumQuals(final BaseIndex base) { - return sumQuals[base.index]; + switch (base) { + case A: return sumQual_A; + case C: return sumQual_C; + case G: return sumQual_G; + case T: return sumQual_T; + case D: return sumQual_D; + case I: return sumQual_I; + case N: return sumQual_N; + default: throw new IllegalArgumentException(base.name()); + } } @Ensures("result >= 0") @@ -155,12 +214,21 @@ import com.google.java.contract.Requires; @Ensures("result >= 0") public int countOfBase(final BaseIndex base) { - return counts[base.index]; + switch (base) { + case A: return count_A; + case C: return count_C; + case G: return count_G; + case T: return count_T; + case D: return count_D; + case I: return count_I; + case N: return count_N; + default: throw new IllegalArgumentException(base.name()); + } } @Ensures("result >= 0") public long sumQualsOfBase(final BaseIndex base) { - return sumQuals[base.index]; + return getSumQuals(base); } @Ensures("result >= 0") @@ -168,12 +236,25 @@ import com.google.java.contract.Requires; return (byte) (sumQualsOfBase(base) / countOfBase(base)); } + @Ensures("result >= 0") + public int nSoftclips() { + return nSoftClippedBases; + } @Ensures("result >= 0") public int totalCount() { return totalCount; } + /** + * The RMS of the mapping qualities of all reads that contributed to this object + * + * @return the RMS of the mapping qualities of all reads that contributed to this object + */ + public double getRMS() { + return MathUtils.rms(mappingQualities); + } + /** * Given a base , it returns the proportional count of this base compared to all other bases * @@ -193,14 +274,14 @@ import com.google.java.contract.Requires; */ @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportion(final BaseIndex baseIndex) { - return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount; + return (totalCount == 0) ? 0.0 : (double)countOfBase(baseIndex) / (double)totalCount; } @Ensures("result != null") public String toString() { StringBuilder b = new StringBuilder(); for (final BaseIndex i : BaseIndex.values()) { - b.append(i.toString()).append("=").append(counts[i.index]).append(","); + b.append(i.toString()).append("=").append(countOfBase(i)).append(","); } return b.toString(); } @@ -209,22 +290,42 @@ import com.google.java.contract.Requires; return baseIndexWithMostCounts().getByte(); } + /** + * @return the base index for which the count is highest, including indel indexes + */ @Ensures("result != null") public BaseIndex baseIndexWithMostCounts() { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (final BaseIndex i : BaseIndex.values()) { - if (counts[i.index] > counts[maxI.index]) - maxI = i; - } - return maxI; + return baseIndexWithMostCounts(true); } + /** + * @return the base index for which the count is highest, excluding indel indexes + */ @Ensures("result != null") public BaseIndex baseIndexWithMostCountsWithoutIndels() { + return baseIndexWithMostCounts(false); + } + + /** + * Finds the base index with the most counts + * + * @param allowIndels should we allow base indexes representing indels? + * @return non-null base index + */ + @Ensures("result != null") + protected BaseIndex baseIndexWithMostCounts(final boolean allowIndels) { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + int maxCount = countOfBase(maxI); + for (final BaseIndex i : BaseIndex.values()) { - if (i.isNucleotide() && counts[i.index] > counts[maxI.index]) + if ( !allowIndels && !i.isNucleotide() ) + continue; + + final int myCount = countOfBase(i); + if (myCount > maxCount) { maxI = i; + maxCount = myCount; + } } return maxI; } @@ -235,27 +336,41 @@ import com.google.java.contract.Requires; @Ensures("result != null") public BaseIndex baseIndexWithMostProbability() { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (final BaseIndex i : BaseIndex.values()) { - if (sumQuals[i.index] > sumQuals[maxI.index]) - maxI = i; - } - return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts()); + return baseIndexWithMostProbability(true); } @Ensures("result != null") public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { + return baseIndexWithMostProbability(false); + } + + /** + * Finds the base index with the most probability + * + * @param allowIndels should we allow base indexes representing indels? + * @return non-null base index + */ + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbability(final boolean allowIndels) { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + long maxSum = getSumQuals(maxI); + for (final BaseIndex i : BaseIndex.values()) { - if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index]) + if ( !allowIndels && !i.isNucleotide() ) + continue; + + final long mySum = getSumQuals(i); + if (mySum > maxSum) { maxI = i; + maxSum = mySum; + } } - return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); + return (maxSum > 0L ? maxI : baseIndexWithMostCounts(allowIndels)); } @Ensures("result >=0") public int totalCountWithoutIndels() { - return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index]; + return totalCount - countOfBase(BaseIndex.D) - countOfBase(BaseIndex.I); } /** @@ -268,10 +383,29 @@ import com.google.java.contract.Requires; @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportionWithoutIndels(final BaseIndex base) { final int total = totalCountWithoutIndels(); - return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total; + return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total; } - public int[] countsArray() { - return counts.clone(); + /** + * @return true if this instance represents low quality bases + */ + public boolean isLowQuality() { return isLowQuality; } + + /** + * Sets the low quality value + * + * @param value true if this instance represents low quality bases false otherwise + */ + public void setLowQuality(final boolean value) { isLowQuality = value; } + + /** + * Clears out all stored data in this object + */ + public void clear() { + count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0; + sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0; + totalCount = 0; + nSoftClippedBases = 0; + mappingQualities.clear(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java index e41878a0b..665e3e7ce 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java @@ -121,7 +121,7 @@ public enum BaseIndex { * * @return whether or not it is a nucleotide, given the definition above */ - public boolean isNucleotide() { + public final boolean isNucleotide() { return !isIndel(); } @@ -130,7 +130,7 @@ public enum BaseIndex { * * @return true for I or D, false otherwise */ - public boolean isIndel() { + public final boolean isIndel() { return this == D || this == I; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java index a8a765ddc..36da92b4f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java @@ -69,15 +69,15 @@ import java.util.Map; *

    * This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files. *

    - *

    Input

    + *

    Input

    *

    * Two BAM files (using -I) with different read group IDs *

    - *

    Output

    + *

    Output

    *

    * [Output description] *

    - *

    Examples

    + *

    Examples

    *
      *    java
      *      -jar GenomeAnalysisTK.jar
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java
    index bd7bdfe89..22ea78521 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java
    @@ -46,10 +46,12 @@
     
     package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
    +import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet;
    +import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
     import org.broadinstitute.sting.utils.*;
     
     import java.util.Collection;
    -import java.util.TreeSet;
    +
     
     /**
      * A stash of regions that must be kept uncompressed in all samples
    @@ -61,7 +63,7 @@ import java.util.TreeSet;
      * Date: 10/15/12
      * Time: 4:08 PM
      */
    -public class CompressionStash extends TreeSet {
    +public class CompressionStash extends ObjectAVLTreeSet {
         public CompressionStash() {
             super();
         }
    @@ -75,7 +77,7 @@ public class CompressionStash extends TreeSet {
          */
         @Override
         public boolean add(final FinishedGenomeLoc insertLoc) {
    -        TreeSet removedLocs = new TreeSet();
    +        ObjectSortedSet removedLocs = new ObjectAVLTreeSet();
             for (FinishedGenomeLoc existingLoc : this) {
                 if (existingLoc.isPast(insertLoc)) {
                     break;                                          // if we're past the loc we're done looking for overlaps.
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
    index 83efaa254..38b9e957b 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
    @@ -46,10 +46,10 @@
     
     package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
    +import it.unimi.dsi.fastutil.objects.ObjectArrayList;
     import org.broadinstitute.sting.utils.MathUtils;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
     
    -import java.util.LinkedList;
     
     /**
      * The element that describes the header of the sliding window.
    @@ -62,9 +62,9 @@ public class HeaderElement {
         private BaseAndQualsCounts consensusBaseCounts;                                                                     // How many A,C,G,T (and D's) are in this site.
         private BaseAndQualsCounts filteredBaseCounts;                                                                      // How many A,C,G,T (and D's) were filtered out in this site.
         private int insertionsToTheRight;                                                                                   // How many reads in this site had insertions to the immediate right
    -    private int nSoftClippedBases;                                                                                      // How many bases in this site came from soft clipped bases
         private int location;                                                                                               // Genome location of this site (the sliding window knows which contig we're at
    -    private LinkedList mappingQuality;                                                                         // keeps the mapping quality of each read that contributed to this element (site)
    +
    +    protected static final int MIN_COUNT_FOR_USING_PVALUE = 2;
     
         public int getLocation() {
             return location;
    @@ -85,7 +85,7 @@ public class HeaderElement {
          * @param location the reference location for the new element
          */
         public HeaderElement(final int location) {
    -        this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new LinkedList());
    +        this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location);
         }
     
         /**
    @@ -95,7 +95,7 @@ public class HeaderElement {
          * @param location the reference location for the new element
          */
         public HeaderElement(final int location, final int insertionsToTheRight) {
    -        this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new LinkedList());
    +        this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location);
         }
     
         /**
    @@ -104,55 +104,67 @@ public class HeaderElement {
          * @param consensusBaseCounts  the BaseCounts object for the running consensus synthetic read
          * @param filteredBaseCounts   the BaseCounts object for the filtered data synthetic read
          * @param insertionsToTheRight number of insertions to the right of this HeaderElement
    -     * @param nSoftClippedBases    number of softclipped bases of this HeaderElement
          * @param location             the reference location of this reference element
    -     * @param mappingQuality       the list of mapping quality values of all reads that contributed to this
          *                             HeaderElement
          */
    -    public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, LinkedList mappingQuality) {
    +    public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int location) {
             this.consensusBaseCounts = consensusBaseCounts;
             this.filteredBaseCounts = filteredBaseCounts;
             this.insertionsToTheRight = insertionsToTheRight;
    -        this.nSoftClippedBases = nSoftClippedBases;
             this.location = location;
    -        this.mappingQuality = mappingQuality;
         }
     
         /**
          * Whether or not the site represented by this HeaderElement is variant according to the definitions of variant
          * by insertion, deletion and mismatches.
          *
    +     * @param minVariantPvalue       min p-value for deciding that a position is or is not variable due to mismatches
    +     * @param minVariantProportion   min proportion for deciding that a position is or is not variable due to mismatches
    +     * @param minIndelProportion     min proportion for deciding that a position is or is not variable due to indels
          * @return true if site is variant by any definition. False otherwise.
          */
    -    public boolean isVariant(double minVariantProportion, double minIndelProportion) {
    -        return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
    +    public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) {
    +        return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
         }
     
         /**
          * Adds a new base to the HeaderElement updating all counts accordingly
          *
    -     * @param base           the base to add
    +     * @param base               the base to add
          * @param baseQual           the base quality
    +     * @param insQual            the base insertion quality
    +     * @param delQual            the base deletion quality
          * @param baseMappingQuality the mapping quality of the read this base belongs to
    +     * @param minBaseQual        the minimum base qual allowed to be a good base
    +     * @param minMappingQual     the minimum mapping qual allowed to be a good read
    +     * @param isSoftClipped      true if the base is soft-clipped in the original read
          */
         public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
    -        if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
    -            consensusBaseCounts.incr(base, baseQual, insQual, delQual);                                                 // If the base passes filters, it is included in the consensus base counts
    +        // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
    +        if ( baseMappingQuality >= minMappingQual )
    +            consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
             else
    -            filteredBaseCounts.incr(base, baseQual, insQual, delQual);                                                  // If the base fails filters, it is included with the filtered data base counts
    -
    -        this.mappingQuality.add(baseMappingQuality);                                                                    // Filtered or not, the RMS mapping quality includes all bases in this site
    -        nSoftClippedBases += isSoftClipped ? 1 : 0;                                                                     // if this base is softclipped, add the counter
    +            filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
         }
     
    +    /**
    +     * Adds a new base to the HeaderElement updating all counts accordingly
    +     *
    +     * @param base               the base to add
    +     * @param baseQual           the base quality
    +     * @param insQual            the base insertion quality
    +     * @param delQual            the base deletion quality
    +     * @param baseMappingQuality the mapping quality of the read this base belongs to
    +     * @param minBaseQual        the minimum base qual allowed to be a good base
    +     * @param minMappingQual     the minimum mapping qual allowed to be a good read
    +     * @param isSoftClipped      true if the base is soft-clipped in the original read
    +     */
         public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
    -        if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
    -            consensusBaseCounts.decr(base, baseQual, insQual, delQual);                                                 // If the base passes filters, it is included in the consensus base counts
    +        // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
    +        if ( baseMappingQuality >= minMappingQual )
    +            consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
             else
    -            filteredBaseCounts.decr(base, baseQual, insQual, delQual);                                                  // If the base fails filters, it is included with the filtered data base counts
    -
    -        this.mappingQuality.remove((Integer) baseMappingQuality);                                                       // Filtered or not, the RMS mapping quality includes all bases in this site
    -        nSoftClippedBases -= isSoftClipped ? 1 : 0;                                                                     // if this base is softclipped, add the counter
    +            filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
         }
         /**
          * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
    @@ -189,15 +201,6 @@ public class HeaderElement {
             return (!hasFilteredData() && !hasConsensusData());
         }
     
    -    /**
    -     * The RMS of the mapping qualities of all reads that contributed to this HeaderElement
    -     *
    -     * @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement
    -     */
    -    public double getRMS() {
    -        return MathUtils.rms(mappingQuality);
    -    }
    -
         /**
          * removes an insertion from this element (if you removed a read that had an insertion)
          */
    @@ -232,7 +235,7 @@ public class HeaderElement {
         /**
          * Whether or not the HeaderElement is variant due to excess deletions
          *
    -     * @return whether or not the HeaderElement is variant due to excess insertions
    +     * @return whether or not the HeaderElement is variant due to excess deletions
          */
         private boolean isVariantFromDeletions(double minIndelProportion) {
             return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
    @@ -241,12 +244,15 @@ public class HeaderElement {
         /**
          * Whether or not the HeaderElement is variant due to excess mismatches
          *
    -     * @return whether or not the HeaderElement is variant due to excess insertions
    +     * @param minVariantPvalue     the minimum pvalue to call a site variant (used with low coverage).
    +     * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage).
    +     * @return whether or not the HeaderElement is variant due to excess mismatches
          */
    -    protected boolean isVariantFromMismatches(double minVariantProportion) {
    -        BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
    -        double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
    -        return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
    +    protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) {
    +        final int totalCount = consensusBaseCounts.totalCountWithoutIndels();
    +        final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
    +        final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon);
    +        return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion);
         }
     
         /**
    @@ -256,37 +262,88 @@ public class HeaderElement {
          * @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
          */
         protected boolean isVariantFromSoftClips() {
    +        final int nSoftClippedBases = consensusBaseCounts.nSoftclips();
             return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases);
         }
     
    -    protected boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) {
    -        return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
    +    /**
    +     * Calculates the number of alleles necessary to represent this site.
    +     *
    +     * @param minVariantPvalue     the minimum pvalue to call a site variant.
    +     * @param minVariantProportion the minimum proportion to call a site variant.
    +     * @return the number of alleles necessary to represent this site or -1 if there are too many indels
    +     */
    +    public int getNumberOfBaseAlleles(final double minVariantPvalue, final double minVariantProportion) {
    +        final ObjectArrayList alleles = getAlleles(minVariantPvalue, minVariantProportion);
    +        return alleles == null ? -1 : alleles.size();
         }
     
         /**
    -     * Calculates the number of haplotypes necessary to represent this site.
    +     * Calculates the alleles necessary to represent this site.
          *
    +     * @param minVariantPvalue     the minimum pvalue to call a site variant.
          * @param minVariantProportion the minimum proportion to call a site variant.
    -     * @return the number of alleles necessary to represent this site.
    +     * @return the list of alleles necessary to represent this site or null if there are too many indels
          */
    -    public int getNumberOfAlleles(final double minVariantProportion) {
    +    public ObjectArrayList getAlleles(final double minVariantPvalue, final double minVariantProportion) {
    +        // make sure we have bases at all
             final int totalBaseCount = consensusBaseCounts.totalCount();
    -        if (totalBaseCount == 0)
    -            return 0;
    +        if ( totalBaseCount == 0 )
    +            return new ObjectArrayList(0);
     
    -        final int minBaseCountForRelevantAlleles = (int)(minVariantProportion * totalBaseCount);
    +        // next, check for insertions; technically, the insertion count can be greater than totalBaseCount
    +        // (because of the way insertions are counted), so we need to account for that
    +        if ( hasSignificantCount(Math.min(totalBaseCount, insertionsToTheRight), totalBaseCount, minVariantPvalue, minVariantProportion) )
    +            return null;
     
    -        int nAlleles = 0;
    -        for ( BaseIndex base : BaseIndex.values() ) {
    +        // finally, check for the bases themselves (including deletions)
    +        final ObjectArrayList alleles = new ObjectArrayList(4);
    +        for ( final BaseIndex base : BaseIndex.values() ) {
                 final int baseCount = consensusBaseCounts.countOfBase(base);
    -
    -            // don't consider this allele if the count is 0
                 if ( baseCount == 0 )
                     continue;
     
    -            if ( baseCount >= minBaseCountForRelevantAlleles )
    -                nAlleles++;
    +            if ( hasSignificantCount(baseCount, totalBaseCount, minVariantPvalue, minVariantProportion) ) {
    +                if ( base == BaseIndex.D )
    +                    return null;
    +                alleles.add(base);
    +            }
             }
    -        return nAlleles;
    +        return alleles;
    +    }
    +
    +    /*
    +     * Checks whether there are a significant number of softclips.
    +     *
    +     * @param minVariantPvalue     the minimum pvalue to call a site variant.
    +     * @param minVariantProportion the minimum proportion to call a site variant.
    +     * @return true if there are significant softclips, false otherwise
    +     */
    +    public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) {
    +        return hasSignificantCount(consensusBaseCounts.nSoftclips(), consensusBaseCounts.totalCount(), minVariantPvalue, minVariantProportion);
    +    }
    +
    +    /*
    +     * Checks whether there are a significant number of count.
    +     *
    +     * @param count                the count (k) to test against
    +     * @param total                the total (n) to test against
    +     * @param minVariantPvalue     the minimum pvalue to call a site variant.
    +     * @param minVariantProportion the minimum proportion to call a site variant.
    +     * @return true if there is a significant count given the provided pvalue, false otherwise
    +     */
    +    private boolean hasSignificantCount(final int count, final int total, final double minVariantPvalue, final double minVariantProportion) {
    +        if ( count == 0 || total == 0 )
    +            return false;
    +
    +        // use p-values for low counts of k
    +        if ( count <= MIN_COUNT_FOR_USING_PVALUE ) {
    +            final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count);
    +            return pvalue > minVariantPvalue;
    +        }
    +
    +        // otherwise, use straight proportions
    +        final int minBaseCountForSignificance = (int)(minVariantProportion * total);
    +        return count >= minBaseCountForSignificance;
         }
     }
    \ No newline at end of file
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
    index d45efeb65..bdd407fba 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
    @@ -46,18 +46,17 @@
     
     package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
    +import com.google.java.contract.Ensures;
    +import it.unimi.dsi.fastutil.objects.*;
     import net.sf.samtools.SAMFileHeader;
     import org.apache.log4j.Logger;
    +import org.broadinstitute.sting.utils.GenomeLoc;
     import org.broadinstitute.sting.utils.SampleUtils;
     import org.broadinstitute.sting.utils.collections.Pair;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
     import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
     import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     
    -import java.util.HashMap;
    -import java.util.Map;
    -import java.util.Set;
    -import java.util.TreeSet;
     
     /*
      * Copyright (c) 2009 The Broad Institute
    @@ -91,52 +90,72 @@ import java.util.TreeSet;
     public class MultiSampleCompressor {
         protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
     
    -    protected Map compressorsPerSample = new HashMap();
    +    protected Object2ObjectMap compressorsPerSample = new Object2ObjectOpenHashMap();
     
         public MultiSampleCompressor(SAMFileHeader header,
                                      final int contextSize,
                                      final int downsampleCoverage,
                                      final int minMappingQuality,
    +                                 final double minAltPValueToTriggerVariant,
                                      final double minAltProportionToTriggerVariant,
                                      final double minIndelProportionToTriggerVariant,
                                      final int minBaseQual,
    -                                 final ReduceReads.DownsampleStrategy downsampleStrategy,
    -                                 final boolean allowPolyploidReduction) {
    +                                 final ReduceReads.DownsampleStrategy downsampleStrategy) {
             for ( String name : SampleUtils.getSAMFileSamples(header) ) {
                 compressorsPerSample.put(name,
                         new SingleSampleCompressor(contextSize, downsampleCoverage,
    -                                    minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, allowPolyploidReduction));
    +                                    minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
             }
         }
     
    -    public Set addAlignment(GATKSAMRecord read) {
    +    /**
    +     * Add an alignment to the compressor
    +     *
    +     * @param read                  the read to be added
    +     * @param knownSnpPositions     the set of known SNP positions
    +     * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
    +     */
    +    public ObjectSet addAlignment(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) {
             String sampleName = read.getReadGroup().getSample();
             SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
             if ( compressor == null )
                 throw new ReviewedStingException("No compressor for sample " + sampleName);
    -        Pair, CompressionStash> readsAndStash = compressor.addAlignment(read);
    -        Set reads = readsAndStash.getFirst();
    +        Pair, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions);
    +        ObjectSet reads = readsAndStash.getFirst();
             CompressionStash regions = readsAndStash.getSecond();
     
    -        reads.addAll(closeVariantRegionsInAllSamples(regions));
    +        reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions));
     
             return reads;
         }
     
    -    public Set close() {
    -        Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator());
    +    /**
    +     * Properly closes the compressor.
    +     *
    +     * @param knownSnpPositions  the set of known SNP positions
    +     * @return A non-null set/list of all reads generated
    +     */
    +    @Ensures("result != null")
    +    public ObjectSet close(final ObjectSortedSet knownSnpPositions) {
    +        ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
             for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
    -            Pair, CompressionStash> readsAndStash = sample.close();
    -            reads = readsAndStash.getFirst();
    +            Pair, CompressionStash> readsAndStash = sample.close(knownSnpPositions);
    +            reads.addAll(readsAndStash.getFirst());
             }
             return reads;
         }
     
    -    private Set closeVariantRegionsInAllSamples(CompressionStash regions) {
    -        Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator());
    +    /**
    +     * Finalizes current variant regions.
    +     *
    +     * @param knownSnpPositions  the set of known SNP positions
    +     * @return A non-null set/list of all reads generated
    +     */
    +    private ObjectSet closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) {
    +        ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
             if (!regions.isEmpty()) {
                 for (SingleSampleCompressor sample : compressorsPerSample.values()) {
    -                reads.addAll(sample.closeVariantRegions(regions));
    +                reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions));
                 }
             }
             return reads;
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
    index 8e45f6db1..71910e566 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
    @@ -46,13 +46,15 @@
     
     package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
    +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
    +import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet;
    +import it.unimi.dsi.fastutil.objects.ObjectArrayList;
    +import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
     import net.sf.samtools.SAMFileHeader;
     import net.sf.samtools.SAMFileWriter;
     import net.sf.samtools.SAMProgramRecord;
     import net.sf.samtools.util.SequenceUtil;
    -import org.broadinstitute.sting.commandline.Argument;
    -import org.broadinstitute.sting.commandline.Hidden;
    -import org.broadinstitute.sting.commandline.Output;
    +import org.broadinstitute.sting.commandline.*;
     import org.broadinstitute.sting.gatk.CommandLineGATK;
     import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
     import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
    @@ -65,13 +67,17 @@ import org.broadinstitute.sting.utils.GenomeLoc;
     import org.broadinstitute.sting.utils.Utils;
     import org.broadinstitute.sting.utils.clipping.ReadClipper;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
    +import org.broadinstitute.sting.utils.exceptions.UserException;
     import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
     import org.broadinstitute.sting.utils.help.HelpConstants;
     import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
     import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     import org.broadinstitute.sting.utils.sam.ReadUtils;
    +import org.broadinstitute.variant.variantcontext.VariantContext;
    +
    +import java.util.Collections;
    +import java.util.List;
     
    -import java.util.*;
     
     /**
      * Reduces the BAM file using read based compression that keeps only essential information for variant calling
    @@ -83,17 +89,17 @@ import java.util.*;
      * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
      * savings in file size and performance of the downstream tools.
      *
    - * 

    Input

    + *

    Input

    *

    * The BAM file to be compressed *

    * - *

    Output

    + *

    Output

    *

    * The compressed (reduced) BAM file. * *

    - *

    Examples

    + *

    Examples

    *
      * java -Xmx4g -jar GenomeAnalysisTK.jar \
      *   -R ref.fasta \
    @@ -107,9 +113,9 @@ import java.util.*;
     @PartitionBy(PartitionType.CONTIG)
     @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
     @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40)
    -public class ReduceReads extends ReadWalker, ReduceReadsStash> {
    +public class ReduceReads extends ReadWalker, ReduceReadsStash> {
     
    -    @Output
    +    @Output(required = false, defaultToStdout = false)
         private StingSAMFileWriter out = null;
         private SAMFileWriter writerToUse = null;
     
    @@ -117,7 +123,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * The number of bases to keep around mismatches (potential variation)
          */
         @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
    -    private int contextSize = 10;
    +    public int contextSize = 10;
     
         /**
          * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
    @@ -125,7 +131,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * towards variable regions.
          */
         @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
    -    private int minMappingQuality = 20;
    +    public int minMappingQuality = 20;
     
         /**
          * The minimum base quality to be considered for the consensus synthetic read. Reads that have
    @@ -133,41 +139,45 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * towards variable regions.
          */
         @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
    -    private byte minBaseQual = 20;
    +    public byte minBaseQual = 15;
     
         /**
    -     * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
    -     * lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
    +     * Reads have notoriously low quality bases on the tails (left and right).  Consecutive bases at the tails with
    +     * quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
          */
         @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
    -    private byte minTailQuality = 2;
    +    public byte minTailQuality = 2;
     
         /**
    -     * Allow the experimental polyploid-based reduction capabilities of this tool
    +     * Any number of VCF files representing known SNPs to be used for the polyploid-based reduction.
    +     * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls.  Non-SNP variants in these files will be ignored.
    +     * If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present
    +     * in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will
    +     * be triggered anywhere there is a single SNP present in a consensus window.
          */
    -    @Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false)
    -    private boolean USE_POLYPLOID_REDUCTION = false;
    +    @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false)
    +    public List> known = Collections.emptyList();
     
         /**
          * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
          * and read group).
          */
         @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
    -    private boolean DONT_SIMPLIFY_READS = false;
    +    public boolean DONT_SIMPLIFY_READS = false;
     
         /**
          * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
          * The program will behave correctly in those cases.
          */
         @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
    -    private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
    +    public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
     
         /**
          * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
          * quality.
          */
         @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
    -    private boolean DONT_CLIP_LOW_QUAL_TAILS = false;
    +    public boolean DONT_CLIP_LOW_QUAL_TAILS = false;
     
         /**
          * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
    @@ -175,7 +185,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
          */
         @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
    -    private boolean DONT_USE_SOFTCLIPPED_BASES = false;
    +    public boolean DONT_USE_SOFTCLIPPED_BASES = false;
     
         /**
          * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee 
    @@ -183,55 +193,68 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. 
          */
         @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
    -    private boolean DONT_COMPRESS_READ_NAMES = false;
    +    public boolean DONT_COMPRESS_READ_NAMES = false;
     
         /**
          * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
          * border.
          */
         @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
    -    private boolean HARD_CLIP_TO_INTERVAL = false;
    +    public boolean HARD_CLIP_TO_INTERVAL = false;
     
         /**
          * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
    -     * considered consensus.
    +     * considered consensus and reduced (otherwise we will try to trigger polyploid compression).  Note that
    +     * this value is used only regions with high coverage.
          */
    +    @Advanced
         @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
    -    private double minAltProportionToTriggerVariant = 0.05;
    +    public double minAltProportionToTriggerVariant = 0.05;
    +
    +    /**
    +     * Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region.
    +     * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to
    +     * trigger polyploid compression).  Note that this value is used only regions with low coverage.
    +     */
    +    @Advanced
    +    @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false)
    +    public double minAltPValueToTriggerVariant = 0.01;
     
         /**
          * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
          * considered consensus.
          */
         @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
    -    private double minIndelProportionToTriggerVariant = 0.05;
    +    public double minIndelProportionToTriggerVariant = 0.05;
     
         /**
    -     * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
    +     * The number of reads emitted per sample in a variant region can be downsampled for better compression.
    +     * This level of downsampling only happens after the region has been evaluated, therefore it can
    +     * be combined with the engine level downsampling.
          * A value of 0 turns downsampling off.
          */
         @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
    -    private int downsampleCoverage = 250;
    +    public int downsampleCoverage = 250;
     
         @Hidden
         @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
    -    private boolean nwayout = false;
    +    public boolean nwayout = false;
     
         @Hidden
         @Argument(fullName = "", shortName = "dl", doc = "", required = false)
    -    private int debugLevel = 0;
    +    public int debugLevel = 0;
     
         @Hidden
         @Argument(fullName = "", shortName = "dr", doc = "", required = false)
    -    private String debugRead = "";
    +    public String debugRead = "";
     
         @Hidden
         @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
    -    private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
    +    public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
         
         @Hidden 
         @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
    -    private boolean NO_PG_TAG = false;
    +    public boolean NO_PG_TAG = false;
     
         public enum DownsampleStrategy {
             Normal,
    @@ -240,10 +263,12 @@ public class ReduceReads extends ReadWalker, ReduceRea
         
         int nCompressedReads = 0;
     
    -    HashMap readNameHash;                                     // This hash will keep the name of the original read the new compressed name (a number).
    +    Object2LongOpenHashMap readNameHash;                         // This hash will keep the name of the original read the new compressed name (a number).
         Long nextReadNumber = 1L;                                               // The next number to use for the compressed read name.
     
    -    SortedSet intervalList;
    +    ObjectSortedSet intervalList;
    +
    +    ObjectSortedSet knownSnpPositions;
     
         // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER
         public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads";   // The name that will go in the @PG tag
    @@ -256,17 +281,33 @@ public class ReduceReads extends ReadWalker, ReduceRea
         @Override
         public void initialize() {
             super.initialize();
    +
    +        if ( !nwayout && out == null )
    +            throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes");
    +
    +        if ( nwayout && out != null )
    +            throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other");
    +
    +        if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 )
    +            throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
    +
    +        if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 )
    +            throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
    +
    +        if ( known.isEmpty() )
    +            knownSnpPositions = null;
    +        else
    +            knownSnpPositions = new ObjectAVLTreeSet();
    +
             GenomeAnalysisEngine toolkit = getToolkit();
    -        readNameHash = new HashMap();           // prepare the read name hash to keep track of what reads have had their read names compressed
    -        intervalList = new TreeSet();              // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
    +        readNameHash = new Object2LongOpenHashMap(100000);     // prepare the read name hash to keep track of what reads have had their read names compressed
    +        intervalList = new ObjectAVLTreeSet();                          // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
     
             if (toolkit.getIntervals() != null)
                 intervalList.addAll(toolkit.getIntervals());
     
    -
             final boolean preSorted = true;
             final boolean indexOnTheFly = true;
    -        final boolean keep_records = true;
             final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
             if (nwayout) {
                 SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
    @@ -276,7 +317,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
                 writerToUse = out;
                 out.setPresorted(false);
                 if (!NO_PG_TAG) {
    -                Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
    +                Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
                 }
             }
         }
    @@ -295,8 +336,8 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * @return a linked list with all the reads produced by the clipping operations
          */
         @Override
    -    public LinkedList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
    -        LinkedList mappedReads;
    +    public ObjectArrayList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
    +        ObjectArrayList mappedReads;
             if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
                     System.out.println("Found debug read!");
     
    @@ -325,18 +366,18 @@ public class ReduceReads extends ReadWalker, ReduceRea
                 if (HARD_CLIP_TO_INTERVAL)
                     mappedReads = hardClipReadToInterval(read);                                                             // Hard clip the remainder of the read to the desired interval
                 else {
    -                mappedReads = new LinkedList();
    +                mappedReads = new ObjectArrayList();
                     mappedReads.add(read);
                 }
             }
             else {
    -            mappedReads = new LinkedList();
    +            mappedReads = new ObjectArrayList();
                 if (!read.isEmpty())
                     mappedReads.add(read);
             }
     
             if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) {
    -            LinkedList tempList = new LinkedList();
    +            ObjectArrayList tempList = new ObjectArrayList();
                 for (GATKSAMRecord mRead : mappedReads) {
                     GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual);
                     if (!clippedRead.isEmpty())
    @@ -349,8 +390,22 @@ public class ReduceReads extends ReadWalker, ReduceRea
                 for (GATKSAMRecord mappedRead : mappedReads)
                     System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
     
    -        return mappedReads;
    +        // add the SNPs to the list of known positions
    +        populateKnownSNPs(metaDataTracker);
     
    +        return mappedReads;
    +    }
    +
    +    /*
    +     * Add the positions of known SNPs to the set so that we can keep track of it
    +     *
    +     * @param metaDataTracker   the ref meta data tracker
    +     */
    +    protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) {
    +        for ( final VariantContext vc : metaDataTracker.getValues(known) ) {
    +            if ( vc.isSNP() )
    +                knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc));
    +        }
         }
     
         /**
    @@ -363,7 +418,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
          */
         @Override
         public ReduceReadsStash reduceInit() {
    -        return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, USE_POLYPLOID_REDUCTION));
    +        return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
         }
     
         /**
    @@ -375,7 +430,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * @param stash       the stash that keeps the reads in order for processing
          * @return the stash with all reads that have not been processed yet
          */
    -    public ReduceReadsStash reduce(LinkedList mappedReads, ReduceReadsStash stash) {
    +    public ReduceReadsStash reduce(ObjectArrayList mappedReads, ReduceReadsStash stash) {
             if (debugLevel == 1)
                 stash.print();
     
    @@ -387,7 +442,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
                     throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
     
                 if (originalRead) {
    -                List readsReady = new LinkedList();
    +                ObjectArrayList readsReady = new ObjectArrayList();
                     readsReady.addAll(stash.getAllReadsBefore(read));
                     readsReady.add(read);
     
    @@ -395,9 +450,16 @@ public class ReduceReads extends ReadWalker, ReduceRea
                         if (debugLevel == 1)
                             System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
     
    -                    for (GATKSAMRecord compressedRead : stash.compress(readReady))
    +                    for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions))
                             outputRead(compressedRead);
     
    +                    // We only care about maintaining the link between read pairs if they are in the same variant
    +                    // region.  Since an entire variant region's worth of reads is returned in a single call to
    +                    // stash.compress(), the readNameHash can be cleared after the for() loop above.
    +                    // The advantage of clearing the hash is that otherwise it holds all reads that have been encountered,
    +                    // which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory.
    +                    readNameHash.clear();
    +
                     }
                 } else
                     stash.add(read);
    @@ -405,6 +467,10 @@ public class ReduceReads extends ReadWalker, ReduceRea
                 firstRead = false;
             }
     
    +        // reduce memory requirements by removing old positions
    +        if ( !mappedReads.isEmpty() )
    +            clearStaleKnownPositions(mappedReads.get(0));
    +
             return stash;
         }
     
    @@ -417,13 +483,38 @@ public class ReduceReads extends ReadWalker, ReduceRea
         public void onTraversalDone(ReduceReadsStash stash) {
     
             // output any remaining reads in the compressor
    -        for (GATKSAMRecord read : stash.close())
    +        for (GATKSAMRecord read : stash.close(knownSnpPositions))
                 outputRead(read);
     
             if (nwayout)
                 writerToUse.close();
         }
     
    +    /**
    +     * Removes known positions that are no longer relevant for use with het compression.
    +     *
    +     * @param read    the current read, used for checking whether there are stale positions we can remove
    +     */
    +    protected void clearStaleKnownPositions(final GATKSAMRecord read) {
    +        // nothing to clear if not used or empty
    +        if ( knownSnpPositions == null || knownSnpPositions.isEmpty() )
    +            return;
    +
    +        // not ready to be cleared until we encounter a read from a different contig
    +        final int contigIndexOfRead = read.getReferenceIndex();
    +        if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead )
    +            return;
    +
    +        // because we expect most elements to be stale, it's not going to be efficient to remove them one at a time
    +        final ObjectAVLTreeSet goodLocs = new ObjectAVLTreeSet();
    +        for ( final GenomeLoc loc : knownSnpPositions ) {
    +            if ( loc.getContigIndex() == contigIndexOfRead )
    +                goodLocs.add(loc);
    +        }
    +        knownSnpPositions.clear();
    +        knownSnpPositions.addAll(goodLocs);
    +    }
    +
         /**
          * Hard clips away all parts of the read that doesn't agree with the intervals selected.
          *
    @@ -433,8 +524,8 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * @param read the read to be hard clipped to the interval.
          * @return a shallow copy of the read hard clipped to the interval
          */
    -    private LinkedList hardClipReadToInterval(GATKSAMRecord read) {
    -        LinkedList clippedReads = new LinkedList();
    +    private ObjectArrayList hardClipReadToInterval(GATKSAMRecord read) {
    +        ObjectArrayList clippedReads = new ObjectArrayList();
     
             GenomeLoc intervalOverlapped = null;       // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list)
     
    @@ -588,7 +679,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
                 System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd());
     
             if (!DONT_COMPRESS_READ_NAMES)
    -            compressReadName(read);
    +            nextReadNumber = compressReadName(readNameHash, read, nextReadNumber);
     
             writerToUse.addAlignment(read);
         }
    @@ -623,21 +714,28 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * Compresses the read name using the readNameHash if we have already compressed
          * this read name before.
          *
    -     * @param read any read
    +     * @param hash           the hash table containing the read name to compressed read name map
    +     * @param read           any read
    +     * @param nextReadNumber the number to use in the compressed read name in case this is a new read name
    +     * @return the next number to use in the compressed read name
          */
    -    private void compressReadName(GATKSAMRecord read) {
    -        String name = read.getReadName();
    -        String compressedName = read.isReducedRead() ? "C" : "";
    -        final Long readNumber = readNameHash.get(name);
    -        if (readNumber != null) {
    -            compressedName += readNumber.toString();
    -        } else {
    -            readNameHash.put(name, nextReadNumber);
    -            compressedName += nextReadNumber.toString();
    -            nextReadNumber++;
    +    protected static long compressReadName(final Object2LongOpenHashMap hash, final GATKSAMRecord read, final long nextReadNumber) {
    +        final String name = read.getReadName();
    +        final StringBuilder compressedName = new StringBuilder();
    +        long result = nextReadNumber;
    +        if (read.isReducedRead()) {
    +            compressedName.append("C");
             }
    -
    -        read.setReadName(compressedName);
    +        final Long readNumber = hash.get(name);
    +        if (readNumber != null) {
    +            compressedName.append(readNumber);
    +        } else {
    +            hash.put(name, nextReadNumber);
    +            compressedName.append(nextReadNumber);
    +            result++;
    +        }
    +        read.setReadName(compressedName.toString());
    +        return result;
         }
     
         /**
    @@ -649,8 +747,8 @@ public class ReduceReads extends ReadWalker, ReduceRea
          * @param read the read
          * @return Returns true if the read is the original read that went through map().
          */
    -    private boolean isOriginalRead(LinkedList list, GATKSAMRecord read) {
    -        return isWholeGenome() || list.getFirst().equals(read);
    +    private boolean isOriginalRead(ObjectArrayList list, GATKSAMRecord read) {
    +        return isWholeGenome() || list.get(0).equals(read);
         }
     
         /**
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
    index 0a446bab7..52c5f0903 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
    @@ -46,6 +46,8 @@
     
     package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
    +import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
    +import org.broadinstitute.sting.utils.GenomeLoc;
     import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
     import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     import org.broadinstitute.sting.utils.sam.ReadUtils;
    @@ -106,11 +108,12 @@ public class ReduceReadsStash {
         /**
          * sends the read to the MultiSampleCompressor
          *
    -     * @param read the read to be compressed
    +     * @param read                  the read to be compressed
    +     * @param knownSnpPositions     the set of known SNP positions
          * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
          */
    -    public Iterable compress(GATKSAMRecord read) {
    -        return compressor.addAlignment(read);
    +    public Iterable compress(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) {
    +        return compressor.addAlignment(read, knownSnpPositions);
         }
     
         /**
    @@ -125,18 +128,19 @@ public class ReduceReadsStash {
         /**
          * Close the stash, processing all remaining reads in order
          *
    +     * @param knownSnpPositions  the set of known SNP positions
          * @return a list of all the reads produced by the SlidingWindow machinery)
          */
    -    public Iterable close() {
    +    public Iterable close(final ObjectSortedSet knownSnpPositions) {
             LinkedList result = new LinkedList();
     
             // compress all the stashed reads (in order)
             for (GATKSAMRecord read : outOfOrderReads)
    -            for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
    +            for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions))
                     result.add(compressedRead);
     
             // output any remaining reads from the compressor
    -        for (GATKSAMRecord read : compressor.close())
    +        for (GATKSAMRecord read : compressor.close(knownSnpPositions))
                 result.add(read);
     
             return result;
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
    index b4de1f0cb..61c34b6a0 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
    @@ -46,14 +46,13 @@
     
     package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
    +import com.google.java.contract.Ensures;
    +import it.unimi.dsi.fastutil.objects.*;
    +import org.broadinstitute.sting.utils.GenomeLoc;
     import org.broadinstitute.sting.utils.collections.Pair;
     import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
     import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     
    -import java.util.Collections;
    -import java.util.Set;
    -import java.util.TreeSet;
    -
     /**
      *
      * @author carneiro, depristo
    @@ -63,38 +62,45 @@ public class SingleSampleCompressor {
         final private int contextSize;
         final private int downsampleCoverage;
         final private int minMappingQuality;
    +    final private double minAltPValueToTriggerVariant;
         final private double minAltProportionToTriggerVariant;
         final private double minIndelProportionToTriggerVariant;
         final private int minBaseQual;
         final private ReduceReads.DownsampleStrategy downsampleStrategy;
    -    final private boolean allowPolyploidReduction;
     
         private SlidingWindow slidingWindow;
         private int slidingWindowCounter;
     
    -    public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new TreeSet(), new CompressionStash());
    +    public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new ObjectAVLTreeSet(), new CompressionStash());
     
         public SingleSampleCompressor(final int contextSize,
                                       final int downsampleCoverage,
                                       final int minMappingQuality,
    +                                  final double minAltPValueToTriggerVariant,
                                       final double minAltProportionToTriggerVariant,
                                       final double minIndelProportionToTriggerVariant,
                                       final int minBaseQual,
    -                                  final ReduceReads.DownsampleStrategy downsampleStrategy,
    -                                  final boolean allowPolyploidReduction) {
    +                                  final ReduceReads.DownsampleStrategy downsampleStrategy) {
             this.contextSize = contextSize;
             this.downsampleCoverage = downsampleCoverage;
             this.minMappingQuality = minMappingQuality;
             this.slidingWindowCounter = 0;
    +        this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant;
             this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
             this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
             this.minBaseQual = minBaseQual;
             this.downsampleStrategy = downsampleStrategy;
    -        this.allowPolyploidReduction = allowPolyploidReduction;
         }
     
    -    public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) {
    -        Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator());
    +    /**
    +     * Add an alignment to the compressor
    +     *
    +     * @param read                  the read to be added
    +     * @param knownSnpPositions     the set of known SNP positions
    +     * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
    +     */
    +    public Pair, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions ) {
    +        ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
             CompressionStash stash = new CompressionStash();
             int readOriginalStart = read.getUnclippedStart();
     
    @@ -104,27 +110,43 @@ public class SingleSampleCompressor {
                   (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) {  // this read is too far away from the end of the current sliding window
     
                 // close the current sliding window
    -            Pair, CompressionStash> readsAndStash = slidingWindow.close();
    +            Pair, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions);
                 reads = readsAndStash.getFirst();
                 stash = readsAndStash.getSecond();
                 slidingWindow = null;                                                      // so we create a new one on the next if
             }
     
             if ( slidingWindow == null) {                                                  // this is the first read
    -            slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), allowPolyploidReduction);
    +            slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(),
    +                    slidingWindowCounter, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant,
    +                    minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
                 slidingWindowCounter++;
             }
     
             stash.addAll(slidingWindow.addRead(read));
    -        return new Pair, CompressionStash>(reads, stash);
    +        return new Pair, CompressionStash>(reads, stash);
         }
     
    -    public Pair, CompressionStash> close() {
    -        return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
    +    /**
    +     * Properly closes the compressor.
    +     *
    +     * @param knownSnpPositions  the set of known SNP positions
    +     * @return A non-null set/list of all reads generated
    +     */
    +    @Ensures("result != null")
    +    public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) {
    +        return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair;
         }
     
    -    public Set closeVariantRegions(CompressionStash regions) {
    -        return slidingWindow == null ? Collections.emptySet() : slidingWindow.closeVariantRegions(regions);
    +    /**
    +     * Finalizes current variant regions.
    +     *
    +     * @param knownSnpPositions  the set of known SNP positions
    +     * @return A non-null set/list of all reads generated
    +     */
    +    @Ensures("result != null")
    +    public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) {
    +        return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions);
         }
     
     }
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
    index 680489042..d3ca037be 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
    @@ -48,15 +48,18 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
     import com.google.java.contract.Ensures;
     import com.google.java.contract.Requires;
    -import net.sf.samtools.Cigar;
    +import it.unimi.dsi.fastutil.bytes.Byte2IntArrayMap;
    +import it.unimi.dsi.fastutil.bytes.Byte2IntMap;
    +import it.unimi.dsi.fastutil.objects.*;
     import net.sf.samtools.CigarElement;
     import net.sf.samtools.CigarOperator;
     import net.sf.samtools.SAMFileHeader;
     import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
    +import org.broadinstitute.sting.utils.BaseUtils;
     import org.broadinstitute.sting.utils.GenomeLoc;
    +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
     import org.broadinstitute.sting.utils.collections.Pair;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
    -import org.broadinstitute.sting.utils.recalibration.EventType;
     import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
     import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
     import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
    @@ -64,6 +67,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
     
     import java.util.*;
     
    +
     /**
      * Created by IntelliJ IDEA.
      * User: roger
    @@ -73,8 +77,8 @@ import java.util.*;
     public class SlidingWindow {
     
         // Sliding Window data
    -    final private TreeSet readsInWindow;
    -    final private LinkedList windowHeader;
    +    final protected PriorityQueue readsInWindow;
    +    final protected LinkedList windowHeader;
         protected int contextSize;                                                                                          // the largest context size (between mismatches and indels)
         protected String contig;
         protected int contigIndex;
    @@ -92,9 +96,9 @@ public class SlidingWindow {
         protected int filteredDataConsensusCounter;
         protected String filteredDataReadName;
     
    -
         // Additional parameters
    -    protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT;                                                        // proportion has to be greater than this value to trigger variant region due to mismatches
    +    protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT;                                                                 // pvalue has to be greater than this value to trigger variant region due to mismatches
    +    protected double MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT;                                                             // proportion has to be greater than this value to trigger variant region due to mismatches
         protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT;                                                      // proportion has to be greater than this value to trigger variant region due to deletions
         protected int MIN_BASE_QUAL_TO_COUNT;                                                                               // qual has to be greater than or equal to this value
         protected int MIN_MAPPING_QUALITY;
    @@ -102,8 +106,6 @@ public class SlidingWindow {
         protected ReduceReads.DownsampleStrategy downsampleStrategy;
         private boolean hasIndelQualities;
     
    -    private boolean allowPolyploidReductionInGeneral;
    -
         private static CompressionStash emptyRegions = new CompressionStash();
     
         /**
    @@ -119,8 +121,8 @@ public class SlidingWindow {
             return getStopLocation(windowHeader);
         }
     
    -    private int getStopLocation(LinkedList header) {
    -        return getStartLocation(header) + header.size() - 1;
    +    private int getStopLocation(final LinkedList header) {
    +        return header.isEmpty() ? -1 : header.peekLast().getLocation();
         }
     
         public String getContig() {
    @@ -131,7 +133,7 @@ public class SlidingWindow {
             return contigIndex;
         }
     
    -    public int getStartLocation(LinkedList header) {
    +    public int getStartLocation(final LinkedList header) {
             return header.isEmpty() ? -1 : header.peek().getLocation();
         }
     
    @@ -144,24 +146,33 @@ public class SlidingWindow {
     
             this.windowHeader = new LinkedList();
             windowHeader.addFirst(new HeaderElement(startLocation));
    -        this.readsInWindow = new TreeSet();
    +        this.readsInWindow = new PriorityQueue(100, new Comparator() {
    +            @Override
    +            public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
    +                return read1.getSoftEnd() - read2.getSoftEnd();
    +            }
    +        });
         }
     
    -    public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, boolean allowPolyploidReduction) {
    +    public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader,
    +                         final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber,
    +                         final double minAltPValueToTriggerVariant, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant,
    +                         final int minBaseQual, final int minMappingQuality, final int downsampleCoverage,
    +                         final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) {
             this.contextSize = contextSize;
             this.downsampleCoverage = downsampleCoverage;
     
    -        this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
    +        this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant;
    +        this.MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
             this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant;
             this.MIN_BASE_QUAL_TO_COUNT = minBaseQual;
             this.MIN_MAPPING_QUALITY = minMappingQuality;
     
             this.windowHeader = new LinkedList();
    -        this.readsInWindow = new TreeSet(new Comparator() {
    +        this.readsInWindow = new PriorityQueue(1000, new Comparator() {
                 @Override
                 public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
    -                final int difference = read1.getSoftEnd() - read2.getSoftEnd();
    -                return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName());
    +                return read1.getSoftEnd() - read2.getSoftEnd();
                 }
             });
     
    @@ -181,8 +192,6 @@ public class SlidingWindow {
             
             this.downsampleStrategy = downsampleStrategy;
             this.hasIndelQualities = hasIndelQualities;
    -
    -        this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
         }
     
         /**
    @@ -286,8 +295,8 @@ public class SlidingWindow {
                 regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose);
             }
     
    -        while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
    -                readsInWindow.pollFirst();
    +        while (!readsInWindow.isEmpty() && readsInWindow.peek().getSoftEnd() < windowHeaderStartLocation) {
    +                readsInWindow.poll();
             }
     
             return regions;
    @@ -340,10 +349,16 @@ public class SlidingWindow {
         private final MarkedSites markedSites = new MarkedSites();
     
         /**
    -     * returns an array marked with variant and non-variant regions (it uses
    -     * markVariantRegions to make the marks)
    +     * returns the MarkedSites object so that it can be tested after adding data to the Sliding Window
          *
    -     * @param stop check the window from start to stop (not-inclusive)
    +     * @return the Marked Sites object used by this Sliding Window
    +     */
    +    protected MarkedSites getMarkedSitesForTesting() { return markedSites; }
    +
    +    /**
    +     * returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks)
    +     *
    +     * @param stop check the window from start to stop (not-inclusive); given in global coordinates
          */
         protected void markSites(final int stop) {
     
    @@ -353,22 +368,17 @@ public class SlidingWindow {
             // copy over as many bits as we can from the previous calculation.  Note that we can't trust the
             // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there.
             final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1;
    -        final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize);
    +        final int locationToProcess = Math.max(windowHeaderStartLocation, Math.min(lastPositionMarked, stop - contextSize));
     
    -        // update the iterator to the correct position
    -        Iterator headerElementIterator = windowHeader.iterator();
    -        for (int i = windowHeaderStartLocation; i < locationToProcess; i++) {
    -            if (headerElementIterator.hasNext())
    -                headerElementIterator.next();
    -        }
    +        final ListIterator headerElementIterator = windowHeader.listIterator(locationToProcess - windowHeaderStartLocation);
     
             // process a contextSize worth of region from scratch in case there's a variant there
             for (int i = locationToProcess; i < stop; i++) {
                 if (headerElementIterator.hasNext()) {
                     HeaderElement headerElement = headerElementIterator.next();
     
    -                if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
    -                    markVariantRegion(markedSites, i - windowHeaderStartLocation);
    +                if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
    +                    markVariantRegion(i - windowHeaderStartLocation);
     
                 } else
                     break;
    @@ -378,33 +388,44 @@ public class SlidingWindow {
         /**
          * Marks the sites around the variant site (as true)
          *
    -     * @param markedSites         the boolean array to bear the marks
          * @param variantSiteLocation the location where a variant site was found
          */
    -    protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
    +    protected void markVariantRegion(final int variantSiteLocation) {
             int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
    -        int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
    -        for (int i = from; i < to; i++)
    -            markedSites.getVariantSiteBitSet()[i] = true;
    +        int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length - 1 : variantSiteLocation + contextSize;
    +        markRegionAs(from, to, true);
         }
     
         /**
    -     * Adds bases to the running consensus or filtered data accordingly
    +     * Marks the sites around the variant site (as true)
    +     *
    +     * @param from              the start index (inclusive) to mark
    +     * @param to                the end index (inclusive) to mark
    +     * @param isVariant         mark the region with this boolean value
    +     */
    +    private void markRegionAs(final int from, final int to, final boolean isVariant) {
    +        for (int i = from; i <= to; i++)
    +            markedSites.getVariantSiteBitSet()[i] = isVariant;
    +    }
    +
    +    /**
    +     * Adds bases to the running consensus
          * 
          * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
          *
          * @param header  the window header
          * @param start   the first header index to add to consensus
          * @param end     the first header index NOT TO add to consensus
    -     * @param isNegativeStrand  should the synthetic read be represented as being on the negative strand?
    +     * @param strandType  the strandedness that the synthetic read should be represented as having
          * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated.
          */
         @Requires({"start >= 0 && (end >= start || end == 0)"})
         @Ensures("result != null")
    -    protected List addToSyntheticReads(LinkedList header, int start, int end, boolean isNegativeStrand) {
    -        LinkedList reads = new LinkedList();
    -        if (start < end) {
    -            ListIterator headerElementIterator = header.listIterator(start);
    +    protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) {
    +        final ObjectArrayList reads = new ObjectArrayList();
    +
    +        if ( start < end ) {
    +            final ListIterator headerElementIterator = header.listIterator(start);
     
                 if (!headerElementIterator.hasNext())
                     throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d  - %d / %d", start, header.size(), end));
    @@ -412,37 +433,29 @@ public class SlidingWindow {
                 HeaderElement headerElement = headerElementIterator.next();
     
                 if (headerElement.hasConsensusData()) {
    -                reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
    -
    -                int endOfConsensus = findNextNonConsensusElement(header, start, end);
    -                addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand);
     
    +                // find the end of the consecutive consensus data in the window
    +                final int endOfConsensus = findNextNonConsensusElement(header, start, end);
                     if (endOfConsensus <= start)
                         throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
     
    -                reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand));
    -            } else if (headerElement.hasFilteredData()) {
    +                // add to running consensus and recurse
    +                addToRunningConsensus(header, start, endOfConsensus, strandType);
    +                reads.addAll(addToSyntheticReads(header, endOfConsensus, end, strandType));
    +
    +            } else {
    +
    +                // add any outstanding consensus data
                     reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
     
    -                int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
    -                reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
    -
    -                if (endOfFilteredData <= start)
    -                    throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
    -
    -                reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand));
    -            } else if (headerElement.isEmpty()) {
    -                reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
    -
    -                int endOfEmptyData = findNextNonEmptyElement(header, start, end);
    -
    +                // find the end of the consecutive empty data in the window
    +                final int endOfEmptyData = findNextConsensusElement(header, start, end);
                     if (endOfEmptyData <= start)
                         throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
     
    -                reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand));
    -            } else
    -                throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
    -
    +                // recurse out of the empty region
    +                reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, strandType));
    +            }
             }
     
             return reads;
    @@ -454,24 +467,21 @@ public class SlidingWindow {
          * @param type the synthetic reads you want to close
          * @return a possibly null list of GATKSAMRecords generated by finalizing the synthetic reads
          */
    -    private List finalizeAndAdd(ConsensusType type) {
    -        GATKSAMRecord read = null;
    -        List list = new LinkedList();
    +    private ObjectArrayList finalizeAndAdd(final ConsensusType type) {
     
    -        switch (type) {
    -            case CONSENSUS:
    -                read = finalizeRunningConsensus();
    -                break;
    -            case FILTERED:
    -                read = finalizeFilteredDataConsensus();
    -                break;
    -            case BOTH:
    -                read = finalizeRunningConsensus();
    -                if (read != null) list.add(read);
    -                read = finalizeFilteredDataConsensus();
    +        final ObjectArrayList list = new ObjectArrayList();
    +
    +        if ( type == ConsensusType.CONSENSUS || type == ConsensusType.BOTH ) {
    +            final GATKSAMRecord read = finalizeRunningConsensus();
    +            if ( read != null )
    +                list.add(read);
    +        }
    +
    +        if ( type == ConsensusType.FILTERED || type == ConsensusType.BOTH ) {
    +            final GATKSAMRecord read = finalizeFilteredDataConsensus();
    +            if ( read != null )
    +                list.add(read);
             }
    -        if (read != null)
    -            list.add(read);
     
             return list;
         }
    @@ -479,19 +489,145 @@ public class SlidingWindow {
         /**
          * Looks for the next position without consensus data
          *
    -     * @param start beginning of the filtered region
    -     * @param upTo  limit to search for another consensus element
    +     * @param header the header to check
    +     * @param start  beginning of the filtered region
    +     * @param upTo   limit to search for another consensus element
          * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
          */
    -    private int findNextNonConsensusElement(LinkedList header, int start, int upTo) {
    -        Iterator headerElementIterator = header.listIterator(start);
    +    private int findNextNonConsensusElement(final LinkedList header, final int start, final int upTo) {
    +        final Iterator headerElementIterator = header.listIterator(start);
             int index = start;
             while (index < upTo) {
                 if (!headerElementIterator.hasNext())
                     throw new ReviewedStingException("There are no more header elements in this window");
     
    -            HeaderElement headerElement = headerElementIterator.next();
    +            if (!headerElementIterator.next().hasConsensusData())
    +                break;
    +            index++;
    +        }
    +        return index;
    +    }
    +
    +    /**
    +     * Looks for the next position witho consensus data
    +     *
    +     * @param header the header to check
    +     * @param start  beginning of the filtered region
    +     * @param upTo   limit to search for another consensus element
    +     * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
    +     */
    +    private int findNextConsensusElement(final LinkedList header, final int start, final int upTo) {
    +        final Iterator headerElementIterator = header.listIterator(start);
    +        int index = start;
    +        while (index < upTo) {
    +            if (!headerElementIterator.hasNext())
    +                throw new ReviewedStingException("There are no more header elements in this window");
    +
    +            if (headerElementIterator.next().hasConsensusData())
    +                break;
    +            index++;
    +        }
    +        return index;
    +    }
    +
    +    /**
    +     * Adds bases to the filtered data synthetic read.
    +     *
    +     * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
    +     * bases.
    +     *
    +     * @param header  the window header
    +     * @param start the first header index to add to consensus
    +     * @param end   the first header index NOT TO add to consensus
    +     * @param strandType  the strandedness that the synthetic read should be represented as having
    +     */
    +    @Requires({"start >= 0 && (end >= start || end == 0)"})
    +    private void addToRunningConsensus(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) {
    +        if (runningConsensus == null)
    +            runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType);
    +
    +        final Iterator headerElementIterator = header.listIterator(start);
    +
    +        for (int index = start; index < end; index++) {
    +            if (!headerElementIterator.hasNext())
    +                throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
    +
    +            final HeaderElement headerElement = headerElementIterator.next();
                 if (!headerElement.hasConsensusData())
    +                throw new ReviewedStingException("No CONSENSUS data in " + index);
    +
    +            genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts());
    +        }
    +    }
    +
    +    /**
    +     * Adds bases to the running filtered data accordingly
    +     *
    +     * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
    +     *
    +     * @param header  the window header
    +     * @param start   the first header index to add to consensus
    +     * @param end     the first header index NOT TO add to consensus
    +     * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated.
    +     */
    +    @Requires({"start >= 0 && (end >= start || end == 0)"})
    +    @Ensures("result != null")
    +    protected ObjectArrayList addToFilteredReads(final LinkedList header, final int start, final int end) {
    +        final ObjectArrayList reads = new ObjectArrayList();
    +
    +        if ( start < end ) {
    +            final ListIterator headerElementIterator = header.listIterator(start);
    +
    +            if (!headerElementIterator.hasNext())
    +                throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d  - %d / %d", start, header.size(), end));
    +
    +            HeaderElement headerElement = headerElementIterator.next();
    +
    +            if (headerElement.hasFilteredData()) {
    +
    +                // find the end of the consecutive filtered data in the window
    +                final int endOfFiltered = findNextNonFilteredElement(header, start, end);
    +                if (endOfFiltered <= start)
    +                    throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFiltered, start));
    +
    +                // add to running filtered consensus and recurse
    +                addToFilteredData(header, start, endOfFiltered);
    +                reads.addAll(addToFilteredReads(header, endOfFiltered, end));
    +
    +            } else {
    +
    +                // add any outstanding filtered data
    +                reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
    +
    +                // find the end of the consecutive empty data in the window
    +                final int endOfEmptyData = findNextFilteredElement(header, start, end);
    +                if (endOfEmptyData <= start)
    +                    throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
    +
    +                // recurse out of the empty region
    +                reads.addAll(addToFilteredReads(header, endOfEmptyData, end));
    +            }
    +        }
    +
    +        return reads;
    +    }
    +
    +    /**
    +     * Looks for the next position without consensus data
    +     *
    +     * @param header the header to check
    +     * @param start  beginning of the filtered region
    +     * @param upTo   limit to search for another consensus element
    +     * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
    +     */
    +    private int findNextNonFilteredElement(final LinkedList header, final int start, final int upTo) {
    +        final Iterator headerElementIterator = header.listIterator(start);
    +        int index = start;
    +        while (index < upTo) {
    +            if (!headerElementIterator.hasNext())
    +                throw new ReviewedStingException("There are no more header elements in this window");
    +
    +            if (!headerElementIterator.next().hasFilteredData())
                     break;
                 index++;
             }
    @@ -499,43 +635,21 @@ public class SlidingWindow {
         }
     
         /**
    -     * Looks for the next position without filtered data
    +     * Looks for the next position witho consensus data
          *
    -     * @param start beginning of the region
    -     * @param upTo  limit to search for
    -     * @return next position in local coordinates (relative to the windowHeader) with no filtered data; otherwise, the start position
    +     * @param header the header to check
    +     * @param start  beginning of the filtered region
    +     * @param upTo   limit to search for another consensus element
    +     * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
          */
    -    private int findNextNonFilteredDataElement(LinkedList header, int start, int upTo) {
    -        Iterator headerElementIterator = header.listIterator(start);
    +    private int findNextFilteredElement(final LinkedList header, final int start, final int upTo) {
    +        final Iterator headerElementIterator = header.listIterator(start);
             int index = start;
             while (index < upTo) {
                 if (!headerElementIterator.hasNext())
                     throw new ReviewedStingException("There are no more header elements in this window");
     
    -            HeaderElement headerElement = headerElementIterator.next();
    -            if (!headerElement.hasFilteredData() || headerElement.hasConsensusData())
    -                break;
    -            index++;
    -        }
    -        return index;
    -    }
    -
    -    /**
    -     * Looks for the next non-empty header element
    -     *
    -     * @param start beginning of the region
    -     * @param upTo  limit to search for
    -     * @return next position in local coordinates (relative to the windowHeader) with non-empty element; otherwise, the start position
    -     */
    -    private int findNextNonEmptyElement(LinkedList header, int start, int upTo) {
    -        ListIterator headerElementIterator = header.listIterator(start);
    -        int index = start;
    -        while (index < upTo) {
    -            if (!headerElementIterator.hasNext())
    -                throw new ReviewedStingException("There are no more header elements in this window");
    -
    -            HeaderElement headerElement = headerElementIterator.next();
    -            if (!headerElement.isEmpty())
    +            if (headerElementIterator.next().hasFilteredData())
                     break;
                 index++;
             }
    @@ -551,66 +665,25 @@ public class SlidingWindow {
          * @param header  the window header
          * @param start   the first header index to add to consensus
          * @param end     the first header index NOT TO add to consensus
    -     * @param isNegativeStrand  should the synthetic read be represented as being on the negative strand?
    -     * @return a non-null list of GATKSAMRecords representing finalized filtered consensus data. Empty list if no consensus was generated.
          */
         @Requires({"start >= 0 && (end >= start || end == 0)"})
         @Ensures("result != null")
    -    private List addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) {
    -        List result = new ArrayList(0);
    +    private void addToFilteredData(final LinkedList header, final int start, final int end) {
     
             if (filteredDataConsensus == null)
    -            filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
    +            filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, SyntheticRead.StrandType.STRANDLESS);
     
             ListIterator headerElementIterator = header.listIterator(start);
             for (int index = start; index < end; index++) {
                 if (!headerElementIterator.hasNext())
                     throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
     
    -            HeaderElement headerElement = headerElementIterator.next();
    -            if (headerElement.hasConsensusData())
    -                throw new ReviewedStingException("Found consensus data inside region to add to filtered data.");
    +            final HeaderElement headerElement = headerElementIterator.next();
     
                 if (!headerElement.hasFilteredData())
                     throw new ReviewedStingException("No filtered data in " + index);
     
    -            if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
    -                result.add(finalizeFilteredDataConsensus());
    -                filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
    -            }
    -
    -            genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
    -        }
    -
    -        return result;
    -    }
    -
    -    /**
    -     * Adds bases to the filtered data synthetic read.
    -     * 
    -     * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
    -     * bases.
    -     *
    -     * @param header  the window header
    -     * @param start the first header index to add to consensus
    -     * @param end   the first header index NOT TO add to consensus
    -     * @param isNegativeStrand  should the synthetic read be represented as being on the negative strand?
    -     */
    -    @Requires({"start >= 0 && (end >= start || end == 0)"})
    -    private void addToRunningConsensus(LinkedList header, int start, int end, boolean isNegativeStrand) {
    -        if (runningConsensus == null)
    -            runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
    -
    -        Iterator headerElementIterator = header.listIterator(start);
    -        for (int index = start; index < end; index++) {
    -            if (!headerElementIterator.hasNext())
    -                throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
    -
    -            HeaderElement headerElement = headerElementIterator.next();
    -            if (!headerElement.hasConsensusData())
    -                throw new ReviewedStingException("No CONSENSUS data in " + index);
    -
    -            genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS());
    +            genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts());
             }
         }
     
    @@ -619,15 +692,14 @@ public class SlidingWindow {
          *
          * @param syntheticRead the synthetic read to add to
          * @param baseCounts    the base counts object in the header element
    -     * @param rms           the rms mapping quality in the header element
          */
    -    private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
    +    private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) {
             final BaseIndex base = baseCounts.baseIndexWithMostProbability();
             byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
             byte qual = baseCounts.averageQualsOfBase(base);
             byte insQual = baseCounts.averageInsertionQualsOfBase(base);
             byte delQual = baseCounts.averageDeletionQualsOfBase(base);
    -        syntheticRead.add(base, count, qual, insQual, delQual, rms);
    +        syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS());
         }
     
         /**
    @@ -635,117 +707,219 @@ public class SlidingWindow {
          *
          * @param start   the first window header index in the variant region (inclusive)
          * @param stop    the last window header index of the variant region (inclusive)
    -     * @param disallowPolyploidReductionAtThisPosition       should we disallow polyploid (het) compression here?
    -     * @return a non-null list of all reads contained in the variant region
    +     * @param knownSnpPositions  the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
    +     * @return a non-null object representing all reads contained in the variant region
          */
         @Requires({"start >= 0 && (stop >= start || stop == 0)"})
         @Ensures("result != null")
    -    protected List compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
    -        List allReads = new LinkedList();
    +    protected CloseVariantRegionResult compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) {
    +        final CloseVariantRegionResult allReads = new CloseVariantRegionResult(stop);
     
             // Try to compress into a polyploid consensus
    -        int nVariantPositions = 0;
    -        int hetRefPosition = -1;
    -        boolean canCompress = true;
    -        Object[] header = windowHeader.toArray();
    +        // Optimization: don't bother if there are no known SNPs here
    +        final int hetRefPosition = (knownSnpPositions != null && knownSnpPositions.isEmpty()) ? -1 : findSinglePolyploidCompressiblePosition(start, stop);
     
    -        // foundEvent will remain false if we don't allow polyploid reduction
    -        if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
    -            for (int i = start; i<=stop; i++) {
    -
    -                int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
    -
    -                // we will only work on diploid cases because we just don't want to handle/test other scenarios
    -                if ( nAlleles > 2 ) {
    -                    canCompress = false;
    -                    break;
    -                } else if ( nAlleles == 2 ) {
    -                    nVariantPositions++;
    -
    -                    // make sure that there is only 1 site in the variant region that contains more than one allele
    -                    if ( nVariantPositions == 1 ) {
    -                        hetRefPosition = i;
    -                    } else if ( nVariantPositions > 1 ) {
    -                        canCompress = false;
    -                        break;
    -                    }
    -                }
    -            }
    +        // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by
    +        //   insertions (which we don't want because we can't confirm that they represent the same allele).
    +        // Also, we only allow polyploid consensus creation at known sites if provided.
    +        if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) {
    +            // try to create the polyploid consensus
    +            allReads.reads.addAll(createPolyploidConsensus(hetRefPosition));
    +            allReads.stopPerformed = hetRefPosition;  // we stopped at the het position
             }
    -
    -        // Try to compress the variant region; note that using the hetRefPosition protects us from trying to compress
    -        // variant regions that are created by insertions (since we can't confirm here that they represent the same allele)
    -        if ( canCompress && hetRefPosition != -1 ) {
    -            allReads = createPolyploidConsensus(start, stop, ((HeaderElement) header[hetRefPosition]).getLocation());
    -        }
    -
    -        // Return all reads that overlap the variant region and remove them from the window header entirely
    -        // also remove all reads preceding the variant region (since they will be output as consensus right after compression
    +        // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them
    +        // from the window header entirely; also remove all reads preceding the variant region (since they will be output
    +        // as consensus right after compression)
             else {
                 final int refStart = windowHeader.get(start).getLocation();
                 final int refStop = windowHeader.get(stop).getLocation();
     
    -            LinkedList toRemove = new LinkedList();
    -            for (GATKSAMRecord read : readsInWindow) {
    -                if (read.getSoftStart() <= refStop) {
    -                    if (read.getAlignmentEnd() >= refStart) {
    -                        allReads.add(read);
    +            final ObjectList toRemove = new ObjectArrayList();
    +            for ( final GATKSAMRecord read : readsInWindow ) {
    +                if ( read.getSoftStart() <= refStop ) {
    +                    if ( read.getAlignmentEnd() >= refStart ) {
    +                        allReads.reads.add(read);
                             removeFromHeader(windowHeader, read);
                         }
                         toRemove.add(read);
                     }
                 }
    -            removeReadsFromWindow(toRemove);
    +
    +            // remove all used reads
    +            for ( final GATKSAMRecord read : toRemove )
    +                readsInWindow.remove(read);
             }
    +
             return allReads;
         }
     
    +    /**
    +     * Determines whether the given position match one of the known sites
    +     *
    +     * @param targetPosition     the position of the het site
    +     * @param knownSnpPositions  the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
    +     * @return true if the targetPosition matches a known SNP position, false otherwise
    +     */
    +    @Requires({"targetPosition >= 1 && knownSnpPositions != null"})
    +    protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) {
    +        final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition);
    +        return knownSnpPositions == null || knownSnpPositions.contains(targetLoc);
    +    }
    +
    +    /*
    +     * Finds the het variant position located within start and stop (inclusive) if one exists.
    +     *
    +     * @param start   the first header index in the region to check (inclusive)
    +     * @param stop    the last header index of the region to check (inclusive)
    +     * @return the window header index of the single het position or -1 if either none or more than one exists
    +     */
    +    @Requires("start >= 0 && (stop >= start || stop == 0)")
    +    protected int findSinglePolyploidCompressiblePosition(final int start, final int stop) {
    +        int hetRefPosition = -1;
    +
    +        for ( int i = start; i <= stop; i++ ) {
    +
    +            final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT);
    +
    +            // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios
    +            if ( nAlleles > 2 || nAlleles == -1 )
    +                return -1;
    +
    +            if ( nAlleles == 2 ) {
    +
    +                // make sure that there is only 1 site in the region that contains more than one allele
    +                if ( hetRefPosition != -1 )
    +                    return -1;
    +
    +                hetRefPosition = i;
    +            }
    +        }
    +
    +        return hetRefPosition;
    +    }
    +
    +    /*
    +     * Checks whether there's a position in the header with a significant number of softclips or a variant.
    +     *
    +     * @param header          the window header to examine
    +     * @param positionToSkip  the global position to skip in the examination (use negative number if you don't want to make use of this argument)
    +     * @return true if there exists a position with significant softclips, false otherwise
    +     */
    +    @Requires("header != null")
    +    protected boolean hasPositionWithSignificantSoftclipsOrVariant(final List header, final int positionToSkip) {
    +
    +        for ( final HeaderElement headerElement : header ) {
    +
    +            if ( headerElement.getLocation() == positionToSkip )
    +                continue;
    +
    +            if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ||
    +                 headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) > 1 )
    +                return true;
    +        }
    +
    +        return false;
    +    }
    +
         /**
          * Finalizes a variant region, any adjacent synthetic reads.
          *
          * @param start   the first window header index in the variant region (inclusive)
          * @param stop    the last window header index of the variant region (inclusive)
    -     * @param disallowPolyploidReductionAtThisPosition       should we disallow polyploid (het) compression here?
    -     * @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads
    +     * @param knownSnpPositions  the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
    +     * @return a non-null object representing all reads contained in the variant region plus any adjacent synthetic reads
          */
         @Requires({"start >= 0 && (stop >= start || stop == 0)"})
         @Ensures("result != null")
    -    protected List closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
    -        List allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
    +    protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) {
    +        final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions);
     
    -        List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
    -        result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
    -        result.addAll(finalizeAndAdd(ConsensusType.BOTH));
    +        final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed);
    +        result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads);
    +        result.reads.addAll(addToSyntheticReads(windowHeader, 0, allReads.stopPerformed + 1, SyntheticRead.StrandType.STRANDLESS));
    +        result.reads.addAll(addToFilteredReads(windowHeader, 0, allReads.stopPerformed + 1));
    +        result.reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
     
             return result; // finalized reads will be downsampled if necessary
         }
     
    -    public Set closeVariantRegions(CompressionStash regions) {
    -        TreeSet allReads = new TreeSet(new AlignmentStartWithNoTiesComparator());
    -        if (!regions.isEmpty()) {
    -            int lastStop = -1;
    -            int windowHeaderStart = getStartLocation(windowHeader);
    +    /*
    +     * @see #closeVariantRegions(CompressionStash, ObjectSortedSet, boolean) with forceCloseFullRegions set to false
    +     */
    +    public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) {
    +        return closeVariantRegions(regions, knownSnpPositions, false);
    +    }
     
    -            for (GenomeLoc region : regions) {
    -                if (((FinishedGenomeLoc)region).isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) {
    -                    int start = region.getStart() - windowHeaderStart;
    +    private static final class CloseVariantRegionResult {
    +        final private ObjectList reads = new ObjectArrayList();
    +        private int stopPerformed;
    +
    +        public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; }
    +    }
    +
    +    /*
    +     * Finalizes the list of regions requested (and any regions preceding them)
    +     *
    +     * @param regions            the list of regions to finalize
    +     * @param knownSnpPositions  the set of known SNP positions; can be null (to allow polyploid consensus anywhere)
    +     * @param forceCloseFullRegions if true, requires this method to make sure all regions are fully closed; otherwise, we may decide not to close up to the very end (e.g. during het compression)
    +     * @return a non-null set of reduced reads representing the finalized regions
    +     */
    +    public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions, final boolean forceCloseFullRegions) {
    +        final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
    +        if ( !regions.isEmpty() ) {
    +
    +            int windowHeaderStart = getStartLocation(windowHeader);
    +            HeaderElement lastCleanedElement = null;
    +
    +            for ( final GenomeLoc region : regions ) {
    +                if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) {
    +                    final int start = region.getStart() - windowHeaderStart;
                         int stop = region.getStop() - windowHeaderStart;
     
    -                    allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track
    -                    lastStop = stop;
    +                    CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions);
    +                    allReads.addAll(closeVariantRegionResult.reads);
    +
    +                    // check whether we didn't close the whole region that was requested
    +                    if ( stop > 0 && closeVariantRegionResult.stopPerformed < stop ) {
    +                        // we should update the variant sites bitset because the context size's worth of bases after the variant position are no longer "variant"
    +                        markRegionAs(closeVariantRegionResult.stopPerformed + 1, stop, false);
    +
    +                        // if the calling method said that it didn't care then we are okay so update the stop
    +                        if ( !forceCloseFullRegions ) {
    +                            stop = closeVariantRegionResult.stopPerformed;
    +                        }
    +                        // otherwise, we need to forcibly push the stop that we originally requested
    +                        else {
    +                            while ( closeVariantRegionResult.stopPerformed < stop ) {
    +                                // first clean up used header elements so they don't get reused
    +                                for ( int i = 0; i <= closeVariantRegionResult.stopPerformed; i++ )
    +                                    windowHeader.remove();
    +                                stop -= (closeVariantRegionResult.stopPerformed + 1);
    +
    +                                closeVariantRegionResult = closeVariantRegion(0, stop, knownSnpPositions);
    +                                allReads.addAll(closeVariantRegionResult.reads);
    +                            }
    +                        }
    +                    }
    +
    +                    // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions.
    +                    // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads
    +                    //  (in the case where there are multiple regions to close we'd reuse the reads for each region).
    +                    if ( stop >= 0 ) {
    +                        for ( int i = 0; i < stop; i++ )
    +                            windowHeader.remove();
    +                        lastCleanedElement = windowHeader.remove();
    +                        windowHeaderStart = getStartLocation(windowHeader);
    +                    }
                     }
                 }
     
    -            // clean up the window header elements up until the end of the variant region.
    -            // note that we keep the last element of the region in the event that the following element has a read that starts with insertion.
    -            if ( lastStop >= 0 ) {
    -                for (int i = 0; i < lastStop; i++)
    -                    windowHeader.remove();
    -                final HeaderElement lastOfRegion = windowHeader.remove();
    -                if ( lastOfRegion.hasInsertionToTheRight() )
    -                    windowHeader.addFirst(new HeaderElement(lastOfRegion.getLocation(), lastOfRegion.numInsertionsToTheRight()));
    -            }
    +            // we need to keep the last element of the last cleaned region in the event that the following element has a read that starts with an insertion.
    +            if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() )
    +                windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight()));
             }
    +
             return allReads;
         }
     
    @@ -759,7 +933,7 @@ public class SlidingWindow {
          */
         @Requires({"allReads != null"})
         @Ensures("result != null")
    -    protected List downsampleVariantRegion(final List allReads) {
    +    protected ObjectList downsampleVariantRegion(final ObjectList allReads) {
             int nReads = allReads.size();
             if (nReads == 0)
                 return allReads;
    @@ -769,7 +943,7 @@ public class SlidingWindow {
     
             ReservoirDownsampler  downsampler = new ReservoirDownsampler(downsampleCoverage);
             downsampler.submit(allReads);
    -        return downsampler.consumeFinalizedItems();
    +        return new ObjectArrayList(downsampler.consumeFinalizedItems());
         }
     
     
    @@ -778,27 +952,28 @@ public class SlidingWindow {
          * regions that still exist regardless of being able to fulfill the
          * context size requirement in the end.
          *
    +     * @param knownSnpPositions  the set of known SNP positions; can be null (to allow polyploid consensus anywhere)
          * @return A non-null set/list of all reads generated
          */
         @Ensures("result != null")
    -    public Pair, CompressionStash> close() {
    +    public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) {
             // mark variant regions
    -        Set finalizedReads = new TreeSet(new AlignmentStartWithNoTiesComparator());
    +        ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
             CompressionStash regions = new CompressionStash();
    -        boolean forceCloseUnfinishedRegions = true;
     
             if (!windowHeader.isEmpty()) {
                 markSites(getStopLocation(windowHeader) + 1);
    -            regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
    -            finalizedReads = closeVariantRegions(regions);
    +            regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true);
    +            finalizedReads = closeVariantRegions(regions, knownSnpPositions, true);
     
                 if (!windowHeader.isEmpty()) {
    -                finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
    +                finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS));
    +                finalizedReads.addAll(addToFilteredReads(windowHeader, 0, windowHeader.size()));
                     finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH));                                              // if it ended in running consensus, finish it up
                 }
             }
     
    -        return new Pair, CompressionStash>(finalizedReads, regions);
    +        return new Pair, CompressionStash>(finalizedReads, regions);
         }
     
         /**
    @@ -837,86 +1012,120 @@ public class SlidingWindow {
             return finalizedRead;
         }
     
    +    // define this so that we can use Java generics below
    +    private final static class HeaderElementList extends LinkedList {}
    +
    +    private final static class SingleStrandConsensusData {
    +        final HeaderElementList consensus = new HeaderElementList();
    +        final ObjectList reads = new ObjectArrayList();
    +    }
    +
         /**
    -     * Finalizes a variant region, any adjacent synthetic reads.
    +     * Finalizes a variant region - and any adjacent synthetic reads - for point mutations (indel sites are not
    +     * supported) with polyploid compression.
          *
    -     * @param start   the first window header index in the variant region (inclusive)
    -     * @param stop    the last window header index of the variant region (inclusive)
    -     * @param hetRefPosition    reference position (in global coordinates) of the het site
    +     * @param hetRefPosition    window header index of the het site; MUST NOT BE AN INDEL SITE!
          * @return a non-null list of all reads contained in the variant region as a polyploid consensus
          */
         @Requires({"start >= 0 && (stop >= start || stop == 0)"})
    -    @Ensures("result != null")
    -    private List createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) {
    -        // we will create two (positive strand, negative strand) headers for each contig
    -        List> headersPosStrand = new ArrayList>();
    -        List> headersNegStrand = new ArrayList>();
    -        List hetReads = new LinkedList();
    -        Map haplotypeHeaderMap = new HashMap(2);
    -        int currentHaplotype = 0;
    -        int refStart = windowHeader.get(start).getLocation();
    -        int refStop = windowHeader.get(stop).getLocation();
    -        List toRemove = new LinkedList();
    -        for (GATKSAMRecord read : readsInWindow) {
    -            int haplotype;
    +    @Ensures({"result != null"})
    +    protected ObjectList createPolyploidConsensus(final int hetRefPosition) {
    +        // we will create two (positive strand, negative strand) headers for each haplotype
    +        final SingleStrandConsensusData[] headersPosStrand = new SingleStrandConsensusData[2];
    +        final SingleStrandConsensusData[] headersNegStrand = new SingleStrandConsensusData[2];
     
    -            // check if the read is either before or inside the variant region
    -            if (read.getSoftStart() <= refStop) {
    -                // check if the read is inside the variant region
    -                if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) {
    -                    // check if the read contains the het site
    -                    if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) {
    -                        int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
    -                        // TODO -- THIS IS A HUGE BUG AS IT WILL NOT WORK FOR DELETIONS; see commented out unit test
    -                        byte base = read.getReadBases()[readPos];
    -                        byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos];
    +        final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation();
     
    -                        // check if base passes the filters!
    -                        if (qual >= MIN_BASE_QUAL_TO_COUNT) {
    -                            // check which haplotype this read represents and take the index of it from the list of headers
    -                            if (haplotypeHeaderMap.containsKey(base)) {
    -                                haplotype = haplotypeHeaderMap.get(base);
    -                            }
    -                            // create new lists if this haplotype has not been seen yet
    -                            else {
    -                                haplotype = currentHaplotype;
    -                                haplotypeHeaderMap.put(base, currentHaplotype);
    -                                headersPosStrand.add(new LinkedList());
    -                                headersNegStrand.add(new LinkedList());
    -                                currentHaplotype++;
    -                            }
    -                            LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype);
    -                            // add to the polyploid header
    -                            addToHeader(header, read);
    -                            // remove from the standard header so that we don't double count it
    -                            removeFromHeader(windowHeader, read);
    -                        }
    -                    }
    -                }
    +        // initialize the mapping from base (allele) to header
    +        final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2);
    +        for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ) {
    +            final int currentIndex = alleleHeaderMap.size();
    +            if ( currentIndex > 1 )
    +                throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus");
     
    -                // we remove all reads before and inside the variant region from the window
    -                toRemove.add(read);
    +            alleleHeaderMap.put(allele.b, currentIndex);
    +            headersPosStrand[currentIndex] = new SingleStrandConsensusData();
    +            headersNegStrand[currentIndex] = new SingleStrandConsensusData();
    +        }
    +
    +        // sanity check that we saw 2 alleles
    +        if ( alleleHeaderMap.size() != 2 )
    +            throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size());
    +
    +        final ObjectList readsToRemove = new ObjectArrayList();
    +
    +        for ( final GATKSAMRecord read : readsInWindow ) {
    +
    +            // if the read falls after the het position, just skip it for now (we'll get to it later)
    +            if ( read.getSoftStart() > globalHetRefPosition )
    +                continue;
    +
    +            // remove all other reads from the read cache since we're going to use them here
    +            readsToRemove.add(read);
    +
    +            // if the read falls before the het position or has low MQ, we don't need to look at it
    +            if ( read.getSoftEnd() < globalHetRefPosition || read.getMappingQuality() < MIN_MAPPING_QUALITY)
    +                continue;
    +
    +            // remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead
    +            removeFromHeader(windowHeader, read);
    +
    +            // where on the read is the het position?
    +            final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
    +
    +            // this is safe because indels are not supported
    +            final byte base = read.getReadBases()[readPosOfHet];
    +
    +            // check which allele this read represents
    +            final Integer allele = alleleHeaderMap.get(base);
    +
    +            // ignore the read if it represents a base that's not part of the consensus
    +            if ( allele != null ) {
    +                // add to the appropriate polyploid header
    +                final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele];
    +                header.reads.add(read);
    +                addToHeader(header.consensus, read);
                 }
             }
     
    -        for (LinkedList header : headersPosStrand) {
    -            if (header.size() > 0)
    -                hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false));
    -            if (runningConsensus != null)
    -                hetReads.add(finalizeRunningConsensus());
    -        }
    -        for (LinkedList header : headersNegStrand) {
    -            if (header.size() > 0)
    -                hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true));
    -            if (runningConsensus != null)
    -                hetReads.add(finalizeRunningConsensus());
    -        }
    +        for ( final GATKSAMRecord read : readsToRemove )
    +            readsInWindow.remove(read);
     
    -        removeReadsFromWindow(toRemove);
    +        // create the polyploid synthetic reads if we can
    +        final ObjectList hetReads = new ObjectArrayList();
    +
    +        // sanity check that no new "variant region" exists on just a single consensus strand due to softclips
    +        // or multi-allelic sites now that we've broken everything out into their component parts.  if one does
    +        // exist then we need to back out the consensus for that strand only.
    +        for ( final SingleStrandConsensusData header : headersPosStrand ) {
    +            if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) )
    +                hetReads.addAll(header.reads);
    +            else
    +                finalizeHetConsensus(header.consensus, false, hetReads);
    +        }
    +        for ( final SingleStrandConsensusData header : headersNegStrand ) {
    +            if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) )
    +                hetReads.addAll(header.reads);
    +            else
    +                finalizeHetConsensus(header.consensus, true, hetReads);
    +        }
     
             return hetReads;
         }
     
    +    /*
    +     * Finalizes a particular het consensus for the given header representation
    +     *
    +     * @param header            the list of header elements representing the header for the consensus
    +     * @param isNegativeStrand  does this header represent reads on the negative strand?
    +     * @param result            list in which to store results
    +     */
    +    protected void finalizeHetConsensus(final LinkedList header, final boolean isNegativeStrand, final ObjectList result) {
    +        if ( header.size() > 0 )
    +            result.addAll(addToSyntheticReads(header, 0, header.size(), isNegativeStrand ? SyntheticRead.StrandType.NEGATIVE : SyntheticRead.StrandType.POSITIVE));
    +        if ( runningConsensus != null )
    +            result.add(finalizeRunningConsensus());
    +    }
     
         private void addToHeader(LinkedList header, GATKSAMRecord read) {
             updateHeaderCounts(header, read, false);
    @@ -926,115 +1135,158 @@ public class SlidingWindow {
             updateHeaderCounts(header, read, true);
         }
     
    -
         /**
          * Updates the sliding window's header counts with the incoming read bases, insertions
          * and deletions.
          *
    -     * @param header the sliding window header to use
    -     * @param read the incoming read to be added to the sliding window
    -     * @param removeRead if we are removing the read from the header or adding
    +     * @param header      the sliding window header to use
    +     * @param read        the incoming read to be added to the sliding window
    +     * @param removeRead  if we are removing the read from the header or adding
          */
    -    private void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) {
    -        byte[] bases = read.getReadBases();
    -        byte[] quals = read.getBaseQualities();
    -        byte[] insQuals = read.getExistingBaseInsertionQualities();
    -        byte[] delQuals = read.getExistingBaseDeletionQualities();
    -        int readStart = read.getSoftStart();
    -        int readEnd = read.getSoftEnd();
    -        Cigar cigar = read.getCigar();
    +    protected void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) {
    +        final int readStart = read.getSoftStart();
    +        final int headerStart = getStartLocation(header);
    +        int locationIndex = headerStart < 0 ? 0 : readStart - headerStart;
     
    -        int readBaseIndex = 0;
    -        int startLocation = getStartLocation(header);
    -        int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
    -        int stopLocation = getStopLocation(header);
    +        if ( removeRead && locationIndex < 0 )
    +            throw new IllegalStateException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header));
     
    -        if (removeRead && locationIndex < 0)
    -            throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
    +        // we only need to create new header elements if we are adding the read, not when we're removing it
    +        if ( !removeRead )
    +            locationIndex = createNewHeaderElements(header, read, locationIndex);
     
    -        if (!removeRead) {                                                                                              // we only need to create new header elements if we are adding the read, not when we're removing it
    -            if (locationIndex < 0) {                                                                                    // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
    -                for (int i = 1; i <= -locationIndex; i++)
    -                    header.addFirst(new HeaderElement(startLocation - i));
    +        actuallyUpdateHeaderForRead(header, read, removeRead, locationIndex);
    +    }
     
    -                startLocation = readStart;                                                               // update start location accordingly
    -                locationIndex = 0;
    -            }
    +    /*
    +     * Creates new header elements if needed for the given read.
    +     *
    +     * @param header        the sliding window header to use
    +     * @param read          the incoming read to be added to the sliding window
    +     * @param startIndex    the start location index into the header for this read
    +     *
    +     * @return an updated index into the modified header
    +     */
    +    @Requires("header != null && read != null")
    +    protected int createNewHeaderElements(final LinkedList header, final GATKSAMRecord read, final int startIndex) {
     
    -            if (stopLocation < readEnd) {                                                                // Do we need to add extra elements to the header?
    -                int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
    -                while (elementsToAdd-- > 0)
    -                    header.addLast(new HeaderElement(readEnd - elementsToAdd));
    -            }
    +        int headerStart = getStartLocation(header);
    +        int locationIndex = startIndex;
     
    -            // Special case for leading insertions before the beginning of the sliding read
    -            if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
    -                header.addFirst(new HeaderElement(readStart - 1));                                 // create a new first element to the window header with no bases added
    -                locationIndex = 1;                                                                                      // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
    -            }
    +        // Do we need to add extra elements before the start of the header?  This could happen if the previous read was
    +        // clipped and this alignment starts before the beginning of the window
    +        final int readStart = read.getSoftStart();
    +        if ( startIndex < 0 ) {
    +            for ( int i = 1; i <= -startIndex; i++ )
    +                header.addFirst(new HeaderElement(headerStart - i));
    +
    +            // update the start location accordingly
    +            headerStart = readStart;
    +            locationIndex = 0;
             }
     
    -        Iterator headerElementIterator = header.listIterator(locationIndex);
    +        // Do we need to add extra elements to the end of the header?
    +        final int headerStop = getStopLocation(header);
    +        final int readEnd = read.getSoftEnd();
    +        if ( headerStop < readEnd ) {
    +            final int elementsToAdd = (headerStop < 0) ? readEnd - readStart + 1 : readEnd - headerStop;
    +            for ( int i = elementsToAdd - 1; i >= 0; i-- )
    +                header.addLast(new HeaderElement(readEnd - i));
    +        }
    +
    +        // Special case for leading insertions before the beginning of the sliding read
    +        if ( ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == headerStart || headerStart < 0) ) {
    +            // create a new first element to the window header with no bases added
    +            header.addFirst(new HeaderElement(readStart - 1));
    +            // this allows the first element (I) to look at locationIndex - 1 when we update the header and do the right thing
    +            locationIndex = 1;
    +        }
    +
    +        return locationIndex;
    +    }
    +
    +    /*
    +     * Actually updates the sliding window's header counts with the incoming read bases and quals (including insertion and deletion quals).
    +     *
    +     * @param header        the sliding window header to use
    +     * @param read          the incoming read to be added to the sliding window
    +     * @param removeRead    if we are removing the read from the header or adding
    +     * @param startIndex    the start location index into the header for this read
    +     */
    +    @Requires("header != null && read != null && startIndex >= 0")
    +    protected void actuallyUpdateHeaderForRead(final LinkedList header, final GATKSAMRecord read, final boolean removeRead, final int startIndex) {
    +
    +        final Iterator headerElementIterator = header.listIterator(startIndex);
    +        final byte mappingQuality = (byte) read.getMappingQuality();
    +
    +        // iterator variables
    +        int locationIndex = startIndex;
    +        int readBaseIndex = 0;
             HeaderElement headerElement;
    -        for (CigarElement cigarElement : cigar.getCigarElements()) {
    -            switch (cigarElement.getOperator()) {
    +
    +        for ( final CigarElement cigarElement : read.getCigar().getCigarElements() ) {
    +            switch ( cigarElement.getOperator() ) {
                     case H:
                         break;
                     case I:
    -                    if (removeRead && locationIndex == 0) {                                                             // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it.
    -                        break;
    -                    }
    -
    -                    headerElement = header.get(locationIndex - 1);                                                // insertions are added to the base to the left (previous element)
    -
    -                    if (removeRead) {
    -                        headerElement.removeInsertionToTheRight();
    -                    }
    -                    else {
    -                        headerElement.addInsertionToTheRight();
    -                    }
                         readBaseIndex += cigarElement.getLength();
    -                    break;                                                                                              // just ignore the insertions at the beginning of the read
    -                case D:
    -                    int nDeletions = cigarElement.getLength();
    -                    while (nDeletions-- > 0) {                                                                          // deletions are added to the baseCounts with the read mapping quality as it's quality score
    -                        headerElement = headerElementIterator.next();
    -                        byte mq = (byte) read.getMappingQuality();
    -                        if (removeRead)
    -                            headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
    -                        else
    -                            headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
     
    -                        locationIndex++;
    +                    // special case, if we don't have the previous header element anymore, don't worry about it.
    +                    if ( locationIndex == 0 )
    +                        break;
    +
    +                    // insertions are added to the base to the left (previous element)
    +                    headerElement = header.get(locationIndex - 1);
    +
    +                    if ( removeRead )
    +                        headerElement.removeInsertionToTheRight();
    +                    else
    +                        headerElement.addInsertionToTheRight();
    +
    +                    break;
    +                case D:
    +                    // deletions are added to the baseCounts with the read mapping quality as it's quality score
    +                    final int nDeletionBases = cigarElement.getLength();
    +                    for ( int i = 0; i < nDeletionBases; i++ ) {
    +                        headerElement = headerElementIterator.next();
    +                        if (removeRead)
    +                            headerElement.removeBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
    +                        else
    +                            headerElement.addBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
                         }
    +                    locationIndex += nDeletionBases;
                         break;
                     case S:
                     case M:
                     case P:
                     case EQ:
                     case X:
    -                    int nBasesToAdd = cigarElement.getLength();
    -                    while (nBasesToAdd-- > 0) {
    +                    final int nBasesToAdd = cigarElement.getLength();
    +                    final boolean isSoftClip = cigarElement.getOperator() == CigarOperator.S;
    +                    final byte[] readBases = read.getReadBases();
    +                    final byte[] readQuals = read.getBaseQualities();
    +                    final boolean readHasIndelQuals = read.hasBaseIndelQualities();
    +                    final byte[] insertionQuals = readHasIndelQuals ? read.getBaseInsertionQualities() : null;
    +                    final byte[] deletionQuals = readHasIndelQuals ? read.getBaseDeletionQualities() : null;
    +
    +                    for ( int i = 0; i < nBasesToAdd; i++ ) {
                             headerElement = headerElementIterator.next();
    -                        byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex];                        // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything)
    -                        byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex];
    -                        if (removeRead)
    -                            headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
    +                        final byte insertionQuality = readHasIndelQuals ? insertionQuals[readBaseIndex] : -1;
    +                        final byte deletionQuality = readHasIndelQuals ? deletionQuals[readBaseIndex] : -1;
    +
    +                        if ( removeRead )
    +                            headerElement.removeBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip);
                             else
    -                            headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
    +                            headerElement.addBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip);
     
                             readBaseIndex++;
    -                        locationIndex++;
                         }
    +                    locationIndex += nBasesToAdd;
    +                    break;
    +                default:
                         break;
                 }
             }
         }
    -
    -    private void removeReadsFromWindow (List readsToRemove) {
    -        for (GATKSAMRecord read : readsToRemove) {
    -            readsInWindow.remove(read);
    -        }
    -    }
     }
     
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
    index 631e099a9..9d16ea06f 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
    @@ -47,20 +47,18 @@
     package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
     
     import com.google.java.contract.Requires;
    +import it.unimi.dsi.fastutil.objects.ObjectArrayList;
     import net.sf.samtools.Cigar;
     import net.sf.samtools.CigarElement;
     import net.sf.samtools.CigarOperator;
     import net.sf.samtools.SAMFileHeader;
    -import org.broadinstitute.sting.utils.MathUtils;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
     import org.broadinstitute.sting.utils.recalibration.EventType;
     import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
     import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     
    -import java.util.ArrayList;
     import java.util.Iterator;
    -import java.util.LinkedList;
    -import java.util.List;
    +
     
     /**
      * Running Consensus is a read that is compressed as a sliding window travels over the reads
    @@ -76,17 +74,25 @@ import java.util.List;
      * @since 8/26/11
      */
     public class SyntheticRead {
    -    // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce
    -    // memory footprint.
    -    // TODO: better name
    +
    +    /**
    +     * The types of strandedness for synthetic reads
    +     */
    +    public enum StrandType {
    +        POSITIVE,
    +        NEGATIVE,
    +        STRANDLESS
    +    }
    +
    +    // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint.
         private static class SingleBaseInfo {
             byte baseIndexOrdinal; // enum BaseIndex.ordinal
    -        byte count;
    +        int count;
             byte qual;
             byte insertionQual;
             byte deletionQual;
     
    -        SingleBaseInfo(byte baseIndexOrdinal, byte count, byte qual, byte insertionQual, byte deletionQual) {
    +        SingleBaseInfo(byte baseIndexOrdinal, int count, byte qual, byte insertionQual, byte deletionQual) {
                 this.baseIndexOrdinal = baseIndexOrdinal;
                 this.count = count;
                 this.qual = qual;
    @@ -123,9 +129,8 @@ public class SyntheticRead {
         }
         
         
    -    private final List basesCountsQuals;
    -    private double mappingQuality;                                                                                      // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
    -    private String readTag;
    +    private final ObjectArrayList basesCountsQuals;
    +    private double mappingQuality;
     
         // Information to produce a GATKSAMRecord
         private SAMFileHeader header;
    @@ -135,7 +140,7 @@ public class SyntheticRead {
         private String readName;
         private int refStart;
         private boolean hasIndelQualities = false;
    -    private boolean isNegativeStrand = false;
    +    private StrandType strandType = StrandType.STRANDLESS;
     
         /**
          * Full initialization of the running consensus if you have all the information and are ready to
    @@ -147,14 +152,12 @@ public class SyntheticRead {
          * @param contigIndex     the read's contig index
          * @param readName        the read's name
          * @param refStart        the alignment start (reference based)
    -     * @param readTag         the reduce reads tag for the synthetic read
          */
    -    public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
    +    public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) {
             final int initialCapacity = 10000;
    -        basesCountsQuals = new ArrayList(initialCapacity);
    +        basesCountsQuals = new ObjectArrayList(initialCapacity);
             mappingQuality = 0.0;
     
    -        this.readTag = readTag;
             this.header = header;
             this.readGroupRecord = readGroupRecord;
             this.contig = contig;
    @@ -162,24 +165,7 @@ public class SyntheticRead {
             this.readName = readName;
             this.refStart = refStart;
             this.hasIndelQualities = hasIndelQualities;
    -        this.isNegativeStrand = isNegativeRead;
    -    }
    -
    -    public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
    -        basesCountsQuals = new ArrayList(bases.size());
    -        for (int i = 0; i < bases.size(); ++i) {
    -            basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i)));
    -        }
    -        this.mappingQuality = mappingQuality;
    -        this.readTag = readTag;
    -        this.header = header;
    -        this.readGroupRecord = readGroupRecord;
    -        this.contig = contig;
    -        this.contigIndex = contigIndex;
    -        this.readName = readName;
    -        this.refStart = refStart;
    -        this.hasIndelQualities = hasIndelQualities;
    -        this.isNegativeStrand = isNegativeRead;
    +        this.strandType = strandType;
         }
     
         /**
    @@ -190,7 +176,7 @@ public class SyntheticRead {
          * @param count  number of reads with this base
          */
         @Requires("count <= Byte.MAX_VALUE")
    -    public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) {
    +    public void add(BaseIndex base, int count, byte qual, byte insQual, byte delQual, double mappingQuality) {
             basesCountsQuals.add(new SingleBaseInfo(base.getOrdinalByte(), count, qual, insQual, delQual));
             this.mappingQuality += mappingQuality;
         }
    @@ -220,15 +206,18 @@ public class SyntheticRead {
             read.setReferenceIndex(contigIndex);
             read.setReadPairedFlag(false);
             read.setReadUnmappedFlag(false);
    -        read.setReadNegativeStrandFlag(isNegativeStrand);
    -        read.setCigar(buildCigar());                                        // the alignment start may change while building the cigar (leading deletions)
    +        if ( strandType != StrandType.STRANDLESS ) {
    +            read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1');  // must come before next line
    +            read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE);
    +        }
    +        read.setCigar(buildCigar());           // the alignment start may change while building the cigar (leading deletions)
             read.setAlignmentStart(refStart);
             read.setReadName(readName);
             read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
             read.setReadBases(convertReadBases());
             read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size()));
             read.setReadGroup(readGroupRecord);
    -        read.setAttribute(readTag, convertBaseCounts());
    +        read.setReducedReadCountsTag(convertBaseCounts());
     
             if (hasIndelQualities) {
                 read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
    @@ -278,22 +267,14 @@ public class SyntheticRead {
                 });
         }
     
    -    protected byte [] convertBaseCounts() {
    -        byte[] countsArray = convertVariableGivenBases(new SingleBaseInfoIterator() {
    -                public Byte next() {
    -                    return it.next().count;
    -                }
    -            });
    -
    -        if (countsArray.length == 0)
    -            throw new ReviewedStingException("Reduced read has counts array of length 0");
    -
    -        byte[] compressedCountsArray = new byte [countsArray.length];
    -        compressedCountsArray[0] = countsArray[0];
    -        for (int i = 1; i < countsArray.length; i++)
    -            compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE);
    -
    -        return compressedCountsArray;
    +    protected int[] convertBaseCounts() {
    +        int[] variableArray = new int[getReadLengthWithNoDeletions()];
    +        int i = 0;
    +        for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
    +            if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte())
    +                variableArray[i++] = singleBaseInfo.count;
    +        }
    +        return variableArray;
         }
     
         private byte [] convertReadBases() {
    @@ -316,7 +297,7 @@ public class SyntheticRead {
          * @return the cigar string for the synthetic read
          */
         private Cigar buildCigar() {
    -        LinkedList cigarElements = new LinkedList();
    +        ObjectArrayList cigarElements = new ObjectArrayList();
             CigarOperator cigarOperator = null;
             int length = 0;
             for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
    @@ -369,7 +350,6 @@ public class SyntheticRead {
                     variableArray[i++] = count;
             }
             return variableArray;
    -
         }
     
         /**
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
    similarity index 97%
    rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
    rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
    index 37e82a90c..417da9d79 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
    @@ -44,10 +44,11 @@
     *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
     */
     
    -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
    +package org.broadinstitute.sting.gatk.walkers.diagnostics;
     
     import org.broadinstitute.sting.commandline.Argument;
     import org.broadinstitute.sting.commandline.Output;
    +import org.broadinstitute.sting.gatk.CommandLineGATK;
     import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
     import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
     import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
    @@ -55,6 +56,8 @@ import org.broadinstitute.sting.gatk.report.GATKReport;
     import org.broadinstitute.sting.gatk.walkers.LocusWalker;
     import org.broadinstitute.sting.utils.GenomeLoc;
     import org.broadinstitute.sting.utils.GenomeLocParser;
    +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
    +import org.broadinstitute.sting.utils.help.HelpConstants;
     
     import java.io.PrintStream;
     import java.util.ArrayList;
    @@ -63,25 +66,25 @@ import java.util.LinkedList;
     import java.util.Map;
     
     /**
    - * Simple walker to plot the coverage distribution per base.
    + * Simple walker to plot the coverage distribution per base
      *
      * 

    * Features of this walker: - *

  1. includes a smart counting of uncovered bases without visiting the uncovered loci.
  2. + *
  3. includes a smart counting of uncovered bases without visiting the uncovered loci
  4. *
  5. includes reads with deletions in the loci (optionally can be turned off)
  6. *

    * - *

    Input

    + *

    Input

    *

    * The BAM file and an optional interval list (works for WGS as well) *

    * - *

    Output

    + *

    Output

    *

    * A GATK Report with the coverage distribution per base * *

    - *

    Examples

    + *

    Examples

    *
      * java -Xmx4g -jar GenomeAnalysisTK.jar \
      *   -R ref.fasta \
    @@ -91,15 +94,16 @@ import java.util.Map;
      *   -fd \
      *   -o report.grp
      * 
    - * User: carneiro - * Date: 1/27/13 - * Time: 11:16 AM + * + * @author carneiro + * @since 1/27/13 */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class BaseCoverageDistribution extends LocusWalker, Map>> { /** * The output GATK Report table */ - @Output(required = true, doc = "The output GATK Report table") + @Output(doc = "The output GATK Report table") private PrintStream out; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java similarity index 93% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java index b1a26b7a2..ad6023579 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; @@ -63,11 +63,36 @@ import org.broadinstitute.sting.utils.help.HelpConstants; import java.io.PrintStream; +/** + * Outputs a list of intervals that are covered above a given threshold. + * + *

    The list can be used as an interval list for other walkers. Note that if the -uncovered argument is given, the tool will instead output intervals that fail the coverage threshold.

    + * + *

    Input

    + *

    + * One or more BAM files. + *

    + * + *

    Output

    + *

    + * List of covered (or uncovered) intervals. + *

    + * + *

    Example

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -T FindCoveredIntervals \
    + *   -R ref.fasta \
    + *   -I my_file.bam \
    + *   -o output.list
    + * 
    + * + */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.CONTIG) @ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000) public class FindCoveredIntervals extends ActiveRegionWalker { - @Output(required = true) + @Output private PrintStream out; @Argument(fullName = "uncovered", shortName = "u", required = false, doc = "output intervals that fail the coverage threshold instead") @@ -80,7 +105,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker { // Look to see if the region has sufficient coverage public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { - int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); + int depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage(); // note the linear probability scale return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java new file mode 100644 index 000000000..dca83af44 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java @@ -0,0 +1,150 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * Generic code for Diagnose Target Statistics + * + * @author Mauricio Carneiro + * @since 4/23/13 + */ +abstract class AbstractStratification { + + private long preComputedTotalCoverage = -1; + private Map statusTally = null; + protected ThresHolder thresholds; + + /** + * Calculates the average "good" coverage of this sample. Good means "passes the base and + * mapping quality requirements. + * + * @return the average "good" coverage + */ + public double averageCoverage(final int size) { + if (preComputedTotalCoverage < 0) + preComputedTotalCoverage = calculateTotalCoverage(getElements()); + return (double) preComputedTotalCoverage / size; + } + + /** + * Calculates the total "good" coverage of this sample. Good means "passes the base and + * mapping quality requirements. + * + * @return the total "good" coverage across the interval for this sample + */ + public long getCoverage() { + if (preComputedTotalCoverage < 0) + preComputedTotalCoverage = calculateTotalCoverage(getElements()); + return preComputedTotalCoverage; + } + + + /** + * This is how the extending class will calculate it's own total coverage + * + * @return the total coverage + */ + private long calculateTotalCoverage(Iterable elements) { + long cov = 0; + for (AbstractStratification element : elements) { + cov += element.getCoverage(); + } + return cov; + } + + /** + * What are the list of elements in your class? For example: + * + * IntervalStatistics => List + * SampleStatistics => List + * + * @return the corresponding list of elements of the extending class + */ + public abstract Iterable getElements(); + + /** + * Calculates the Callable statuses for the statistic as a whole (interval, sample or locus) + * + * @return the callable status(es) for the whole object + */ + public abstract Iterable callableStatuses(); + + + /** + * Tally up all the callable status of all the loci in this sample. + * + * @return a map of callable status and counts + */ + public Map getStatusTally() { + if (statusTally == null) { + statusTally = new HashMap(CallableStatus.values().length); + for (AbstractStratification stats : getElements()) { + for (CallableStatus status : stats.callableStatuses()) { + statusTally.put(status, !statusTally.containsKey(status) ? 1 : statusTally.get(status) + 1); + } + } + } + return statusTally; + } + + public static List queryStatus(List statList, AbstractStratification stratification) { + List output = new LinkedList(); + for (Metric stat : statList) { + final CallableStatus status = stat.status(stratification); + if (status != null) { + output.add(status); + } + } + return output; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java index 4bc318b02..d38736f4f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * Short one line description of the walker. @@ -52,9 +52,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; * @author Mauricio Carneiro * @since 2/1/12 */ -public enum CallableStatus { - - REF_N("the reference base was an N, which is not considered callable the GATK"), +enum CallableStatus { PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"), @@ -68,12 +66,7 @@ public enum CallableStatus { BAD_MATE("the reads are not properly mated, suggesting mapping errors"), - NO_READS("there are no reads contained in the interval"), - - // - // Interval-level statuses - // - LOW_MEDIAN_DEPTH("interval has insufficient median depth across samples"); + NO_READS("there are no reads contained in the interval"); public final String description; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java similarity index 61% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 8b9b37c18..32f87b973 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -44,10 +44,10 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -56,13 +56,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; import java.util.*; @@ -75,7 +76,7 @@ import java.util.*; *

    *

    *

    - *

    Input

    + *

    Input

    *

    *

      *
    • A reference file
    • @@ -84,12 +85,12 @@ import java.util.*; *
    *

    *

    - *

    Output

    + *

    Output

    *

    * A modified VCF detailing each interval by sample *

    *

    - *

    Examples

    + *

    Examples

    *
      *    java
      *      -jar GenomeAnalysisTK.jar
    @@ -110,79 +111,52 @@ import java.util.*;
     @PartitionBy(PartitionType.INTERVAL)
     public class DiagnoseTargets extends LocusWalker {
     
    -    @Output(doc = "File to which variants should be written", required = true)
    +    private static final String AVG_INTERVAL_DP_KEY = "IDP";
    +
    +    @Output(doc = "File to which interval statistics should be written")
         private VariantContextWriter vcfWriter = null;
     
    -    @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false)
    -    private int minimumBaseQuality = 20;
    +    @ArgumentCollection
    +    private ThresHolder thresholds = new ThresHolder();
     
    -    @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false)
    -    private int minimumMappingQuality = 20;
    +    private Map intervalMap = null;              // maps each interval => statistics
    +    private PeekableIterator intervalListIterator;                   // an iterator to go over all the intervals provided as we traverse the genome
    +    private Set samples = null;                                         // all the samples being processed
    +    private static final Allele SYMBOLIC_ALLELE = Allele.create("
    ", false); // avoid creating the symbolic allele multiple times + private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times - @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false) - private int minimumCoverage = 5; - - @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) - private int maximumCoverage = 700; - - @Argument(fullName = "minimum_median_depth", shortName = "med", doc = "The minimum allowable median coverage, used for calling LOW_MEDIAN_DEPTH", required = false) - private int minMedianDepth = 10; - - @Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false) - private int maxInsertSize = 500; - - @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed percentage of samples containing a call for the interval to adopt the call ", required = false) - private double votePercentage = 0.50; - - @Argument(fullName = "low_median_depth_status_threshold", shortName = "stMED", doc = "The percentage of the loci needed for calling LOW_MEDIAN_DEPTH", required = false) - private double lowMedianDepthPercentage = 0.20; - - @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The percentage of the loci needed for calling BAD_MATE", required = false) - private double badMateStatusThreshold = 0.50; - - @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The percentage of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false) - private double coverageStatusThreshold = 0.20; - - @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The percentage of the loci needed for calling EXCESSIVE_COVERAGE", required = false) - private double excessiveCoverageThreshold = 0.20; - - @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The percentage of the loci needed for calling POOR_QUALITY", required = false) - private double qualityStatusThreshold = 0.50; - - @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false) - private boolean debug = false; - - private HashMap intervalMap = null; // maps each interval => statistics - private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome - private Set samples = null; // all the samples being processed - private final Allele SYMBOLIC_ALLELE = Allele.create("
    ", false); // avoid creating the symbolic allele multiple times - private ThresHolder thresholds = null; + private static final int INITIAL_HASH_SIZE = 500000; @Override public void initialize() { super.initialize(); - if (getToolkit().getIntervals() == null) - throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )"); + if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) + throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); - thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold); - - intervalMap = new HashMap(); + intervalMap = new HashMap(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header - vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header + // get all of the unique sample names for the VCF Header + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); + + // pre load all the statistics classes because it is costly to operate on the JVM and we only want to do it once. + loadAllPlugins(thresholds); } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore - addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus + // process and remove any intervals in the map that are don't overlap the current locus anymore + // and add all new intervals that may overlap this reference locus + outputFinishedIntervals(refLocus, ref.getBase()); + addNewOverlappingIntervals(refLocus); - for (IntervalStatistics intervalStatistics : intervalMap.values()) - intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats + // at this point, all intervals in intervalMap overlap with this locus, so update all of them + for (IntervalStratification intervalStratification : intervalMap.values()) + intervalStratification.addLocus(context); return 1L; } @@ -212,53 +186,40 @@ public class DiagnoseTargets extends LocusWalker { @Override public void onTraversalDone(Long result) { for (GenomeLoc interval : intervalMap.keySet()) - outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true)); - } + outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); - private GenomeLoc getIntervalMapSpan() { - GenomeLoc loc = null; - for (GenomeLoc interval : intervalMap.keySet()) { - if (loc == null) { - loc = interval; - } else - loc = interval.union(loc); + GenomeLoc interval = intervalListIterator.peek(); + while (interval != null) { + outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE); + intervalListIterator.next(); + interval = intervalListIterator.peek(); } - - return loc; - } - - private GenomeLoc getFinishedIntervalSpan(GenomeLoc pos) { - GenomeLoc loc = null; - for (GenomeLoc interval : intervalMap.keySet()) { - if (interval.isBefore(pos)) { - if (loc == null) - loc = interval; - else - loc = interval.union(loc); - } - } - - return loc; } /** - * Removes all intervals that are behind the current reference locus from the intervalMap + * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ - private void removePastIntervals(GenomeLoc refLocus, byte refBase) { - // if there are statistics to output/ check to see that we can output them in order - if (getFinishedIntervalSpan(refLocus) != null && - getIntervalMapSpan().getStart() == getFinishedIntervalSpan(refLocus).getStart()) { + private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { + GenomeLoc interval = intervalListIterator.peek(); - for (GenomeLoc interval : intervalMap.keySet()) { - if (interval.isBefore(refLocus)) { - outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true)); - intervalMap.remove(interval); - } + // output empty statistics for uncovered intervals + while (interval != null && interval.isBefore(refLocus)) { + final IntervalStratification stats = intervalMap.get(interval); + outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE); + if (stats != null) intervalMap.remove(interval); + intervalListIterator.next(); + interval = intervalListIterator.peek(); + } + + // remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals) + for (GenomeLoc key : intervalMap.keySet()) { + if (key.isBefore(refLocus)) { + outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true)); + intervalMap.remove(key); } - } } @@ -269,17 +230,9 @@ public class DiagnoseTargets extends LocusWalker { */ private void addNewOverlappingIntervals(GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); - - // skip any intervals with no coverage that we have passed - while (interval != null && interval.isBefore(refLocus)) { - intervalListIterator.next(); // discard the interval (we've already added it to the map) - interval = intervalListIterator.peek(); - } - - // add any intervals that overlap this one while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); - intervalListIterator.next(); // discard the interval (we've already added it to the map) + intervalListIterator.next(); interval = intervalListIterator.peek(); } } @@ -290,7 +243,7 @@ public class DiagnoseTargets extends LocusWalker { * @param stats The statistics of the interval * @param refAllele the reference allele */ - private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) { + private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) { GenomeLoc interval = stats.getInterval(); @@ -302,37 +255,26 @@ public class DiagnoseTargets extends LocusWalker { alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); - vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF - vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(thresholds), true))); + vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); + vcb.filters(new LinkedHashSet(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(ThresHolder.AVG_INTERVAL_DP_KEY, stats.averageCoverage()); + attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); vcb = vcb.attributes(attributes); - if (debug) { - System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage()); - } for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); - SampleStatistics sampleStat = stats.getSample(sample); - gb.attribute(ThresHolder.AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage()); - gb.attribute("Q1", sampleStat.getQuantileDepth(0.25)); - gb.attribute("MED", sampleStat.getQuantileDepth(0.50)); - gb.attribute("Q3", sampleStat.getQuantileDepth(0.75)); + SampleStratification sampleStat = stats.getSampleStatistics(sample); + gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size())); - if (debug) { - System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads()); - } - gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds), false)); + gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); } vcb = vcb.genotypes(genotypes); - vcfWriter.add(vcb.make()); - } /** @@ -341,17 +283,74 @@ public class DiagnoseTargets extends LocusWalker { * @param statuses the set of statuses to be converted * @return a matching set of strings */ - private List statusesToStrings(Set statuses, final boolean includePASS) { - List output = new ArrayList(statuses.size()); + private List statusToStrings(Iterable statuses, final boolean isInfoField) { + List output = new LinkedList(); for (CallableStatus status : statuses) - if ( includePASS || status != CallableStatus.PASS ) // adding pass => results in a filter for genotypes + if ( isInfoField || status != CallableStatus.PASS ) output.add(status.name()); return output; } - private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { - return new IntervalStatistics(samples, interval); + private IntervalStratification createIntervalStatistic(GenomeLoc interval) { + return new IntervalStratification(samples, interval, thresholds); } + + protected static void loadAllPlugins(final ThresHolder thresholds) { + for (Class stat : new PluginManager(LocusMetric.class).getPlugins()) { + try { + final LocusMetric stats = (LocusMetric) stat.newInstance(); + stats.initialize(thresholds); + thresholds.locusMetricList.add(stats); + } catch (Exception e) { + throw new DynamicClassResolutionException(stat, e); + } + } + + for (Class stat : new PluginManager(SampleMetric.class).getPlugins()) { + try { + final SampleMetric stats = (SampleMetric) stat.newInstance(); + stats.initialize(thresholds); + thresholds.sampleMetricList.add(stats); + } catch (Exception e) { + throw new DynamicClassResolutionException(stat, e); + } + } + + for (Class stat : new PluginManager(IntervalMetric.class).getPlugins()) { + try { + final IntervalMetric stats = (IntervalMetric) stat.newInstance(); + stats.initialize(thresholds); + thresholds.intervalMetricList.add(stats); + } catch (Exception e) { + throw new DynamicClassResolutionException(stat, e); + } + } + } + + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + private static Set getHeaderInfo() { + Set headerLines = new HashSet(); + + // INFO fields for overall data + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); + headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size.")); + + // FILTER fields + for (CallableStatus stat : CallableStatus.values()) + headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); + + return headerLines; + } + } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java new file mode 100644 index 000000000..50470a744 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java @@ -0,0 +1,57 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 4/20/13 + * Time: 11:30 PM + * To change this template use File | Settings | File Templates. + */ +interface IntervalMetric extends Metric { +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java new file mode 100644 index 000000000..6c20403d1 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java @@ -0,0 +1,132 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.*; + +final class IntervalStratification extends AbstractStratification { + private final Map samples; + private final GenomeLoc interval; + private final ThresHolder thresholds; + + public IntervalStratification(Set samples, GenomeLoc interval, ThresHolder thresholds) { + this.interval = interval; + this.thresholds = thresholds; + this.samples = new HashMap(samples.size()); + for (String sample : samples) + this.samples.put(sample, new SampleStratification(interval, thresholds)); + } + + public SampleStratification getSampleStatistics(String sample) { + return (SampleStratification) samples.get(sample); + } + + public GenomeLoc getInterval() { + return interval; + } + + public int getNSamples() { + return samples.size(); + } + + /** + * The function to populate data into the Statistics from the walker. + * This takes the input and manages passing the data to the SampleStatistics and Locus Statistics + * + * @param context The alignment context given from the walker + */ + public void addLocus(AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + + Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); + + for (Map.Entry entry : samplePileups.entrySet()) { + String sample = entry.getKey(); + ReadBackedPileup samplePileup = entry.getValue(); + SampleStratification sampleStratification = (SampleStratification) samples.get(sample); + + if (sampleStratification == null) + throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); + + sampleStratification.addLocus(context.getLocation(), samplePileup); + } + + } + + /** + * {@inheritDoc} + */ + @Override + public Iterable getElements() { + return samples.values(); + } + + /** + * {@inheritDoc} + */ + @Override + public Iterable callableStatuses() { + final List output = new LinkedList(); + + // check if any of the votes pass the threshold + final int nSamples = getNSamples(); + for (Map.Entry entry : getStatusTally().entrySet()) { + if ((double) entry.getValue() / nSamples > thresholds.votePercentageThreshold) { + output.add(entry.getKey()); + } + } + + output.addAll(queryStatus(thresholds.intervalMetricList, this)); + + return output; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java new file mode 100644 index 000000000..9950b4e2d --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java @@ -0,0 +1,58 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 4/20/13 + * Time: 11:29 PM + * To change this template use File | Settings | File Templates. + */ +interface LocusMetric extends Metric { + public CallableStatus sampleStatus (SampleStratification sampleStratification); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java similarity index 86% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java index 1390b0ee9..0973fef1e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java @@ -44,55 +44,30 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; - -import java.util.Arrays; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Mar 23, 2011 + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM */ -// simple node class for storing kmer sequences -@Invariant("kmer > 0") -public class DeBruijnVertex { +final class LocusMetricCoverageGap implements LocusMetric { + private double threshold; + private static final CallableStatus CALL = CallableStatus.COVERAGE_GAPS; - protected final byte[] sequence; - public final int kmer; - - public DeBruijnVertex( final byte[] sequence, final int kmer ) { - this.sequence = sequence.clone(); - this.kmer = kmer; + @Override + public void initialize(ThresHolder thresholds) { + threshold = thresholds.coverageStatusThreshold; } @Override - public boolean equals( Object v ) { - return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence); + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + return locusStratification.getRawCoverage() == 0 ? CALL : null; } @Override - public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect - return Arrays.hashCode(sequence); - } - - public String toString() { - return new String(sequence); - } - - public String getSuffixString() { - return new String(getSuffix()); - } - - @Ensures("result != null") - public byte[] getSequence() { - return sequence.clone(); - } - - @Ensures("result != null") - public byte[] getSuffix() { - return Arrays.copyOfRange( sequence, kmer - 1, sequence.length ); + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java new file mode 100644 index 000000000..fbedc5404 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java @@ -0,0 +1,75 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +final class LocusMetricExcessiveCoverage implements LocusMetric { + private int excessiveCoverage; + private double threshold; + private static final CallableStatus CALL = CallableStatus.EXCESSIVE_COVERAGE ; + + @Override + public void initialize(ThresHolder thresholds) { + this.excessiveCoverage = thresholds.maximumCoverage; + this.threshold = thresholds.coverageStatusThreshold; + } + + @Override + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + return locusStratification.getCoverage() > excessiveCoverage ? CALL : null; + } + + @Override + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java new file mode 100644 index 000000000..5b5015beb --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java @@ -0,0 +1,76 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +final class LocusMetricLowCoverage implements LocusMetric { + private int minCoverage; + private double threshold; + private static final CallableStatus CALL = CallableStatus.LOW_COVERAGE ; + + @Override + public void initialize(ThresHolder thresholds) { + this.minCoverage = thresholds.minimumCoverage; + this.threshold = thresholds.coverageStatusThreshold; + } + + @Override + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + final long raw = locusStratification.getRawCoverage(); + return raw > 0 && raw < minCoverage ? CALL: null; + } + + @Override + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java new file mode 100644 index 000000000..53c07d421 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java @@ -0,0 +1,75 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +final class LocusMetricPoorQuality implements LocusMetric { + private int minCoverage; + private double threshold; + private static final CallableStatus CALL = CallableStatus.POOR_QUALITY ; + + @Override + public void initialize(ThresHolder thresholds) { + this.minCoverage = thresholds.minimumCoverage; + this.threshold = thresholds.coverageStatusThreshold; + } + + @Override + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + return locusStratification.getCoverage() < minCoverage && locusStratification.getRawCoverage() >= minCoverage ? CALL: null; + } + + @Override + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java similarity index 86% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java index 5ec1a1608..d6acaf850 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java @@ -44,61 +44,54 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; -import java.util.HashSet; -import java.util.Set; +import java.util.LinkedList; +import java.util.List; -class LocusStatistics { - private final int coverage; - private final int rawCoverage; +final class LocusStratification extends AbstractStratification { + private long coverage; + private long rawCoverage; + private final List locusStatisticsList; - public LocusStatistics() { - this.coverage = 0; - this.rawCoverage = 0; + public LocusStratification(ThresHolder thresholds) { + this(0,0,thresholds); } - public LocusStatistics(int coverage, int rawCoverage) { + protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) { this.coverage = coverage; this.rawCoverage = rawCoverage; + this.locusStatisticsList = thresholds.locusMetricList; } - public int getCoverage() { - return coverage; - } + @Override + public long getCoverage() {return coverage;} + public long getRawCoverage() {return rawCoverage;} - public int getRawCoverage() { - return rawCoverage; + public void addLocus(final int coverage, final int rawCoverage) { + this.coverage = coverage; + this.rawCoverage = rawCoverage; } /** * Generates all applicable statuses from the coverages in this locus * - * @param thresholds the class contains the statistical threshold for making calls * @return a set of all statuses that apply */ - public Set callableStatuses(ThresHolder thresholds) { - Set output = new HashSet(); - - // if too much coverage - if (getCoverage() > thresholds.getMaximumCoverage()) - output.add(CallableStatus.EXCESSIVE_COVERAGE); - - // if not enough coverage - if (getCoverage() < thresholds.getMinimumCoverage()) { - // was there a lot of low Qual coverage? - if (getRawCoverage() >= thresholds.getMinimumCoverage()) - output.add(CallableStatus.POOR_QUALITY); - // no? - else { - // is there any coverage? - if (getRawCoverage() > 0) - output.add(CallableStatus.LOW_COVERAGE); - else - output.add(CallableStatus.COVERAGE_GAPS); + public List callableStatuses() { + List output = new LinkedList(); + for (Metric stats : locusStatisticsList) { + CallableStatus status = stats.status(this); + if (status != null) { + output.add(status); } } - return output; } + + @Override + public Iterable getElements() { + return null; + } + } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java new file mode 100644 index 000000000..6f13b9cac --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java @@ -0,0 +1,57 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * + * @author Mauricio Carneiro + * @since 4/23/13 + */ +interface Metric { + public void initialize(ThresHolder thresholds); + public CallableStatus status (AbstractStratification statistic); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java new file mode 100644 index 000000000..1085e8cac --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java @@ -0,0 +1,63 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +import java.util.Map; + +/** + * User: carneiro + * Date: 4/21/13 + * Time: 11:23 AM + */ +final class PluginUtils { + public static CallableStatus genericSampleStatus (final SampleStratification sampleStratification, final CallableStatus CALL, final double threshold) { + final Map totals = sampleStratification.getStatusTally(); + final int size = sampleStratification.getIntervalSize(); + final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0; + return ( (double) statusCount / size) >= threshold ? CALL: null; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java new file mode 100644 index 000000000..8de33b269 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java @@ -0,0 +1,57 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 4/20/13 + * Time: 11:30 PM + * To change this template use File | Settings | File Templates. + */ +interface SampleMetric extends Metric { +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java new file mode 100644 index 000000000..cf5aac4a6 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java @@ -0,0 +1,73 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +final class SampleMetricBadMates implements SampleMetric { + private static final CallableStatus CALL = CallableStatus.NO_READS ; + + private double threshold; + private double votingThreshold; + + @Override + public void initialize(ThresHolder thresholds) { + threshold = thresholds.badMateStatusThreshold; + votingThreshold = thresholds.votePercentageThreshold; + } + + @Override + public CallableStatus status(AbstractStratification statistics) { + final SampleStratification sampleStratification = (SampleStratification) statistics; + final int nReads = sampleStratification.getnReads(); + return nReads > 0 && (double) sampleStratification.getnBadMates() / nReads > threshold ? CALL : null; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java new file mode 100644 index 000000000..bf9e7420d --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java @@ -0,0 +1,66 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +final class SampleMetricNoReads implements SampleMetric { + private static final CallableStatus CALL = CallableStatus.NO_READS; +@Override + public void initialize(ThresHolder thresholds) { + } + + @Override + public CallableStatus status(AbstractStratification statistics) { + final SampleStratification sampleStratification = (SampleStratification) statistics; + return sampleStratification.getnReads() == 0 ? CALL : null; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java new file mode 100644 index 000000000..b9ae1f3cf --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java @@ -0,0 +1,162 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * The statistics calculator for a specific sample given the interval + */ +final class SampleStratification extends AbstractStratification { + private final GenomeLoc interval; + private final ArrayList loci; + private final ThresHolder thresholds; + + private int nReads = -1; + private int nBadMates = -1; + + public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) { + this.interval = interval; + this.loci = new ArrayList(interval.size()); + this.thresholds = thresholds; + nReads = 0; + nBadMates = 0; + + // Initialize every loci (this way we don't have to worry about non-existent loci in the object + for (int i = 0; i < interval.size(); i++) + this.loci.add(new LocusStratification(thresholds)); + } + + /** + * Simple Getters + */ + public int getIntervalSize() {return interval.size();} + public int getnReads() {return nReads;} + public int getnBadMates() {return nBadMates;} + + /** + * Adds a locus to the interval wide stats + * + * @param locus The locus given as a GenomeLoc + * @param pileup The pileup of that locus, this exclusively contains the sample + */ + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { + if (!interval.containsP(locus)) + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval)); + + // a null pileup means there nothing to add + if (pileup != null) { + final int locusIndex = locus.getStart() - interval.getStart(); + final int rawCoverage = pileup.depthOfCoverage(); + final int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.minimumBaseQuality, thresholds.minimumMappingQuality).depthOfCoverage(); + final LocusStratification locusData = (LocusStratification) loci.get(locusIndex); + locusData.addLocus(coverage, rawCoverage); + + // process all the reads in this pileup (tallying number of reads and bad mates) + for (GATKSAMRecord read : pileup.getReads()) + processRead(read); + } + } + + @Override + public Iterable getElements() { + return loci; + } + + /** + * {@inheritDoc} + */ + @Override + public Iterable callableStatuses() { + final List output = new LinkedList(); + + // get the tally of all the locus callable statuses + for (Metric locusStat : thresholds.locusMetricList) { + final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this); + if (status != null) { + output.add(status); + } + } + + // get the sample specific statitics statuses + for (Metric sampleStat : thresholds.sampleMetricList) { + final CallableStatus status = sampleStat.status(this); + if (status != null) { + output.add(status); + } + } + + // special case, if there are no reads, then there is no sense reporting coverage gaps. + if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS)) + output.remove(CallableStatus.COVERAGE_GAPS); + + return output; + } + + + /** + * Account for the read and check it for any statistics necessary. Reads are marked in the temporary + * attribute "seen" to make sure they're not counted twice. + * + * @param read the read + */ + private void processRead(GATKSAMRecord read) { + if (read.getTemporaryAttribute("seen") == null) { + nReads++; + if (read.getReadPairedFlag() && !read.getProperPairFlag()) + nBadMates++; + read.setTemporaryAttribute("seen", true); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java similarity index 66% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index fc4954f3b..b0c999460 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -44,131 +44,101 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.commandline.Argument; -import java.util.HashSet; -import java.util.Set; +import java.util.LinkedList; +import java.util.List; -class ThresHolder { - public static final String AVG_INTERVAL_DP_KEY = "AVG_INTERVAL_DP"; - public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); +final class ThresHolder { - private final int minimumBaseQuality; - private final int minimumMappingQuality; + /** + * Only bases with quality greater than this will be considered in the coverage metrics. + */ + @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false) + public int minimumBaseQuality = 20; - private final int minimumCoverage; - private final int maximumCoverage; - private final int minimumMedianDepth; + /** + * Only reads with mapping quality greater than this will be considered in the coverage metrics. + */ + @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false) + public int minimumMappingQuality = 20; - private final int maximumInsertSize; + /** + * If at any locus, a sample has less coverage than this, it will be reported as LOW_COVERAGE + */ + @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false) + public int minimumCoverage = 5; - private final double votePercentageThreshold; - private final double lowMedianDepthThreshold; - private final double badMateStatusThreshold; - private final double coverageStatusThreshold; - private final double excessiveCoverageThreshold; - private final double qualityStatusThreshold; + /** + * If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE + */ + @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) + public int maximumCoverage = 700; - public ThresHolder(int minimumBaseQuality, - int minimumMappingQuality, - int minimumCoverage, - int maximumCoverage, - int minimumMedianDepth, - int maximumInsertSize, - double votePercentageThreshold, - double lowMedianDepthThreshold, - double badMateStatusThreshold, - double coverageStatusThreshold, - double excessiveCoverageThreshold, - double qualityStatusThreshold) { + /** + * If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE + */ + @Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false) + public int maximumInsertSize = 500; + + /** + * The proportion of samples that must have a status for it to filter the entire interval. Example: 8 out of 10 samples have low coverage status on the interval, + * with a threshold higher than 0.2, this interval will be filtered as LOW_COVERAGE. + */ + @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed proportion of samples containing a call for the interval to adopt the call ", required = false) + public double votePercentageThreshold = 0.50; + + /** + * The proportion of reads in the loci that must have bad mates for the sample to be reported as BAD_MATE + */ + @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The proportion of the loci needed for calling BAD_MATE", required = false) + public double badMateStatusThreshold = 0.50; + + /** + * The proportion of loci in a sample that must fall under the LOW_COVERAGE or COVERAGE_GAPS category for the sample to be reported as either (or both) + */ + @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The proportion of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false) + public double coverageStatusThreshold = 0.20; + + /** + * The proportion of loci in a sample that must fall under the EXCESSIVE_COVERAGE category for the sample to be reported as EXCESSIVE_COVERAGE + */ + @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The proportion of the loci needed for calling EXCESSIVE_COVERAGE", required = false) + public double excessiveCoverageThreshold = 0.20; + + /** + * The proportion of loci in a sample that must fall under the LOW_QUALITY category for the sample to be reported as LOW_QUALITY + */ + @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) + public double qualityStatusThreshold = 0.50; + + public final List locusMetricList = new LinkedList(); + public final List sampleMetricList = new LinkedList(); + public final List intervalMetricList = new LinkedList(); + + public ThresHolder() {} + + public ThresHolder(final int minimumBaseQuality, + final int minimumMappingQuality, + final int minimumCoverage, + final int maximumCoverage, + final int maximumInsertSize, + final double votePercentageThreshold, + final double badMateStatusThreshold, + final double coverageStatusThreshold, + final double excessiveCoverageThreshold, + final double qualityStatusThreshold) { this.minimumBaseQuality = minimumBaseQuality; this.minimumMappingQuality = minimumMappingQuality; this.minimumCoverage = minimumCoverage; this.maximumCoverage = maximumCoverage; - this.minimumMedianDepth = minimumMedianDepth; this.maximumInsertSize = maximumInsertSize; this.votePercentageThreshold = votePercentageThreshold; - this.lowMedianDepthThreshold = lowMedianDepthThreshold; this.badMateStatusThreshold = badMateStatusThreshold; this.coverageStatusThreshold = coverageStatusThreshold; this.excessiveCoverageThreshold = excessiveCoverageThreshold; this.qualityStatusThreshold = qualityStatusThreshold; } - - public int getMinimumCoverage() { - return minimumCoverage; - } - - public int getMaximumCoverage() { - return maximumCoverage; - } - - public int getMinimumMedianDepth() { - return minimumMedianDepth; - } - - public int getMaximumInsertSize() { - return maximumInsertSize; - } - - public double getVotePercentageThreshold() { - return votePercentageThreshold; - } - - public double getLowMedianDepthThreshold() { - return lowMedianDepthThreshold; - } - - public double getBadMateStatusThreshold() { - return badMateStatusThreshold; - } - - public double getCoverageStatusThreshold() { - return coverageStatusThreshold; - } - - public double getExcessiveCoverageThreshold() { - return excessiveCoverageThreshold; - } - - public double getQualityStatusThreshold() { - return qualityStatusThreshold; - } - - public int getFilteredCoverage(ReadBackedPileup pileup) { - return pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); - } - - /** - * Gets the header lines for the VCF writer - * - * @return A set of VCF header lines - */ - public static Set getHeaderInfo() { - Set headerLines = new HashSet(); - - // INFO fields for overall data - headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); - headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); - - // FORMAT fields for each genotype - // todo -- find the appropriate VCF constants - headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); - headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); - headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); - headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); - headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); - - - // FILTER fields - for (CallableStatus stat : CallableStatus.values()) - headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); - - return headerLines; - } - } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java deleted file mode 100644 index ad9f287d2..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ /dev/null @@ -1,320 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * The statistics calculator for a specific sample given the interval - */ -class SampleStatistics { - private final GenomeLoc interval; - private final ArrayList loci; - - private int[] preSortedDepths = null; - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - - private int nReads = -1; - private int nBadMates = -1; - - private SampleStatistics(GenomeLoc interval, ArrayList loci) { - this.interval = interval; - this.loci = loci; - nReads = 0; - nBadMates = 0; - } - - public SampleStatistics(GenomeLoc interval) { - this(interval, new ArrayList(interval.size())); - - // Initialize every loci (this way we don't have to worry about non-existent loci in the object - for (int i = 0; i < interval.size(); i++) - this.loci.add(new LocusStatistics()); - - } - - public long totalCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return preComputedTotalCoverage; - } - - public double averageCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return (double) preComputedTotalCoverage / loci.size(); - } - - /** - * Calculates the callable statuses of the entire sample - * - * @param thresholds the class contains the statistical threshold for making calls - * @return the callable statuses of the entire sample - */ - public Set getCallableStatuses(ThresHolder thresholds) { - // We check if reads are present ot prevent div / 0 exceptions - if (nReads == 0) { - return Collections.singleton(CallableStatus.NO_READS); - } - - Set output = new HashSet(); - Map totals = new HashMap(CallableStatus.values().length); - - // initialize map - for (CallableStatus status : CallableStatus.values()) - totals.put(status, 0.0); - - // sum up all the callable statuses for each locus - for (int i = 0; i < interval.size(); i++) { - for (CallableStatus status : callableStatus(i, thresholds)) { - double count = totals.get(status); - - totals.put(status, count + 1); - } - } - - double intervalSize = interval.size(); - - if (((double) nBadMates / nReads) >= thresholds.getBadMateStatusThreshold()) - output.add(CallableStatus.BAD_MATE); - - if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) >= thresholds.getCoverageStatusThreshold()) - output.add(CallableStatus.COVERAGE_GAPS); - - if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) >= thresholds.getCoverageStatusThreshold()) - output.add(CallableStatus.LOW_COVERAGE); - - if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) >= thresholds.getExcessiveCoverageThreshold()) - output.add(CallableStatus.EXCESSIVE_COVERAGE); - - if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) >= thresholds.getQualityStatusThreshold()) - output.add(CallableStatus.POOR_QUALITY); - - if (totals.get(CallableStatus.REF_N) > 0) - output.add(CallableStatus.REF_N); - - - if (output.isEmpty()) { - output.add(CallableStatus.PASS); - } - - return output; - } - - /** - * Adds a locus to the interval wide stats - * - * @param locus The locus given as a GenomeLoc - * @param pileup The pileup of that locus, this exclusively contains the sample - * @param thresholds the class contains the statistical threshold for making calls - */ - public void addLocus(GenomeLoc locus, ReadBackedPileup pileup, ThresHolder thresholds) { - if (!interval.containsP(locus)) - throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval)); - - // a null pileup means there nothing ot add - if (pileup != null) { - - int locusIndex = locus.getStart() - interval.getStart(); - - int rawCoverage = pileup.depthOfCoverage(); - int coverage = thresholds.getFilteredCoverage(pileup); - - LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); - - loci.set(locusIndex, locusData); - - for (GATKSAMRecord read : pileup.getReads()) - processRead(read, thresholds); - } - } - - private void processRead(GATKSAMRecord read, ThresHolder thresholds) { - // Was this read already processed? - if (read.getTemporaryAttribute("checkedBadMate") == null) { - nReads++; - if (!hasValidMate(read, thresholds)) - nBadMates++; - read.setTemporaryAttribute("checkedBadMate", true); - } - } - - /** - * returns the callable status of a given locus without taking the reference base into account. - * - * @param locusIndex location in the genome to inquire (only one locus) - * @param thresholds the class contains the statistical threshold for making calls - * @return the callable status of a locus - */ - private Set callableStatus(int locusIndex, ThresHolder thresholds) { - LocusStatistics locus = loci.get(locusIndex); - - return locus.callableStatuses(thresholds); - } - - private void calculateTotalCoverage() { - preComputedTotalCoverage = 0; - for (LocusStatistics locus : loci) - preComputedTotalCoverage += locus.getCoverage(); - } - - public double getQuantileDepth(double percentage) { - if (preSortedDepths == null) - getDepthsAsSortedArray(); - - return getQuartile(preSortedDepths, percentage); - } - - static double getQuartile(int[] data, double percentage) { - int size = data.length; - if (size == 1) - return (double) data[0]; - - if (percentage == 0.5) { - return getMedian(data); - } - - double position = (size - 1.0) / 2; - if (percentage == 0.25) { - // if the position is a whole number - return getMedian(Arrays.copyOfRange(data, 0, (int) position + 1)); - - } - if (percentage == 0.75) { - if (position % 1 == 0) { - return getMedian(Arrays.copyOfRange(data, (int) position, size)); - } else { - return getMedian(Arrays.copyOfRange(data, (int) position + 1, size)); - } - } - return -1; - } - - // Assumes data is sorted - private static double getMedian(int[] data) { - double size = (double) data.length; - if (size == 1) - return (double) data[0]; - - double position = (size - 1.0) / 2; - - if (position % 1 == 0) - return (double) data[(int) position]; - - else { - double high = (double) data[(int) Math.ceil(position)]; - double low = (double) data[(int) Math.floor(position)]; - - return (high + low) / 2; - - } - - } - - private void getDepthsAsSortedArray() { - preSortedDepths = new int[loci.size()]; - - for (int i = 0; i < loci.size(); i++) - preSortedDepths[i] = loci.get(i).getCoverage(); - - Arrays.sort(preSortedDepths); - } - - boolean hasValidMate(GATKSAMRecord read, ThresHolder thresholds) { - /** Check the following - * Does it have a pair? - * reasonable insert size? - * inverted? - * same orientation? - * same contig? - * is pair mapped? - * todo - is forced mate? - * - */ - - // has NO pair - if (!read.getReadPairedFlag()) - return false; - - // different contigs - if (!read.getMateReferenceIndex().equals(read.getReferenceIndex())) - return false; - - // unmapped - if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag()) - return false; - - // same orientation - if (read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag()) - return false; - - // inverted - if (read.getReadNegativeStrandFlag() == - read.getAlignmentStart() < read.getMateAlignmentStart()) - return false; - - // TODO note: IGV uses a different algorithm for insert size, there should be a common util class that does this for you - // mates are too far apart - if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize()) - return false; - - return true; - } - - public int getnReads() { - return nReads; - } - - public int getnBadMates() { - return nBadMates; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index 49494ebb0..7ce736b0c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index c957bb9db..2f2a93fa4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index bd25fb6c5..9c4694955 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.variant.variantcontext.*; import java.util.*; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index 14bffbc34..f19057f29 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -227,7 +227,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi * @param capBaseQualsAtMappingQual Cap base at mapping qual * @param minBaseQual Minimum base quality to consider * @param errorModel Site error model - * @return Number of bases added + * @return Number of bases added - only good bases actually added to GLs are counted. */ private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) { // Number of [A C G T]'s in pileup, in that order @@ -235,28 +235,29 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi for (byte b: BaseUtils.BASES) numSeenBases.add(0); - if (hasReferenceSampleData) { - // count number of elements in pileup - for (PileupElement elt : pileup) { - byte obsBase = elt.getBase(); - byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - if ( qual == 0 ) - continue; - - int idx = 0; - - for (byte base:BaseUtils.BASES) { - int cnt = numSeenBases.get(idx); - numSeenBases.set(idx++,cnt + (base == obsBase?1:0)); - - } - + int nGoodBases = 0; + // count number of elements in pileup + for (PileupElement elt : pileup) { + byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + if ( qual == 0 ) + continue; + + int idx = 0; + + for (byte base:BaseUtils.BASES) { + int cnt = numSeenBases.get(idx); + numSeenBases.set(idx++,cnt + (base == obsBase?1:0)); + } - if (VERBOSE) - System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3)); + nGoodBases++; } + + if (VERBOSE) + System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3)); + computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup); - return pileup.getNumberOfElements(); + return nGoodBases; } /** @@ -281,7 +282,8 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi double p1 = 0.0; if (!hasReferenceSampleData) { - // no error model: loop throught pileup to compute likalihoods just on base qualities + // no error model: loop through pileup to compute likelihoods just on base qualities + // In this case, vector numObservations is not used directly for GL computation for (final PileupElement elt : pileup) { final byte obsBase = elt.getBase(); final byte qual = qualToUse(elt, true, true, mbq); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 858a3370b..c6e9ea379 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -55,7 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey())); b.PL(genotypeLikelihoods); b.DP(getFilteredDepth(pileup)); genotypes.add(b.make()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 7d2f794ec..ce5f94478 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); final Double contamination = UAC.getSampleContamination().get(sample.getKey()); if( contamination > 0.0 ) //no need to enter if no contamination reduction - pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog); + pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination); if ( useBAQedPileup ) pileup = createBAQedPileup(pileup); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 14d827747..e346b10b7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -113,12 +113,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; - /** - * This argument informs the prior probability of having an indel at a site. - */ - @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) - public double INDEL_HETEROZYGOSITY = 1.0/8000; - @Advanced @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty, as Phred-scaled probability. I.e., 30 => 10^-30/10", required = false) public byte INDEL_GAP_CONTINUATION_PENALTY = 10; @@ -238,7 +232,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; - this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 137a1cfa5..54fcad1df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -86,17 +86,17 @@ import java.util.*; * both single sample data and multi-sample data. *

    * - *

    Input

    + *

    Input

    *

    * The read data from which to make variant calls. *

    * - *

    Output

    + *

    Output

    *

    * A raw, unfiltered, highly sensitive callset in VCF format. *

    * - *

    Example generic command for multi-sample SNP calling

    + *

    Example generic command for multi-sample SNP calling

    *
      * java -jar GenomeAnalysisTK.jar \
      *   -R resources/Homo_sapiens_assembly18.fasta \
    @@ -117,7 +117,7 @@ import java.util.*;
      * argument descriptions below.
      * 

    * - *

    Example command for generating calls at all sites

    + *

    Example command for generating calls at all sites

    *
      * java -jar /path/to/GenomeAnalysisTK.jar \
      *   -l INFO \
    @@ -128,7 +128,7 @@ import java.util.*;
      *   --output_mode EMIT_ALL_SITES
      * 
    * - *

    Caveats

    + *

    Caveats

    *
      *
    • The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and * file formats are likely to change.
    • @@ -167,7 +167,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * Records that are filtered in the comp track will be ignored. * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). */ - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -180,7 +180,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * A raw, unfiltered, highly sensitive callset in VCF format. */ //@Gather(className = "org.broadinstitute.sting.queue.extensions.gatk.CatVariantsGatherer") - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter writer = null; @Hidden @@ -205,7 +205,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif protected List annotationsToExclude = new ArrayList(); /** - * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups. + * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs. */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected String[] annotationClassesToUse = { "Standard" }; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index ede0741ff..3380efcc9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine { this.N = samples.size() * ploidy; log10AlleleFrequencyPriorsSNPs = new double[N+1]; log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); filter.add(LOW_QUAL_FILTER_NAME); @@ -385,11 +385,23 @@ public class UnifiedGenotyperEngine { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + // TODO TODO TODO TODO + // REFACTOR THIS FUNCTION, TOO UNWIELDY!! + // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } + // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call + if (!canVCbeGenotyped(vc)) { + if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext) + return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext); + else + return null; + + } + // estimate our confidence in a reference call and return if ( vc.getNSamples() == 0 ) { if ( limitedContext ) @@ -544,6 +556,23 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); } + /** + * Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed. + * @param vc Input VC + * @return Status check + */ + @Requires("vc != null") + protected boolean canVCbeGenotyped(final VariantContext vc) { + // protect against too many alternate alleles that we can't even run AF on: + if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { + logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + + " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); + return false; + } + else return true; + + } + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { if ( !BaseUtils.isRegularBase(refContext.getBase()) ) @@ -570,9 +599,9 @@ public class UnifiedGenotyperEngine { int numDeletions = 0; for ( final PileupElement p : rawContext.getBasePileup() ) { if ( p.isDeletion() ) - numDeletions++; + numDeletions += p.getRepresentativeCount(); } - if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) { + if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { return null; } } @@ -581,20 +610,8 @@ public class UnifiedGenotyperEngine { return stratifiedContexts; } - private final static double[] binomialProbabilityDepthCache = new double[10000]; - private final static double REF_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); - - static { - for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { - binomialProbabilityDepthCache[i] = MathUtils.log10BinomialProbability(i, 0, REF_BINOMIAL_PROB_LOG10_0_5); - } - } - private final double getRefBinomialProbLog10(final int depth) { - if ( depth < binomialProbabilityDepthCache.length ) - return binomialProbabilityDepthCache[depth]; - else - return MathUtils.log10BinomialProbability(depth, 0, REF_BINOMIAL_PROB_LOG10_0_5); + return MathUtils.log10BinomialProbability(depth, 0); } private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { @@ -722,17 +739,45 @@ public class UnifiedGenotyperEngine { return GGAmodel; } - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { + /** + * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, + * where Pr(AC=i) = theta/i where theta is heterozygosity + * @param N Number of chromosomes + * @param priors (output) array to be filled with priors + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) + */ + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { + double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - final double value = theta / (double)i; - priors[i] = Math.log10(value); - sum += value; + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else { + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } } + // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) + if (sum > 1.0) { + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); + } // null frequency for AF=0 is (1 - sum(all other frequencies)) priors[0] = Math.log10(1.0 - sum); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index d4bb3cab3..6dffa8a6d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -79,7 +79,7 @@ public class AFCalcFactory { /** original biallelic exact model, for testing only */ EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), - /** implementation that supports any sample ploidy */ + /** implementation that supports any sample ploidy. Currently not available for the HaplotypeCaller */ EXACT_GENERAL_PLOIDY("GeneralPloidyExactAFCalc", -1, -1); /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index a66a5580c..042e04767 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; @@ -111,7 +112,7 @@ public class AFCalcTestBuilder { return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors case human: final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 0a552c0a1..12a4841bf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -52,18 +52,22 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; -import java.io.PrintStream; +import java.io.File; import java.util.*; /** @@ -73,30 +77,38 @@ import java.util.*; */ public class DeBruijnAssembler extends LocalAssemblyEngine { + private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; - private static final byte MIN_QUALITY = (byte) 16; + + // TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should + // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where + // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases + private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; private static final int GRAPH_KMER_STEP = 6; - // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode - private static final double SW_MATCH = 5.0; // 1.0; - private static final double SW_MISMATCH = -10.0; //-1.0/3.0; - private static final double SW_GAP = -22.0; //-1.0-1.0/3.0; - private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; + private final boolean debug; + private final boolean debugGraphTransformations; + private final int minKmer; + private final boolean allowCyclesInKmerGraphToGeneratePaths; - private final boolean DEBUG; - private final PrintStream GRAPH_WRITER; - private final List graphs = new ArrayList(); - private final int MIN_KMER; + private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; - private int PRUNE_FACTOR = 2; - - public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) { + + protected DeBruijnAssembler() { + this(false, -1, 11, false); + } + + public DeBruijnAssembler(final boolean debug, + final int debugGraphTransformations, + final int minKmer, + final boolean allowCyclesInKmerGraphToGeneratePaths) { super(); - DEBUG = debug; - GRAPH_WRITER = graphWriter; - MIN_KMER = minKmer; + this.debug = debug; + this.debugGraphTransformations = debugGraphTransformations > 0; + this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; + this.minKmer = minKmer; + this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths; } /** @@ -105,150 +117,119 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @param refHaplotype reference haplotype object * @param fullReferenceWithPadding byte array holding the reference sequence with padding * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode * @return a non-empty list of all the haplotypes that are produced during assembly */ @Ensures({"result.contains(refHaplotype)"}) - public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List activeAllelesToGenotype ) { + public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype ) { if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } - if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } - - // set the pruning factor for this run of the assembly engine - this.PRUNE_FACTOR = PRUNE_FACTOR; + if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } // create the graphs - createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); + final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); // print the graphs if the appropriate debug option has been turned on - if( GRAPH_WRITER != null ) { - printGraphs(); + if( graphWriter != null ) { + printGraphs(graphs); } // find the best paths in the graphs and return them as haplotypes - return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); + return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); } @Requires({"reads != null", "refHaplotype != null"}) - protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { - graphs.clear(); + protected List createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { + final List graphs = new LinkedList(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; - if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs - + if( maxKmer < minKmer) { + // Reads are too small for assembly so don't try to create any assembly graphs + return Collections.emptyList(); + } // create the graph for each possible kmer - for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) { - final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG ); + for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { + if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) + continue; + + if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); + DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready - pruneGraph(graph, PRUNE_FACTOR); - cleanNonRefPaths(graph); - mergeNodes(graph); - if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference - sanityCheckReferenceGraph(graph, refHaplotype); - graphs.add(graph); + if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); + + if ( shouldErrorCorrectKmers() ) { + throw new UserException("Error correction no longer supported because of the " + + "incredibly naive way this was implemented. The command line argument remains because some" + + " future subsystem will actually go and error correct the reads"); + } + + final SeqGraph seqGraph = toSeqGraph(graph); + + if ( seqGraph != null ) { // if the graph contains interesting variation from the reference + sanityCheckReferenceGraph(seqGraph, refHaplotype); + graphs.add(seqGraph); + + if ( debugGraphTransformations ) // we only want to use one graph size + break; } } + } + + return graphs; } - @Requires({"graph != null"}) - protected static void mergeNodes( final DeBruijnAssemblyGraph graph ) { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; + private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { + final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); - for( final DeBruijnEdge e : graph.edgeSet() ) { - final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e); - final DeBruijnVertex incomingVertex = graph.getEdgeSource(e); - if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 && - graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { - final Set outEdges = graph.outgoingEdgesOf(outgoingVertex); - final Set inEdges = graph.incomingEdgesOf(incomingVertex); - if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } + // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm + // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect + // TODO -- to anything from one that's actually has good support along the chain but just happens + // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately + // TODO -- the pruning algorithm really should be an error correction algorithm that knows more + // TODO -- about the structure of the data and can differentiate between an infrequent path but + // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) + // TODO -- from a error with lots of weight going along another similar path + // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive + seqGraph.zipLinearChains(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); - final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer ); - graph.addVertex(addedVertex); - for( final DeBruijnEdge edge : outEdges ) { - graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); - } - for( final DeBruijnEdge edge : inEdges ) { - graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); - } + // now go through and prune the graph, removing vertices no longer connected to the reference chain + // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight + // edges to maintain graph connectivity. + seqGraph.pruneGraph(pruneFactor); + seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); - graph.removeVertex( incomingVertex ); - graph.removeVertex( outgoingVertex ); - foundNodesToMerge = true; - break; - } - } + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor); + seqGraph.simplifyGraph(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor); + + // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can + // happen in cases where for example the reference somehow manages to acquire a cycle, or + // where the entire assembly collapses back into the reference sequence. + if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) + return null; + + seqGraph.removePathsNotConnectedToRef(); + seqGraph.simplifyGraph(); + if ( seqGraph.vertexSet().size() == 1 ) { + // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop + // the code from blowing up. + // TODO -- ref properties should really be on the vertices, not the graph itself + final SeqVertex complete = seqGraph.vertexSet().iterator().next(); + final SeqVertex dummy = new SeqVertex(""); + seqGraph.addVertex(dummy); + seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); } + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); + + return seqGraph; } - protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) { - if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) { - return; - } - // Remove non-ref edges connected before and after the reference path - final Set edgesToCheck = new HashSet(); - edgesToCheck.addAll(graph.incomingEdgesOf(graph.getReferenceSourceVertex())); - while( !edgesToCheck.isEmpty() ) { - final DeBruijnEdge e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( graph.incomingEdgesOf(graph.getEdgeSource(e)) ); - graph.removeEdge(e); - } - edgesToCheck.remove(e); - } - edgesToCheck.addAll(graph.outgoingEdgesOf(graph.getReferenceSinkVertex())); - while( !edgesToCheck.isEmpty() ) { - final DeBruijnEdge e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( graph.outgoingEdgesOf(graph.getEdgeTarget(e)) ); - graph.removeEdge(e); - } - edgesToCheck.remove(e); - } - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - graph.removeAllVertices(verticesToRemove); - } - - protected static void pruneGraph( final DeBruijnAssemblyGraph graph, final int pruneFactor ) { - final List edgesToRemove = new ArrayList(); - for( final DeBruijnEdge e : graph.edgeSet() ) { - if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor - edgesToRemove.add(e); - } - } - graph.removeAllEdges(edgesToRemove); - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - graph.removeAllVertices(verticesToRemove); - } - - protected static void sanityCheckReferenceGraph(final DeBruijnAssemblyGraph graph, final Haplotype refHaplotype) { + protected void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) { if( graph.getReferenceSourceVertex() == null ) { throw new IllegalStateException("All reference graphs must have a reference source vertex."); } @@ -263,86 +244,131 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) - protected static DeBruijnAssemblyGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) + protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) { + final DeBruijnGraph graph = new DeBruijnGraph(kmerLength); + final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph); // First pull kmers from the reference haplotype and add them to the graph - final byte[] refSequence = refHaplotype.getBases(); - if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true) ) { - if( DEBUG ) { - System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); - } - return null; - } - } - } + if ( ! addReferenceKmersToGraph(builder, refHaplotype.getBases()) ) + // something went wrong, so abort right now with a null graph + return null; + + // now go through the graph already seeded with the reference sequence and add the read kmers to it + if ( ! addReadKmersToGraph(builder, reads) ) + // some problem was detected adding the reads to the graph, return null to indicate we failed + return null; + + graph.cleanNonRefPaths(); + return graph; + } + + /** + * Add the high-quality kmers from the reads to the graph + * + * @param builder a debruijn graph builder to add the read kmers to + * @param reads a non-null list of reads whose kmers we want to add to the graph + * @return true if we successfully added the read kmers to the graph without corrupting it in some way + */ + protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List reads) { + final int kmerLength = builder.getKmerSize(); // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); - final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - // if the qualities of all the bases in the kmers are high enough - boolean badKmer = false; - for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { - if( qualities[jjj] < MIN_QUALITY ) { - badKmer = true; - break; - } - } - if( !badKmer ) { + final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced + if ( sequence.length > kmerLength + KMER_OVERLAP ) { + int lastGood = -1; // the index of the last good base we've seen + for( int end = 0; end < sequence.length; end++ ) { + if ( qualities[end] < minBaseQualityToUseInAssembly ) { + lastGood = -1; // reset the last good base + } else if ( lastGood == -1 ) { + lastGood = end; // we're at a good base, the last good one is us + } else if ( end - kmerLength >= lastGood ) { + // end - kmerLength (the start) is after the lastGood base, so that kmer is good + final int start = end - kmerLength; + // how many observations of this kmer have we seen? A normal read counts for 1, but + // a reduced read might imply a higher multiplicity for our the edge int countNumber = 1; - if( read.isReducedRead() ) { + if ( read.isReducedRead() ) { // compute mean number of reduced read counts in current kmer span // precise rounding can make a difference with low consensus counts - countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + KMER_LENGTH)); + // TODO -- optimization: should extend arrayMax function to take start stop values + countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end)); } - final byte[] kmer1 = Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH); - final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH); - - for( int kkk=0; kkk < countNumber; kkk++ ) { - graph.addKmersToGraph(kmer1, kmer2, false); - } + builder.addKmerPairFromSeqToGraph(sequence, start, countNumber); } } } } - return graph; + + builder.flushKmersToGraph(false); + + // always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way + return true; } - protected void printGraphs() { - GRAPH_WRITER.println("digraph assemblyGraphs {"); - for( final DeBruijnAssemblyGraph graph : graphs ) { - for( final DeBruijnEdge edge : graph.edgeSet() ) { - if( edge.getMultiplicity() > PRUNE_FACTOR ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];"); - } - if( edge.isRef() ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); - } - if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } - } - for( final DeBruijnVertex v : graph.vertexSet() ) { - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); - } + /** + * Add the kmers from the reference sequence to the DeBruijnGraph + * + * @param builder the graph to add the reference kmers to. Must be empty + * @param refSequence the reference sequence from which we'll get our kmers + * @return true if we succeeded in creating a good graph from the reference sequence, false otherwise + */ + protected boolean addReferenceKmersToGraph(final DeBruijnGraphBuilder builder, final byte[] refSequence) { + if ( builder == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( builder.getGraph().vertexSet().size() != 0 ) + throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + builder.getGraph().vertexSet().size() + " vertices in it already: " + builder.getGraph()); + if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null"); + + final int kmerLength = builder.getKmerSize(); + if( refSequence.length < kmerLength + KMER_OVERLAP ) { + // not enough reference sequence to build a kmer graph of this length, return null + return false; } - GRAPH_WRITER.println("}"); + + final int kmersInSequence = refSequence.length - kmerLength + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + builder.addKmerPairFromSeqToGraph(refSequence, iii, 1); + } + builder.flushKmersToGraph(true); + + // we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices + if ( builder.getGraph().vertexSet().size() != kmersInSequence ) { + if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping"); + return false; + } + + return true; + } + + protected void printGraphs(final List graphs) { + final int writeFirstGraphWithSizeSmallerThan = 50; + + graphWriter.println("digraph assemblyGraphs {"); + for( final SeqGraph graph : graphs ) { + if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); + continue; + } + + graph.printGraph(graphWriter, false, pruneFactor); + + if ( debugGraphTransformations ) + break; + } + + graphWriter.println("}"); } @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) @Ensures({"result.contains(refHaplotype)"}) - private List findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { + private List findBestPaths( final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes + // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm final List returnHaplotypes = new ArrayList(); refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart()); final Cigar c = new Cigar(); @@ -361,8 +387,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - for( final DeBruijnAssemblyGraph graph : graphs ) { - for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + for( final SeqGraph graph : graphs ) { + final SeqVertex source = graph.getReferenceSourceVertex(); + final SeqVertex sink = graph.getReferenceSinkVertex(); + if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); + + final KBestPaths pathFinder = new KBestPaths(allowCyclesInKmerGraphToGeneratePaths); + for ( final Path path : pathFinder.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH, source, sink) ) { +// logger.info("Found path " + path); Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); @@ -383,12 +415,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } if( !returnHaplotypes.contains(h) ) { h.setAlignmentStartHapwrtRef(activeRegionStart); - h.setCigar( leftAlignedCigar ); + h.setCigar(leftAlignedCigar); + h.setScore(path.getScore()); returnHaplotypes.add(h); + if ( debug ) + logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize()); + // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { - final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place + final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); @@ -409,17 +445,24 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - if( DEBUG ) { + // add genome locs to the haplotypes + for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow); + + if ( returnHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + + if( debug ) { if( returnHaplotypes.size() > 1 ) { - System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against."); + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { - System.out.println("Found only the reference haplotype in the assembly graph."); + logger.info("Found only the reference haplotype in the assembly graph."); } for( final Haplotype h : returnHaplotypes ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() ); + logger.info( h.toString() ); + logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } + return returnHaplotypes; } @@ -430,7 +473,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @param refWithPadding the full reference byte array with padding which encompasses the active region * @return a haplotype fully extended to encompass the active region */ - @Requires({"haplotype != null", "activeRegionStart > 0", "refWithPadding != null", "refWithPadding.length > 0"}) + @Requires({"haplotype != null", "activeRegionStart >= 0", "refWithPadding != null", "refWithPadding.length > 0"}) @Ensures({"result != null", "result.getCigar() != null"}) private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) { final Cigar cigar = haplotype.getCigar(); @@ -438,7 +481,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { byte[] newHaplotypeBases = haplotype.getBases(); int refPos = activeRegionStart; int hapPos = 0; - for( CigarElement ce : cigar.getCigarElements() ) { + for( int iii = 0; iii < cigar.getCigarElements().size(); iii++ ) { + final CigarElement ce = cigar.getCigarElement(iii); switch (ce.getOperator()) { case M: refPos += ce.getLength(); @@ -450,16 +494,17 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { newCigar.add(ce); break; case D: - refPos += ce.getLength(); - newCigar.add(ce); - break; - case X: - newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos), - ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()), - Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length))); - refPos += ce.getLength(); - hapPos += ce.getLength(); - newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M)); + if( iii == 0 || iii == cigar.getCigarElements().size() - 1 ) { + newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos), + ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()), + Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length))); + hapPos += ce.getLength(); + refPos += ce.getLength(); + newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M)); + } else { + refPos += ce.getLength(); + newCigar.add(ce); + } break; default: throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator()); @@ -496,7 +541,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @return the left-aligned cigar */ @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) - protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { final Cigar cigarToReturn = new Cigar(); Cigar cigarToAlign = new Cigar(); for (int i = 0; i < cigar.numCigarElements(); i++) { @@ -537,7 +582,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { if( haplotype == null ) { return false; } - final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS ); haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments @@ -566,7 +611,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } final Haplotype h = new Haplotype( newHaplotypeBases ); - final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS ); h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); if ( haplotype.isArtificialHaplotype() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java deleted file mode 100644 index 6a95049d1..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ /dev/null @@ -1,321 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.jgrapht.graph.DefaultDirectedGraph; - -import java.io.PrintStream; -import java.util.Arrays; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 2/6/13 - */ - -public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { - - public DeBruijnAssemblyGraph() { - super(DeBruijnEdge.class); - } - - /** - * @param v the vertex to test - * @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph) - */ - public boolean isReferenceNode( final DeBruijnVertex v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - for( final DeBruijnEdge e : edgesOf(v) ) { - if( e.isRef() ) { return true; } - } - return false; - } - - /** - * @param v the vertex to test - * @return true if this vertex is a source node - */ - public boolean isSource( final DeBruijnVertex v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - return inDegreeOf(v) == 0; - } - - /** - * Pull out the additional sequence implied by traversing this node in the graph - * @param v the vertex from which to pull out the additional base sequence - * @return non-null byte array - */ - @Ensures({"result != null"}) - public byte[] getAdditionalSequence( final DeBruijnVertex v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); } - return ( isSource(v) ? v.getSequence() : v.getSuffix() ); - } - - /** - * @param e the edge to test - * @return true if this edge is a reference source edge - */ - public boolean isRefSource( final DeBruijnEdge e ) { - if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); } - for( final DeBruijnEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * @param v the vertex to test - * @return true if this vertex is a reference source - */ - public boolean isRefSource( final DeBruijnVertex v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * @param e the edge to test - * @return true if this edge is a reference sink edge - */ - public boolean isRefSink( final DeBruijnEdge e ) { - if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); } - for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * @param v the vertex to test - * @return true if this vertex is a reference sink - */ - public boolean isRefSink( final DeBruijnVertex v ) { - if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph - */ - public DeBruijnVertex getReferenceSourceVertex( ) { - for( final DeBruijnVertex v : vertexSet() ) { - if( isReferenceNode(v) && isRefSource(v) ) { - return v; - } - } - return null; - } - - /** - * @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph - */ - public DeBruijnVertex getReferenceSinkVertex( ) { - for( final DeBruijnVertex v : vertexSet() ) { - if( isReferenceNode(v) && isRefSink(v) ) { - return v; - } - } - return null; - } - - /** - * Traverse the graph and get the next reference vertex if it exists - * @param v the current vertex, can be null - * @return the next reference vertex if it exists - */ - public DeBruijnVertex getNextReferenceVertex( final DeBruijnVertex v ) { - if( v == null ) { return null; } - for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { - return getEdgeTarget(edgeToTest); - } - } - return null; - } - - /** - * Traverse the graph and get the previous reference vertex if it exists - * @param v the current vertex, can be null - * @return the previous reference vertex if it exists - */ - public DeBruijnVertex getPrevReferenceVertex( final DeBruijnVertex v ) { - if( v == null ) { return null; } - for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) { - if( isReferenceNode(getEdgeSource(edgeToTest)) ) { - return getEdgeSource(edgeToTest); - } - } - return null; - } - - /** - * Does a reference path exist between the two vertices? - * @param fromVertex from this vertex, can be null - * @param toVertex to this vertex, can be null - * @return true if a reference path exists in the graph between the two vertices - */ - public boolean referencePathExists(final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex) { - DeBruijnVertex v = fromVertex; - if( v == null ) { - return false; - } - v = getNextReferenceVertex(v); - if( v == null ) { - return false; - } - while( !v.equals(toVertex) ) { - v = getNextReferenceVertex(v); - if( v == null ) { - return false; - } - } - return true; - } - - /** - * Walk along the reference path in the graph and pull out the corresponding bases - * @param fromVertex starting vertex - * @param toVertex ending vertex - * @param includeStart should the starting vertex be included in the path - * @param includeStop should the ending vertex be included in the path - * @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example) - */ - public byte[] getReferenceBytes( final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop ) { - if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); } - if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); } - - byte[] bytes = null; - DeBruijnVertex v = fromVertex; - if( includeStart ) { - bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); - } - v = getNextReferenceVertex(v); // advance along the reference path - while( v != null && !v.equals(toVertex) ) { - bytes = ArrayUtils.addAll( bytes, getAdditionalSequence(v) ); - v = getNextReferenceVertex(v); // advance along the reference path - } - if( includeStop && v != null && v.equals(toVertex)) { - bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v)); - } - return bytes; - } - - /** - * Pull kmers out of the given long sequence and throw them on in the graph - * @param sequence byte array holding the sequence with which to build the assembly graph - * @param KMER_LENGTH the desired kmer length to use - * @param isRef if true the kmers added to the graph will have reference edges linking them - */ - public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) { - if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); } - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef); - } - } - - /** - * Add edge to assembly graph connecting the two kmers - * @param kmer1 the source kmer for the edge - * @param kmer2 the target kmer for the edge - * @param isRef true if the added edge is a reference edge - * @return will return false if trying to add a reference edge which creates a cycle in the assembly graph - */ - public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef ) { - if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } - if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } - if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); } - - final int numVertexBefore = vertexSet().size(); - final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length ); - addVertex(v1); - final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length ); - addVertex(v2); - if( isRef && vertexSet().size() == numVertexBefore ) { return false; } - - final DeBruijnEdge targetEdge = getEdge(v1, v2); - if ( targetEdge == null ) { - addEdge(v1, v2, new DeBruijnEdge( isRef )); - } else { - if( isRef ) { - targetEdge.setIsRef( true ); - } - targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1); - } - return true; - } - - /** - * Print out the graph in the dot language for visualization - * @param GRAPH_WRITER PrintStream to write to - */ - public void printGraph( final PrintStream GRAPH_WRITER ) { - if( GRAPH_WRITER == null ) { throw new IllegalArgumentException("PrintStream cannot be null."); } - - GRAPH_WRITER.println("digraph assembly {"); - for( final DeBruijnEdge edge : edgeSet() ) { - GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + "label=\""+ edge.getMultiplicity() +"\"" + "];"); - if( edge.isRef() ) { - GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); - } - } - for( final DeBruijnVertex v : vertexSet() ) { - final String label = ( inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() ); - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]"); - } - GRAPH_WRITER.println("}"); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java new file mode 100644 index 000000000..0f66082c6 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java @@ -0,0 +1,150 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; + +/** + * Fast approach to building a DeBruijnGraph + * + * Follows the model: + * + * for each X that has bases for the final graph: + * addKmer pair (single kmer with kmer size + 1 spanning the pair) + * + * flushKmersToGraph + * + * User: depristo + * Date: 4/7/13 + * Time: 4:14 PM + */ +public class DeBruijnGraphBuilder { + /** The size of the kmer graph we want to build */ + private final int kmerSize; + + /** The graph we're going to add kmers to */ + private final DeBruijnGraph graph; + + /** keeps counts of all kmer pairs added since the last flush */ + private final KMerCounter counter; + + /** + * Create a new builder that will write out kmers to graph + * + * @param graph a non-null graph that can contain already added kmers + */ + public DeBruijnGraphBuilder(final DeBruijnGraph graph) { + if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null"); + this.kmerSize = graph.getKmerSize(); + this.graph = graph; + this.counter = new KMerCounter(kmerSize + 1); + } + + /** + * The graph we're building + * @return a non-null graph + */ + public DeBruijnGraph getGraph() { + return graph; + } + + /** + * The kmer size of our graph + * @return positive integer + */ + public int getKmerSize() { + return kmerSize; + } + + /** + * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this + * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The + * edge between them is added with isRef and multiplicity + * + * @param sequence a sequence of bases from which we want to extract a pair of kmers + * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize() + * @param multiplicity what's the multiplicity of the edge between these two kmers + */ + public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final int multiplicity ) { + if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); + if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); + if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length); + final Kmer kmerPair = new Kmer(sequence, start, getKmerSize() + 1); + addKmerPair(kmerPair, multiplicity); + } + + /** + * Add a single kmer pair to this builder + * @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1 + * will have an edge added to this + * @param multiplicity the desired multiplicity of this edge + */ + public void addKmerPair(final Kmer kmerPair, final int multiplicity) { + if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length()); + counter.addKmer(kmerPair, multiplicity); + } + + /** + * Flushes the currently added kmers to the graph + * + * After this function is called the builder is reset to an empty state + * + * This flushing is expensive, so many kmers should be added to the builder before flushing. The most + * efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases) + * then and do one flush when completed + * + * @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges? + */ + public void flushKmersToGraph(final boolean addRefEdges) { + for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) { + final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases(); + final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases(); + graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount()); + } + counter.clear(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index bef0cd96c..419ea378f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -48,44 +48,84 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.EventMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.variant.variantcontext.*; -import java.io.PrintStream; import java.util.*; public class GenotypingEngine { + private final static Logger logger = Logger.getLogger(GenotypingEngine.class); private final boolean DEBUG; private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied - private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); private final VariantAnnotatorEngine annotationEngine; + private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; - public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ) { + public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, + final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, + final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { this.DEBUG = DEBUG; this.annotationEngine = annotationEngine; this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; noCall.add(Allele.NO_CALL); + this.crossHaplotypeEventMerger = crossHaplotypeEventMerger; + } + + /** + * Carries the result of a call to #assignGenotypeLikelihoods + */ + public static class CalledHaplotypes { + private final List calls; + private final Set calledHaplotypes; + + protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { + if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) + throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes); + this.calls = calls; + this.calledHaplotypes = calledHaplotypes; + } + + /** + * Get the list of calls made at this location + * @return a non-null (but potentially empty) list of calls + */ + public List getCalls() { + return calls; + } + + /** + * Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls(). + * @return a non-null set of haplotypes + */ + public Set getCalledHaplotypes() { + return calledHaplotypes; + } } /** * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling * + * The list of samples we're working with is obtained from the haplotypeReadMap + * * @param UG_engine UG Engine with basic input parameters * @param haplotypes Haplotypes to assign likelihoods to - * @param samples Samples to genotype * @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) * @param perSampleFilteredReadList * @param ref Reference bytes at active region @@ -93,113 +133,40 @@ public class GenotypingEngine { * @param activeRegionWindow Active window * @param genomeLocParser GenomeLocParser * @param activeAllelesToGenotype Alleles to genotype - * @return List of VC's with genotyped events + * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes */ @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! - public List assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final List samples, - final Map haplotypeReadMap, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final List activeAllelesToGenotype ) { + public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { // sanity check input arguments - if (UG_engine == null) - throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); - if (haplotypes == null || haplotypes.isEmpty()) - throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); - if (samples == null || samples.isEmpty()) - throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples); - if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) - throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); - if (ref == null || ref.length == 0 ) - throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); - if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) - throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); - if (activeRegionWindow == null ) - throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); - if (activeAllelesToGenotype == null ) - throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - if (genomeLocParser == null ) - throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); + if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); + if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); + if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); + if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); + if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); + if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); + if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); + if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); - final List returnCalls = new ArrayList(); - final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } - for( final Haplotype h : haplotypes ) { - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } - if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - System.out.println( ">> Events = " + h.getEventMap()); - } - } - - cleanUpSymbolicUnassembledEvents( haplotypes ); - if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); - cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events - } - if( in_GGA_mode ) { - for( final VariantContext compVC : activeAllelesToGenotype ) { - startPosKeySet.add( compVC.getStart() ); - } - } + // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference + // that carry events among the haplotypes + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); // Walk along each position in the key set and create each event to be outputted + final Set calledHaplotypes = new HashSet(); + final List returnCalls = new ArrayList(); for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final List eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view - final List priorityList = new ArrayList(); // used to merge overlapping events into common reference view - - if( !in_GGA_mode ) { - for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - priorityList.add(vc.getSource()); - } - } - } else { // we are in GGA mode! - int compCount = 0; - for( final VariantContext compVC : activeAllelesToGenotype ) { - if( compVC.getStart() == loc ) { - int alleleCount = 0; - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - List alleleSet = new ArrayList(2); - alleleSet.add(compVC.getReference()); - alleleSet.add(compAltAllele); - final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; - // check if this event is already in the list of events due to a repeat in the input alleles track - final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); - boolean alreadyExists = false; - for( final VariantContext eventToTest : eventsAtThisLoc ) { - if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { - alreadyExists = true; - } - } - if( !alreadyExists ) { - priorityList.add(vcSourceName); - eventsAtThisLoc.add(candidateEventToAdd); - } - alleleCount++; - } - } - compCount++; - } - } + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); if( eventsAtThisLoc.isEmpty() ) { continue; } @@ -207,7 +174,7 @@ public class GenotypingEngine { final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); // Sanity check the priority list for mistakes - validatePriorityList( priorityList, eventsAtThisLoc ); + final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); @@ -225,75 +192,157 @@ public class GenotypingEngine { final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - //System.out.println("Event/haplotype allele mapping = " + alleleMapper); + logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); } - final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION ); - final GenotypesContext genotypes = calculateGLsForThisEvent( samples, alleleReadMap, mergedVC ); - final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); + final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); + final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : - convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) ); + convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) ); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); - VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); + VariantContext annotatedCall = call; if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); } + annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, annotatedCall); + + // maintain the set of all called haplotypes + for ( final Allele calledAllele : call.getAlleles() ) + calledHaplotypes.addAll(alleleMapper.get(calledAllele)); + returnCalls.add( annotatedCall ); } } } - return returnCalls; + return new CalledHaplotypes(returnCalls, calledHaplotypes); + } + + /** + * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts + * + * @param haplotypes the list of haplotypes we're working with + * @param haplotypeReadMap map from samples -> the per read allele likelihoods + * @param ref the reference bases (over the same interval as the haplotypes) + * @param refLoc the span of the reference bases + * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) + * @return + */ + private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, + final Map haplotypeReadMap, + final byte[] ref, + final GenomeLoc refLoc, + final List activeAllelesToGenotype) { + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); + + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); + + if ( in_GGA_mode ) startPosKeySet.clear(); + + cleanUpSymbolicUnassembledEvents( haplotypes ); + if ( !in_GGA_mode ) { + // run the event merger if we're not in GGA mode + final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc); + if ( mergedAnything ) + cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events + } + + if ( in_GGA_mode ) { + for( final VariantContext compVC : activeAllelesToGenotype ) { + startPosKeySet.add( compVC.getStart() ); + } + } + + return startPosKeySet; + } + + /** + * Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view + * @param vcs a list of variant contexts + * @return the list of the sources of vcs in the same order + */ + private List makePriorityList(final List vcs) { + final List priorityList = new LinkedList(); + for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); + return priorityList; + } + + private List getVCsAtThisLocation(final List haplotypes, + final int loc, + final List activeAllelesToGenotype) { + // the overlapping events to merge into a common reference view + final List eventsAtThisLoc = new ArrayList(); + + if( activeAllelesToGenotype.isEmpty() ) { + for( final Haplotype h : haplotypes ) { + final EventMap eventMap = h.getEventMap(); + final VariantContext vc = eventMap.get(loc); + if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { + eventsAtThisLoc.add(vc); + } + } + } else { // we are in GGA mode! + int compCount = 0; + for( final VariantContext compVC : activeAllelesToGenotype ) { + if( compVC.getStart() == loc ) { + int alleleCount = 0; + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + List alleleSet = new ArrayList(2); + alleleSet.add(compVC.getReference()); + alleleSet.add(compAltAllele); + final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; + // check if this event is already in the list of events due to a repeat in the input alleles track + final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); + boolean alreadyExists = false; + for( final VariantContext eventToTest : eventsAtThisLoc ) { + if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { + alreadyExists = true; + } + } + if( !alreadyExists ) { + eventsAtThisLoc.add(candidateEventToAdd); + } + alleleCount++; + } + } + compCount++; + } + } + + return eventsAtThisLoc; } /** * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele - * @param samples List of samples to genotype * @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods * @param mergedVC Input VC with event to genotype * @return GenotypesContext object wrapping genotype objects with PLs */ - @Requires({"samples != null","alleleReadMap!= null", "mergedVC != null"}) + @Requires({"alleleReadMap!= null", "mergedVC != null"}) @Ensures("result != null") - private GenotypesContext calculateGLsForThisEvent( final List samples, final Map alleleReadMap, final VariantContext mergedVC ) { - final GenotypesContext genotypes = GenotypesContext.create(samples.size()); + private GenotypesContext calculateGLsForThisEvent( final Map alleleReadMap, final VariantContext mergedVC ) { + final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - for( final String sample : samples ) { + for( final String sample : alleleReadMap.keySet() ) { final int numHaplotypes = mergedVC.getAlleles().size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles()); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC } } - genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); + genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make()); } return genotypes; } - private void validatePriorityList( final List priorityList, final List eventsAtThisLoc ) { - for( final VariantContext vc : eventsAtThisLoc ) { - if( !priorityList.contains(vc.getSource()) ) { - throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles."); - } - } - for( final String name : priorityList ) { - boolean found = false; - for( final VariantContext vc : eventsAtThisLoc ) { - if(vc.getSource().equals(name)) { found = true; break; } - } - if( !found ) { - throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles."); - } - } - } - private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, final Map perSampleReadMap, final Map> perSampleFilteredReadList, @@ -337,10 +386,10 @@ public class GenotypingEngine { protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final List haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { - for( final VariantContext vc : h.getEventMap().values() ) { + for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { if( vc.isSymbolic() ) { for( final Haplotype h2 : haplotypes ) { - for( final VariantContext vc2 : h2.getEventMap().values() ) { + for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) { if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events haplotypesToRemove.add(h); break; @@ -356,8 +405,7 @@ public class GenotypingEngine { // BUGBUG: ugh, too complicated protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, final Map> alleleMapper, - final double downsamplingFraction, - final PrintStream downsamplingLog ) { + final double downsamplingFraction ) { final Map alleleReadMap = new LinkedHashMap(); for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample @@ -374,165 +422,13 @@ public class GenotypingEngine { perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); } } - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); } return alleleReadMap; } - /** - * TODO - comment me, clean me, refactor me! - * @param haplotypes - * @param samples - * @param haplotypeReadMap - * @param startPosKeySet - * @param ref - * @param refLoc - */ - protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, - final List samples, - final Map haplotypeReadMap, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - - final int MAX_SIZE_TO_COMBINE = 15; - final double MERGE_EVENTS_R2_THRESHOLD = 0.95; - if( startPosKeySet.size() <= 1 ) { return; } - - boolean mapWasUpdated = true; - while( mapWasUpdated ) { - mapWasUpdated = false; - - // loop over the set of start locations and consider pairs that start near each other - final Iterator iter = startPosKeySet.iterator(); - int thisStart = iter.next(); - while( iter.hasNext() ) { - final int nextStart = iter.next(); - if( nextStart - thisStart < MAX_SIZE_TO_COMBINE) { - boolean isBiallelic = true; - VariantContext thisVC = null; - VariantContext nextVC = null; - double x11 = Double.NEGATIVE_INFINITY; - double x12 = Double.NEGATIVE_INFINITY; - double x21 = Double.NEGATIVE_INFINITY; - double x22 = Double.NEGATIVE_INFINITY; - - for( final Haplotype h : haplotypes ) { - // only make complex substitutions out of consecutive biallelic sites - final VariantContext thisHapVC = h.getEventMap().get(thisStart); - if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype - if( thisVC == null ) { - thisVC = thisHapVC; - } else if( !thisHapVC.hasSameAllelesAs( thisVC ) ) { - isBiallelic = false; - break; - } - } - final VariantContext nextHapVC = h.getEventMap().get(nextStart); - if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype - if( nextVC == null ) { - nextVC = nextHapVC; - } else if( !nextHapVC.hasSameAllelesAs( nextVC ) ) { - isBiallelic = false; - break; - } - } - // count up the co-occurrences of the events for the R^2 calculation - for( final String sample : samples ) { - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( Collections.singleton(sample), haplotypeReadMap, Collections.singletonList(Allele.create(h, true)) )[0][0]; - if( thisHapVC == null ) { - if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } - else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } - } else { - if( nextHapVC == null ) { x21 = MathUtils.approximateLog10SumLog10(x21, haplotypeLikelihood); } - else { x22 = MathUtils.approximateLog10SumLog10(x22, haplotypeLikelihood); } - } - } - } - if( thisVC == null || nextVC == null ) { - continue; - } - if( isBiallelic ) { - final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) ); - if( DEBUG ) { - System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); - System.out.println("-- " + thisVC); - System.out.println("-- " + nextVC); - } - if( R2 > MERGE_EVENTS_R2_THRESHOLD ) { - - final VariantContext mergedVC = createMergedVariantContext(thisVC, nextVC, ref, refLoc); - - // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event - for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) { - eventMap.remove(thisStart); - eventMap.remove(nextStart); - eventMap.put(mergedVC.getStart(), mergedVC); - } - } - startPosKeySet.add(mergedVC.getStart()); - boolean containsStart = false; - boolean containsNext = false; - for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - if( eventMap.containsKey(thisStart) ) { containsStart = true; } - if( eventMap.containsKey(nextStart) ) { containsNext = true; } - } - if(!containsStart) { startPosKeySet.remove(thisStart); } - if(!containsNext) { startPosKeySet.remove(nextStart); } - - if( DEBUG ) { System.out.println("====> " + mergedVC); } - mapWasUpdated = true; - break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events - } - } - } - thisStart = nextStart; - } - } - } - - // BUGBUG: make this merge function more general - protected static VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) { - final int thisStart = thisVC.getStart(); - final int nextStart = nextVC.getStart(); - byte[] refBases = new byte[]{}; - byte[] altBases = new byte[]{}; - refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases()); - altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases()); - int locus; - for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) { - final byte refByte = ref[locus - refLoc.getStart()]; - refBases = ArrayUtils.add(refBases, refByte); - altBases = ArrayUtils.add(altBases, refByte); - } - refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel - altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); - - int iii = 0; - if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele - while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } - } - final List mergedAlleles = new ArrayList(); - mergedAlleles.add( Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ) ); - mergedAlleles.add( Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ) ); - return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make(); - } - - protected static double calculateR2LD( final double x11, final double x12, final double x21, final double x22 ) { - final double total = x11 + x12 + x21 + x22; - final double pa1b1 = x11 / total; - final double pa1b2 = x12 / total; - final double pa2b1 = x21 / total; - final double pa1 = pa1b1 + pa1b2; - final double pb1 = pa1b1 + pa2b1; - return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) ); - } - protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { final Map> alleleMapper = new LinkedHashMap>(); for( final Map.Entry entry : mergeMap.entrySet() ) { @@ -559,8 +455,8 @@ public class GenotypingEngine { alleles.add(h.getArtificialRefAllele()); alleles.add(h.getArtificialAltAllele()); final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype") - .alleles(alleles) - .loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() ); + .alleles(alleles) + .loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() ); if( eventMapper.containsKey(artificialVC) ) { eventMapper.get(artificialVC).add(h); } @@ -588,6 +484,10 @@ public class GenotypingEngine { if( eventToTest.getKey().equals(new Event(null)) ) continue; + // only try to disambiguate for alleles that have had haplotypes previously assigned above + if( eventToTest.getValue().isEmpty() ) + continue; + final Haplotype artificialHaplotype = eventToTest.getValue().get(0); if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) { matchingEvent = eventToTest.getKey(); @@ -648,6 +548,11 @@ public class GenotypingEngine { return eventAllelesForSample; } + @Deprecated + protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + return new EventMap(haplotype, ref, refLoc, sourceNameToAdd); + } + protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { for( final VariantContext vc : list ) { if( vc.hasSameAllelesAs(vcToTest) ) { @@ -657,91 +562,7 @@ public class GenotypingEngine { return false; } - protected static Map generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { - final Map vcs = new LinkedHashMap(); - - int refPos = alignmentStartHapwrtRef; - if( refPos < 0 ) { return null; } // Protection against SW failures - int alignmentPos = 0; - - for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { - final CigarElement ce = cigar.getCigarElement(cigarIndex); - final int elementLength = ce.getLength(); - switch( ce.getOperator() ) { - case I: - { - final List insertionAlleles = new ArrayList(); - final int insertionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) ) { - insertionAlleles.add( Allele.create(refByte, true) ); - } - if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele - insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); - } else { - byte[] insertionBases = new byte[]{}; - insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base - insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength )); - if( BaseUtils.isAllRegularBases(insertionBases) ) { - insertionAlleles.add( Allele.create(insertionBases, false) ); - } - } - if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele - vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); - } - alignmentPos += elementLength; - break; - } - case S: - { - alignmentPos += elementLength; - break; - } - case D: - { - final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base - final List deletionAlleles = new ArrayList(); - final int deletionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { - deletionAlleles.add( Allele.create(deletionBases, true) ); - deletionAlleles.add( Allele.create(refByte, false) ); - vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); - } - refPos += elementLength; - break; - } - case M: - case EQ: - case X: - { - for( int iii = 0; iii < elementLength; iii++ ) { - final byte refByte = ref[refPos]; - final byte altByte = alignment[alignmentPos]; - if( refByte != altByte ) { // SNP! - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { - final List snpAlleles = new ArrayList(); - snpAlleles.add( Allele.create( refByte, true ) ); - snpAlleles.add( Allele.create( altByte, false ) ); - vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); - } - } - refPos++; - alignmentPos++; - } - break; - } - case N: - case H: - case P: - default: - throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); - } - } - return vcs; - } - - private static class Event { + protected static class Event { public VariantContext vc; public Event( final VariantContext vc ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 64c762e97..6ea543f25 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -56,7 +55,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -67,27 +67,30 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalcul import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.*; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; import java.io.FileNotFoundException; import java.io.PrintStream; @@ -96,17 +99,17 @@ import java.util.*; /** * Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM. * - *

      Input

      + *

      Input

      *

      * Input bam file(s) from which to make calls *

      * - *

      Output

      + *

      Output

      *

      * VCF file with raw, unrecalibrated SNP and indel calls. *

      * - *

      Examples

      + *

      Examples

      *
        *   java
        *     -jar GenomeAnalysisTK.jar
      @@ -120,7 +123,7 @@ import java.util.*;
        *     -o output.raw.snps.indels.vcf
        * 
      * - *

      Caveats

      + *

      Caveats

      *
        *
      • The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.
      • *
      @@ -132,33 +135,60 @@ import java.util.*; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ActiveRegionTraversalParameters(extension=85, maxRegion=300) +@ActiveRegionTraversalParameters(extension=200, maxRegion=300) +@ReadFilters({HCMappingQualityFilter.class}) @Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { - /** * A raw, unfiltered, highly sensitive callset in VCF format. */ - @Output(doc="File to which variants should be written", required = true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; - @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) + @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false) protected PrintStream graphWriter = null; /** - * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here - * does not include uninformative reads so that not every input read is emitted to the bam. + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. + * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. + * + * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to + * use in specific areas where you want to better understand why the HC is making specific calls. + * + * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches + * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended + * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more + * easily see which reads go with these haplotype. + * + * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire + * active region, coming from read HC and a special read group. + * + * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean + * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to + * its next best haplotype. + * + * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, + * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV + * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen + * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png + * */ - @Hidden - @Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false) + @Advanced + @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) protected StingSAMFileWriter bamWriter = null; - private SAMFileHeader bamHeader = null; - private long uniqueNameCounter = 1; - private final static String readGroupId = "ArtificialHaplotype"; + private HaplotypeBAMWriter haplotypeBAMWriter; + + /** + * The type of BAM output we want to see. + */ + @Advanced + @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) + public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ + @Advanced @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; @@ -166,8 +196,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; + @Advanced @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 2; + protected int MIN_PRUNE_FACTOR = 0; @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) @@ -175,7 +206,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Advanced @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 13; + protected int maxNumHaplotypesInPopulation = 25; @Advanced @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) @@ -188,9 +219,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, * and may make use of them in assembly and calling, where possible. */ + @Hidden @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) protected boolean includeUnmappedReads = false; + @Advanced @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; @@ -202,6 +235,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) protected boolean justDetermineActiveRegions = false; + @Hidden + @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) + protected boolean dontGenotype = false; + + @Hidden + @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectKmers = false; + /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -216,6 +257,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Records that are filtered in the comp track will be ignored. * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). */ + @Advanced @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -228,6 +270,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. */ + @Advanced @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(Arrays.asList(new String[]{"ClippingRankSumTest"})); @@ -235,9 +278,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ + @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); + @Advanced + @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) + protected boolean mergeVariantsViaLD = false; + /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. */ @@ -247,9 +295,27 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @ArgumentCollection private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + @Advanced @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + @Advanced + @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) + protected int debugGraphTransformations = -1; + + @Hidden // TODO -- not currently useful + @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) + protected boolean useLowQualityBasesForAssembly = false; + + @Hidden + @Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) + protected boolean dontTrimActiveRegions = false; + + @Hidden + @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) + protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + + // the UG engines private UnifiedGenotyperEngine UG_engine = null; private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; @@ -271,6 +337,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // reference base padding size private static final int REFERENCE_PADDING = 500; + // include at least this many bases around an event for calling it + private final static int PADDING_AROUND_SNPS_FOR_CALLING = 20; + private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150; + + // the maximum extent into the full active region extension that we're willing to go in genotyping our events + private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25; + + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + // bases with quality less than or equal to this value are trimmed off the tails of the reads private static final byte MIN_TAIL_QUALITY = 20; @@ -291,6 +367,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem public void initialize() { super.initialize(); + if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) + throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); + // get all of the unique sample names Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); @@ -349,12 +428,21 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer ); + // setup the assembler + assemblyEngine = new DeBruijnAssembler(DEBUG, debugGraphTransformations, minKmer, allowCyclesInKmerGraphToGeneratePaths); + assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); + if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); + if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); + + final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); + + genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); if ( bamWriter != null ) - setupBamWriter(); + haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); } //--------------------------------------------------------------------------------------------------------------- @@ -391,12 +479,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { - if( !allelesToGenotype.contains(vc) ) { - allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object - } - } - if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { + final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); + if( vcFromAllelesRod != null ) { + allelesToGenotype.add(vcFromAllelesRod); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object return new ActivityProfileState(ref.getLocus(), 1.0); } } @@ -423,7 +508,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final byte qual = p.getQual(); if( p.isDeletion() || qual > (byte) 18) { int AA = 0; final int AB = 1; int BB = 2; - if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { + if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { AA = 2; BB = 0; if( p.isNextToSoftClip() ) { @@ -454,87 +539,240 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- @Override - public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { + public Integer map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { if ( justDetermineActiveRegions ) // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return 1; - final List activeAllelesToGenotype = new ArrayList(); + if( !originalActiveRegion.isActive() ) { return 0; } // Not active so nothing to do! + final List activeAllelesToGenotype = new ArrayList(); if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : allelesToGenotype ) { - if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { + if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode } } allelesToGenotype.removeAll( activeAllelesToGenotype ); + // No alleles found in this region so nothing to do! + if ( activeAllelesToGenotype.isEmpty() ) { return 0; } + } else { + if( originalActiveRegion.size() == 0 ) { return 0; } // No reads here so nothing to do! } - if( !activeRegion.isActive() ) { return 0; } // Not active so nothing to do! - if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! + // run the local assembler, getting back a collection of information on how we should proceed + final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); - finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails + // abort early if something is out of the acceptable range + if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! + if (dontGenotype) return 1; // user requested we not proceed - final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region - final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); - final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); - if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! - - final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria - if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! - - // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM - Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); - - // evaluate each sample's reads against all haplotypes - final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); + // filter out reads from genotyping which fail mapping quality based criteria + final List filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); - // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final List bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? - likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes ); + if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! - for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoods( UG_engine, - bestHaplotypes, - samplesList, - stratifiedReadMap, - perSampleFilteredReadList, - fullReferenceWithPadding, - paddedReferenceLoc, - activeRegion.getLocation(), - getToolkit().getGenomeLocParser(), - activeAllelesToGenotype ) ) { + // evaluate each sample's reads against all haplotypes + //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); + final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); + + // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) + final List bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap); + + final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, + bestHaplotypes, + stratifiedReadMap, + perSampleFilteredReadList, + assemblyResult.fullReferenceWithPadding, + assemblyResult.paddedReferenceLoc, + assemblyResult.regionForGenotyping.getLocation(), + getToolkit().getGenomeLocParser(), + activeAllelesToGenotype ); + + for( final VariantContext call : calledHaplotypes.getCalls() ) { // TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker. // annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call); vcfWriter.add( call ); } if ( bamWriter != null ) { - // write the haplotypes to the bam - for ( Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); + haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc, + bestHaplotypes, + calledHaplotypes.getCalledHaplotypes(), + stratifiedReadMap); + } - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); - for ( final Haplotype haplotype : haplotypes ) - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - // next, output the interesting reads for each sample aligned against the appropriate haplotype - for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); - } + return 1; // One active region was processed during this map call + } + + private final static class AssemblyResult { + final List haplotypes; + final ActiveRegion regionForGenotyping; + final byte[] fullReferenceWithPadding; + final GenomeLoc paddedReferenceLoc; + + private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc) { + this.haplotypes = haplotypes; + this.regionForGenotyping = regionForGenotyping; + this.fullReferenceWithPadding = fullReferenceWithPadding; + this.paddedReferenceLoc = paddedReferenceLoc; + } + } + + /** + * High-level function that runs the assembler on the active region reads, + * returning a data structure with the resulting information needed + * for further HC steps + * + * @param activeRegion the region we should assemble + * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) + * @return the AssemblyResult describing how to proceed with genotyping + */ + protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { + // Create the reference haplotype which is the bases from the reference that make up the active region + finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails + + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); + final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); + final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); + + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); + + if ( ! dontTrimActiveRegions ) { + return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); + } else { + // we don't want to or cannot create a trimmed active region, so go ahead and use the old one + return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc); + } + } + + /** + * Trim down the active region to just enough to properly genotype the events among the haplotypes + * + * This function merely creates the region, but it doesn't populate the reads back into the region + * + * @param region our full active region + * @param haplotypes the list of haplotypes we've created from assembly + * @param ref the reference bases over the full padded location + * @param refLoc the span of the reference bases + * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully + */ + private ActiveRegion createTrimmedRegion(final ActiveRegion region, final List haplotypes, final byte[] ref, final GenomeLoc refLoc) { + EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); + final TreeSet allContexts = EventMap.getAllVariantContexts(haplotypes); + final GenomeLocParser parser = getToolkit().getGenomeLocParser(); + + if ( allContexts.isEmpty() ) // no variants, so just return the current region + return null; + + final List withinActiveRegion = new LinkedList(); + int pad = PADDING_AROUND_SNPS_FOR_CALLING; + GenomeLoc trimLoc = null; + for ( final VariantContext vc : allContexts ) { + final GenomeLoc vcLoc = parser.createGenomeLoc(vc); + if ( region.getLocation().overlapsP(vcLoc) ) { + if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding + pad = PADDING_AROUND_OTHERS_FOR_CALLING; + trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); + withinActiveRegion.add(vc); } } - if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } + // we don't actually have anything in the region after removing variants that don't overlap the region's full location + if ( trimLoc == null ) return null; - return 1; // One active region was processed during this map call + final GenomeLoc maxSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(region.getLocation(), MAX_GENOTYPING_ACTIVE_REGION_EXTENSION); + final GenomeLoc idealSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(trimLoc, pad); + final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); + + final ActiveRegion trimmedRegion = region.trim(finalSpan); + if ( DEBUG ) { + logger.info("events : " + withinActiveRegion); + logger.info("trimLoc : " + trimLoc); + logger.info("pad : " + pad); + logger.info("idealSpan : " + idealSpan); + logger.info("maxSpan : " + maxSpan); + logger.info("finalSpan : " + finalSpan); + logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size()); + } + return trimmedRegion; + } + + /** + * Trim down the active region to just enough to properly genotype the events among the haplotypes + * + * @param originalActiveRegion our full active region + * @param haplotypes the list of haplotypes we've created from assembly + * @param fullReferenceWithPadding the reference bases over the full padded location + * @param paddedReferenceLoc the span of the reference bases + * @return an AssemblyResult containing the trimmed active region with all of the reads we should use + * trimmed down as well, and a revised set of haplotypes. If trimming failed this function + * may choose to use the originalActiveRegion without modification + */ + private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion, + final List haplotypes, + final byte[] fullReferenceWithPadding, + final GenomeLoc paddedReferenceLoc) { + final ActiveRegion trimmedActiveRegion = createTrimmedRegion(originalActiveRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); + + if ( trimmedActiveRegion == null ) + return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + + // trim down the haplotypes + final Set haplotypeSet = new HashSet(haplotypes.size()); + for ( final Haplotype h : haplotypes ) { + final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); + if ( trimmed != null ) { + haplotypeSet.add(trimmed); + } else if ( DEBUG ) { + logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + " because it starts with or ends with an insertion or deletion when trimmed to " + trimmedActiveRegion.getExtendedLoc()); + } + } + + // create the final list of trimmed haplotypes + final List trimmedHaplotypes = new ArrayList(haplotypeSet); + + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); + + if ( DEBUG ) { + logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + for ( final Haplotype remaining: trimmedHaplotypes ) { + logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar()); + } + } + + + // trim down the reads and add them to the trimmed active region + final List trimmedReads = new ArrayList(originalActiveRegion.getReads().size()); + for( final GATKSAMRecord read : originalActiveRegion.getReads() ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() ); + if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + trimmedReads.add(clippedRead); + } + } + trimmedActiveRegion.clearReads(); + trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); + + return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + } + + /** + * Select the best N haplotypes according to their likelihoods, if appropriate + * + * @param haplotypes a list of haplotypes to consider + * @param stratifiedReadMap a map from samples -> read likelihoods + * @return the list of haplotypes to genotype + */ + protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { + if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + return haplotypes; + } else { + return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); + } } //--------------------------------------------------------------------------------------------------------------- @@ -564,8 +802,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // //--------------------------------------------------------------------------------------------------------------- - private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { - if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } + private void finalizeActiveRegion( final ActiveRegion activeRegion ) { + if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); activeRegion.clearReads(); @@ -581,20 +819,33 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { - GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + + // uncomment to remove hard clips from consideration at all + //clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); readsToUse.add(clippedRead); } } } - activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse)); + + activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart)); } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { final List readsToRemove = new ArrayList(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { - if( rec.getReadLength() < 24 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { + if( rec.getReadLength() < 10 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { readsToRemove.add(rec); } } @@ -624,92 +875,5 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return returnMap; } - private void setupBamWriter() { - // prepare the bam header - bamHeader = new SAMFileHeader(); - bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); - bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - // include the original read groups plus a new artificial one for the haplotypes - final List readGroups = new ArrayList(getToolkit().getSAMFileHeader().getReadGroups()); - final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); - rg.setSample("HC"); - rg.setSequencingCenter("BI"); - readGroups.add(rg); - bamHeader.setReadGroups(readGroups); - - bamWriter.setPresorted(false); - bamWriter.writeHeader(bamHeader); - } - - private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { - final GATKSAMRecord record = new GATKSAMRecord(bamHeader); - record.setReadBases(haplotype.getBases()); - record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); - record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); - record.setCigar(haplotype.getCigar()); - record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); - record.setReadName("HC" + uniqueNameCounter++); - record.setReadUnmappedFlag(false); - record.setReferenceIndex(paddedRefLoc.getContigIndex()); - record.setAttribute(SAMTag.RG.toString(), readGroupId); - record.setFlags(16); - bamWriter.addAlignment(record); - } - - private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype, final int referenceStart) { - - final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), read.getReadBases(), 5.0, -10.0, -22.0, -1.2); - final int readStartOnHaplotype = swPairwiseAlignment.getAlignmentStart2wrt1(); - final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; - read.setAlignmentStart(readStartOnReference); - - final Cigar cigar = generateReadCigarFromHaplotype(read, readStartOnHaplotype, haplotype.getCigar()); - read.setCigar(cigar); - - bamWriter.addAlignment(read); - } - - private Cigar generateReadCigarFromHaplotype(final GATKSAMRecord read, final int readStartOnHaplotype, final Cigar haplotypeCigar) { - - int currentReadPos = 0; - int currentHapPos = 0; - final List readCigarElements = new ArrayList(); - - for ( final CigarElement cigarElement : haplotypeCigar.getCigarElements() ) { - - if ( cigarElement.getOperator() == CigarOperator.D ) { - if ( currentReadPos > 0 ) - readCigarElements.add(cigarElement); - } else if ( cigarElement.getOperator() == CigarOperator.M || cigarElement.getOperator() == CigarOperator.I ) { - - final int elementLength = cigarElement.getLength(); - final int nextReadPos = currentReadPos + elementLength; - final int nextHapPos = currentHapPos + elementLength; - - // do we want this element? - if ( currentReadPos > 0 ) { - // do we want the entire element? - if ( nextReadPos < read.getReadLength() ) { - readCigarElements.add(cigarElement); - currentReadPos = nextReadPos; - } - // otherwise, we can finish up and return the cigar - else { - readCigarElements.add(new CigarElement(read.getReadLength() - currentReadPos, cigarElement.getOperator())); - return new Cigar(readCigarElements); - } - } - // do we want part of the element to start? - else if ( currentReadPos == 0 && nextHapPos > readStartOnHaplotype ) { - currentReadPos = Math.min(nextHapPos - readStartOnHaplotype, read.getReadLength()); - readCigarElements.add(new CigarElement(currentReadPos, cigarElement.getOperator())); - } - - currentHapPos = nextHapPos; - } - } - - return new Cigar(readCigarElements); - } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index c7cc84b9c..01ab421b3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -58,8 +58,8 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; @@ -84,17 +84,17 @@ import java.util.*; * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. * Records are annotated with the set and status attributes. * - *

      Input

      + *

      Input

      *

      * 2 variant files to resolve. *

      * - *

      Output

      + *

      Output

      *

      * A single consensus VCF. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx1g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -125,7 +125,7 @@ public class HaplotypeResolver extends RodWalker {
           @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
           public List> variants;
       
      -    @Output(doc="File to which variants should be written", required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter baseWriter = null;
           private VariantContextWriter writer;
       
      @@ -360,8 +360,8 @@ public class HaplotypeResolver extends RodWalker {
               }
       
               // order results by start position
      -        final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1));
      -        final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2));
      +        final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1));
      +        final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2));
               if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
                   // TODO -- handle errors appropriately
                   logger.debug("No source alleles; aborting at " + refContext.getLocus());
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
      deleted file mode 100644
      index 90c2e6a2a..000000000
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
      +++ /dev/null
      @@ -1,445 +0,0 @@
      -/*
      -*  By downloading the PROGRAM you agree to the following terms of use:
      -*  
      -*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      -*  
      -*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      -*  
      -*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      -*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      -*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      -*  
      -*  1. DEFINITIONS
      -*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      -*  
      -*  2. LICENSE
      -*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      -*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      -*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      -*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      -*  
      -*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      -*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      -*  Copyright 2012 Broad Institute, Inc.
      -*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      -*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      -*  
      -*  4. INDEMNIFICATION
      -*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      -*  
      -*  5. NO REPRESENTATIONS OR WARRANTIES
      -*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      -*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      -*  
      -*  6. ASSIGNMENT
      -*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      -*  
      -*  7. MISCELLANEOUS
      -*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      -*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      -*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      -*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      -*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      -*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      -*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      -*/
      -
      -package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      -
      -import com.google.java.contract.Ensures;
      -import com.google.java.contract.Requires;
      -import net.sf.samtools.Cigar;
      -import net.sf.samtools.CigarElement;
      -import net.sf.samtools.CigarOperator;
      -import org.apache.commons.lang.ArrayUtils;
      -import org.broadinstitute.sting.utils.GenomeLoc;
      -import org.broadinstitute.sting.utils.Haplotype;
      -import org.broadinstitute.sting.utils.SWPairwiseAlignment;
      -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
      -import org.broadinstitute.sting.utils.sam.AlignmentUtils;
      -import org.broadinstitute.variant.variantcontext.Allele;
      -import org.broadinstitute.variant.variantcontext.VariantContext;
      -
      -import java.io.Serializable;
      -import java.util.*;
      -
      -/**
      - * Created by IntelliJ IDEA.
      - * User: ebanks, rpoplin
      - * Date: Mar 23, 2011
      - */
      -// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
      -// This is different from most graph traversals because we want to test paths from any source node to any sink node.
      -public class KBestPaths {
      -
      -    // static access only
      -    protected KBestPaths() { }
      -    private static int MAX_PATHS_TO_HOLD = 100;
      -
      -    protected static class MyInt { public int val = 0; }
      -
      -    // class to keep track of paths
      -    protected static class Path {
      -
      -        // the last vertex seen in the path
      -        private final DeBruijnVertex lastVertex;
      -
      -        // the list of edges comprising the path
      -        private final List edges;
      -
      -        // the scores for the path
      -        private final int totalScore;
      -
      -        // the graph from which this path originated
      -        private final DeBruijnAssemblyGraph graph;
      -
      -        // used in the bubble state machine to apply Smith-Waterman to the bubble sequence
      -        // these values were chosen via optimization against the NA12878 knowledge base
      -        private static final double SW_MATCH = 20.0;
      -        private static final double SW_MISMATCH = -15.0;
      -        private static final double SW_GAP = -26.0;
      -        private static final double SW_GAP_EXTEND = -1.1;
      -        private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes();
      -
      -        public Path( final DeBruijnVertex initialVertex, final DeBruijnAssemblyGraph graph ) {
      -            lastVertex = initialVertex;
      -            edges = new ArrayList(0);
      -            totalScore = 0;
      -            this.graph = graph;
      -        }
      -
      -        public Path( final Path p, final DeBruijnEdge edge ) {
      -            if( !p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); }
      -
      -            graph = p.graph;
      -            lastVertex = p.graph.getEdgeTarget(edge);
      -            edges = new ArrayList(p.edges);
      -            edges.add(edge);
      -            totalScore = p.totalScore + edge.getMultiplicity();
      -        }
      -
      -        /**
      -         * Does this path contain the given edge
      -         * @param edge  the given edge to test
      -         * @return      true if the edge is found in this path
      -         */
      -        public boolean containsEdge( final DeBruijnEdge edge ) {
      -            if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
      -
      -            for( final DeBruijnEdge e : edges ) {
      -                if( e.equals(graph, edge) ) {
      -                    return true;
      -                }
      -            }
      -
      -            return false;
      -        }
      -
      -        /**
      -         * Calculate the number of times this edge appears in the path
      -         * @param edge  the given edge to test
      -         * @return      number of times this edge appears in the path
      -         */
      -        public int numInPath( final DeBruijnEdge edge ) {
      -            if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
      -
      -            int numInPath = 0;
      -            for( final DeBruijnEdge e : edges ) {
      -                if( e.equals(graph, edge) ) {
      -                    numInPath++;
      -                }
      -            }
      -
      -            return numInPath;
      -        }
      -
      -        /**
      -         * Does this path contain a reference edge?
      -         * @return  true if the path contains a reference edge
      -         */
      -        public boolean containsRefEdge() {
      -            for( final DeBruijnEdge e : edges ) {
      -                if( e.isRef() ) { return true; }
      -            }
      -            return false;
      -        }
      -
      -        public List getEdges() { return edges; }
      -
      -        public int getScore() { return totalScore; }
      -
      -        public DeBruijnVertex getLastVertexInPath() { return lastVertex; }
      -
      -        /**
      -         * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes
      -         * @return  non-null sequence of bases corresponding to this path
      -         */
      -        @Ensures({"result != null"})
      -        public byte[] getBases() {
      -            if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); }
      -            
      -            byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0)));
      -            for( final DeBruijnEdge e : edges ) {
      -                bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e)));
      -            }
      -            return bases;
      -        }
      -
      -        /**
      -         * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble
      -         * @return  non-null Cigar string with reference length equal to the refHaplotype's reference length
      -         */
      -        @Ensures("result != null")
      -        public Cigar calculateCigar() {
      -
      -            final Cigar cigar = new Cigar();
      -            // special case for paths that start on reference but not at the reference source node
      -            if( edges.get(0).isRef() && !graph.isRefSource(edges.get(0)) ) {
      -                for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edges.get(0))).getCigarElements() ) {
      -                    cigar.add(ce);
      -                }
      -            }
      -
      -            // reset the bubble state machine
      -            final BubbleStateMachine bsm = new BubbleStateMachine(cigar);
      -
      -            for( final DeBruijnEdge e : edges ) {
      -                if( e.equals(graph, edges.get(0)) ) {
      -                    advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null );
      -                }
      -                advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e );
      -            }
      -
      -            // special case for paths that don't end on reference
      -            if( bsm.inBubble ) {
      -                for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) {
      -                    bsm.cigar.add(ce);
      -                }
      -            } else if( edges.get(edges.size()-1).isRef() && !graph.isRefSink(edges.get(edges.size()-1)) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit
      -                for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edges.get(edges.size()-1)), null).getCigarElements() ) {
      -                    bsm.cigar.add(ce);
      -                }
      -            }
      -
      -            return AlignmentUtils.consolidateCigar(bsm.cigar);
      -        }
      -
      -        /**
      -         * Advance the bubble state machine by incorporating the next node in the path.
      -         * @param bsm   the current bubble state machine
      -         * @param node  the node to be incorporated
      -         * @param e     the edge which generated this node in the path
      -         */
      -        @Requires({"bsm != null", "graph != null", "node != null"})
      -        private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final DeBruijnVertex node, final DeBruijnEdge e ) {
      -            if( graph.isReferenceNode( node ) ) {
      -                if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else
      -                    if( e !=null && !e.isRef() ) {
      -                        if( graph.referencePathExists( graph.getEdgeSource(e), node) ) {
      -                            for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) {
      -                                bsm.cigar.add(ce);
      -                            }
      -                            bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
      -                        } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself
      -                            bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) );
      -                        } else {
      -                            bsm.inBubble = true;
      -                            bsm.bubbleBytes = null;
      -                            bsm.lastSeenReferenceNode = graph.getEdgeSource(e);
      -                            bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      -                        }
      -                    } else {
      -                        bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
      -                    }
      -                } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path
      -                    bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      -                } else { // close the bubble and use a local SW to determine the Cigar string
      -                    for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) {
      -                        bsm.cigar.add(ce);
      -                    }
      -                    bsm.inBubble = false;
      -                    bsm.bubbleBytes = null;
      -                    bsm.lastSeenReferenceNode = null;
      -                    bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
      -                }
      -            } else { // non-ref vertex
      -                if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path
      -                    bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      -                } else { // open up a bubble
      -                    bsm.inBubble = true;
      -                    bsm.bubbleBytes = null;
      -                    bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null );
      -                    bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      -                }
      -            }
      -        }
      -
      -        /**
      -         * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble
      -         * @param bubbleBytes   the bytes that comprise the alternate allele path in this bubble
      -         * @param fromVertex    the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex)
      -         * @param toVertex      the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex)
      -         * @return              the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble
      -         */
      -        @Requires({"graph != null"})
      -        @Ensures({"result != null"})
      -        private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) {
      -            final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null);
      -
      -            final Cigar returnCigar = new Cigar();
      -
      -            // add padding to anchor ref/alt bases in the SW matrix
      -            byte[] padding = STARTING_SW_ANCHOR_BYTES;
      -            boolean goodAlignment = false;
      -            SWPairwiseAlignment swConsensus = null;
      -            while( !goodAlignment && padding.length < 1000 ) {
      -                padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time
      -                final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding );
      -                final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding );
      -                swConsensus = new SWPairwiseAlignment( reference, alternate, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
      -                if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) {
      -                    goodAlignment = true;
      -                }
      -            }
      -            if( !goodAlignment ) {
      -                returnCigar.add(new CigarElement(1, CigarOperator.N));
      -                return returnCigar;
      -            }
      -
      -            final Cigar swCigar = swConsensus.getCigar();
      -            if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference
      -                returnCigar.add(new CigarElement(1, CigarOperator.N));
      -            } else {
      -                int skipElement = -1;
      -                if( fromVertex == null ) {
      -                    for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
      -                        final CigarElement ce = swCigar.getCigarElement(iii);
      -                        if( ce.getOperator().equals(CigarOperator.D) ) {
      -                            skipElement = iii;
      -                            break;
      -                        }
      -                    }
      -                } else if (toVertex == null ) {
      -                    for( int iii = swCigar.numCigarElements() - 1; iii >= 0; iii-- ) {
      -                        final CigarElement ce = swCigar.getCigarElement(iii);
      -                        if( ce.getOperator().equals(CigarOperator.D) ) {
      -                            skipElement = iii;
      -                            break;
      -                        }
      -                    }
      -                }
      -                for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
      -                    // now we need to remove the padding from the cigar string
      -                    int length = swCigar.getCigarElement(iii).getLength();
      -                    if( iii == 0 ) { length -= padding.length; }
      -                    if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; }
      -                    if( length > 0 ) {
      -                        returnCigar.add(new CigarElement(length, (skipElement == iii ? CigarOperator.X : swCigar.getCigarElement(iii).getOperator())));
      -                    }
      -                }
      -                if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) {
      -                    throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar());
      -                }
      -            }
      -
      -            return returnCigar;
      -        }
      -
      -        // class to keep track of the bubble state machine
      -        protected static class BubbleStateMachine {
      -            public boolean inBubble = false;
      -            public byte[] bubbleBytes = null;
      -            public DeBruijnVertex lastSeenReferenceNode = null;
      -            public Cigar cigar = null;
      -
      -            public BubbleStateMachine( final Cigar initialCigar ) {
      -                inBubble = false;
      -                bubbleBytes = null;
      -                lastSeenReferenceNode = null;
      -                cigar = initialCigar;
      -            }
      -        }
      -    }
      -
      -    protected static class PathComparatorTotalScore implements Comparator, Serializable {
      -        @Override
      -        public int compare(final Path path1, final Path path2) {
      -            return path1.totalScore - path2.totalScore;
      -        }
      -    }
      -
      -    /**
      -     * Traverse the graph and pull out the best k paths.
      -     * Paths are scored via their comparator function. The default being PathComparatorTotalScore()
      -     * @param graph the graph from which to pull paths
      -     * @param k     the number of paths to find
      -     * @return      a list with at most k top-scoring paths from the graph
      -     */
      -    @Ensures({"result != null", "result.size() <= k"})
      -    public static List getKBestPaths( final DeBruijnAssemblyGraph graph, final int k ) {
      -        if( graph == null ) { throw  new IllegalArgumentException("Attempting to traverse a null graph."); }
      -        if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); }
      -
      -        final ArrayList bestPaths = new ArrayList();
      -        
      -        // run a DFS for best paths
      -        for( final DeBruijnVertex v : graph.vertexSet() ) {
      -            if( graph.inDegreeOf(v) == 0 ) {
      -                findBestPaths(new Path(v, graph), bestPaths);
      -            }
      -        }
      -
      -        Collections.sort(bestPaths, new PathComparatorTotalScore() );
      -        Collections.reverse(bestPaths);
      -        return bestPaths.subList(0, Math.min(k, bestPaths.size()));
      -    }
      -
      -    private static void findBestPaths( final Path path, final List bestPaths ) {
      -        findBestPaths(path, bestPaths, new MyInt());
      -    }
      -
      -    private static void findBestPaths( final Path path, final List bestPaths, final MyInt n ) {
      -
      -        // did we hit the end of a path?
      -        if ( allOutgoingEdgesHaveBeenVisited(path) ) {
      -            if( path.containsRefEdge() ) {
      -                if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
      -                    // clean out some low scoring paths
      -                    Collections.sort(bestPaths, new PathComparatorTotalScore() );
      -                    for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20
      -                }
      -                bestPaths.add(path);
      -            }
      -        } else if( n.val > 10000) {
      -            // do nothing, just return
      -        } else {
      -            // recursively run DFS
      -            final ArrayList edgeArrayList = new ArrayList();
      -            edgeArrayList.addAll(path.graph.outgoingEdgesOf(path.lastVertex));
      -            Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator());
      -            Collections.reverse(edgeArrayList);
      -            for ( final DeBruijnEdge edge : edgeArrayList ) {
      -                // make sure the edge is not already in the path
      -                if ( path.containsEdge(edge) )
      -                    continue;
      -
      -                final Path newPath = new Path(path, edge);
      -                n.val++;
      -                findBestPaths(newPath, bestPaths, n);
      -            }
      -        }
      -    }
      -
      -    /**
      -     * @param path  the path to test
      -     * @return      true if all the outgoing edges at the end of this path have already been visited
      -     */
      -    private static boolean allOutgoingEdgesHaveBeenVisited( final Path path ) {
      -        for( final DeBruijnEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) {
      -            if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles
      -                return false;
      -            }
      -        }
      -        return true;
      -    }
      -}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java
      new file mode 100644
      index 000000000..a7194f85f
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java
      @@ -0,0 +1,183 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +
      +import java.util.Collection;
      +import java.util.HashMap;
      +import java.util.Map;
      +
      +/**
      + * generic utility class that counts kmers
      + *
      + * Basically you add kmers to the counter, and it tells you how many occurrences of each kmer it's seen.
      + *
      + * User: depristo
      + * Date: 3/8/13
      + * Time: 1:16 PM
      + */
      +public class KMerCounter {
      +    //private final static Logger logger = Logger.getLogger(KMerCounter.class);
      +
      +    /**
      +     * A map of for each kmer to its num occurrences in addKmers
      +     */
      +    private final Map countsByKMer = new HashMap();
      +    private final int kmerLength;
      +
      +    /**
      +     * Create a new kmer counter
      +     *
      +     * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1
      +     */
      +    public KMerCounter(final int kmerLength) {
      +        if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength);
      +        this.kmerLength = kmerLength;
      +    }
      +
      +    /**
      +     * Get the count of kmer in this kmer counter
      +     * @param kmer a non-null counter to get
      +     * @return a positive integer
      +     */
      +    public int getKmerCount(final Kmer kmer) {
      +        if ( kmer == null ) throw new IllegalArgumentException("kmer cannot be null");
      +        final CountedKmer counted = countsByKMer.get(kmer);
      +        return counted == null ? 0 : counted.count;
      +    }
      +
      +    /**
      +     * Get an unordered collection of the counted kmers in this counter
      +     * @return a non-null collection
      +     */
      +    public Collection getCountedKmers() {
      +        return countsByKMer.values();
      +    }
      +
      +    /**
      +     * Remove all current counts, resetting the counter to an empty state
      +     */
      +    public void clear() {
      +        countsByKMer.clear();
      +    }
      +
      +    /**
      +     * Add a kmer that occurred kmerCount times
      +     *
      +     * @param kmer a kmer
      +     * @param kmerCount the number of occurrences
      +     */
      +    public void addKmer(final Kmer kmer, final int kmerCount) {
      +        if ( kmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + kmer + " expected size " + kmerLength);
      +        if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount);
      +
      +        CountedKmer countFromMap = countsByKMer.get(kmer);
      +        if ( countFromMap == null ) {
      +            countFromMap = new CountedKmer(kmer);
      +            countsByKMer.put(kmer, countFromMap);
      +        }
      +        countFromMap.count += kmerCount;
      +    }
      +
      +    @Override
      +    public String toString() {
      +        final StringBuilder b = new StringBuilder("KMerCounter{");
      +        b.append("counting ").append(countsByKMer.size()).append(" distinct kmers");
      +        b.append("\n}");
      +        return b.toString();
      +    }
      +
      +    protected static class CountedKmer implements Comparable {
      +        final Kmer kmer;
      +        int count = 0;
      +
      +        private CountedKmer(final Kmer kmer) {
      +            this.kmer = kmer;
      +        }
      +
      +        public Kmer getKmer() {
      +            return kmer;
      +        }
      +
      +        public int getCount() {
      +            return count;
      +        }
      +
      +        @Override
      +        public String toString() {
      +            return "CountedKmer{" +
      +                    "kmer='" + kmer + '\'' +
      +                    ", count=" + count +
      +                    '}';
      +        }
      +
      +        @Override
      +        public int compareTo(CountedKmer o) {
      +            return o.count - count;
      +        }
      +    }
      +
      +    // -------------------------------------------------------------------------------------
      +    // Protected methods for testing purposes only
      +    // -------------------------------------------------------------------------------------
      +
      +    /**
      +     * For testing purposes only
      +     */
      +    protected void addKmer(final String rawKmer, final int kmerCount) {
      +        addKmer(new Kmer(rawKmer), kmerCount);
      +    }
      +
      +    /**
      +     * For testing purposes
      +     *
      +     * @param kmers
      +     */
      +    protected void addKmers(final String ... kmers) {
      +        for ( final String kmer : kmers )
      +            addKmer(kmer, 1);
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java
      new file mode 100644
      index 000000000..9b0e1ac0a
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java
      @@ -0,0 +1,207 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +
      +import java.util.Arrays;
      +
      +/**
      + * Fast wrapper for byte[] kmers
      + *
      + * This objects has several important features that make it better than using a raw byte[] for a kmer:
      + *
      + * -- Can create kmer from a range of a larger byte[], allowing us to avoid Array.copyOfRange
      + * -- Fast equals and hashcode methods
      + * -- can get actual byte[] of the kmer, even if it's from a larger byte[], and this operation
      + *    only does the work of that operation once, updating its internal state
      + *
      + * User: depristo
      + * Date: 4/8/13
      + * Time: 7:54 AM
      + */
      +public class Kmer {
      +    // this values may be updated in the course of interacting with this kmer
      +    private byte[] bases;
      +    protected int start;
      +
      +    // two constants
      +    final protected int length;
      +    final protected int hash;
      +
      +    /**
      +     * Create a new kmer using all bases in kmer
      +     * @param kmer a non-null byte[]
      +     */
      +    public Kmer(byte[] kmer) {
      +        this(kmer, 0, kmer.length);
      +    }
      +
      +    /**
      +     * Create a new kmer based on the string kmer
      +     *
      +     * This is not a good method to use for performance
      +     *
      +     * @param kmer the bases as a string
      +     */
      +    public Kmer(final String kmer) {
      +        this(kmer.getBytes());
      +    }
      +
      +    /**
      +     * Create a new kmer backed by the bases in bases, spanning start -> start + length
      +     *
      +     * Under no circumstances can bases be modified anywhere in the client code.  This does not make a copy
      +     * of bases for performance reasons
      +     *
      +     * @param bases an array of bases
      +     * @param start the start of the kmer in bases, must be >= 0 and < bases.length
      +     * @param length the length of the kmer.  Must be >= 0 and start + length < bases.length
      +     */
      +    public Kmer(final byte[] bases, final int start, final int length) {
      +        if ( bases == null ) throw new IllegalArgumentException("bases cannot be null");
      +        if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start);
      +        if ( length < 0 ) throw new IllegalArgumentException("length must be >= 0 but got " + length);
      +        if ( (start + length) > bases.length ) throw new IllegalArgumentException("start + length " + (start + length) + " must be <= bases.length " + bases.length + " but got " + start + " with length " + length);
      +        this.bases = bases;
      +        this.start = start;
      +        this.length = length;
      +        this.hash = myHashCode(bases, start, length);
      +    }
      +
      +    /**
      +     * Create a new kmer that's a shallow copy of kmer
      +     * @param kmer the kmer to shallow copy
      +     */
      +    public Kmer(final Kmer kmer) {
      +        this.bases = kmer.bases;
      +        this.start = kmer.start;
      +        this.length = kmer.length;
      +        this.hash = kmer.hash;
      +    }
      +
      +    /**
      +     * Create a derived shallow kmer that starts at newStart and has newLength bases
      +     * @param newStart the new start of kmer, where 0 means that start of the kmer, 1 means skip the first base
      +     * @param newLength the new length
      +     * @return a new kmer based on the data in this kmer.  Does not make a copy, so shares most of the data
      +     */
      +    public Kmer subKmer(final int newStart, final int newLength) {
      +        return new Kmer(bases, start + newStart, newLength);
      +    }
      +
      +    /**
      +     * Get the bases of this kmer.  May create a copy of the bases, depending on how this kmer was constructed.
      +     *
      +     * Note that this function is efficient in that if it needs to copy the bases this only occurs once.
      +     *
      +     * @return a non-null byte[] containing length() bases of this kmer, regardless of how this kmer was created
      +     */
      +    public byte[] bases() {
      +        if ( start != 0 || bases.length != length ) {
      +            // update operation.  Rip out the exact byte[] and update start so we don't ever do this again
      +            bases = Arrays.copyOfRange(bases, start, start + length);
      +            start = 0;
      +        }
      +
      +        return bases;
      +    }
      +
      +    /**
      +     * The length of this kmer
      +     * @return an integer >= 0
      +     */
      +    public int length() {
      +        return length;
      +    }
      +
      +    @Override
      +    public String toString() {
      +        return "Kmer{" + new String(bases()) + "}";
      +    }
      +
      +    @Override
      +    public boolean equals(Object o) {
      +        if (this == o) return true;
      +        if (o == null || getClass() != o.getClass()) return false;
      +
      +        final Kmer kmer = (Kmer) o;
      +
      +        // very fast test.  If hash aren't equal you are done, otherwise compare the bases
      +        if ( hash != kmer.hash ) return false;
      +        if ( length != kmer.length ) return false;
      +
      +        for ( int i = 0; i < length; i++ )
      +            if ( bases[start + i] != kmer.bases[kmer.start + i] )
      +                return false;
      +
      +        return true;
      +    }
      +
      +    @Override
      +    public int hashCode() {
      +        return hash;
      +    }
      +
      +    /**
      +     * Helper method that computes the hashcode for this kmer based only the bases in
      +     * a[], starting at start and running length bases
      +     *
      +     * @param a a non-null bases array
      +     * @param start where to start in bases
      +     * @param length the length of the bases
      +     * @return a hashcode value appropriate for a[start] -> a[start + length]
      +     */
      +    private static int myHashCode(final byte a[], final int start, final int length) {
      +        if (a == null)
      +            return 0;
      +
      +        int result = 1;
      +        for (int i = 0; i < length; i++)
      +            result = 31 * result + a[start + i];
      +
      +        return result;
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
      index c3e7276a6..8697833a6 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
      @@ -48,12 +48,15 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
       
       import com.google.java.contract.Ensures;
       import com.google.java.contract.Requires;
      +import org.apache.log4j.Logger;
      +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
       import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
      -import org.broadinstitute.sting.utils.Haplotype;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
       import org.broadinstitute.sting.utils.MathUtils;
       import org.broadinstitute.sting.utils.QualityUtils;
       import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
       import org.broadinstitute.sting.utils.exceptions.UserException;
      +import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
       import org.broadinstitute.sting.utils.pairhmm.*;
       import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       import org.broadinstitute.sting.utils.sam.ReadUtils;
      @@ -62,11 +65,20 @@ import org.broadinstitute.variant.variantcontext.Allele;
       import java.util.*;
       
       public class LikelihoodCalculationEngine {
      +    private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
       
           private static final double LOG_ONE_HALF = -Math.log10(2.0);
           private final byte constantGCP;
           private final boolean DEBUG;
           private final PairHMM pairHMM;
      +    private final int minReadLength = 20;
      +
      +    /**
      +     * The expected rate of random sequencing errors for a read originating from its true haplotype.
      +     *
      +     * For example, if this is 0.01, then we'd expect 1 error per 100 bp.
      +     */
      +    private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
       
           public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
       
      @@ -78,7 +90,7 @@ public class LikelihoodCalculationEngine {
                       pairHMM = new Log10PairHMM(false);
                       break;
                   case LOGLESS_CACHING:
      -                pairHMM = new LoglessCachingPairHMM();
      +                pairHMM = new LoglessPairHMM();
                       break;
                   default:
                       throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
      @@ -88,9 +100,16 @@ public class LikelihoodCalculationEngine {
               DEBUG = debug;
           }
       
      -    public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) {
      -
      -        final Map stratifiedReadMap = new HashMap();
      +    /**
      +     * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
      +     *
      +     * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
      +     * against the set of haplotypes
      +     *
      +     * @param haplotypes a non-null list of haplotypes
      +     * @param perSampleReadList a mapping from sample -> reads
      +     */
      +    private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) {
               int X_METRIC_LENGTH = 0;
               for( final Map.Entry> sample : perSampleReadList.entrySet() ) {
                   for( final GATKSAMRecord read : sample.getValue() ) {
      @@ -104,19 +123,29 @@ public class LikelihoodCalculationEngine {
                   if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
               }
       
      -        // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
      -        X_METRIC_LENGTH += 2;
      -        Y_METRIC_LENGTH += 2;
      -
               // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
               pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
      +    }
       
      -        // for each sample's reads
      +    public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) {
      +        // configure the HMM
      +        initializePairHMM(haplotypes, perSampleReadList);
      +
      +        // Add likelihoods for each sample's reads to our stratifiedReadMap
      +        final Map stratifiedReadMap = new HashMap();
               for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) {
                   //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
                   // evaluate the likelihood of the reads given those haplotypes
      -            stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue()));
      +            final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
      +
      +            final List removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
      +//            logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey());
      +//            for ( final GATKSAMRecord read : removedReads )
      +//                logger.info("\tRemoved " + read.getReadName());
      +
      +            stratifiedReadMap.put(sampleEntry.getKey(), map);
               }
      +
               return stratifiedReadMap;
           }
       
      @@ -130,10 +159,14 @@ public class LikelihoodCalculationEngine {
       
               final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
               for( final GATKSAMRecord read : reads ) {
      +            if ( read.getReadLength() < minReadLength )
      +                // don't consider any reads that have a read length < the minimum
      +                continue;
      +
                   final byte[] overallGCP = new byte[read.getReadLength()];
                   Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
      -            Haplotype previousHaplotypeSeen = null;
      -            final byte[] readQuals = read.getBaseQualities();
      +            // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
      +            final byte[] readQuals = read.getBaseQualities().clone();
                   final byte[] readInsQuals = read.getBaseInsertionQualities();
                   final byte[] readDelQuals = read.getBaseDeletionQualities();
                   for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
      @@ -146,15 +179,14 @@ public class LikelihoodCalculationEngine {
       
                   for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
                       final Haplotype haplotype = haplotypes.get(jjj);
      +                final boolean isFirstHaplotype = jjj == 0;
      +                final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
      +                        read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
       
      -                final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : PairHMM.findFirstPositionWhereHaplotypesDiffer(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
      -                previousHaplotypeSeen = haplotype;
      -
      -                perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype),
      -                        pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
      -                                readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
      +                perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
                   }
               }
      +
               return perReadAlleleLikelihoodMap;
           }
       
      @@ -162,17 +194,17 @@ public class LikelihoodCalculationEngine {
           @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
           public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
                                                                        final Map stratifiedReadMap,
      -                                                                 final List alleleOrdering ) {
      -        final TreeSet sampleSet = new TreeSet();
      -        sampleSet.add(sample);
      -        return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering);
      +                                                                 final List alleleOrdering,
      +                                                                 final boolean normalize ) {
      +        return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize);
           }
       
           @Requires({"alleleOrdering.size() > 0"})
           @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
           public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples,
                                                                        final Map stratifiedReadMap,
      -                                                                 final List alleleOrdering ) {
      +                                                                 final List alleleOrdering,
      +                                                                 final boolean normalize) {
       
               final int numHaplotypes = alleleOrdering.size();
               final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
      @@ -199,7 +231,7 @@ public class LikelihoodCalculationEngine {
               }
       
               // normalize the diploid likelihoods matrix
      -        return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
      +        return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix;
           }
       
           @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
      @@ -223,54 +255,127 @@ public class LikelihoodCalculationEngine {
               return likelihoodMatrix;
           }
       
      -    @Requires({"haplotypes.size() > 0"})
      -    @Ensures({"result.size() <= haplotypes.size()"})
      -    public List selectBestHaplotypes( final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation ) {
      +    // --------------------------------------------------------------------------------
      +    //
      +    // System to compute the best N haplotypes for genotyping
      +    //
      +    // --------------------------------------------------------------------------------
       
      -        final int numHaplotypes = haplotypes.size();
      -        final Set sampleKeySet = stratifiedReadMap.keySet();
      -        final List bestHaplotypesIndexList = new ArrayList();
      -        bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
      -        final List haplotypesAsAlleles = new ArrayList();
      -        for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); }
      +    /**
      +     * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
      +     * @param map an annoying map object that moves us between the allele and haplotype representation
      +     * @param haplotypeAsAllele the allele version of the haplotype
      +     * @return the haplotype version, with its score incremented by 1 if its non-reference
      +     */
      +    private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) {
      +        final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
      +        if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
      +        return h;
      +    }
       
      -        final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together
      +    /**
      +     * Take the best N haplotypes and return them as a list
      +     *
      +     * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
      +     * as it's preferred haplotype.  Takes the best N haplotypes from selectedHaplotypes in decreasing
      +     * order of score (so higher score haplotypes are preferred).  The N we take is determined by
      +     *
      +     * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
      +     *
      +     * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
      +     * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
      +     *
      +     * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
      +     * @param nSamples the number of samples used to select the haplotypes
      +     * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
      +     * @return a list of N or fewer haplotypes, with the reference haplotype first
      +     */
      +    private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
      +        final List selectedHaplotypesList = new ArrayList(selectedHaplotypes);
      +        Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
      +        final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
      +        final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
      +        final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
      +        if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
      +        return bestHaplotypes;
      +    }
       
      -        int hap1 = 0;
      -        int hap2 = 0;
      -        //double bestElement = Double.NEGATIVE_INFINITY;
      -        final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 );
      -        while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
      -            double maxElement = Double.NEGATIVE_INFINITY;
      -            for( int iii = 0; iii < numHaplotypes; iii++ ) {
      -                for( int jjj = 0; jjj <= iii; jjj++ ) {
      -                    if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
      -                        maxElement = haplotypeLikelihoodMatrix[iii][jjj];
      -                        hap1 = iii;
      -                        hap2 = jjj;
      -                    }
      -                }
      -            }
      -            if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
      -            if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
      -            haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
      +    /**
      +     * Select the best haplotypes for genotyping the samples in stratifiedReadMap
      +     *
      +     * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
      +     * haplotypes per sample.  What this means is that each sample computes the diploid genotype likelihoods for
      +     * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
      +     * one extra count for each haplotype (so hom-var haplotypes get two counts).  After performing this calculation
      +     * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
      +     * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
      +     *
      +     * @param haplotypes a list of all haplotypes we're considering
      +     * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
      +     * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
      +     * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
      +     */
      +    public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
      +        if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
       
      -            if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
      -            if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
      +        if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
      +
      +        // all of the haplotypes that at least one sample called as one of the most likely
      +        final Set selectedHaplotypes = new HashSet();
      +        selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
      +
      +        // our annoying map from allele -> haplotype
      +        final Map allele2Haplotype = new HashMap();
      +        for ( final Haplotype h : haplotypes ) {
      +            h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
      +            allele2Haplotype.put(Allele.create(h, h.isReference()), h);
               }
       
      -        if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
      +        // for each sample, compute the most likely pair of haplotypes
      +        for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) {
      +            // get the two most likely haplotypes under a diploid model for this sample
      +            final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
       
      -        final List bestHaplotypes = new ArrayList();
      -        for( final int hIndex : bestHaplotypesIndexList ) {
      -            bestHaplotypes.add( haplotypes.get(hIndex) );
      +            if ( mla != null ) { // there was something to evaluate in this sample
      +                // note that there must be at least 2 haplotypes
      +                final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
      +                final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
      +
      +//            if ( DEBUG ) {
      +//                logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
      +//            }
      +
      +                // add these two haplotypes to the set of haplotypes that have been selected
      +                selectedHaplotypes.add(best);
      +                selectedHaplotypes.add(second);
      +
      +                // we've already selected all of our haplotypes, and we don't need to prune them down
      +                if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
      +                    break;
      +            }
      +        }
      +
      +        // take the best N haplotypes forward, in order of the number of samples that choose them
      +        final int nSamples = stratifiedReadMap.size();
      +        final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
      +
      +        if ( DEBUG ) {
      +            logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
      +            for ( final Haplotype h : bestHaplotypes ) {
      +                logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
      +            }
               }
               return bestHaplotypes;
           }
       
      -    public static int findReferenceIndex( final List haplotypes ) {
      +    /**
      +     * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
      +     * @param haplotypes non-null list of haplotypes
      +     * @return the reference haplotype
      +     */
      +    private static Haplotype findReferenceHaplotype( final List haplotypes ) {
               for( final Haplotype h : haplotypes ) {
      -            if( h.isReference() ) { return haplotypes.indexOf(h); }
      +            if( h.isReference() ) return h;
               }
               throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
           }
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
      index 3efa342b1..4c0483ad6 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java
      @@ -47,10 +47,11 @@
       package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
       
       import org.broadinstitute.sting.utils.GenomeLoc;
      -import org.broadinstitute.sting.utils.Haplotype;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
       import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
       import org.broadinstitute.variant.variantcontext.VariantContext;
       
      +import java.io.PrintStream;
       import java.util.List;
       
       /**
      @@ -59,13 +60,46 @@ import java.util.List;
        * Date: Mar 14, 2011
        */
       public abstract class LocalAssemblyEngine {
      +    public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
       
      -    public enum ASSEMBLER {
      -        SIMPLE_DE_BRUIJN
      +    protected PrintStream graphWriter = null;
      +    protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE;
      +    protected int pruneFactor = 2;
      +    protected boolean errorCorrectKmers = false;
      +
      +    protected LocalAssemblyEngine() { }
      +
      +    public int getPruneFactor() {
      +        return pruneFactor;
           }
       
      -    protected LocalAssemblyEngine() {
      +    public void setPruneFactor(int pruneFactor) {
      +        this.pruneFactor = pruneFactor;
           }
       
      -    public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List activeAllelesToGenotype);
      +    public boolean shouldErrorCorrectKmers() {
      +        return errorCorrectKmers;
      +    }
      +
      +    public void setErrorCorrectKmers(boolean errorCorrectKmers) {
      +        this.errorCorrectKmers = errorCorrectKmers;
      +    }
      +
      +    public PrintStream getGraphWriter() {
      +        return graphWriter;
      +    }
      +
      +    public void setGraphWriter(PrintStream graphWriter) {
      +        this.graphWriter = graphWriter;
      +    }
      +
      +    public byte getMinBaseQualityToUseInAssembly() {
      +        return minBaseQualityToUseInAssembly;
      +    }
      +
      +    public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) {
      +        this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
      +    }
      +
      +    public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype);
       }
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java
      new file mode 100644
      index 000000000..be5a431c4
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java
      @@ -0,0 +1,190 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import java.io.Serializable;
      +import java.util.Collection;
      +import java.util.Comparator;
      +
      +/**
      + * simple edge class for connecting nodes in the graph
      + *
      + * Works equally well for all graph types (kmer or sequence)
      + *
      + * User: ebanks
      + * Date: Mar 23, 2011
      + */
      +public class BaseEdge {
      +    private int multiplicity;
      +    private boolean isRef;
      +
      +    /**
      +     * Create a new BaseEdge with weight multiplicity and, if isRef == true, indicates a path through the reference
      +     *
      +     * @param isRef indicates whether this edge is a path through the reference
      +     * @param multiplicity the number of observations of this edge
      +     */
      +    public BaseEdge(final boolean isRef, final int multiplicity) {
      +        if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0 but got " + multiplicity);
      +
      +        this.multiplicity = multiplicity;
      +        this.isRef = isRef;
      +    }
      +
      +    /**
      +     * Copy constructor
      +     *
      +     * @param toCopy
      +     */
      +    public BaseEdge(final BaseEdge toCopy) {
      +        this(toCopy.isRef(), toCopy.getMultiplicity());
      +    }
      +
      +    /**
      +     * Get the number of observations of paths connecting two vertices
      +     * @return a positive integer >= 0
      +     */
      +    public int getMultiplicity() {
      +        return multiplicity;
      +    }
      +
      +    /**
      +     * Set the multiplicity of this edge to value
      +     * @param value an integer >= 0
      +     */
      +    public void setMultiplicity( final int value ) {
      +        if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0");
      +        multiplicity = value;
      +    }
      +
      +    /**
      +     * Does this edge indicate a path through the reference graph?
      +     * @return true if so
      +     */
      +    public boolean isRef() {
      +        return isRef;
      +    }
      +
      +    /**
      +     * Indicate that this edge follows the reference sequence, or not
      +     * @param isRef true if this is a reference edge
      +     */
      +    public void setIsRef( final boolean isRef ) {
      +        this.isRef = isRef;
      +    }
      +
      +    /**
      +     * Does this and edge have the same source and target vertices in graph?
      +     *
      +     * @param graph the graph containing both this and edge
      +     * @param edge our comparator edge
      +     * @param 
      +     * @return true if we have the same source and target vertices
      +     */
      +    public  boolean hasSameSourceAndTarget(final BaseGraph graph, final BaseEdge edge) {
      +        return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
      +    }
      +
      +    // For use when comparing edges across graphs!
      +    public  boolean seqEquals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) {
      +        return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge)));
      +    }
      +
      +    /**
      +     * Sorts a collection of BaseEdges in decreasing order of weight, so that the most
      +     * heavily weighted is at the start of the list
      +     */
      +    public static class EdgeWeightComparator implements Comparator, Serializable {
      +        @Override
      +        public int compare(final BaseEdge edge1, final BaseEdge edge2) {
      +            return edge2.multiplicity - edge1.multiplicity;
      +        }
      +    }
      +
      +    /**
      +     * Add edge to this edge, updating isRef and multiplicity as appropriate
      +     *
      +     * isRef is simply the or of this and edge
      +     * multiplicity is the sum
      +     *
      +     * @param edge the edge to add
      +     * @return this
      +     */
      +    public BaseEdge add(final BaseEdge edge) {
      +        if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
      +        this.multiplicity += edge.getMultiplicity();
      +        this.isRef = this.isRef || edge.isRef();
      +        return this;
      +    }
      +
      +    /**
      +     * Create a new BaseEdge with multiplicity and isRef that's an or of all edges
      +     *
      +     * @param edges a collection of edges to or their isRef values
      +     * @param multiplicity our desired multiplicity
      +     * @return a newly allocated BaseEdge
      +     */
      +    public static BaseEdge orRef(final Collection edges, final int multiplicity) {
      +        for ( final BaseEdge e : edges )
      +            if ( e.isRef() )
      +                return new BaseEdge(true, multiplicity);
      +        return new BaseEdge(false, multiplicity);
      +    }
      +
      +    /**
      +     * Return a new edge whose multiplicity is the max of this and edge, and isRef is or of this and edge
      +     *
      +     * isRef is simply the or of this and edge
      +     * multiplicity is the max
      +     *
      +     * @param edge the edge to max
      +     */
      +    public BaseEdge max(final BaseEdge edge) {
      +        if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
      +        return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity()));
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
      new file mode 100644
      index 000000000..7ce57e2e7
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java
      @@ -0,0 +1,636 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Ensures;
      +import com.google.java.contract.Invariant;
      +import com.google.java.contract.Requires;
      +import org.apache.commons.lang.ArrayUtils;
      +import org.apache.log4j.Logger;
      +import org.jgrapht.EdgeFactory;
      +import org.jgrapht.graph.DefaultDirectedGraph;
      +
      +import java.io.File;
      +import java.io.FileNotFoundException;
      +import java.io.FileOutputStream;
      +import java.io.PrintStream;
      +import java.util.*;
      +
      +/**
      + * Created with IntelliJ IDEA.
      + * User: rpoplin
      + * Date: 2/6/13
      + */
      +@Invariant("!this.isAllowingMultipleEdges()")
      +public class BaseGraph extends DefaultDirectedGraph {
      +    protected final static Logger logger = Logger.getLogger(BaseGraph.class);
      +    private final int kmerSize;
      +
      +    /**
      +     * Construct an empty BaseGraph
      +     */
      +    public BaseGraph() {
      +        this(11);
      +    }
      +
      +    /**
      +     * Edge factory that creates non-reference multiplicity 1 edges
      +     * @param  the new of our vertices
      +     */
      +    private static class MyEdgeFactory implements EdgeFactory {
      +        @Override
      +        public BaseEdge createEdge(T sourceVertex, T targetVertex) {
      +            return new BaseEdge(false, 1);
      +        }
      +    }
      +
      +    /**
      +     * Construct a DeBruijnGraph with kmerSize
      +     * @param kmerSize
      +     */
      +    public BaseGraph(final int kmerSize) {
      +        super(new MyEdgeFactory());
      +
      +        if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
      +        this.kmerSize = kmerSize;
      +    }
      +
      +    /**
      +     * How big of a kmer did we use to create this graph?
      +     * @return
      +     */
      +    public int getKmerSize() {
      +        return kmerSize;
      +    }
      +
      +    /**
      +     * @param v the vertex to test
      +     * @return  true if this vertex is a reference node (meaning that it appears on the reference path in the graph)
      +     */
      +    public boolean isReferenceNode( final T v ) {
      +        if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
      +        for( final BaseEdge e : edgesOf(v) ) {
      +            if( e.isRef() ) { return true; }
      +        }
      +        return false;
      +    }
      +
      +    /**
      +     * @param v the vertex to test
      +     * @return  true if this vertex is a source node (in degree == 0)
      +     */
      +    public boolean isSource( final T v ) {
      +        if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
      +        return inDegreeOf(v) == 0;
      +    }
      +
      +    /**
      +     * @param v the vertex to test
      +     * @return  true if this vertex is a sink node (out degree == 0)
      +     */
      +    public boolean isSink( final T v ) {
      +        if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
      +        return outDegreeOf(v) == 0;
      +    }
      +
      +    /**
      +     * Get the set of source vertices of this graph
      +     * @return a non-null set
      +     */
      +    public Set getSources() {
      +        final Set set = new LinkedHashSet();
      +        for ( final T v : vertexSet() )
      +            if ( isSource(v) )
      +                set.add(v);
      +        return set;
      +    }
      +
      +    /**
      +     * Get the set of sink vertices of this graph
      +     * @return a non-null set
      +     */
      +    public Set getSinks() {
      +        final Set set = new LinkedHashSet();
      +        for ( final T v : vertexSet() )
      +            if ( isSink(v) )
      +                set.add(v);
      +        return set;
      +    }
      +
      +    /**
      +     * Pull out the additional sequence implied by traversing this node in the graph
      +     * @param v the vertex from which to pull out the additional base sequence
      +     * @return  non-null byte array
      +     */
      +    @Ensures({"result != null"})
      +    public byte[] getAdditionalSequence( final T v ) {
      +        if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); }
      +        return v.getAdditionalSequence(isSource(v));
      +    }
      +
      +    /**
      +     * @param e the edge to test
      +     * @return  true if this edge is a reference source edge
      +     */
      +    public boolean isRefSource( final BaseEdge e ) {
      +        if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
      +        for( final BaseEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
      +            if( edgeToTest.isRef() ) { return false; }
      +        }
      +        return true;
      +    }
      +
      +    /**
      +     * @param v the vertex to test
      +     * @return  true if this vertex is a reference source
      +     */
      +    public boolean isRefSource( final T v ) {
      +        if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
      +        for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
      +            if( edgeToTest.isRef() ) { return false; }
      +        }
      +        return true;
      +    }
      +
      +    /**
      +     * @param e the edge to test
      +     * @return  true if this edge is a reference sink edge
      +     */
      +    public boolean isRefSink( final BaseEdge e ) {
      +        if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
      +        for( final BaseEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
      +            if( edgeToTest.isRef() ) { return false; }
      +        }
      +        return true;
      +    }
      +
      +    /**
      +     * @param v the vertex to test
      +     * @return  true if this vertex is a reference sink
      +     */
      +    public boolean isRefSink( final T v ) {
      +        if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
      +        for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
      +            if( edgeToTest.isRef() ) { return false; }
      +        }
      +        return true;
      +    }
      +
      +    /**
      +     * @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph
      +     */
      +    public T getReferenceSourceVertex( ) {
      +        for( final T v : vertexSet() ) {
      +            if( isReferenceNode(v) && isRefSource(v) ) {
      +                return v;
      +            }
      +        }
      +        return null;
      +    }
      +
      +    /**
      +     * @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph
      +     */
      +    public T getReferenceSinkVertex( ) {
      +        for( final T v : vertexSet() ) {
      +            if( isReferenceNode(v) && isRefSink(v) ) {
      +                return v;
      +            }
      +        }
      +        return null;
      +    }
      +
      +    /**
      +     * Traverse the graph and get the next reference vertex if it exists
      +     * @param v the current vertex, can be null
      +     * @return  the next reference vertex if it exists
      +     */
      +    public T getNextReferenceVertex( final T v ) {
      +        if( v == null ) { return null; }
      +        for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
      +            if( edgeToTest.isRef() ) {
      +                return getEdgeTarget(edgeToTest);
      +            }
      +        }
      +        return null;
      +    }
      +
      +    /**
      +     * Traverse the graph and get the previous reference vertex if it exists
      +     * @param v the current vertex, can be null
      +     * @return  the previous reference vertex if it exists
      +     */
      +    public T getPrevReferenceVertex( final T v ) {
      +        if( v == null ) { return null; }
      +        for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
      +            if( isReferenceNode(getEdgeSource(edgeToTest)) ) {
      +                return getEdgeSource(edgeToTest);
      +            }
      +        }
      +        return null;
      +    }
      +
      +    /**
      +     * Does a reference path exist between the two vertices?
      +     * @param fromVertex    from this vertex, can be null
      +     * @param toVertex      to this vertex, can be null
      +     * @return              true if a reference path exists in the graph between the two vertices
      +     */
      +    public boolean referencePathExists(final T fromVertex, final T toVertex) {
      +        T v = fromVertex;
      +        if( v == null ) {
      +            return false;
      +        }
      +        v = getNextReferenceVertex(v);
      +        if( v == null ) {
      +            return false;
      +        }
      +        while( !v.equals(toVertex) ) {
      +            v = getNextReferenceVertex(v);
      +            if( v == null ) {
      +                return false;
      +            }
      +        }
      +        return true;
      +    }
      +
      +    /**
      +     * Walk along the reference path in the graph and pull out the corresponding bases
      +     * @param fromVertex    starting vertex
      +     * @param toVertex      ending vertex
      +     * @param includeStart  should the starting vertex be included in the path
      +     * @param includeStop   should the ending vertex be included in the path
      +     * @return              byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example)
      +     */
      +    public byte[] getReferenceBytes( final T fromVertex, final T toVertex, final boolean includeStart, final boolean includeStop ) {
      +        if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); }
      +        if( toVertex == null ) { throw  new IllegalArgumentException("From vertex in requested path cannot be null."); }
      +
      +        byte[] bytes = null;
      +        T v = fromVertex;
      +        if( includeStart ) {
      +            bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
      +        }
      +        v = getNextReferenceVertex(v); // advance along the reference path
      +        while( v != null && !v.equals(toVertex) ) {
      +            bytes = ArrayUtils.addAll( bytes, getAdditionalSequence(v) );
      +            v = getNextReferenceVertex(v); // advance along the reference path
      +        }
      +        if( includeStop && v != null && v.equals(toVertex)) {
      +            bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
      +        }
      +        return bytes;
      +    }
      +
      +    /**
      +     * Convenience function to add multiple vertices to the graph at once
      +     * @param vertices one or more vertices to add
      +     */
      +    public void addVertices(final T ... vertices) {
      +        for ( final T v : vertices )
      +            addVertex(v);
      +    }
      +
      +    /**
      +     * Convenience function to add multiple vertices to the graph at once
      +     * @param vertices one or more vertices to add
      +     */
      +    public void addVertices(final Collection vertices) {
      +        for ( final T v : vertices )
      +            addVertex(v);
      +    }
      +
      +    /**
      +     * Convenience function to add multiple edges to the graph
      +     * @param start the first vertex to connect
      +     * @param remaining all additional vertices to connect
      +     */
      +    public void addEdges(final T start, final T ... remaining) {
      +        addEdges(new BaseEdge(false, 1), start, remaining);
      +    }
      +
      +    /**
      +     * Convenience function to add multiple edges to the graph
      +     * @param start the first vertex to connect
      +     * @param remaining all additional vertices to connect
      +     */
      +    public void addEdges(final BaseEdge template, final T start, final T ... remaining) {
      +        T prev = start;
      +        for ( final T next : remaining ) {
      +            addEdge(prev, next, new BaseEdge(template));
      +            prev = next;
      +        }
      +    }
      +
      +    /**
      +     * Get the set of vertices connected by outgoing edges of V
      +     * @param v a non-null vertex
      +     * @return a set of vertices connected by outgoing edges from v
      +     */
      +    public Set outgoingVerticesOf(final T v) {
      +        final Set s = new LinkedHashSet();
      +        for ( final BaseEdge e : outgoingEdgesOf(v) ) {
      +            s.add(getEdgeTarget(e));
      +        }
      +        return s;
      +    }
      +
      +    /**
      +     * Get the set of vertices connected to v by incoming edges
      +     * @param v a non-null vertex
      +     * @return a set of vertices {X} connected X -> v
      +     */
      +    public Set incomingVerticesOf(final T v) {
      +        final Set s = new LinkedHashSet();
      +        for ( final BaseEdge e : incomingEdgesOf(v) ) {
      +            s.add(getEdgeSource(e));
      +        }
      +        return s;
      +    }
      +
      +    /**
      +     * Print out the graph in the dot language for visualization
      +     * @param destination File to write to
      +     */
      +    public void printGraph(final File destination, final int pruneFactor) {
      +        PrintStream stream = null;
      +
      +        try {
      +            stream = new PrintStream(new FileOutputStream(destination));
      +            printGraph(stream, true, pruneFactor);
      +        } catch ( FileNotFoundException e ) {
      +            throw new RuntimeException(e);
      +        } finally {
      +            if ( stream != null ) stream.close();
      +        }
      +    }
      +
      +    public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) {
      +        if ( writeHeader )
      +            graphWriter.println("digraph assemblyGraphs {");
      +
      +        for( final BaseEdge edge : edgeSet() ) {
      +            graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];");
      +            if( edge.isRef() ) {
      +                graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];");
      +            }
      +        }
      +
      +        for( final T v : vertexSet() ) {
      +            graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]");
      +        }
      +
      +        if ( writeHeader )
      +            graphWriter.println("}");
      +    }
      +
      +    /**
      +     * Remove edges that are connected before the reference source and after the reference sink
      +     *
      +     * Also removes all vertices that are orphaned by this process
      +     */
      +    public void cleanNonRefPaths() {
      +        if( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) {
      +            return;
      +        }
      +
      +        // Remove non-ref edges connected before and after the reference path
      +        final Set edgesToCheck = new HashSet();
      +        edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex()));
      +        while( !edgesToCheck.isEmpty() ) {
      +            final BaseEdge e = edgesToCheck.iterator().next();
      +            if( !e.isRef() ) {
      +                edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) );
      +                removeEdge(e);
      +            }
      +            edgesToCheck.remove(e);
      +        }
      +
      +        edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex()));
      +        while( !edgesToCheck.isEmpty() ) {
      +            final BaseEdge e = edgesToCheck.iterator().next();
      +            if( !e.isRef() ) {
      +                edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) );
      +                removeEdge(e);
      +            }
      +            edgesToCheck.remove(e);
      +        }
      +
      +        removeSingletonOrphanVertices();
      +    }
      +
      +    /**
      +     * Prune all edges from this graph that have multiplicity <= pruneFactor and remove all orphaned singleton vertices as well
      +     *
      +     * @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
      +     */
      +    public void pruneGraph( final int pruneFactor ) {
      +        final List edgesToRemove = new ArrayList();
      +        for( final BaseEdge e : edgeSet() ) {
      +            if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
      +                edgesToRemove.add(e);
      +            }
      +        }
      +        removeAllEdges(edgesToRemove);
      +
      +        removeSingletonOrphanVertices();
      +    }
      +
      +    /**
      +     * Remove all vertices in the graph that have in and out degree of 0
      +     */
      +    protected void removeSingletonOrphanVertices() {
      +        // Run through the graph and clean up singular orphaned nodes
      +        final List verticesToRemove = new LinkedList();
      +        for( final T v : vertexSet() ) {
      +            if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
      +                verticesToRemove.add(v);
      +            }
      +        }
      +        removeAllVertices(verticesToRemove);
      +    }
      +
      +    /**
      +     * Remove all vertices on the graph that cannot be accessed by following any edge,
      +     * regardless of its direction, from the reference source vertex
      +     */
      +    public void removeVerticesNotConnectedToRefRegardlessOfEdgeDirection() {
      +        final HashSet toRemove = new HashSet(vertexSet());
      +
      +        final T refV = getReferenceSourceVertex();
      +        if ( refV != null ) {
      +            for ( final T v : new BaseGraphIterator(this, refV, true, true) ) {
      +                toRemove.remove(v);
      +            }
      +        }
      +
      +        removeAllVertices(toRemove);
      +    }
      +
      +    /**
      +     * Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex
      +     *
      +     * More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection,
      +     * as it requires vertices to not only be connected by a series of directed edges but also prunes away
      +     * paths that do not also meet eventually with the reference sink vertex
      +     */
      +    public void removePathsNotConnectedToRef() {
      +        if ( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) {
      +            throw new IllegalStateException("Graph must have ref source and sink vertices");
      +        }
      +
      +        // get the set of vertices we can reach by going forward from the ref source
      +        final Set onPathFromRefSource = new HashSet(vertexSet().size());
      +        for ( final T v : new BaseGraphIterator(this, getReferenceSourceVertex(), false, true) ) {
      +            onPathFromRefSource.add(v);
      +        }
      +
      +        // get the set of vertices we can reach by going backward from the ref sink
      +        final Set onPathFromRefSink = new HashSet(vertexSet().size());
      +        for ( final T v : new BaseGraphIterator(this, getReferenceSinkVertex(), true, false) ) {
      +            onPathFromRefSink.add(v);
      +        }
      +
      +        // we want to remove anything that's not in both the sink and source sets
      +        final Set verticesToRemove = new HashSet(vertexSet());
      +        onPathFromRefSource.retainAll(onPathFromRefSink);
      +        verticesToRemove.removeAll(onPathFromRefSource);
      +        removeAllVertices(verticesToRemove);
      +    }
      +
      +    /**
      +     * Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure
      +     *
      +     * By similar this means that both graphs have the same number of vertices, where each vertex can find
      +     * a vertex in the other graph that's seqEqual to it.  A similar constraint applies to the edges,
      +     * where all edges in g1 must have a corresponding edge in g2 where both source and target vertices are
      +     * seqEqual
      +     *
      +     * @param g1 the first graph to compare
      +     * @param g2 the second graph to compare
      +     * @param  the type of the nodes in those graphs
      +     * @return true if g1 and g2 are equals
      +     */
      +    public static  boolean graphEquals(final BaseGraph g1, BaseGraph g2) {
      +        final Set vertices1 = g1.vertexSet();
      +        final Set vertices2 = g2.vertexSet();
      +        final Set edges1 = g1.edgeSet();
      +        final Set edges2 = g2.edgeSet();
      +
      +        if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() )
      +            return false;
      +
      +        for ( final T v1 : vertices1 ) {
      +            boolean found = false;
      +            for ( final T v2 : vertices2 )
      +                found = found || v1.getSequenceString().equals(v2.getSequenceString());
      +            if ( ! found ) return false;
      +        }
      +
      +        for( final BaseEdge e1 : g1.edgeSet() ) {
      +            boolean found = false;
      +            for( BaseEdge e2 : g2.edgeSet() ) {
      +                if( e1.seqEquals(g1, e2, g2) ) { found = true; break; }
      +            }
      +            if( !found ) { return false; }
      +        }
      +        for( final BaseEdge e2 : g2.edgeSet() ) {
      +            boolean found = false;
      +            for( BaseEdge e1 : g1.edgeSet() ) {
      +                if( e2.seqEquals(g2, e1, g1) ) { found = true; break; }
      +            }
      +            if( !found ) { return false; }
      +        }
      +        return true;
      +    }
      +
      +    /**
      +     * Get the incoming edge of v.  Requires that there be only one such edge or throws an error
      +     * @param v our vertex
      +     * @return the single incoming edge to v, or null if none exists
      +     */
      +    public BaseEdge incomingEdgeOf(final T v) {
      +        return getSingletonEdge(incomingEdgesOf(v));
      +    }
      +
      +    /**
      +     * Get the outgoing edge of v.  Requires that there be only one such edge or throws an error
      +     * @param v our vertex
      +     * @return the single outgoing edge from v, or null if none exists
      +     */
      +    public BaseEdge outgoingEdgeOf(final T v) {
      +        return getSingletonEdge(outgoingEdgesOf(v));
      +    }
      +
      +    /**
      +     * Helper function that gets the a single edge from edges, null if edges is empty, or
      +     * throws an error is edges has more than 1 element
      +     * @param edges a set of edges
      +     * @return a edge
      +     */
      +    @Requires("edges != null")
      +    private BaseEdge getSingletonEdge(final Collection edges) {
      +        if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges);
      +        return edges.isEmpty() ? null : edges.iterator().next();
      +    }
      +
      +    /**
      +     * Add edge between source -> target if none exists, or add e to an already existing one if present
      +     *
      +     * @param source source vertex
      +     * @param target vertex
      +     * @param e edge to add
      +     */
      +    public void addOrUpdateEdge(final T source, final T target, final BaseEdge e) {
      +        final BaseEdge prev = getEdge(source, target);
      +        if ( prev != null ) {
      +            prev.add(e);
      +        } else {
      +            addEdge(source, target, e);
      +        }
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java
      new file mode 100644
      index 000000000..7c33e060d
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java
      @@ -0,0 +1,120 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import java.util.HashSet;
      +import java.util.Iterator;
      +import java.util.LinkedList;
      +
      +/**
      + * General iterator that can iterate over all vertices in a BaseGraph, following either
      + * incoming, outgoing edge (as well as both or none) edges.  Supports traversal of graphs
      + * with cycles and other crazy structures.  Will only ever visit each vertex once.  The
      + * order in which the vertices are visited is undefined.
      + *
      + * User: depristo
      + * Date: 3/24/13
      + * Time: 4:41 PM
      + */
      +public class BaseGraphIterator implements Iterator, Iterable {
      +    final HashSet visited = new HashSet();
      +    final LinkedList toVisit = new LinkedList();
      +    final BaseGraph graph;
      +    final boolean followIncomingEdges, followOutgoingEdges;
      +
      +    /**
      +     * Create a new BaseGraphIterator starting its traversal at start
      +     *
      +     * Note that if both followIncomingEdges and followOutgoingEdges are false, we simply return the
      +     * start vertex
      +     *
      +     * @param graph the graph to iterator over.  Cannot be null
      +     * @param start the vertex to start at.  Cannot be null
      +     * @param followIncomingEdges should we follow incoming edges during our
      +     *                            traversal? (goes backward through the graph)
      +     * @param followOutgoingEdges should we follow outgoing edges during out traversal?
      +     */
      +    public BaseGraphIterator(final BaseGraph graph, final T start,
      +                             final boolean followIncomingEdges, final boolean followOutgoingEdges) {
      +        if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
      +        if ( start == null ) throw new IllegalArgumentException("start cannot be null");
      +        if ( ! graph.containsVertex(start) ) throw new IllegalArgumentException("start " + start + " must be in graph but it isn't");
      +        this.graph = graph;
      +        this.followIncomingEdges = followIncomingEdges;
      +        this.followOutgoingEdges = followOutgoingEdges;
      +
      +        toVisit.add(start);
      +    }
      +
      +    @Override
      +    public Iterator iterator() {
      +        return this;
      +    }
      +
      +    @Override
      +    public boolean hasNext() {
      +        return ! toVisit.isEmpty();
      +    }
      +
      +    @Override
      +    public T next() {
      +        final T v = toVisit.pop();
      +
      +        if ( ! visited.contains(v) ) {
      +            visited.add(v);
      +            if ( followIncomingEdges ) for ( final T prev : graph.incomingVerticesOf(v) ) toVisit.add(prev);
      +            if ( followOutgoingEdges ) for ( final T next : graph.outgoingVerticesOf(v) ) toVisit.add(next);
      +        }
      +
      +        return v;
      +    }
      +
      +    @Override
      +    public void remove() {
      +        throw new UnsupportedOperationException("Doesn't implement remove");
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java
      new file mode 100644
      index 000000000..b075a69a6
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java
      @@ -0,0 +1,179 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Ensures;
      +
      +import java.util.Arrays;
      +
      +/**
      + * A graph vertex that holds some sequence information
      + *
      + * @author: depristo
      + * @since 03/2013
      + */
      +public class BaseVertex {
      +    final byte[] sequence;
      +    private final static int UNASSIGNED_HASHCODE = -1;
      +    int cachedHashCode = UNASSIGNED_HASHCODE;
      +
      +    /**
      +     * Create a new sequence vertex with sequence
      +     *
      +     * This code doesn't copy sequence for efficiency reasons, so sequence should absolutely not be modified
      +     * in any way after passing this sequence to the BaseVertex
      +     *
      +     * @param sequence a non-null, non-empty sequence of bases contained in this vertex
      +     */
      +    public BaseVertex(final byte[] sequence) {
      +        if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
      +        this.sequence = sequence;
      +    }
      +
      +    /**
      +     * Does this vertex have an empty sequence?
      +     *
      +     * That is, is it a dummy node that's only present for structural reasons but doesn't actually
      +     * contribute to the sequence of the graph?
      +     *
      +     * @return true if sequence is empty, false otherwise
      +     */
      +    public boolean isEmpty() {
      +        return length() == 0;
      +    }
      +
      +    /**
      +     * Get the length of this sequence
      +     * @return a positive integer >= 1
      +     */
      +    public int length() {
      +        return sequence.length;
      +    }
      +
      +    /**
      +     * For testing purposes only -- low performance
      +     * @param sequence the sequence as a string
      +     */
      +    protected BaseVertex(final String sequence) {
      +        this(sequence.getBytes());
      +    }
      +
      +    @Override
      +    public boolean equals(Object o) {
      +        if (this == o) return true;
      +        if (o == null || getClass() != o.getClass()) return false;
      +
      +        BaseVertex that = (BaseVertex) o;
      +
      +        if (!Arrays.equals(sequence, that.sequence)) return false;
      +
      +        return true;
      +    }
      +
      +    /**
      +     * Are b and this equal according to their base sequences?
      +     *
      +     * @param b the vertex to compare ourselves to
      +     * @return true if b and this have the same sequence, regardless of other attributes that might differentiate them
      +     */
      +    public boolean seqEquals(final BaseVertex b) {
      +        return Arrays.equals(this.getSequence(), b.getSequence());
      +    }
      +
      +    /**
      +     * necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
      +     * @return
      +     */
      +    @Override
      +    public int hashCode() {
      +        if ( cachedHashCode == UNASSIGNED_HASHCODE ) {
      +            cachedHashCode = Arrays.hashCode(sequence);
      +        }
      +        return cachedHashCode;
      +    }
      +
      +    @Override
      +    public String toString() {
      +        return getSequenceString();
      +    }
      +
      +    /**
      +     * Get the sequence of bases contained in this vertex
      +     *
      +     * Do not modify these bytes in any way!
      +     *
      +     * @return a non-null pointer to the bases contained in this vertex
      +     */
      +    @Ensures("result != null")
      +    public byte[] getSequence() {
      +        return sequence;
      +    }
      +
      +    /**
      +     * Get a string representation of the bases in this vertex
      +     * @return a non-null String
      +     */
      +    @Ensures("result != null")
      +    public String getSequenceString() {
      +        return new String(sequence);
      +    }
      +
      +    /**
      +     * Get the sequence unique to this vertex
      +     *
      +     * This function may not return the entire sequence stored in the vertex, as kmer graphs
      +     * really only provide 1 base of additional sequence (the last base of the kmer).
      +     *
      +     * The base implementation simply returns the sequence.
      +     *
      +     * @param source is this vertex a source vertex (i.e., no in nodes) in the graph
      +     * @return a byte[] of the sequence added by this vertex to the overall sequence
      +     */
      +    public byte[] getAdditionalSequence(final boolean source) {
      +        return getSequence();
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java
      new file mode 100644
      index 000000000..0665186c6
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java
      @@ -0,0 +1,224 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Requires;
      +
      +import java.util.*;
      +
      +/**
      + * Split a collection of middle nodes in a graph into their shared prefix and suffix values
      + *
      + * This code performs the following transformation.  Suppose I have a set of vertices V, such
      + * that each vertex is composed of sequence such that
      + *
      + * Vi = prefix + seq_i + suffix
      + *
      + * where prefix and suffix are shared sequences across all vertices V.  This replaces each
      + * Vi with three nodes prefix, seq_i, and suffix connected in a simple chain.
      + *
      + * This operation can be performed in a very general case, without too much worry about the incoming
      + * and outgoing edge structure of each Vi.  The partner algorithm SharedSequenceMerger can
      + * put these pieces back together in a smart way that maximizes the sharing of nodes
      + * while respecting complex connectivity.
      + *
      + * User: depristo
      + * Date: 3/22/13
      + * Time: 8:31 AM
      + */
      +public class CommonSuffixSplitter {
      +    /**
      +     * Create a new graph that contains the vertices in toMerge with their shared suffix and prefix
      +     * sequences extracted out.
      +     *
      +     */
      +    public CommonSuffixSplitter() {}
      +
      +    /**
      +     * Simple single-function interface to split and then update a graph
      +     *
      +     * @param graph the graph containing the vertices in toMerge
      +     * @param v The bottom node whose incoming vertices we'd like to split
      +     * @return true if some useful splitting was done, false otherwise
      +     */
      +    public boolean split(final SeqGraph graph, final SeqVertex v) {
      +        if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
      +        if ( v == null ) throw new IllegalArgumentException("v cannot be null");
      +        if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex v " + v);
      +
      +        final Collection toSplit = graph.incomingVerticesOf(v);
      +        if ( toSplit.size() < 2 )
      +            // Can only split at least 2 vertices
      +            return false;
      +        else if ( ! safeToSplit(graph, v, toSplit) ) {
      +            return false;
      +        } else {
      +            final SeqVertex suffixVTemplate = commonSuffix(toSplit);
      +            if ( suffixVTemplate.isEmpty() ) {
      +                return false;
      +            } else if ( wouldEliminateRefSource(graph, suffixVTemplate, toSplit) ) {
      +                return false;
      +            } else if ( allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit) ) {
      +                return false;
      +            } else {
      +                final List edgesToRemove = new LinkedList();
      +
      +//                graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0);
      +                for ( final SeqVertex mid : toSplit ) {
      +                    // create my own copy of the suffix
      +                    final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence());
      +                    graph.addVertex(suffixV);
      +                    final SeqVertex prefixV = mid.withoutSuffix(suffixV.getSequence());
      +                    final BaseEdge out = graph.outgoingEdgeOf(mid);
      +
      +                    final SeqVertex incomingTarget;
      +                    if ( prefixV == null ) {
      +                        // this node is entirely explained by suffix
      +                        incomingTarget = suffixV;
      +                    } else {
      +                        incomingTarget = prefixV;
      +                        graph.addVertex(prefixV);
      +                        graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0));
      +                        edgesToRemove.add(out);
      +                    }
      +
      +                    graph.addEdge(suffixV, graph.getEdgeTarget(out), new BaseEdge(out));
      +
      +                    for ( final BaseEdge in : graph.incomingEdgesOf(mid) ) {
      +                        graph.addEdge(graph.getEdgeSource(in), incomingTarget, new BaseEdge(in));
      +                        edgesToRemove.add(in);
      +                    }
      +                }
      +
      +                graph.removeAllVertices(toSplit);
      +                graph.removeAllEdges(edgesToRemove);
      +//                graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0);
      +
      +                return true;
      +            }
      +        }
      +    }
      +
      +    /**
      +     * Would factoring out this suffix result in elimating the reference source vertex?
      +     * @param graph the graph
      +     * @param commonSuffix the common suffix of all toSplits
      +     * @param toSplits the list of vertices we're are trying to split
      +     * @return true if toSplit contains the reference source and this ref source has all and only the bases of commonSuffix
      +     */
      +    private boolean wouldEliminateRefSource(final SeqGraph graph, final SeqVertex commonSuffix, final Collection toSplits) {
      +        for ( final SeqVertex toSplit : toSplits ) {
      +            if ( graph.isRefSource(toSplit) )
      +                return toSplit.length() == commonSuffix.length();
      +        }
      +        return false;
      +    }
      +
      +//    private static int counter = 0;
      +
      +    /**
      +     * Would all vertices that we'd split just result in the common suffix?
      +     *
      +     * That is, suppose we have prefix nodes ABC and ABC.  After splitting all of the vertices would
      +     * just be ABC again, and we'd enter into an infinite loop.
      +     *
      +     * @param commonSuffix the common suffix of all vertices in toSplits
      +     * @param toSplits the collection of vertices we want to split
      +     * @return true if all of the vertices are equal to the common suffix
      +     */
      +    private boolean allVerticesAreTheCommonSuffix(final SeqVertex commonSuffix, final Collection toSplits) {
      +        for ( final SeqVertex toSplit : toSplits ) {
      +            if ( toSplit.length() != commonSuffix.length() )
      +                return false;
      +        }
      +
      +        return true;
      +    }
      +
      +    /**
      +     * Can we safely split up the vertices in toMerge?
      +     *
      +     * @param graph a graph
      +     * @param bot a vertex whose incoming vertices we want to split
      +     * @param toMerge the set of vertices we'd be splitting up
      +     * @return true if we can safely split up toMerge
      +     */
      +    private boolean safeToSplit(final SeqGraph graph, final SeqVertex bot, final Collection toMerge) {
      +        final Set outgoingOfBot = new HashSet(graph.outgoingVerticesOf(bot));
      +        for ( final SeqVertex m : toMerge ) {
      +            final Set outs = graph.outgoingEdgesOf(m);
      +            if ( m == bot || outs.size() != 1 || ! graph.outgoingVerticesOf(m).contains(bot) )
      +                // m == bot => don't allow self cycles in the graph
      +                return false;
      +            if ( outgoingOfBot.contains(m) )
      +                // forbid cycles from bottom -> mid
      +                return false;
      +        }
      +
      +        return true;
      +    }
      +
      +    /**
      +     * Return the longest suffix of bases shared among all provided vertices
      +     *
      +     * For example, if the vertices have sequences AC, CC, and ATC, this would return
      +     * a single C.  However, for ACC and TCC this would return CC.  And for AC and TG this
      +     * would return null;
      +     *
      +     * @param middleVertices a non-empty set of vertices
      +     * @return a single vertex that contains the common suffix of all middle vertices
      +     */
      +    @Requires("!middleVertices.isEmpty()")
      +    protected static SeqVertex commonSuffix(final Collection middleVertices) {
      +        final List kmers = GraphUtils.getKmers(middleVertices);
      +        final int min = GraphUtils.minKmerLength(kmers);
      +        final int suffixLen = GraphUtils.compSuffixLen(kmers, min);
      +        final byte[] kmer = kmers.get(0);
      +        final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length);
      +        return new SeqVertex(suffix);
      +    }
      +}
      \ No newline at end of file
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java
      new file mode 100644
      index 000000000..13135ddce
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java
      @@ -0,0 +1,139 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Ensures;
      +
      +import java.util.Arrays;
      +import java.util.HashMap;
      +import java.util.Map;
      +
      +/**
      + * A DeBruijn kmer graph
      + *
      + * User: rpoplin
      + * Date: 2/6/13
      + */
      +public final class DeBruijnGraph extends BaseGraph {
      +    /**
      +     * Create an empty DeBruijnGraph with default kmer size
      +     */
      +    public DeBruijnGraph() {
      +        super();
      +    }
      +
      +    /**
      +     * Create an empty DeBruijnGraph with kmer size
      +     * @param kmerSize kmer size, must be >= 1
      +     */
      +    public DeBruijnGraph(int kmerSize) {
      +        super(kmerSize);
      +    }
      +
      +    /**
      +     * Pull kmers out of the given long sequence and throw them on in the graph
      +     * @param sequence      byte array holding the sequence with which to build the assembly graph
      +     * @param KMER_LENGTH   the desired kmer length to use
      +     * @param isRef         if true the kmers added to the graph will have reference edges linking them
      +     */
      +    public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
      +        if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
      +        final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
      +        for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
      +            addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1);
      +        }
      +    }
      +
      +    /**
      +     * Add edge to assembly graph connecting the two kmers
      +     * @param kmer1 the source kmer for the edge
      +     * @param kmer2 the target kmer for the edge
      +     * @param isRef true if the added edge is a reference edge
      +     */
      +    public void addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) {
      +        if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
      +        if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
      +        if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); }
      +
      +        final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 );
      +        final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 );
      +        final BaseEdge toAdd = new BaseEdge(isRef, multiplicity);
      +
      +        addVertices(v1, v2);
      +        addOrUpdateEdge(v1, v2, toAdd);
      +    }
      +
      +    /**
      +     * Convert this kmer graph to a simple sequence graph.
      +     *
      +     * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
      +     * graph.  Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
      +     *
      +     * @return a newly allocated SequenceGraph
      +     */
      +    @Ensures({"result != null"})
      +    public SeqGraph convertToSequenceGraph() {
      +        final SeqGraph seqGraph = new SeqGraph(getKmerSize());
      +        final Map vertexMap = new HashMap();
      +
      +        // create all of the equivalent seq graph vertices
      +        for ( final DeBruijnVertex dv : vertexSet() ) {
      +            final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
      +            vertexMap.put(dv, sv);
      +            seqGraph.addVertex(sv);
      +        }
      +
      +        // walk through the nodes and connect them to their equivalent seq vertices
      +        for( final BaseEdge e : edgeSet() ) {
      +            final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
      +            final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
      +            seqGraph.addEdge(seqInV, seqOutV, e);
      +        }
      +
      +        return seqGraph;
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java
      similarity index 80%
      rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java
      rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java
      index 28c735b5c..c240949d9 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java
      @@ -44,70 +44,82 @@
       *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
       */
       
      -package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
       
      -import org.jgrapht.graph.DefaultDirectedGraph;
      -
      -import java.io.Serializable;
      -import java.util.Comparator;
      +import com.google.java.contract.Ensures;
       
       /**
      - * Created by IntelliJ IDEA.
      - * User: ebanks
      + * simple node class for storing kmer sequences
      + *
      + * User: ebanks, mdepristo
        * Date: Mar 23, 2011
        */
      -
      -// simple edge class for connecting nodes in the graph
      -public class DeBruijnEdge {
      -
      -    private int multiplicity;
      -    private boolean isRef;
      -
      -    public DeBruijnEdge() {
      -        multiplicity = 1;
      -        isRef = false;
      +public final class DeBruijnVertex extends BaseVertex {
      +    private final static byte[][] sufficesAsByteArray = new byte[256][];
      +    static {
      +        for ( int i = 0; i < sufficesAsByteArray.length; i++ )
      +            sufficesAsByteArray[i] = new byte[]{(byte)(i & 0xFF)};
           }
       
      -    public DeBruijnEdge( final boolean isRef ) {
      -        multiplicity = 1;
      -        this.isRef = isRef;
      +    public DeBruijnVertex( final byte[] sequence ) {
      +        super(sequence);
           }
       
      -    public DeBruijnEdge( final boolean isRef, final int multiplicity ) {
      -        this.multiplicity = multiplicity;
      -        this.isRef = isRef;
      +    /**
      +     * For testing purposes only
      +     * @param sequence
      +     */
      +    protected DeBruijnVertex( final String sequence ) {
      +        this(sequence.getBytes());
           }
       
      -    public int getMultiplicity() {
      -        return multiplicity;
      +    /**
      +     * Get the kmer size for this DeBruijnVertex
      +     * @return integer >= 1
      +     */
      +    @Ensures("result >= 1")
      +    public int getKmer() {
      +        return sequence.length;
           }
       
      -    public void setMultiplicity( final int value ) {
      -        multiplicity = value;
      +    /**
      +     * Get the string representation of the suffix of this DeBruijnVertex
      +     * @return a non-null non-empty string
      +     */
      +    @Ensures({"result != null", "result.length() >= 1"})
      +    public String getSuffixString() {
      +        return new String(getSuffixAsArray());
           }
       
      -    public boolean isRef() {
      -        return isRef;
      +    /**
      +     * Get the suffix byte of this DeBruijnVertex
      +     *
      +     * The suffix byte is simply the last byte of the kmer sequence, so if this is holding sequence ACT
      +     * getSuffix would return T
      +     *
      +     * @return a byte
      +     */
      +    public byte getSuffix() {
      +        return sequence[getKmer() - 1];
           }
       
      -    public void setIsRef( final boolean isRef ) {
      -        this.isRef = isRef;
      +    /**
      +     * Optimized version that returns a byte[] for the single byte suffix of this graph without allocating memory.
      +     *
      +     * Should not be modified
      +     *
      +     * @return a byte[] that contains 1 byte == getSuffix()
      +     */
      +    @Ensures({"result != null", "result.length == 1", "result[0] == getSuffix()"})
      +    private byte[] getSuffixAsArray() {
      +        return sufficesAsByteArray[getSuffix()];
           }
       
      -    // For use when comparing edges pulled from the same graph
      -    public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge ) {
      -        return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
      -    }
      -
      -    // For use when comparing edges across graphs!
      -    public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge, final DeBruijnAssemblyGraph graph2 ) {
      -        return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
      -    }
      -
      -    public static class EdgeWeightComparator implements Comparator, Serializable {
      -        @Override
      -        public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) {
      -            return edge1.multiplicity - edge2.multiplicity;
      -        }
      +    /**
      +     * {@inheritDoc}
      +     */
      +    @Override
      +    public byte[] getAdditionalSequence(boolean source) {
      +        return source ? super.getAdditionalSequence(source) : getSuffixAsArray();
           }
       }
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java
      new file mode 100644
      index 000000000..30c5be190
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java
      @@ -0,0 +1,138 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Ensures;
      +import com.google.java.contract.Requires;
      +
      +import java.util.ArrayList;
      +import java.util.Collection;
      +import java.util.List;
      +
      +/**
      + * Utility functions used in the graphs package
      + *
      + * User: depristo
      + * Date: 3/25/13
      + * Time: 9:42 PM
      + */
      +final class GraphUtils {
      +    private GraphUtils() {}
      +
      +    /**
      +     * Compute the maximum shared prefix length of list of bytes.
      +     *
      +     * @param listOfBytes a list of bytes with at least one element
      +     * @param minLength the min. length among all byte[] in listOfBytes
      +     * @return the number of shared bytes common at the start of all bytes
      +     */
      +    @Requires({"listOfBytes.size() >= 1", "minLength >= 0"})
      +    @Ensures("result >= 0")
      +    protected static int compPrefixLen(final List listOfBytes, final int minLength) {
      +        for ( int i = 0; i < minLength; i++ ) {
      +            final byte b = listOfBytes.get(0)[i];
      +            for ( int j = 1; j < listOfBytes.size(); j++ ) {
      +                if ( b != listOfBytes.get(j)[i] )
      +                    return i;
      +            }
      +        }
      +
      +        return minLength;
      +    }
      +
      +    /**
      +     * Compute the maximum shared suffix length of list of bytes.
      +     *
      +     * @param listOfBytes a list of bytes with at least one element
      +     * @param minLength the min. length among all byte[] in listOfBytes
      +     * @return the number of shared bytes common at the end of all bytes
      +     */
      +    @Requires({"listOfBytes.size() >= 1", "minLength >= 0"})
      +    @Ensures("result >= 0")
      +    protected static int compSuffixLen(final List listOfBytes, final int minLength) {
      +        for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) {
      +            final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1];
      +            for ( int j = 1; j < listOfBytes.size(); j++ ) {
      +                if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] )
      +                    return suffixLen;
      +            }
      +        }
      +        return minLength;
      +    }
      +
      +    /**
      +     * Get the list of kmers as byte[] from the vertices in the graph
      +     *
      +     * @param vertices a collection of vertices
      +     * @return a list of their kmers in order of the iterator on vertices
      +     */
      +    protected static List getKmers(final Collection vertices) {
      +        final List kmers = new ArrayList(vertices.size());
      +        for ( final SeqVertex v : vertices ) {
      +            kmers.add(v.getSequence());
      +        }
      +        return kmers;
      +    }
      +
      +    /**
      +     * Get the minimum length of a collection of byte[]
      +     *
      +     * @param kmers a list of kmers whose .length min we want
      +     * @return the min of the kmers, if kmers is empty the result is 0
      +     */
      +    protected static int minKmerLength(final Collection kmers) {
      +        if ( kmers == null ) throw new IllegalArgumentException("kmers cannot be null");
      +
      +        if ( kmers.isEmpty() ) return 0;
      +        int min = Integer.MAX_VALUE;
      +        for ( final byte[] kmer : kmers ) {
      +            min = Math.min(min, kmer.length);
      +        }
      +        return min;
      +    }
      +
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java
      new file mode 100644
      index 000000000..466148588
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java
      @@ -0,0 +1,185 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.common.collect.MinMaxPriorityQueue;
      +import com.google.java.contract.Ensures;
      +
      +import java.io.Serializable;
      +import java.util.*;
      +
      +/**
      + * Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
      + * This is different from most graph traversals because we want to test paths from any source node to any sink node.
      + *
      + * User: ebanks, rpoplin, mdepristo
      + * Date: Mar 23, 2011
      + */
      +public class KBestPaths {
      +    private final boolean allowCycles;
      +
      +    /**
      +     * Create a new KBestPaths finder that follows cycles in the graph
      +     */
      +    public KBestPaths() {
      +        this(true);
      +    }
      +
      +    /**
      +     * Create a new KBestPaths finder
      +     *
      +     * @param allowCycles should we allow paths that follow cycles in the graph?
      +     */
      +    public KBestPaths(final boolean allowCycles) {
      +        this.allowCycles = allowCycles;
      +    }
      +
      +    protected static class MyInt { public int val = 0; }
      +
      +    /**
      +     * Compare paths such that paths with greater weight are earlier in a list
      +     */
      +    protected static class PathComparatorTotalScore implements Comparator, Serializable {
      +        @Override
      +        public int compare(final Path path1, final Path path2) {
      +            return path2.getScore() - path1.getScore();
      +        }
      +    }
      +
      +    /**
      +     * @see #getKBestPaths(BaseGraph, int) retriving the best 1000 paths
      +     */
      +    public List> getKBestPaths( final BaseGraph graph ) {
      +        return getKBestPaths(graph, 1000);
      +    }
      +
      +    /**
      +     * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) retriving the first 1000 paths
      +     * starting from all source vertices and ending with all sink vertices
      +     */
      +    public List> getKBestPaths( final BaseGraph graph, final int k ) {
      +        return getKBestPaths(graph, k, graph.getSources(), graph.getSinks());
      +    }
      +
      +    /**
      +     * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000
      +     */
      +    public List> getKBestPaths( final BaseGraph graph, final Set sources, final Set sinks ) {
      +        return getKBestPaths(graph, 1000, sources, sinks);
      +    }
      +
      +    /**
      +     * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000
      +     */
      +    public List> getKBestPaths( final BaseGraph graph, final T source, final T sink ) {
      +        return getKBestPaths(graph, 1000, source, sink);
      +    }
      +
      +    /**
      +     * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with singleton source and sink sets
      +     */
      +    public List> getKBestPaths( final BaseGraph graph, final int k, final T source, final T sink ) {
      +        return getKBestPaths(graph, k, Collections.singleton(source), Collections.singleton(sink));
      +    }
      +
      +        /**
      +         * Traverse the graph and pull out the best k paths.
      +         * Paths are scored via their comparator function. The default being PathComparatorTotalScore()
      +         * @param graph the graph from which to pull paths
      +         * @param k     the number of paths to find
      +         * @param sources a set of vertices we want to start paths with
      +         * @param sinks   a set of vertices we want to end paths with
      +         * @return      a list with at most k top-scoring paths from the graph
      +         */
      +    @Ensures({"result != null", "result.size() <= k"})
      +    public List> getKBestPaths( final BaseGraph graph, final int k, final Set sources, final Set sinks ) {
      +        if( graph == null ) { throw  new IllegalArgumentException("Attempting to traverse a null graph."); }
      +
      +        // a min max queue that will collect the best k paths
      +        final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create();
      +
      +        // run a DFS for best paths
      +        for ( final T source : sources ) {
      +            final Path startingPath = new Path(source, graph);
      +            findBestPaths(startingPath, sinks, bestPaths, new MyInt());
      +        }
      +
      +        // the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result
      +        final List> toReturn = new ArrayList>(bestPaths);
      +        Collections.sort(toReturn, new PathComparatorTotalScore());
      +        return toReturn;
      +    }
      +
      +    /**
      +     * Recursive algorithm to find the K best paths in the graph from the current path to any of the sinks
      +     * @param path the current path progress
      +     * @param sinks a set of nodes that are sinks.  Will terminate and add a path if the last vertex of path is in this set
      +     * @param bestPaths a path to collect completed paths.
      +     * @param n used to limit the search by tracking the number of vertices visited across all paths
      +     */
      +    private void findBestPaths( final Path path, final Set sinks, final Collection> bestPaths, final MyInt n ) {
      +        if ( sinks.contains(path.getLastVertex())) {
      +            bestPaths.add(path);
      +        } else if( n.val > 10000 ) {
      +            // do nothing, just return, as we've done too much work already
      +        } else {
      +            // recursively run DFS
      +            final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex());
      +            Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator());
      +            for ( final BaseEdge edge : edgeArrayList ) {
      +                final T target = path.getGraph().getEdgeTarget(edge);
      +                // make sure the edge is not already in the path
      +                final boolean alreadyVisited = allowCycles ? path.containsEdge(edge) : path.containsVertex(target);
      +                if ( ! alreadyVisited ) {
      +                    final Path newPath = new Path(path, edge);
      +                    n.val++;
      +                    findBestPaths(newPath, sinks, bestPaths, n);
      +                }
      +            }
      +        }
      +    }
      +}
      \ No newline at end of file
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java
      new file mode 100644
      index 000000000..47676a498
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java
      @@ -0,0 +1,445 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Ensures;
      +import com.google.java.contract.Requires;
      +import net.sf.samtools.Cigar;
      +import net.sf.samtools.CigarElement;
      +import net.sf.samtools.CigarOperator;
      +import org.apache.commons.lang.ArrayUtils;
      +import org.broadinstitute.sting.utils.smithwaterman.Parameters;
      +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
      +import org.broadinstitute.sting.utils.sam.AlignmentUtils;
      +
      +import java.util.*;
      +
      +/**
      + * A path thought a BaseGraph
      + *
      + * class to keep track of paths
      + *
      + * User: depristo
      + * Date: 3/19/13
      + * Time: 2:34 PM
      + *
      + */
      +public class Path {
      +    private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20;
      +
      +    // the last vertex seen in the path
      +    private final T lastVertex;
      +
      +    // the list of edges comprising the path
      +    private Set edgesAsSet = null;
      +    private final LinkedList edgesInOrder;
      +
      +    // the scores for the path
      +    private final int totalScore;
      +
      +    // the graph from which this path originated
      +    private final BaseGraph graph;
      +
      +    // used in the bubble state machine to apply Smith-Waterman to the bubble sequence
      +    // these values were chosen via optimization against the NA12878 knowledge base
      +    public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1);
      +
      +    private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes();
      +
      +    /**
      +     * Create a new Path containing no edges and starting at initialVertex
      +     * @param initialVertex the starting vertex of the path
      +     * @param graph the graph this path with follow through
      +     */
      +    public Path(final T initialVertex, final BaseGraph graph) {
      +        if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
      +        if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
      +        if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph);
      +
      +        lastVertex = initialVertex;
      +        edgesInOrder = new LinkedList();
      +        totalScore = 0;
      +        this.graph = graph;
      +    }
      +
      +    /**
      +     * Convenience constructor for testing that creates a path through vertices in graph
      +     */
      +    protected static  Path makePath(final List vertices, final BaseGraph graph) {
      +        Path path = new Path(vertices.get(0), graph);
      +        for ( int i = 1; i < vertices.size(); i++ )
      +            path = new Path(path, graph.getEdge(path.lastVertex, vertices.get(i)));
      +        return path;
      +    }
      +
      +    /**
      +     * Create a new Path extending p with edge
      +     *
      +     * @param p the path to extend
      +     * @param edge the edge to extend path by
      +     */
      +    public Path(final Path p, final BaseEdge edge) {
      +        if ( p == null ) throw new IllegalArgumentException("Path cannot be null");
      +        if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null");
      +        if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't");
      +        if ( ! p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); }
      +
      +        graph = p.graph;
      +        lastVertex = p.graph.getEdgeTarget(edge);
      +        edgesInOrder = new LinkedList(p.getEdges());
      +        edgesInOrder.add(edge);
      +        totalScore = p.totalScore + edge.getMultiplicity();
      +    }
      +
      +    /**
      +     * Get the collection of edges leaving the last vertex of this path
      +     * @return a non-null collection
      +     */
      +    public Collection getOutgoingEdgesOfLastVertex() {
      +        return getGraph().outgoingEdgesOf(getLastVertex());
      +    }
      +
      +    /**
      +     * Does this path contain the given edge
      +     * @param edge  the given edge to test
      +     * @return      true if the edge is found in this path
      +     */
      +    public boolean containsEdge( final BaseEdge edge ) {
      +        if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
      +        if ( edgesInOrder.isEmpty() ) return false;
      +
      +        // initialize contains cache if necessary
      +        if ( edgesAsSet == null ) edgesAsSet = new HashSet(edgesInOrder);
      +        return edgesAsSet.contains(edge);
      +    }
      +
      +    /**
      +     * Does this path contain the given vertex?
      +     *
      +     * @param v a non-null vertex
      +     * @return true if v occurs within this path, false otherwise
      +     */
      +    public boolean containsVertex(final T v) {
      +        if ( v == null ) throw new IllegalArgumentException("Vertex cannot be null");
      +
      +        // TODO -- warning this is expensive.  Need to do vertex caching
      +        return getVertices().contains(v);
      +    }
      +
      +    /**
      +     * Check that two paths have the same edges and total score
      +     * @param path the other path we might be the same as
      +     * @return true if this and path are the same
      +     */
      +    protected boolean pathsAreTheSame(Path path) {
      +        return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder);
      +    }
      +
      +    @Override
      +    public String toString() {
      +        final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path=");
      +        boolean first = true;
      +        for ( final T v : getVertices() ) {
      +            if ( first ) {
      +                first = false;
      +            } else {
      +                b.append(" -> ");
      +            }
      +            b.append(v.getSequenceString());
      +        }
      +        return b.toString();
      +    }
      +
      +    /**
      +     * Get the graph of this path
      +     * @return a non-null graph
      +     */
      +    @Ensures("result != null")
      +    public BaseGraph getGraph() {
      +        return graph;
      +    }
      +
      +    /**
      +     * Get the edges of this path in order
      +     * @return a non-null list of edges
      +     */
      +    @Ensures("result != null")
      +    public List getEdges() { return edgesInOrder; }
      +
      +    /**
      +     * Get the list of vertices in this path in order defined by the edges of the path
      +     * @return a non-null, non-empty list of vertices
      +     */
      +    @Ensures({"result != null", "!result.isEmpty()"})
      +    public List getVertices() {
      +        if ( getEdges().isEmpty() )
      +            return Collections.singletonList(lastVertex);
      +        else {
      +            final LinkedList vertices = new LinkedList();
      +            boolean first = true;
      +            for ( final BaseEdge e : getEdges() ) {
      +                if ( first ) {
      +                    vertices.add(graph.getEdgeSource(e));
      +                    first = false;
      +                }
      +                vertices.add(graph.getEdgeTarget(e));
      +            }
      +            return vertices;
      +        }
      +    }
      +
      +    /**
      +     * Get the total score of this path (bigger is better)
      +     * @return a positive integer
      +     */
      +    @Ensures("result >= 0")
      +    public int getScore() { return totalScore; }
      +
      +    /**
      +     * Get the final vertex of the path
      +     * @return a non-null vertex
      +     */
      +    @Ensures("result != null")
      +    public T getLastVertex() { return lastVertex; }
      +
      +    /**
      +     * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes
      +     * @return  non-null sequence of bases corresponding to this path
      +     */
      +    @Ensures({"result != null"})
      +    public byte[] getBases() {
      +        if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); }
      +
      +        byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst()));
      +        for( final BaseEdge e : edgesInOrder ) {
      +            bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e)));
      +        }
      +        return bases;
      +    }
      +
      +    /**
      +     * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble
      +     * @return  non-null Cigar string with reference length equal to the refHaplotype's reference length
      +     */
      +    @Ensures("result != null")
      +    public Cigar calculateCigar() {
      +        final Cigar cigar = new Cigar();
      +        // special case for paths that start on reference but not at the reference source node
      +        if( edgesInOrder.getFirst().isRef() && !graph.isRefSource(edgesInOrder.getFirst()) ) {
      +            for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edgesInOrder.getFirst())).getCigarElements() ) {
      +                cigar.add(ce);
      +            }
      +        }
      +
      +        // reset the bubble state machine
      +        final BubbleStateMachine bsm = new BubbleStateMachine(cigar);
      +
      +        for( final BaseEdge e : getEdges() ) {
      +            if ( e.hasSameSourceAndTarget(graph, edgesInOrder.getFirst()) ) {
      +                advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null );
      +            }
      +            advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e );
      +        }
      +
      +        // special case for paths that don't end on reference
      +        if( bsm.inBubble ) {
      +            for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) {
      +                bsm.cigar.add(ce);
      +            }
      +        } else if( edgesInOrder.getLast().isRef() && !graph.isRefSink(edgesInOrder.getLast()) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit
      +            for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edgesInOrder.getLast()), null).getCigarElements() ) {
      +                bsm.cigar.add(ce);
      +            }
      +        }
      +
      +        return AlignmentUtils.consolidateCigar(bsm.cigar);
      +    }
      +
      +    /**
      +     * Advance the bubble state machine by incorporating the next node in the path.
      +     * @param bsm   the current bubble state machine
      +     * @param node  the node to be incorporated
      +     * @param e     the edge which generated this node in the path
      +     */
      +    @Requires({"bsm != null", "graph != null", "node != null"})
      +    private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) {
      +        if( graph.isReferenceNode( node ) ) {
      +            if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else
      +                if( e !=null && !e.isRef() ) {
      +                    if( graph.referencePathExists( graph.getEdgeSource(e), node) ) {
      +                        for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) {
      +                            bsm.cigar.add(ce);
      +                        }
      +                        bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
      +                    } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself
      +                        bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) );
      +                    } else {
      +                        bsm.inBubble = true;
      +                        bsm.bubbleBytes = null;
      +                        bsm.lastSeenReferenceNode = graph.getEdgeSource(e);
      +                        bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      +                    }
      +                } else {
      +                    bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
      +                }
      +            } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path
      +                bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      +            } else { // close the bubble and use a local SW to determine the Cigar string
      +                for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) {
      +                    bsm.cigar.add(ce);
      +                }
      +                bsm.inBubble = false;
      +                bsm.bubbleBytes = null;
      +                bsm.lastSeenReferenceNode = null;
      +                bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
      +            }
      +        } else { // non-ref vertex
      +            if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path
      +                bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      +            } else { // open up a bubble
      +                bsm.inBubble = true;
      +                bsm.bubbleBytes = null;
      +                bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null );
      +                bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
      +            }
      +        }
      +    }
      +
      +    /**
      +     * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble
      +     * @param bubbleBytes   the bytes that comprise the alternate allele path in this bubble
      +     * @param fromVertex    the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex)
      +     * @param toVertex      the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex)
      +     * @return              the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble
      +     */
      +    @Requires({"graph != null"})
      +    @Ensures({"result != null"})
      +    private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) {
      +        final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null);
      +
      +        final Cigar returnCigar = new Cigar();
      +
      +        // add padding to anchor ref/alt bases in the SW matrix
      +        byte[] padding = STARTING_SW_ANCHOR_BYTES;
      +        boolean goodAlignment = false;
      +        SWPairwiseAlignment swConsensus = null;
      +        while( !goodAlignment && padding.length < 1000 ) {
      +            padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time
      +            final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding );
      +            final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding );
      +            swConsensus = new SWPairwiseAlignment( reference, alternate, NEW_SW_PARAMETERS );
      +            if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) {
      +                goodAlignment = true;
      +            }
      +        }
      +        if( !goodAlignment ) {
      +            returnCigar.add(new CigarElement(1, CigarOperator.N));
      +            return returnCigar;
      +        }
      +
      +        final Cigar swCigar = swConsensus.getCigar();
      +        if( swCigar.numCigarElements() > MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW ) { // this bubble is too divergent from the reference
      +            returnCigar.add(new CigarElement(1, CigarOperator.N));
      +        } else {
      +            for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
      +                // now we need to remove the padding from the cigar string
      +                int length = swCigar.getCigarElement(iii).getLength();
      +                if( iii == 0 ) { length -= padding.length; }
      +                if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; }
      +                if( length > 0 ) {
      +                    returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator()));
      +                }
      +            }
      +            if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) {
      +                throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar());
      +            }
      +        }
      +
      +        return returnCigar;
      +    }
      +
      +    // class to keep track of the bubble state machine
      +    private static class BubbleStateMachine {
      +        public boolean inBubble = false;
      +        public byte[] bubbleBytes = null;
      +        public T lastSeenReferenceNode = null;
      +        public Cigar cigar = null;
      +
      +        public BubbleStateMachine( final Cigar initialCigar ) {
      +            inBubble = false;
      +            bubbleBytes = null;
      +            lastSeenReferenceNode = null;
      +            cigar = initialCigar;
      +        }
      +    }
      +
      +    /**
      +     * Tests that this and other have the same score and vertices in the same order with the same seq
      +     * @param other the other path to consider.  Cannot be null
      +     * @return true if this and path are equal, false otherwise
      +     */
      +    public boolean equalScoreAndSequence(final Path other) {
      +        if ( other == null ) throw new IllegalArgumentException("other cannot be null");
      +        return getScore() == other.getScore() && equalSequence(other);
      +    }
      +
      +    /**
      +     * Tests that this and other have the same vertices in the same order with the same seq
      +     * @param other the other path to consider.  Cannot be null
      +     * @return true if this and path are equal, false otherwise
      +     */
      +    public boolean equalSequence(final Path other) {
      +        final List mine = getVertices();
      +        final List yours = other.getVertices();
      +        if ( mine.size() == yours.size() ) { // hehehe
      +            for ( int i = 0; i < mine.size(); i++ )
      +                if ( ! mine.get(i).seqEquals(yours.get(i)) )
      +                    return false;
      +        }
      +        return true;
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java
      new file mode 100644
      index 000000000..bb4b26257
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java
      @@ -0,0 +1,544 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Ensures;
      +import com.google.java.contract.Requires;
      +
      +import java.io.File;
      +import java.util.HashSet;
      +import java.util.LinkedList;
      +import java.util.List;
      +import java.util.Set;
      +
      +/**
      + * A graph that contains base sequence at each node
      + *
      + * @author: depristo
      + * @since 03/2013
      + */
      +public final class SeqGraph extends BaseGraph {
      +    private final static boolean PRINT_SIMPLIFY_GRAPHS = false;
      +
      +    /**
      +     * The minimum number of common bp from the prefix (head merging) or suffix (tail merging)
      +     * required before we'll merge in such configurations.  A large value here is critical to avoid
      +     * merging inappropriate head or tail nodes, which introduces large insertion / deletion events
      +     * as the merge operation creates a link among the non-linked sink / source vertices
      +     */
      +    protected final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10;
      +
      +    /**
      +     * How many cycles of the graph simplifications algorithms will we run before
      +     * thinking something has gone wrong and throw an exception?
      +     */
      +    private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100;
      +
      +    /**
      +     * Construct an empty SeqGraph
      +     */
      +    public SeqGraph() {
      +        super();
      +    }
      +
      +    /**
      +     * Construct an empty SeqGraph where we'll add nodes based on a kmer size of kmer
      +     *
      +     * The kmer size is purely information.  It is useful when converting a Debruijn graph -> SeqGraph
      +     * for us to track the kmer used to make the transformation.
      +     *
      +     * @param kmer kmer
      +     */
      +    public SeqGraph(final int kmer) {
      +        super(kmer);
      +    }
      +
      +    /**
      +     * Simplify this graph, merging vertices together and restructuring the graph in an
      +     * effort to minimize the number of overall vertices in the graph without changing
      +     * in any way the sequences implied by a complex enumeration of all paths through the graph.
      +     */
      +    public void simplifyGraph() {
      +        simplifyGraph(Integer.MAX_VALUE);
      +    }
      +
      +    protected void simplifyGraph(final int maxCycles) {
      +        // start off with one round of zipping of chains for performance reasons
      +        zipLinearChains();
      +
      +        SeqGraph prevGraph = null;
      +        for( int i = 0; i < maxCycles; i++ ) {
      +            if ( i > MAX_REASONABLE_SIMPLIFICATION_CYCLES ) {
      +                logger.warn("Infinite loop detected in simpliciation routines.  Writing current graph to debugMeMark.dot");
      +                printGraph(new File("debugMeMark.dot"), 0);
      +                throw new IllegalStateException("Infinite loop detected in simplification routines for kmer graph " + getKmerSize());
      +            }
      +
      +            final boolean didSomeWork = simplifyGraphOnce(i);
      +            if ( ! didSomeWork )
      +                // no simplification algorithm could run, so stop
      +                break;
      +
      +            // we get five cycles before we start looking for changes in the graph
      +            // by cloning ourselves and then checking for any changes
      +            if ( i > 5 ) {
      +                // the previous graph and this graph have the same structure, so the simplification
      +                // algorithms are looping endless between states.  Just break and consider ourselves done
      +                if ( prevGraph != null && graphEquals(prevGraph, this) )
      +                    break;
      +
      +                prevGraph = (SeqGraph)clone();
      +            }
      +        }
      +    }
      +
      +    /**
      +     * Run one full cycle of the graph simplification algorithms
      +     * @return true if any algorithms said they did some simplification
      +     */
      +    private boolean simplifyGraphOnce(final int iteration) {
      +        //logger.info("simplifyGraph iteration " + i);
      +        // iterate until we haven't don't anything useful
      +        boolean didSomeWork = false;
      +        if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0);
      +        didSomeWork |= new MergeDiamonds().transformUntilComplete();
      +        didSomeWork |= new MergeTails().transformUntilComplete();
      +        if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0);
      +
      +        didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
      +        if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0);
      +        didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
      +        if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0);
      +
      +        didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete();
      +        didSomeWork |= zipLinearChains();
      +        return didSomeWork;
      +    }
      +
      +    /**
      +     * Zip up all of the simple linear chains present in this graph.
      +     *
      +     * Merges together all pairs of vertices in the graph v1 -> v2 into a single vertex v' containing v1 + v2 sequence
      +     *
      +     * Only works on vertices where v1's only outgoing edge is to v2 and v2's only incoming edge is from v1.
      +     *
      +     * If such a pair of vertices is found, they are merged and the graph is update.  Otherwise nothing is changed.
      +     *
      +     * @return true if any such pair of vertices could be found, false otherwise
      +     */
      +    public boolean zipLinearChains() {
      +        // create the list of start sites [doesn't modify graph yet]
      +        final List zipStarts = new LinkedList();
      +        for ( final SeqVertex source : vertexSet() ) {
      +            if ( isLinearChainStart(source) )
      +                zipStarts.add(source);
      +        }
      +
      +        if ( zipStarts.isEmpty() ) // nothing to do, as nothing could start a chain
      +            return false;
      +
      +        // At this point, zipStarts contains all of the vertices in this graph that might start some linear
      +        // chain of vertices.  We walk through each start, building up the linear chain of vertices and then
      +        // zipping them up with mergeLinearChain, if possible
      +        boolean mergedOne = false;
      +        for ( final SeqVertex zipStart : zipStarts ) {
      +            final LinkedList linearChain = traceLinearChain(zipStart);
      +
      +            // merge the linearized chain, recording if we actually did some useful work
      +            mergedOne |= mergeLinearChain(linearChain);
      +        }
      +
      +        return mergedOne;
      +    }
      +
      +    /**
      +     * Is source vertex potentially a start of a linear chain of vertices?
      +     *
      +     * We are a start of a zip chain if our out degree is 1 and either the
      +     * the vertex has no incoming connections or 2 or more (we must start a chain) or
      +     * we have exactly one incoming vertex and that one has out-degree > 1 (i.e., source's incoming
      +     * vertex couldn't be a start itself
      +     *
      +     * @param source a non-null vertex
      +     * @return true if source might start a linear chain
      +     */
      +    @Requires("source != null")
      +    private boolean isLinearChainStart(final SeqVertex source) {
      +        return outDegreeOf(source) == 1
      +                && ( inDegreeOf(source) != 1
      +                     || outDegreeOf(incomingVerticesOf(source).iterator().next()) > 1 );
      +    }
      +
      +    /**
      +     * Get all of the vertices in a linear chain of vertices starting at zipStart
      +     *
      +     * Build a list of vertices (in order) starting from zipStart such that each sequential pair of vertices
      +     * in the chain A and B can be zipped together.
      +     *
      +     * @param zipStart a vertex that starts a linear chain
      +     * @return a list of vertices that comprise a linear chain starting with zipStart.  The resulting
      +     *         list will always contain at least zipStart as the first element.
      +     */
      +    @Requires("isLinearChainStart(zipStart)")
      +    @Ensures({"result != null", "result.size() >= 1"})
      +    private LinkedList traceLinearChain(final SeqVertex zipStart) {
      +        final LinkedList linearChain = new LinkedList();
      +        linearChain.add(zipStart);
      +
      +        boolean lastIsRef = isReferenceNode(zipStart); // remember because this calculation is expensive
      +        SeqVertex last = zipStart;
      +        while (true) {
      +            if ( outDegreeOf(last) != 1 )
      +                // cannot extend a chain from last if last has multiple outgoing branches
      +                break;
      +
      +            // there can only be one (outgoing edge of last) by contract
      +            final SeqVertex target = getEdgeTarget(outgoingEdgeOf(last));
      +
      +            if ( inDegreeOf(target) != 1 || last.equals(target) )
      +                // cannot zip up a target that has multiple incoming nodes or that's a cycle to the last node
      +                break;
      +
      +            final boolean targetIsRef = isReferenceNode(target);
      +            if ( lastIsRef != targetIsRef ) // both our isRef states must be equal
      +                break;
      +
      +            linearChain.add(target); // extend our chain by one
      +
      +            // update our last state to be the current state, and continue
      +            last = target;
      +            lastIsRef = targetIsRef;
      +        }
      +
      +        return linearChain;
      +    }
      +
      +    /**
      +     * Merge a linear chain of vertices into a single combined vertex, and update this graph to such that
      +     * the incoming edges into the first element of the linearChain and the outgoing edges from linearChain.getLast()
      +     * all point to this new combined vertex.
      +     *
      +     * @param linearChain a non-empty chain of vertices that can be zipped up into a single vertex
      +     * @return true if we actually merged at least two vertices together
      +     */
      +    protected boolean mergeLinearChain(final LinkedList linearChain) {
      +        if ( linearChain.isEmpty() ) throw new IllegalArgumentException("BUG: cannot have linear chain with 0 elements but got " + linearChain);
      +
      +        final SeqVertex first = linearChain.getFirst();
      +        final SeqVertex last = linearChain.getLast();
      +
      +        if ( first == last ) return false; // only one element in the chain, cannot be extended
      +
      +        // create the combined vertex, and add it to the graph
      +        // TODO -- performance problem -- can be optimized if we want
      +        final List seqs = new LinkedList();
      +        for ( SeqVertex v : linearChain ) seqs.add(v.getSequence());
      +        final byte[] seqsCat = org.broadinstitute.sting.utils.Utils.concat(seqs.toArray(new byte[][]{}));
      +        final SeqVertex addedVertex = new SeqVertex( seqsCat );
      +        addVertex(addedVertex);
      +
      +        final Set inEdges = incomingEdgesOf(first);
      +        final Set outEdges = outgoingEdgesOf(last);
      +
      +        final int nEdges = inEdges.size() + outEdges.size();
      +        int sharedWeightAmongEdges = nEdges == 0 ? 0 : sumEdgeWeightAlongChain(linearChain) / nEdges;
      +        final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy
      +
      +        // update the incoming and outgoing edges to point to the new vertex
      +        for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge).add(inc)); }
      +        for( final BaseEdge edge : inEdges )  { addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge).add(inc)); }
      +
      +        removeAllVertices(linearChain);
      +        return true;
      +    }
      +
      +    /**
      +     * Get the sum of the edge weights on a linear chain of at least 2 elements
      +     *
      +     * @param chain a linear chain of vertices with at least 2 vertices
      +     * @return the sum of the multiplicities along all edges connecting vertices within the chain
      +     */
      +    @Requires({"chain != null", "chain.size() >= 2"})
      +    private int sumEdgeWeightAlongChain(final LinkedList chain) {
      +        int sum = 0;
      +        SeqVertex prev = null;
      +
      +        for ( final SeqVertex v : chain ) {
      +            if ( prev != null ) {
      +                final BaseEdge e = getEdge(prev, v);
      +                if ( e == null ) throw new IllegalStateException("Something wrong with the linear chain, got a null edge between " + prev + " and " + v);
      +                sum += e.getMultiplicity();
      +            }
      +            prev = v;
      +        }
      +
      +        return sum;
      +    }
      +
      +    /**
      +     * Base class for transformation operations that need to iterate over proposed vertices, where
      +     * each proposed vertex is a seed vertex for a potential transformation.
      +     *
      +     * transformUntilComplete will iteratively apply the tryToTransform function on each vertex in the graph
      +     * until no vertex can be found that can be transformed.
      +     *
      +     * Note that in order to eventually terminate tryToTransform must transform the graph such that eventually
      +     * no vertices are candidates for further transformations.
      +     */
      +    private abstract class VertexBasedTransformer {
      +        /**
      +         * For testing purposes we sometimes want to test that can be transformed capabilities are working
      +         * without actually modifying the graph */
      +        private boolean dontModifyGraphEvenIfPossible = false;
      +
      +        public boolean dontModifyGraphEvenIfPossible() { return dontModifyGraphEvenIfPossible; }
      +        public void setDontModifyGraphEvenIfPossible() { this.dontModifyGraphEvenIfPossible = true; }
      +
      +        /**
      +         * Merge until the graph has no vertices that are candidates for merging
      +         */
      +        public boolean transformUntilComplete() {
      +            boolean didAtLeastOneTranform = false;
      +            boolean foundNodesToMerge = true;
      +            while( foundNodesToMerge ) {
      +                foundNodesToMerge = false;
      +
      +                for( final SeqVertex v : vertexSet() ) {
      +                    foundNodesToMerge = tryToTransform(v);
      +                    if ( foundNodesToMerge ) {
      +                        didAtLeastOneTranform = true;
      +                        break;
      +                    }
      +                }
      +            }
      +
      +            return didAtLeastOneTranform;
      +        }
      +
      +        /**
      +         * Merge, if possible, seeded on the vertex v
      +         * @param v the proposed seed vertex to merge
      +         * @return true if some useful merging happened, false otherwise
      +         */
      +        abstract boolean tryToTransform(final SeqVertex v);
      +    }
      +
      +    /**
      +     * Merge diamond configurations:
      +     *
      +     * Performance the transformation:
      +     *
      +     * { A -> x + S_i + y -> Z }
      +     *
      +     * goes to:
      +     *
      +     * { A -> x -> S_i -> y -> Z }
      +     *
      +     * for all nodes that match this configuration.
      +     */
      +    protected class MergeDiamonds extends VertexBasedTransformer {
      +        @Override
      +        protected boolean tryToTransform(final SeqVertex top) {
      +            final Set middles = outgoingVerticesOf(top);
      +            if ( middles.size() <= 1 )
      +                // we can only merge if there's at least two middle nodes
      +                return false;
      +
      +            SeqVertex bottom = null;
      +            for ( final SeqVertex mi : middles ) {
      +                // all nodes must have at least 1 connection
      +                if ( outDegreeOf(mi) < 1 )
      +                    return false;
      +
      +                // can only have 1 incoming node, the root vertex
      +                if ( inDegreeOf(mi) != 1 )
      +                    return false;
      +
      +                // make sure that all outgoing vertices of mi go only to the bottom node
      +                for ( final SeqVertex mt : outgoingVerticesOf(mi) ) {
      +                    if ( bottom == null )
      +                        bottom = mt;
      +                    else if ( ! bottom.equals(mt) )
      +                        return false;
      +                }
      +            }
      +
      +            // bottom has some connections coming in from other nodes, don't allow
      +            if ( inDegreeOf(bottom) != middles.size() )
      +                return false;
      +
      +            if ( dontModifyGraphEvenIfPossible() ) return true;
      +
      +            // actually do the merging, returning true if at least 1 base was successfully split
      +            final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles);
      +            if (splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(1))
      +                return splitter.splitAndUpdate(top, bottom);
      +            else
      +                return false;
      +        }
      +    }
      +
      +    /**
      +     * Merge tail configurations:
      +     *
      +     * Performs the transformation:
      +     *
      +     * { A -> x + S_i + y }
      +     *
      +     * goes to:
      +     *
      +     * { A -> x -> S_i -> y }
      +     *
      +     * for all nodes that match this configuration.
      +     *
      +     * Differs from the diamond transform in that no bottom node is required
      +     */
      +    protected class MergeTails extends VertexBasedTransformer {
      +        @Override
      +        protected boolean tryToTransform(final SeqVertex top) {
      +            final Set tails = outgoingVerticesOf(top);
      +            if ( tails.size() <= 1 )
      +                return false;
      +
      +            for ( final SeqVertex t : tails )
      +                if ( ! isSink(t) || inDegreeOf(t) > 1 )
      +                    return false;
      +
      +            if ( dontModifyGraphEvenIfPossible() ) return true;
      +
      +            final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails);
      +
      +            if (splitter.meetsMinMergableSequenceForSuffix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES))
      +                return splitter.splitAndUpdate(top, null);
      +            else
      +                return false;
      +        }
      +    }
      +
      +    /**
      +     * Merge headless configurations:
      +     *
      +     * Performs the transformation:
      +     *
      +     * { x + S_i -> y -> Z }
      +     *
      +     * goes to:
      +     *
      +     * { x -> S_i -> y + Z }
      +     *
      +     * for all nodes that match this configuration.
      +     */
      +    protected class MergeCommonSuffices extends VertexBasedTransformer {
      +        @Override
      +        boolean tryToTransform(final SeqVertex bottom) {
      +            return new SharedSequenceMerger().merge(SeqGraph.this, bottom);
      +        }
      +    }
      +
      +    /**
      +     * Performs the transformation:
      +     *
      +     * { x + S_i + y -> Z }
      +     *
      +     * goes to:
      +     *
      +     * { x -> S_i -> y -> Z }
      +     *
      +     * for all nodes that match this configuration.
      +     *
      +     * Differs from the diamond transform in that no top node is required
      +     */
      +    protected class SplitCommonSuffices extends VertexBasedTransformer {
      +        final Set alreadySplit = new HashSet();
      +
      +        @Override
      +        boolean tryToTransform(final SeqVertex bottom) {
      +            if ( alreadySplit.contains(bottom) )
      +                return false;
      +            else {
      +                alreadySplit.add(bottom);
      +                return new CommonSuffixSplitter().split(SeqGraph.this, bottom);
      +            }
      +        }
      +    }
      +
      +    /**
      +     * Merge headless configurations:
      +     *
      +     * Performs the transformation:
      +     *
      +     * { x + S_i + y -> Z }
      +     *
      +     * goes to:
      +     *
      +     * { x -> S_i -> y -> Z }
      +     *
      +     * for all nodes that match this configuration.
      +     *
      +     * Differs from the diamond transform in that no top node is required
      +     */
      +    protected class MergeHeadlessIncomingSources extends VertexBasedTransformer {
      +        @Override
      +        boolean tryToTransform(final SeqVertex bottom) {
      +            final Set incoming = incomingVerticesOf(bottom);
      +            if ( incoming.size() <= 1 )
      +                return false;
      +
      +            for ( final SeqVertex inc : incoming )
      +                if ( ! isSource(inc) || outDegreeOf(inc) > 1 )
      +                    return false;
      +
      +            if ( dontModifyGraphEvenIfPossible() ) return true;
      +
      +            final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming);
      +            if (splitter.meetsMinMergableSequenceForPrefix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES))
      +                return splitter.splitAndUpdate(null, bottom);
      +            else
      +                return false;
      +        }
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java
      new file mode 100644
      index 000000000..f192b54aa
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java
      @@ -0,0 +1,167 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Requires;
      +import org.broadinstitute.sting.utils.Utils;
      +import java.util.Arrays;
      +
      +/**
      + * A graph vertex containing a sequence of bases and a unique ID that
      + * allows multiple distinct nodes in the graph to have the same sequence.
      + *
      + * This is essential when thinking about representing the actual sequence of a haplotype
      + * in a graph.  There can be many parts of the sequence that have the same sequence, but
      + * are distinct elements in the graph because they have a different position in the graph.  For example:
      + *
      + * A -> C -> G -> A -> T
      + *
      + * The two As are not the same, because they occur with different connections.  In a kmer graph equals()
      + * is based on the sequence itself, as each distinct kmer can only be represented once.  But the transformation
      + * of the kmer graph into a graph of base sequences, without their kmer prefixes, means that nodes that
      + * where once unique including their prefix can become equal after shedding the prefix.  So we need to
      + * use some mechanism -- here a unique ID per node -- to separate nodes that have the same sequence
      + * but are distinct elements of the graph.
      + *
      + * @author: depristo
      + * @since 03/2013
      + */
      +public final class SeqVertex extends BaseVertex {
      +    private static int idCounter = 0;
      +    public final int id;
      +
      +    /**
      +     * Create a new SeqVertex with sequence and the next available id
      +     * @param sequence our base sequence
      +     */
      +    public SeqVertex(final byte[] sequence) {
      +        super(sequence);
      +        this.id = idCounter++;
      +    }
      +
      +    /**
      +     * Create a new SeqVertex having bases of sequence.getBytes()
      +     * @param sequence the string representation of our bases
      +     */
      +    public SeqVertex(final String sequence) {
      +        super(sequence);
      +        this.id = idCounter++;
      +    }
      +
      +    /**
      +     * Create a copy of toCopy
      +     * @param toCopy a SeqVertex to copy into this newly allocated one
      +     */
      +    public SeqVertex(final SeqVertex toCopy) {
      +        super(toCopy.sequence);
      +        this.id = toCopy.id;
      +    }
      +
      +    /**
      +     * Get the unique ID for this SeqVertex
      +     * @return a positive integer >= 0
      +     */
      +    public int getId() {
      +        return id;
      +    }
      +
      +    @Override
      +    public String toString() {
      +        return "SeqVertex_id_" + id + "_seq_" + getSequenceString();
      +    }
      +
      +    /**
      +     * Two SeqVertex are equal only if their ids are equal
      +     * @param o
      +     * @return
      +     */
      +    @Override
      +    public boolean equals(Object o) {
      +        if (this == o) return true;
      +        if (o == null || getClass() != o.getClass()) return false;
      +
      +        SeqVertex seqVertex = (SeqVertex) o;
      +        if (id != seqVertex.id) return false;
      +
      +        // note that we don't test for super equality here because the ids are unique
      +        //if (!super.equals(o)) return false;
      +
      +        return true;
      +    }
      +
      +    @Override
      +    public int hashCode() {
      +        return id;
      +    }
      +
      +    /**
      +     * Return a new SeqVertex derived from this one but not including the suffix bases
      +     *
      +     * @param suffix the suffix bases to remove from this vertex
      +     * @return a newly allocated SeqVertex with appropriate prefix, or null if suffix removes all bases from this node
      +     */
      +    @Requires("Utils.endsWith(sequence, suffix)")
      +    public SeqVertex withoutSuffix(final byte[] suffix) {
      +        final int prefixSize = sequence.length - suffix.length;
      +        return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null;
      +    }
      +
      +    /**
      +     * Return a new SeqVertex derived from this one but not including prefix or suffix bases
      +     *
      +     * @param prefix the previx bases to remove
      +     * @param suffix the suffix bases to remove from this vertex
      +     * @return a newly allocated SeqVertex
      +     */
      +    @Requires("Utils.endsWith(sequence, suffix)")
      +    public SeqVertex withoutPrefixAndSuffix(final byte[] prefix, final byte[] suffix) {
      +        final int start = prefix.length;
      +        final int length = sequence.length - suffix.length - prefix.length;
      +        final int stop = start + length;
      +        return length > 0 ? new SeqVertex(Arrays.copyOfRange(sequence, start, stop)) : null;
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java
      new file mode 100644
      index 000000000..1c53f2332
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java
      @@ -0,0 +1,138 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.apache.commons.lang.ArrayUtils;
      +
      +import java.util.*;
      +
      +/**
      + * Merges the incoming vertices of a vertex V of a graph
      + *
      + * Looks at the vertices that are incoming to V (i.e., have an outgoing edge connecting to V).  If
      + * they all have the same sequence, merges them into the sequence of V, and updates the graph
      + * as appropriate
      + *
      + * User: depristo
      + * Date: 3/22/13
      + * Time: 8:31 AM
      + */
      +public class SharedSequenceMerger {
      +    public SharedSequenceMerger() { }
      +
      +    /**
      +     * Attempt to merge the incoming vertices of v
      +     *
      +     * @param graph the graph containing the vertex v
      +     * @param v the vertex whose incoming vertices we want to merge
      +     * @return true if some useful merging was done, false otherwise
      +     */
      +    public boolean merge(final SeqGraph graph, final SeqVertex v) {
      +        if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
      +        if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v);
      +
      +        final Set prevs = graph.incomingVerticesOf(v);
      +        if ( ! canMerge(graph, v, prevs) )
      +            return false;
      +        else {
      +//            graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0);
      +
      +            final List edgesToRemove = new LinkedList();
      +            final byte[] prevSeq = prevs.iterator().next().getSequence();
      +            final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence()));
      +            graph.addVertex(newV);
      +
      +            for ( final SeqVertex prev : prevs ) {
      +                for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) {
      +                    graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn));
      +                    edgesToRemove.add(prevIn);
      +                }
      +            }
      +
      +            for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) {
      +                graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e));
      +            }
      +
      +            graph.removeAllVertices(prevs);
      +            graph.removeVertex(v);
      +            graph.removeAllEdges(edgesToRemove);
      +
      +//            graph.printGraph(new File("csm." + counter++ + "." + v.getSequenceString() + "_post.dot"), 0);
      +
      +            return true;
      +        }
      +    }
      +
      +    //private static int counter = 0;
      +
      +    /**
      +     * Can we safely merge the incoming vertices of v
      +     *
      +     * @param graph the graph containing v and incomingVertices
      +     * @param v the vertex we want to merge into
      +     * @param incomingVertices the incoming vertices of v
      +     * @return true if we can safely merge incomingVertices
      +     */
      +    private boolean canMerge(final SeqGraph graph, final SeqVertex v, final Collection incomingVertices) {
      +        if ( incomingVertices.isEmpty() )
      +            return false;
      +
      +        final SeqVertex first = incomingVertices.iterator().next();
      +        for ( final SeqVertex prev : incomingVertices) {
      +            if ( ! prev.seqEquals(first) )
      +                return false;
      +            final Collection prevOuts = graph.outgoingVerticesOf(prev);
      +            if ( prevOuts.size() != 1 )
      +                return false;
      +            if ( prevOuts.iterator().next() != v )
      +                return false;
      +        }
      +
      +        return true;
      +    }
      +
      +}
      \ No newline at end of file
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java
      new file mode 100644
      index 000000000..f6ee4c3c3
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java
      @@ -0,0 +1,329 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import com.google.java.contract.Ensures;
      +import com.google.java.contract.Requires;
      +import org.broadinstitute.sting.utils.collections.Pair;
      +
      +import java.util.*;
      +
      +/**
      + * Split a collection of middle nodes in a graph into their shared prefix and suffix values
      + *
      + * This code performs the following transformation.  Suppose I have a set of vertices V, such
      + * that each vertex is composed of sequence such that
      + *
      + * Vi = prefix + seq_i + suffix
      + *
      + * where prefix and suffix are shared sequences across all vertices V
      + *
      + * This algorithm creates a new SeqGraph with the following configuration
      + *
      + * prefix -> has outgoing edges to all seq_i
      + * suffix -> has incoming edges for all seq_i
      + *
      + * There are a few special cases that must be handled.  First, Vi could be simply
      + * == to the prefix or the suffix.  These generate direct connections between
      + * the prefix and suffix nodes, and they are handled internally by the algorithm.
      + *
      + * Note that for convenience, we will always create newTop and newBottom nodes, but
      + * these may be empty node (i.e., they contain no sequence).  That allows them to be
      + * trivially merged, if desired, when the graph is incorporated into an overall
      + * graph.
      + *
      + * The product of this operation is a SeqGraph that contains the split.  There's a
      + * function to merge reconnect this graph into the graph that contains the middle nodes
      + *
      + * The process guarentees a few things about the output:
      + *
      + * -- Preserves the paths and weights among all vertices
      + *
      + * It produces a graph that has some unusual properties
      + *
      + * -- May add nodes with no sequence (isEmpty() == true) to preserve connectivity among the graph
      + * -- May introduce edges with no multiplicity to preserve paths through the graph
      + *
      + * The overall workflow of using this class is simple:
      + *
      + * find vertices V in graph that you want to split out
      + * s = new SharedVertexSequenceSplitter(graph, V)
      + * s.updateGraph(graph)
      + *
      + * to update the graph with the modifications created by this splitter
      + *
      + * User: depristo
      + * Date: 3/22/13
      + * Time: 8:31 AM
      + */
      +public class SharedVertexSequenceSplitter {
      +    final private SeqGraph outer;
      +    final protected SeqVertex prefixV, suffixV;
      +    final protected Collection toSplits;
      +
      +    // updated in split routine
      +    protected SeqGraph splitGraph = null;
      +    protected Collection newMiddles = null;
      +    protected List edgesToRemove = null;
      +
      +    /**
      +     * Create a new graph that contains the vertices in toSplitsArg with their shared suffix and prefix
      +     * sequences extracted out.
      +     *
      +     * @param graph the graph containing the vertices in toSplitsArg
      +     * @param toSplitsArg a collection of vertices to split.  Must be contained within graph, and have only connections
      +     *                    from a single shared top and/or bottom node
      +     */
      +    public SharedVertexSequenceSplitter(final SeqGraph graph, final Collection toSplitsArg) {
      +        if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
      +        if ( toSplitsArg == null ) throw new IllegalArgumentException("toSplitsArg cannot be null");
      +        if ( toSplitsArg.size() < 2 ) throw new IllegalArgumentException("Can only split at least 2 vertices but only got " + toSplitsArg);
      +        if ( ! graph.vertexSet().containsAll(toSplitsArg) ) throw new IllegalArgumentException("graph doesn't contain all of the vertices to split");
      +
      +        this.outer = graph;
      +        this.toSplits = toSplitsArg;
      +
      +        // all of the edges point to the same sink, so it's time to merge
      +        final Pair prefixAndSuffix = commonPrefixAndSuffixOfVertices(toSplits);
      +        prefixV = prefixAndSuffix.getFirst();
      +        suffixV = prefixAndSuffix.getSecond();
      +    }
      +
      +    /**
      +     * Given sequencing that are all equal, does this splitter make those into prefix or suffix nodes?
      +     * @return true if we merge equal nodes into prefix nodes or suffix nodes
      +     */
      +    protected static boolean prefersPrefixMerging() {
      +        return true;
      +    }
      +
      +    /**
      +     * Simple single-function interface to split and then update a graph
      +     *
      +     * @see #updateGraph(SeqVertex, SeqVertex) for a full description of top and bottom
      +     *
      +     * @param top the top vertex, may be null
      +     * @param bottom the bottom vertex, may be null
      +     * @return true if some useful splitting was done, false otherwise
      +     */
      +    public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom) {
      +        split();
      +        updateGraph(top, bottom);
      +        return true;
      +    }
      +
      +    /**
      +     * Does either the common suffix or prefix have at least minCommonSequence bases in it?
      +     * @param minCommonSequence a minimum length of the common sequence, must be >= 0
      +     * @return true if either suffix or prefix length >= minCommonSequence
      +     */
      +    public boolean meetsMinMergableSequenceForEitherPrefixOrSuffix(final int minCommonSequence) {
      +        return meetsMinMergableSequenceForPrefix(minCommonSequence) || meetsMinMergableSequenceForSuffix(minCommonSequence);
      +    }
      +
      +    /**
      +     * Does the common prefix have at least minCommonSequence bases in it?
      +     * @param minCommonSequence a minimum length of the common sequence, must be >= 0
      +     * @return true if prefix length >= minCommonSequence
      +     */
      +    public boolean meetsMinMergableSequenceForPrefix(final int minCommonSequence) {
      +        return prefixV.length() >= minCommonSequence;
      +    }
      +
      +    /**
      +     * Does the common suffix have at least minCommonSequence bases in it?
      +     * @param minCommonSequence a minimum length of the common sequence, must be >= 0
      +     * @return true if suffix length >= minCommonSequence
      +     */
      +    public boolean meetsMinMergableSequenceForSuffix(final int minCommonSequence) {
      +        return suffixV.length() >= minCommonSequence;
      +    }
      +
      +    /**
      +     * Actually do the splitting up of the vertices
      +     *
      +     * Must be called before calling updateGraph
      +     */
      +    public void split() {
      +        splitGraph = new SeqGraph();
      +        newMiddles = new LinkedList();
      +        edgesToRemove = new LinkedList();
      +
      +        splitGraph.addVertices(prefixV, suffixV);
      +
      +        for ( final SeqVertex mid : toSplits ) {
      +            final BaseEdge toMid = processEdgeToRemove(mid, outer.incomingEdgeOf(mid));
      +            final BaseEdge fromMid = processEdgeToRemove(mid, outer.outgoingEdgeOf(mid));
      +
      +            final SeqVertex remaining = mid.withoutPrefixAndSuffix(prefixV.getSequence(), suffixV.getSequence());
      +            if ( remaining != null ) {
      +                // there's some sequence prefix + seq + suffix, so add the node and make edges
      +                splitGraph.addVertex(remaining);
      +                newMiddles.add(remaining);
      +                // update edge from top -> middle to be top -> without suffix
      +                splitGraph.addEdge(prefixV, remaining, toMid);
      +                splitGraph.addEdge(remaining, suffixV, fromMid);
      +            } else {
      +                // prefix + suffix completely explain this node
      +                splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid));
      +            }
      +        }
      +    }
      +
      +    /**
      +     * Update graph outer, replacing the previous middle vertices that were split out with the new
      +     * graph structure of the split, linking this subgraph into the graph at top and bot (the
      +     * vertex connecting the middle nodes and the vertex outgoing of all middle node)
      +     *
      +     * @param top an optional top node that must have outgoing edges to all split vertices.  If null, this subgraph
      +     *            will be added without any incoming edges
      +     * @param bot an optional bottom node that must have incoming edges to all split vertices.  If null, this subgraph
      +     *            will be added without any outgoing edges to the rest of the graph
      +     */
      +    public void updateGraph(final SeqVertex top, final SeqVertex bot) {
      +        if ( ! outer.vertexSet().containsAll(toSplits) ) throw new IllegalArgumentException("graph doesn't contain all of the original vertices to split");
      +        if ( top == null && bot == null ) throw new IllegalArgumentException("Cannot update graph without at least one top or bot vertex, but both were null");
      +        if ( top != null && ! outer.containsVertex(top) ) throw new IllegalArgumentException("top " + top + " not found in graph " + outer);
      +        if ( bot != null && ! outer.containsVertex(bot) ) throw new IllegalArgumentException("bot " + bot + " not found in graph " + outer);
      +        if ( splitGraph == null ) throw new IllegalStateException("Cannot call updateGraph until split() has been called");
      +
      +        outer.removeAllVertices(toSplits);
      +        outer.removeAllEdges(edgesToRemove);
      +
      +        outer.addVertices(newMiddles);
      +
      +        final boolean hasPrefixSuffixEdge = splitGraph.getEdge(prefixV, suffixV) != null;
      +        final boolean hasOnlyPrefixSuffixEdges = hasPrefixSuffixEdge && splitGraph.outDegreeOf(prefixV) == 1;
      +        final boolean needPrefixNode = ! prefixV.isEmpty() || (top == null && ! hasOnlyPrefixSuffixEdges);
      +        final boolean needSuffixNode = ! suffixV.isEmpty() || (bot == null && ! hasOnlyPrefixSuffixEdges);
      +
      +        // if prefix / suffix are needed, keep them
      +        final SeqVertex topForConnect = needPrefixNode ? prefixV : top;
      +        final SeqVertex botForConnect = needSuffixNode ? suffixV : bot;
      +
      +        if ( needPrefixNode ) {
      +            outer.addVertex(prefixV);
      +            if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0));
      +        }
      +
      +        if ( needSuffixNode ) {
      +            outer.addVertex(suffixV);
      +            if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0));
      +        }
      +
      +        if ( topForConnect != null ) {
      +            for ( final BaseEdge e : splitGraph.outgoingEdgesOf(prefixV) ) {
      +                final SeqVertex target = splitGraph.getEdgeTarget(e);
      +
      +                if ( target == suffixV ) { // going straight from prefix -> suffix
      +                    if ( botForConnect != null )
      +                        outer.addEdge(topForConnect, botForConnect, e);
      +                } else {
      +                    outer.addEdge(topForConnect, target, e);
      +                }
      +            }
      +        }
      +
      +        if ( botForConnect != null ) {
      +            for ( final BaseEdge e : splitGraph.incomingEdgesOf(suffixV) ) {
      +                outer.addEdge(splitGraph.getEdgeSource(e), botForConnect, e);
      +            }
      +        }
      +    }
      +
      +    /**
      +     * Return the longest suffix of bases shared among all provided vertices
      +     *
      +     * For example, if the vertices have sequences AC, CC, and ATC, this would return
      +     * a single C.  However, for ACC and TCC this would return CC.  And for AC and TG this
      +     * would return null;
      +     *
      +     * @param middleVertices a non-empty set of vertices
      +     * @return
      +     */
      +    @Requires("!middleVertices.isEmpty()")
      +    protected static Pair commonPrefixAndSuffixOfVertices(final Collection middleVertices) {
      +        final List kmers = new ArrayList(middleVertices.size());
      +
      +        int min = Integer.MAX_VALUE;
      +        for ( final SeqVertex v : middleVertices ) {
      +            kmers.add(v.getSequence());
      +            min = Math.min(min, v.getSequence().length);
      +        }
      +
      +        final int prefixLen = GraphUtils.compPrefixLen(kmers, min);
      +        final int suffixLen = GraphUtils.compSuffixLen(kmers, min - prefixLen);
      +
      +        final byte[] kmer = kmers.get(0);
      +        final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen);
      +        final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length);
      +        return new Pair(new SeqVertex(prefix), new SeqVertex(suffix));
      +    }
      +
      +    /**
      +     * Helper function that returns an edge that we should use for splitting
      +     *
      +     * If e is null, creates a new 0 multiplicity edge, set to ref is any edges to V are ref
      +     * If e is not null, returns a new copy of e, and schedules e for removal
      +     *
      +     * @param e a non-null edge
      +     * @return a non-null edge
      +     */
      +    @Requires("v != null")
      +    @Ensures("result != null")
      +    private BaseEdge processEdgeToRemove(final SeqVertex v, final BaseEdge e) {
      +        if ( e == null ) {
      +            // there's no edge, so we return a newly allocated one and don't schedule e for removal
      +            // the weight must be 0 to preserve sum through the diamond
      +            return new BaseEdge(outer.isReferenceNode(v), 0);
      +        } else {
      +            // schedule edge for removal, and return a freshly allocated one for our graph to use
      +            edgesToRemove.add(e);
      +            return new BaseEdge(e);
      +        }
      +    }
      +}
      \ No newline at end of file
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
      index f7686bdf5..cd4ea778d 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
      @@ -47,7 +47,7 @@
       package org.broadinstitute.sting.gatk.walkers.indels;
       
       import net.sf.samtools.SAMRecord;
      -import org.broadinstitute.sting.utils.Haplotype;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
       import org.broadinstitute.sting.utils.MathUtils;
       import org.broadinstitute.sting.utils.QualityUtils;
       import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
      index c7d24f475..c0848663e 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
      @@ -62,7 +62,8 @@ import org.broadinstitute.sting.gatk.walkers.BAQMode;
       import org.broadinstitute.sting.gatk.walkers.ReadWalker;
       import org.broadinstitute.sting.utils.BaseUtils;
       import org.broadinstitute.sting.utils.GenomeLoc;
      -import org.broadinstitute.sting.utils.SWPairwiseAlignment;
      +import org.broadinstitute.sting.utils.smithwaterman.Parameters;
      +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
       import org.broadinstitute.sting.utils.Utils;
       import org.broadinstitute.sting.utils.baq.BAQ;
       import org.broadinstitute.sting.utils.collections.Pair;
      @@ -87,7 +88,7 @@ import java.io.IOException;
       import java.util.*;
       
       /**
      - * Performs local realignment of reads based on misalignments due to the presence of indels.
      + * Performs local realignment of reads to correct misalignments due to the presence of indels.
        *
        * 

      * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases @@ -100,39 +101,46 @@ import java.util.*; * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and * specifically identify indels. - *

      + *

      *
        There are 2 steps to the realignment process: *
      1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
      2. *
      3. Running the realigner over those intervals (IndelRealigner)
      4. *
      - *

      - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. *

      - * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). + * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 + *

      * - *

      Input

      + *

      Input

      *

      * One or more aligned BAM files and optionally one or more lists of known indels. *

      * - *

      Output

      + *

      Output

      *

      * A realigned version of your input BAM file(s). *

      * - *

      Examples

      + *

      Example

      *
        * java -Xmx4g -jar GenomeAnalysisTK.jar \
      - *   -I input.bam \
      - *   -R ref.fasta \
        *   -T IndelRealigner \
      + *   -R ref.fasta \
      + *   -I input.bam \
        *   -targetIntervals intervalListFromRTC.intervals \
        *   -o realignedBam.bam \
        *   [-known /path/to/indels.vcf] \
        *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
        * 
      * + *

      Caveats

      + * + *
      • + * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. + *
      • + * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). + *
      + * * @author ebanks */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @@ -168,7 +176,7 @@ public class IndelRealigner extends ReadWalker { /** * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). */ - @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + @Input(fullName="targetIntervals", shortName="targetIntervals", doc="Intervals file output from RealignerTargetCreator", required=true) protected IntervalBinding intervalsFile = null; /** @@ -182,7 +190,7 @@ public class IndelRealigner extends ReadWalker { /** * The realigned bam file. */ - @Output(required=false, doc="Output bam") + @Output(required=false, doc="Output bam", defaultToStdout=false) protected StingSAMFileWriter writer = null; protected ConstrainedMateFixingManager manager = null; protected SAMFileWriter writerToUse = null; @@ -203,7 +211,7 @@ public class IndelRealigner extends ReadWalker { * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. */ @Advanced - @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) + @Argument(fullName="entropyThreshold", shortName="entropy", doc="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required=false) protected double MISMATCH_THRESHOLD = 0.15; /** @@ -225,21 +233,21 @@ public class IndelRealigner extends ReadWalker { * For expert users only! */ @Advanced - @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false) + @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="Maximum positional move in basepairs that a read can be adjusted during realignment", required=false) protected int MAX_POS_MOVE_ALLOWED = 200; /** * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. */ @Advanced - @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) + @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="Max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) protected int MAX_CONSENSUSES = 30; /** * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. */ @Advanced - @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) + @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) protected int MAX_READS_FOR_CONSENSUSES = 120; /** @@ -247,7 +255,7 @@ public class IndelRealigner extends ReadWalker { * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. */ @Advanced - @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="Max reads allowed at an interval for realignment", required=false) protected int MAX_READS = 20000; @Advanced @@ -263,7 +271,7 @@ public class IndelRealigner extends ReadWalker { * * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). */ - @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") + @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file (not compatible with -output)") protected String N_WAY_OUT = null; @Hidden @@ -288,15 +296,15 @@ public class IndelRealigner extends ReadWalker { protected boolean KEEP_ALL_PG_RECORDS = false; @Hidden - @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") + @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, defaultToStdout=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") protected String OUT_INDELS = null; @Hidden - @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) protected String OUT_STATS = null; @Hidden - @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) protected String OUT_SNPS = null; // fasta reference reader to supplement the edges of the reference sequence @@ -321,10 +329,7 @@ public class IndelRealigner extends ReadWalker { // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; - private static final double SW_MATCH = 30.0; // 1.0; - private static final double SW_MISMATCH = -10.0; //-1.0/3.0; - private static final double SW_GAP = -10.0; //-1.0-1.0/3.0; - private static final double SW_GAP_EXTEND = -2.0; //-1.0/.0; + private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0); // reference base padding size // TODO -- make this a command-line argument if the need arises @@ -992,7 +997,7 @@ public class IndelRealigner extends ReadWalker { private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, swParameters); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); if ( c != null ) altConsensesToPopulate.add(c); @@ -1009,7 +1014,7 @@ public class IndelRealigner extends ReadWalker { } // do a pairwise alignment against the reference SWalignmentRuns++; - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), swParameters); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases()); if ( c != null ) { altConsensesToPopulate.add(c); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index ff21893f1..532d13690 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -68,17 +68,17 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. * - *

      Input

      + *

      Input

      *

      * A bam file to left-align. *

      * - *

      Output

      + *

      Output

      *

      * A left-aligned bam. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx3g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
      index 45162fdba..363f7a357 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
      @@ -48,7 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
       
       import com.google.java.contract.Ensures;
       import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
      -import org.broadinstitute.sting.utils.Haplotype;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
       import org.broadinstitute.sting.utils.MathUtils;
       import org.broadinstitute.sting.utils.clipping.ReadClipper;
       import org.broadinstitute.sting.utils.exceptions.UserException;
      @@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       import org.broadinstitute.sting.utils.sam.ReadUtils;
       import org.broadinstitute.variant.variantcontext.Allele;
       
      -import java.io.PrintStream;
       import java.util.Arrays;
       import java.util.LinkedHashMap;
       import java.util.Map;
      @@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel {
                                                                               final ReferenceContext ref,
                                                                               final int eventLength,
                                                                               final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
      -                                                                        final double downsamplingFraction,
      -                                                                        final PrintStream downsamplingLog) {
      +                                                                        final double downsamplingFraction) {
               final int numHaplotypes = haplotypeMap.size();
       
               final int readCounts[] = new int[pileup.getNumberOfElements()];
               final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
      -        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
      +        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction);
               return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
               
           }
      @@ -247,8 +245,13 @@ public class PairHMMIndelErrorModel {
                       }
                   }
                   else {
      -                final int refWindowStart = ref.getWindow().getStart();
      -                final int refWindowStop  = ref.getWindow().getStop();
      +                // extra padding on candidate haplotypes to make sure reads are always strictly contained
      +                // in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly
      +                // in case bases at edge of reads have lower quality.
      +                final int trailingBases = 3;
      +                final int extraOffset = Math.abs(eventLength);
      +                final int refWindowStart = ref.getWindow().getStart()+(trailingBases+extraOffset);
      +                final int refWindowStop  = ref.getWindow().getStop()-(trailingBases+extraOffset);
       
                       if (DEBUG) {
                           System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
      @@ -257,10 +260,10 @@ public class PairHMMIndelErrorModel {
                       GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
       
                       if (!read.isEmpty() && (read.getSoftEnd() > refWindowStop && read.getSoftStart() < refWindowStop))
      -                    read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop());
      +                    read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, refWindowStop);
       
                       if (!read.isEmpty() && (read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart))
      -                    read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart());
      +                    read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, refWindowStart);
       
                       if (read.isEmpty())
                           continue;
      @@ -272,7 +275,6 @@ public class PairHMMIndelErrorModel {
                           continue;
       
                       // get bases of candidate haplotypes that overlap with reads
      -                final int trailingBases = 3;
                       final long readStart = read.getSoftStart();
                       final long readEnd = read.getSoftEnd();
       
      @@ -288,7 +290,6 @@ public class PairHMMIndelErrorModel {
                       final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
                       final byte [] unclippedReadBases = read.getReadBases();
                       final byte [] unclippedReadQuals = read.getBaseQualities();
      -                final int extraOffset = Math.abs(eventLength);
       
                       /**
                        * Compute genomic locations that candidate haplotypes will span.
      @@ -315,6 +316,7 @@ public class PairHMMIndelErrorModel {
                           startLocationInRefForHaplotypes = ref.getWindow().getStop();                                        // read starts after haplotype: read will have to be clipped completely;
                       }
       
      +                // candidate haplotype cannot go beyond reference context
                       if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
                           stopLocationInRefForHaplotypes = ref.getWindow().getStop();                                         // check also if end of read will go beyond reference context
                       }
      @@ -349,7 +351,6 @@ public class PairHMMIndelErrorModel {
       
                           int j=0;
       
      -                    byte[] previousHaplotypeSeen = null;
                           final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
                           final byte[] contextLogGapContinuationProbabilities  = new byte[readBases.length];
       
      @@ -389,37 +390,30 @@ public class PairHMMIndelErrorModel {
                                   System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
                                           indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
       
      -                        final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
      -                                (int)indStart, (int)indStop);
      +                        final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
       
      -                        final int X_METRIC_LENGTH = readBases.length+2;
      -                        final int Y_METRIC_LENGTH = haplotypeBases.length+2;
      +                        // it's possible that the indel starts at the last base of the haplotypes
      +                        if ( haplotypeBases.length == 0 ) {
      +                            readLikelihood = -Double.MAX_VALUE;
      +                        } else {
      +                            if (firstHap) {
      +                                //no need to reallocate arrays for each new haplotype, as length won't change
      +                                pairHMM.initialize(readBases.length, haplotypeBases.length);
      +                                firstHap = false;
      +                            }
       
      -                        if (previousHaplotypeSeen == null) {
      -                            //no need to reallocate arrays for each new haplotype, as length won't change
      -                            pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
      +                            readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
      +                                    baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap);
                               }
       
      -                        int startIndexInHaplotype = 0;
      -                        if (previousHaplotypeSeen != null)
      -                            startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
      -                        previousHaplotypeSeen = haplotypeBases.clone();
      -
      -                        readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
      -                                baseInsertionQualities, baseDeletionQualities,
      -                                contextLogGapContinuationProbabilities, startIndexInHaplotype, firstHap);
      -
      -
                               if (DEBUG) {
                                   System.out.println("H:"+new String(haplotypeBases));
                                   System.out.println("R:"+new String(readBases));
                                   System.out.format("L:%4.2f\n",readLikelihood);
      -                            System.out.format("StPos:%d\n", startIndexInHaplotype);
                               }
       
                               perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
                               readLikelihoods[readIdx][j++] = readLikelihood;
      -                        firstHap = false;
                           }
                       }
                   }
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
      index dea17cd02..caeb1e8d7 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
      @@ -91,27 +91,30 @@ import java.util.TreeSet;
        *     
    • Running the realigner over those intervals (see the IndelRealigner tool)
    • *
*

- * An important note: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + * Important note 1: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *

- * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * Important note 2: when multiple potential indels are found by the tool in the same general region, the tool will choose the most likely + * one for realignment to the exclusion of the others. This is a known limitation of the tool. + *

+ * Important note 3: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. * - *

Input

+ *

Input

*

* One or more aligned BAM files and optionally one or more lists of known indels. *

* - *

Output

+ *

Output

*

* A list of target intervals to pass to the Indel Realigner. *

* - *

Examples

+ *

Examples

*
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I input.bam \
- *   -R ref.fasta \
  *   -T RealignerTargetCreator \
+ *   -R ref.fasta \
+ *   -I input.bam \
  *   -o forIndelRealigner.intervals \
  *   [--known /path/to/indels.vcf]
  * 
@@ -140,7 +143,7 @@ public class RealignerTargetCreator extends RodWalker> known = Collections.emptyList(); /** - * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart. + * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart. Must be > 1. */ @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false) protected int windowSize = 10; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 54a324411..a4c1caf86 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -90,7 +90,7 @@ import java.util.*; *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • * * - *

    Input

    + *

    Input

    *

    *

      *
    • A VCF variant set containing trio(s) and/or parent/child pair(s).
    • @@ -108,12 +108,12 @@ import java.util.*; *
    *

    * - *

    Output

    + *

    Output

    *

    * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. *

    * - *

    Examples

    + *

    Examples

    *
      * java -Xmx2g -jar GenomeAnalysisTK.jar \
      *   -R ref.fasta \
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java
    index eb2bb62ef..bb8c14ef7 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java
    @@ -65,17 +65,17 @@ import java.util.*;
      * [Functionality of this walker]
      * 

    *

    - *

    Input

    + *

    Input

    *

    * [Input description] *

    *

    - *

    Output

    + *

    Output

    *

    * [Output description] *

    *

    - *

    Examples

    + *

    Examples

    *
      *    java
      *      -jar GenomeAnalysisTK.jar
    diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
    index 7f2cdd3d0..a297b38cf 100644
    --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
    +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
    @@ -86,17 +86,17 @@ import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersF
      * Performs physical phasing of SNP calls, based on sequencing reads.
      * 

    * - *

    Input

    + *

    Input

    *

    * VCF file of SNP calls, BAM file of sequence reads. *

    * - *

    Output

    + *

    Output

    *

    * Phased VCF file. *

    * - *

    Examples

    + *

    Examples

    *
      *    java
      *      -jar GenomeAnalysisTK.jar
    @@ -131,7 +131,7 @@ public class ReadBackedPhasing extends RodWalkerInput
    + * 

    + * The original and reduced BAM files. + *

    + * + *

    Output

    + *

    + * A list of intervals present in one bam but not the other. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -I:original original.bam \
    + *   -I:reduced reduced.bam \
    + *   -R ref.fasta \
    + *   -T AssessReducedCoverage \
    + *   -o output.intervals
    + * 
    + * + * @author ebanks + */ +@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) +@Hidden +public class AssessReducedCoverage extends LocusWalker implements TreeReducible { + + private static final String original = "original"; + private static final String reduced = "reduced"; + + @Output + protected PrintStream out; + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) + public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; + + public void initialize() {} + + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if ( tracker == null ) + return null; + + final Set tags = getAllTags(context.getBasePileup()); + return (tags.contains(original) && !tags.contains(reduced)) || + (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; + } + + private Set getAllTags(final ReadBackedPileup pileup) { + + final Set tags = new HashSet(10); + + for ( final PileupElement p : pileup ) { + if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) + tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); + } + + return tags; + } + + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } + + public GenomeLoc reduceInit() { + return null; + } + + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; + + if ( rhs == null ) + return lhs; + + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } + + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; + + if ( sum == null ) + return value; + + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java similarity index 62% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java index 0aea54fa0..a3bdc6691 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -44,129 +44,160 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.qc; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.io.PrintStream; +import java.util.List; -class IntervalStatistics { +/** + * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of + * the reduced bam are above sufficient threshold) + * + *

    Input

    + *

    + * The original and reduced BAM files. + *

    + * + *

    Output

    + *

    + * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -I:original original.bam \
    + *   -I:reduced reduced.bam \
    + *   -R ref.fasta \
    + *   -T AssessReducedQuals \
    + *   -o output.intervals
    + * 
    + * + * @author ami + */ - private final Map samples; - private final GenomeLoc interval; - private boolean hasNref = false; +public class AssessReducedQuals extends LocusWalker implements TreeReducible { - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + private static final String reduced = "reduced"; + private static final int originalQualsIndex = 0; + private static final int reducedQualsIndex = 1; - /* - private double minMedianDepth = 20.0; - private double badMedianDepthPercentage = 0.20; - private double votePercentage = 0.50; - */ - public IntervalStatistics(Set samples, GenomeLoc interval/*, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality*/) { - this.interval = interval; - this.samples = new HashMap(samples.size()); - for (String sample : samples) - this.samples.put(sample, new SampleStatistics(interval /*, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality*/)); + @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) + public int sufficientQualSum = 600; + + @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false) + public double qual_epsilon = 0.10; + + @Argument(fullName = "exclude_low_mq", shortName = "excludeMQ", doc = "ignore reads with mapping quality below this number", required = false) + public int excludeMQ = 0; + + @Output + protected PrintStream out; + + public void initialize() { + if ( qual_epsilon < 0.0 || qual_epsilon > 1.0 ) + throw new UserException.BadArgumentValue("qual_epsilon", "must be a number between 0 and 1"); } - public SampleStatistics getSample(String sample) { - return samples.get(sample); + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Override + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return null; + + boolean reportLocus; + final int[] quals = getPileupQuals(context.getBasePileup()); + final int epsilon = MathUtils.fastRound(quals[originalQualsIndex] * qual_epsilon); + final int calcOriginalQuals = Math.min(quals[originalQualsIndex], sufficientQualSum); + final int calcReducedQuals = Math.min(quals[reducedQualsIndex], sufficientQualSum); + final int originalReducedQualDiff = calcOriginalQuals - calcReducedQuals; + reportLocus = originalReducedQualDiff > epsilon || originalReducedQualDiff < -1 * epsilon; + + return reportLocus ? ref.getLocus() : null; } - public GenomeLoc getInterval() { - return interval; - } + private int[] getPileupQuals(final ReadBackedPileup readPileup) { - /** - * The function to populate data into the Statistics from the walker. - * This takes the input and manages passing the data to the SampleStatistics and Locus Statistics - * - * @param context The alignment context given from the walker - * @param ref the reference context given from the walker - * @param thresholds the class contains the statistical threshold for making calls - */ - public void addLocus(AlignmentContext context, ReferenceContext ref, ThresHolder thresholds) { - ReadBackedPileup pileup = context.getBasePileup(); + final int[] quals = new int[2]; - //System.out.println(ref.getLocus().toString()); - - Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); - - for (Map.Entry entry : samplePileups.entrySet()) { - String sample = entry.getKey(); - ReadBackedPileup samplePileup = entry.getValue(); - SampleStatistics sampleStatistics = samples.get(sample); - - if (sampleStatistics == null) - throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); - - sampleStatistics.addLocus(context.getLocation(), samplePileup, thresholds); + for ( final PileupElement p : readPileup ) { + final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); + if ( isGoodRead(p) ) { + final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); + final int tagIndex = getTagIndex(tags); + quals[tagIndex] += tempQual; + } } - if (!hasNref && ref.getBase() == 'N') - hasNref = true; + return quals; } - public double averageCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return (double) preComputedTotalCoverage / interval.size(); + private boolean isGoodRead(final PileupElement p) { + return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= excludeMQ; } - private void calculateTotalCoverage() { - preComputedTotalCoverage = 0; - for (SampleStatistics sample : samples.values()) - preComputedTotalCoverage += sample.totalCoverage(); + private int getTagIndex(final List tags) { + return tags.contains(reduced) ? 1 : 0; } - /** - * Return the Callable statuses for the interval as a whole - * todo -- add missingness filter - * - * @param thresholds the class contains the statistical threshold for making calls - * @return the callable status(es) for the whole interval - */ - public Set callableStatuses(ThresHolder thresholds) { - Set output = new HashSet(); + @Override + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } - // Initialize the Map - Map votes = new HashMap(); - for (CallableStatus status : CallableStatus.values()) - votes.put(status, 0); + @Override + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; - // tally up the votes - for (SampleStatistics sample : samples.values()) - for (CallableStatus status : sample.getCallableStatuses(thresholds)) - votes.put(status, votes.get(status) + 1); + if ( rhs == null ) + return lhs; - // output tall values above the threshold - for (CallableStatus status : votes.keySet()) { - if (votes.get(status) > (samples.size() * thresholds.getVotePercentageThreshold()) && !(status.equals(CallableStatus.PASS))) - output.add(status); - } + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } - if (hasNref) - output.add(CallableStatus.REF_N); + @Override + public GenomeLoc reduceInit() { + return null; + } - // get median DP of each sample - int nLowMedianDepth = 0; - for (SampleStatistics sample : samples.values()) { - if (sample.getQuantileDepth(0.5) < thresholds.getMinimumMedianDepth()) - nLowMedianDepth++; - } + @Override + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; - if (nLowMedianDepth > (samples.size() * thresholds.getLowMedianDepthThreshold())) - output.add(CallableStatus.LOW_MEDIAN_DEPTH); + if ( sum == null ) + return value; - return output; + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java index d6a814ee8..6af39c0b0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java @@ -99,14 +99,14 @@ import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; *

    * * - *

    Input

    + *

    Input

    *

    * A BAM file to make calls on and a VCF file to use as truth validation dataset. * * You also have the option to invert the roles of the files using the command line options listed below. *

    * - *

    Output

    + *

    Output

    *

    * GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a * 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true @@ -176,7 +176,7 @@ import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; * * * - *

    Examples

    + *

    Examples

    *
      *
    1. * Genotypes BAM file from new technology using the VCF as a truth dataset: diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java index 5c216928b..ad723f0cf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java @@ -85,17 +85,17 @@ import java.util.*; * * User can additionally restrict output to a particular type of variant (SNP, Indel, etc.) * - *

      Input

      + *

      Input

      *

      * One or more variant sets to choose from. *

      * - *

      Output

      + *

      Output

      *

      * A sites-only VCF with the desired number of randomly selected sites. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -145,7 +145,7 @@ public class ValidationSiteSelector extends RodWalker {
           /**
            * The output VCF file
            */
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter vcfWriter = null;
       
           /**
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
      index f2120213a..e15b99824 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
      @@ -81,7 +81,7 @@ import java.util.*;
        * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
        * slightly lower quality level.
        *
      - * 

      Input

      + *

      Input

      *

      * The input raw variants to be recalibrated. *

      @@ -89,11 +89,11 @@ import java.util.*; *

      * The tranches file that was generated by the VariantRecalibrator walker. * - *

      Output

      + *

      Output

      *

      * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. * - *

      Examples

      + *

      Examples

      *
        * java -Xmx3g -jar GenomeAnalysisTK.jar \
        *   -T ApplyRecalibration \
      @@ -128,7 +128,7 @@ public class ApplyRecalibration extends RodWalker implements T
           /////////////////////////////
           // Outputs
           /////////////////////////////
      -    @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true)
      +    @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value")
           private VariantContextWriter vcfWriter = null;
       
           /////////////////////////////
      @@ -200,6 +200,8 @@ public class ApplyRecalibration extends RodWalker implements T
               hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
               hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
               hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
      +        hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants"));
      +        hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants"));
           }
       
           //---------------------------------------------------------------------------------------------------------------
      @@ -243,6 +245,10 @@ public class ApplyRecalibration extends RodWalker implements T
                       // Annotate the new record with its VQSLOD and the worst performing annotation
                       builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod);
                       builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY));
      +                if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY))
      +                    builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
      +                if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY))
      +                    builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
       
                       for( int i = tranches.size() - 1; i >= 0; i-- ) {
                           final Tranche tranche = tranches.get(i);
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
      index 3f6b6ed09..40032a886 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
      @@ -127,30 +127,46 @@ public class VariantDataManager {
               }
           }
       
      -     public void addTrainingSet( final TrainingSet trainingSet ) {
      -         trainingSets.add( trainingSet );
      -     }
      +    /**
      +     * Convert a normalized point to it's original annotation value
      +     *
      +     * norm = (orig - mu) / sigma
      +     * orig = norm * sigma + mu
      +     *
      +     * @param normalizedValue the normalized value of the ith annotation
      +     * @param annI the index of the annotation value
      +     * @return the denormalized value for the annotation
      +     */
      +    public double denormalizeDatum(final double normalizedValue, final int annI) {
      +        final double mu = meanVector[annI];
      +        final double sigma = varianceVector[annI];
      +        return normalizedValue * sigma + mu;
      +    }
       
      -     public boolean checkHasTrainingSet() {
      -         for( final TrainingSet trainingSet : trainingSets ) {
      -             if( trainingSet.isTraining ) { return true; }
      -         }
      -         return false;
      -     }
      +    public void addTrainingSet( final TrainingSet trainingSet ) {
      +        trainingSets.add( trainingSet );
      +    }
       
      -     public boolean checkHasTruthSet() {
      -         for( final TrainingSet trainingSet : trainingSets ) {
      -             if( trainingSet.isTruth ) { return true; }
      -         }
      -         return false;
      -     }
      +    public boolean checkHasTrainingSet() {
      +        for( final TrainingSet trainingSet : trainingSets ) {
      +            if( trainingSet.isTraining ) { return true; }
      +        }
      +        return false;
      +    }
       
      -     public boolean checkHasKnownSet() {
      -         for( final TrainingSet trainingSet : trainingSets ) {
      -             if( trainingSet.isKnown ) { return true; }
      -         }
      -         return false;
      -     }
      +    public boolean checkHasTruthSet() {
      +        for( final TrainingSet trainingSet : trainingSets ) {
      +            if( trainingSet.isTruth ) { return true; }
      +        }
      +        return false;
      +    }
      +
      +    public boolean checkHasKnownSet() {
      +        for( final TrainingSet trainingSet : trainingSets ) {
      +            if( trainingSet.isKnown ) { return true; }
      +        }
      +        return false;
      +    }
       
           public ExpandingArrayList getTrainingData() {
               final ExpandingArrayList trainingData = new ExpandingArrayList();
      @@ -260,7 +276,7 @@ public class VariantDataManager {
                   value = vc.getAttributeAsDouble( annotationKey, Double.NaN );
                   if( Double.isInfinite(value) ) { value = Double.NaN; }
                   if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
      -                  value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
      +                value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
                   }
       
                   if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
      @@ -297,7 +313,7 @@ public class VariantDataManager {
       
           private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
               return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && checkVariationClass( evalVC, trainVC ) &&
      -                        (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples());
      +                (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples());
           }
       
           protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) {
      @@ -335,19 +351,17 @@ public class VariantDataManager {
                   }} );
       
               // create dummy alleles to be used
      -        final List alleles = new ArrayList(2);
      -        alleles.add(Allele.create("N", true));
      -        alleles.add(Allele.create("", false));
      -
      -        // to be used for the important INFO tags
      -        final HashMap attributes = new HashMap(3);
      +        final List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false));
       
               for( final VariantDatum datum : data ) {
      -            attributes.put(VCFConstants.END_KEY, datum.loc.getStop());
      -            attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
      -            attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
      +            VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles);
      +            builder.attribute(VCFConstants.END_KEY, datum.loc.getStop());
      +            builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
      +            builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
      +
      +            if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
      +            if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
       
      -            VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes);
                   recalWriter.add(builder.make());
               }
           }
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
      index 57d9c219c..824ef1f6e 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
      @@ -80,6 +80,7 @@ import java.util.*;
        *
        * 

      * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + *

      * *

      * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. @@ -91,24 +92,26 @@ import java.util.*; * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + *

      * *

      * NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). * See http://www.r-project.org for more info on how to download and install R. + *

      * - *

      Input

      + *

      Input

      *

      * The input raw variants to be recalibrated. *

      * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. * - *

      Output

      + *

      Output

      *

      * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. *

      * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. * - *

      Example

      + *

      Example

      *
        * java -Xmx4g -jar GenomeAnalysisTK.jar \
        *   -T VariantRecalibrator \
      @@ -132,6 +135,8 @@ public class VariantRecalibrator extends RodWalker> resource = Collections.emptyList();
       
           /////////////////////////////
      @@ -168,9 +173,9 @@ public class VariantRecalibrator extends RodWalkerInput
      + * 

      Input

      *

      * A variant set to regenotype. *

      * - *

      Output

      + *

      Output

      *

      * A re-genotyped VCF. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -104,7 +104,7 @@ public class RegenotypeVariants extends RodWalker implements T
       
           @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
       
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter vcfWriter = null;
       
           private UnifiedGenotyperEngine UG_engine = null;
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java
      similarity index 55%
      rename from protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java
      rename to protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java
      index dd131b797..4609c3209 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java
      @@ -44,165 +44,151 @@
       *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
       */
       
      -package org.broadinstitute.sting.gatk.downsampling;
      +package org.broadinstitute.sting.utils.haplotype;
       
      -import org.apache.log4j.Logger;
      -import org.broadinstitute.sting.BaseTest;
      -import org.broadinstitute.sting.utils.exceptions.UserException;
      -import org.testng.Assert;
      -import org.testng.annotations.Test;
      -
      -import java.io.File;
      -import java.util.HashMap;
      -import java.util.HashSet;
      -import java.util.Map;
      -import java.util.Set;
      +import com.google.java.contract.Requires;
      +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine;
      +import org.broadinstitute.sting.utils.MathUtils;
      +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
      +import org.broadinstitute.variant.variantcontext.Allele;
      +import org.broadinstitute.variant.variantcontext.VariantContext;
       
      +import java.util.*;
       
       /**
      - * Basic unit test for AlleleBiasedDownsamplingUtils
      + * Computes the likelihood based probability that haplotypes for first and second variant contexts
      + * only appear in their fully linked form (x11 and x22) given a set of haplotypes where they might occur
      + * and read likelihoods per sample
      + *
      + * User: depristo
      + * Date: 3/29/13
      + * Time: 9:23 AM
        */
      -public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
      +public class HaplotypeLDCalculator {
      +    private final List haplotypes;
      +    private final Map haplotypeReadMap;
      +    private List> haplotypeLikelihoodsPerSample = null;
       
      +    // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1]
      +    private final double[] table = new double[4];
       
      -    @Test
      -    public void testSmartDownsampling() {
      -
      -        final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50};
      -        final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0};
      -
      -        // no contamination, no removal
      -        testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -        testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
      -
      -        // hom sample, het contaminant, different alleles
      -        testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
      -        testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
      -        testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
      -
      -        // hom sample, hom contaminant, different alleles
      -        testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
      -        testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
      -        testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
      -
      -        // het sample, het contaminant, different alleles
      -        testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -        testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -
      -        // het sample, hom contaminant, different alleles
      -        testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -        testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -
      -        // hom sample, het contaminant, overlapping alleles
      -        final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0};
      -        testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
      -        testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
      -        testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
      -
      -        // hom sample, hom contaminant, overlapping alleles
      -        testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0});
      -
      -        // het sample, het contaminant, overlapping alleles
      -        testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -        testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -        testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55});
      -        testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -        testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -
      -        // het sample, hom contaminant, overlapping alleles
      -        testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      -        testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
      +    /**
      +     * For testing
      +     */
      +    protected HaplotypeLDCalculator() {
      +        haplotypes = Collections.emptyList();
      +        haplotypeReadMap = Collections.emptyMap();
           }
       
      -    private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction,
      -                                    final int pileupSize, final int[] initialCounts, final int[] targetCounts) {
      -
      -        final int[] actualCounts = initialCounts.clone();
      -        actualCounts[0] += addA;
      -        actualCounts[1] += addC;
      -        actualCounts[2] += addG;
      -        actualCounts[3] += addT;
      -
      -        final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction));
      -        Assert.assertTrue(countsAreEqual(results, targetCounts));
      +    public HaplotypeLDCalculator(List haplotypes, Map haplotypeReadMap) {
      +        this.haplotypes = haplotypes;
      +        this.haplotypeReadMap = haplotypeReadMap;
           }
       
      -    private static boolean countsAreEqual(final int[] counts1, final int[] counts2) {
      -        for ( int i = 0; i < 4; i++ ) {
      -            if ( counts1[i] != counts2[i] )
      -                return false;
      +    /**
      +     * Construct the cached list of summed haplotype likelihoods per sample if it
      +     * hasn't already been computed.  This data structure is lazy created but only
      +     * needs to be made once when we make 1 merge decision as the data doesn't change
      +     * no matter how many calls to computeProbOfBeingPhased
      +     */
      +    private void buildHaplotypeLikelihoodsPerSampleIfNecessary() {
      +        if ( haplotypeLikelihoodsPerSample == null ) {
      +            // do the lazy computation
      +            final Set samples = haplotypeReadMap.keySet();
      +            haplotypeLikelihoodsPerSample = new LinkedList>();
      +            for( final String sample : samples ) {
      +                final Map map = new HashMap(haplotypes.size());
      +                for( final Haplotype h : haplotypes ) {
      +                    // count up the co-occurrences of the events for the R^2 calculation
      +                    final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0];
      +                    map.put(h, haplotypeLikelihood);
      +                }
      +                haplotypeLikelihoodsPerSample.add(map);
      +            }
               }
      -        return true;
           }
       
      +    /**
      +     * Compute the likelihood based probability that that haplotypes for first and second are only x11 and x22
      +     *
      +     * As opposed to the hypothesis that all four haplotypes (x11, x12, x21, and x22) exist in the population
      +     *
      +     * @param first a non-null VariantContext
      +     * @param second a non-null VariantContext
      +     * @return the probability that only x11 and x22 exist among the samples
      +     */
      +    protected double computeProbOfBeingPhased(final VariantContext first, final VariantContext second) {
      +        buildHaplotypeLikelihoodsPerSampleIfNecessary();
       
      -    @Test
      -    public void testLoadContaminationFile1(){
      -        Logger logger=org.apache.log4j.Logger.getRootLogger();
      +        Arrays.fill(table, Double.NEGATIVE_INFINITY);
       
      -        final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/";
      -        final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt");
      -
      -        Map Contam1=new HashMap();
      -        Set Samples1=new HashSet();
      -
      -        Contam1.put("NA11918",0.15);
      -        Samples1.addAll(Contam1.keySet());
      -        testLoadFile(ContamFile1,Samples1,Contam1,logger);
      -
      -        Contam1.put("NA12842",0.13);
      -        Samples1.addAll(Contam1.keySet());
      -        testLoadFile(ContamFile1,Samples1,Contam1,logger);
      -
      -        Samples1.add("DUMMY");
      -        testLoadFile(ContamFile1,Samples1,Contam1,logger);
      -   }
      -
      -    private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){
      -        Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger);
      -        Assert.assertTrue(loadedMap.equals(map));
      -    }
      -
      -    @Test
      -    public void testLoadContaminationFiles(){
      -        Logger logger=org.apache.log4j.Logger.getRootLogger();
      -        final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/";
      -
      -        for(int i=1; i<=5; i++){
      -            File ContamFile=new File(ArtificalBAMLocation+String.format("contamination.case.%d.txt",i));
      -            Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile,0.0,null,logger).size()==2);
      +        for ( final Map entry : haplotypeLikelihoodsPerSample ) {
      +            for ( final Map.Entry haplotypeLikelihood : entry.entrySet() ) {
      +                final Haplotype h = haplotypeLikelihood.getKey();
      +                // count up the co-occurrences of the events for the R^2 calculation
      +                final VariantContext thisHapVC = h.getEventMap().get(first.getStart());
      +                final VariantContext nextHapVC = h.getEventMap().get(second.getStart()); // TODO -- add function to take a VC
      +                final int i = thisHapVC == null ? 0 : 1;
      +                final int j = nextHapVC == null ? 0 : 1;
      +                final int index = 2 * i + j;
      +                table[index] = MathUtils.approximateLog10SumLog10(table[index], haplotypeLikelihood.getValue());
      +            }
               }
       
      +        return pPhased(table);
           }
       
      -    @Test(expectedExceptions = UserException.MalformedFile.class)
      -    public void testLoadBrokenContaminationFile1(){
      -        testLoadBrokenContaminationFile(1);
      +    /**
      +     * Compute probability that two variants are in phase with each other and that no
      +     * compound hets exist in the population.
      +     *
      +     * Implemented as a likelihood ratio test of the hypothesis:
      +     *
      +     * x11 and x22 are the only haplotypes in the populations
      +     *
      +     * vs.
      +     *
      +     * all four haplotype combinations (x11, x12, x21, and x22) all exist in the population.
      +     *
      +     * Now, since we have to have both variants in the population, we exclude the x11 & x11 state.  So the
      +     * p of having just x11 and x22 is P(x11 & x22) + p(x22 & x22).
      +     *
      +     * Alternatively, we might have any configuration that gives us both 1 and 2 alts, which are:
      +     *
      +     * - P(x11 & x12 & x21) -- we have hom-ref and both hets
      +     * - P(x22 & x12 & x21) -- we have hom-alt and both hets
      +     * - P(x22 & x12) -- one haplotype is 22 and the other is het 12
      +     * - P(x22 & x21) -- one haplotype is 22 and the other is het 21
      +     *
      +     * The probability is just p11_22 / (p11_22 + p hets)
      +     *
      +     * @table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1]
      +     *      doesn't have to be normalized as this function does the normalization internally
      +     * @return the real space probability that the data is phased
      +     */
      +    @Requires("table.length == 4")
      +    protected double pPhased( double[] table ) {
      +        final double[] normTable = MathUtils.normalizeFromLog10(table, true);
      +
      +        final double x11 = normTable[0], x12 = normTable[1], x21 = normTable[2], x22 = normTable[3];
      +
      +        // probability that we are only x11 && x22
      +        final double p11_22 = MathUtils.approximateLog10SumLog10(x11 + x22, x22 + x22);
      +
      +        // probability of having any of the other pairs
      +        final double p11_12_21 = MathUtils.approximateLog10SumLog10(x11 + x12, x11 + x21, x12 + x21);
      +        final double p22_12_21 = MathUtils.approximateLog10SumLog10(x22 + x12, x22 + x21, x12 + x21);
      +        final double p22_12 = x22 + x12;
      +        final double p22_21 = x22 + x21;
      +        final double pOthers = MathUtils.approximateLog10SumLog10(new double[]{p11_12_21, p22_12_21, p22_12, p22_21});
      +
      +        // probability of being phases is the ratio of p11_22 / pOthers which in log space is just a substraction
      +        final double log10phased = p11_22 - (MathUtils.approximateLog10SumLog10(p11_22, pOthers));
      +
      +        return Math.pow(10.0, log10phased);
           }
       
      -    @Test(expectedExceptions = UserException.MalformedFile.class)
      -    public void testLoadBrokenContaminationFile2(){
      -        testLoadBrokenContaminationFile(2);
      +    protected double pPhasedTest( final double x11, final double x12, final double x21, final double x22 ) {
      +        return pPhased(new double[]{x11, x12, x21, x22});
           }
      -    @Test(expectedExceptions = UserException.MalformedFile.class)
      -    public void testLoadBrokenContaminationFile3(){
      -        testLoadBrokenContaminationFile(3);
      -    }
      -
      -    @Test(expectedExceptions = UserException.MalformedFile.class)
      -    public void testLoadBrokenContaminationFile4(){
      -        testLoadBrokenContaminationFile(4);
      -    }
      -
      -
      -    public void testLoadBrokenContaminationFile(final int i){
      -        Logger logger=org.apache.log4j.Logger.getRootLogger();
      -        final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/";
      -
      -        File ContaminationFile=new File(ArtificalBAMLocation+String.format("contamination.case.broken.%d.txt",i));
      -        AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile,0.0,null,logger);
      -
      -    }
      -
      -
       }
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java
      new file mode 100644
      index 000000000..bbedd1b1a
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java
      @@ -0,0 +1,305 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotype;
      +
      +import org.apache.commons.lang.ArrayUtils;
      +import org.apache.log4j.Logger;
      +import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
      +import org.broadinstitute.variant.variantcontext.Allele;
      +import org.broadinstitute.variant.variantcontext.VariantContext;
      +import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
      +
      +import java.util.*;
      +
      +/**
      + * Merges VariantContexts in a series of haplotypes according to their pairwise LD
      + *
      + * User: depristo
      + * Date: 3/28/13
      + * Time: 6:17 PM
      + */
      +public class LDMerger extends MergeVariantsAcrossHaplotypes {
      +    private final static Logger logger = Logger.getLogger(LDMerger.class);
      +
      +    private final boolean DEBUG;
      +    private final int minSamplesToMergeSNPs;
      +    private final int minSamplesToMergeOtherEvents;
      +
      +    public LDMerger(boolean DEBUG, int minSamplesToMergeSNPs, int minSamplesToMergeOtherEvents) {
      +        super();
      +        this.DEBUG = DEBUG;
      +        this.minSamplesToMergeSNPs = minSamplesToMergeSNPs;
      +        this.minSamplesToMergeOtherEvents = minSamplesToMergeOtherEvents;
      +    }
      +
      +    protected LDMerger() {
      +        this(false, 1, 1);
      +    }
      +
      +    // TODO -- should be class arguments and static variables in HC
      +    protected final static int MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE = 6;
      +    protected final static int MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE = 25;
      +
      +    /**
      +     * We require 99% confidence that only the phased haplotypes exist in the population to merge the records
      +     */
      +    protected final static double MERGE_EVENTS_PROB_PHASED_THRESHOLD = 0.99;
      +
      +    /**
      +     * Merge as many events among the haplotypes as possible based on pairwise LD among variants
      +     *
      +     * @param haplotypes a list of haplotypes whose events we want to merge
      +     * @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype
      +     * @param startPosKeySet a set of starting positions of all events among the haplotypes
      +     * @param ref the reference bases
      +     * @param refLoc the span of the reference bases
      +     */
      +    @Override
      +    public boolean merge( final List haplotypes,
      +                          final Map haplotypeReadMap,
      +                          final TreeSet startPosKeySet,
      +                          final byte[] ref,
      +                          final GenomeLoc refLoc ) {
      +        if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null");
      +        if ( haplotypeReadMap == null ) throw new IllegalArgumentException("haplotypeReadMap cannot be null");
      +        if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null");
      +        if ( ref == null ) throw new IllegalArgumentException("ref cannot be null");
      +        if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null");
      +        if ( refLoc.size() != ref.length ) throw new IllegalArgumentException("refLoc size " + refLoc.size() + " != ref.length " + ref.length + " at " + refLoc);
      +
      +        if( startPosKeySet.size() <= 1 ) { return false; }
      +
      +        final int nSamples = haplotypeReadMap.keySet().size();
      +        final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, haplotypeReadMap);
      +        boolean somethingWasMerged = false;
      +        boolean mapWasUpdated = true;
      +        while( mapWasUpdated ) {
      +            mapWasUpdated = mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calculator, nSamples, startPosKeySet, ref, refLoc);
      +            somethingWasMerged |= mapWasUpdated;
      +        }
      +        return somethingWasMerged;
      +    }
      +
      +    /**
      +     * Merge the next pair of events, if possible
      +     *
      +     * @param haplotypes a list of haplotypes whose events we want to merge
      +     * @param ldCalculator calculates R^2 for pairs of events on demand
      +     * @param startPosKeySet a set of starting positions of all events among the haplotypes
      +     * @param ref the reference bases
      +     * @param refLoc the span of the reference bases
      +     * @return true if something was merged, false otherwise
      +     */
      +    protected boolean mergeConsecutiveEventsBasedOnLDOnce( final List haplotypes,
      +                                                           final HaplotypeLDCalculator ldCalculator,
      +                                                           final int nSamples,
      +                                                           final TreeSet startPosKeySet,
      +                                                           final byte[] ref,
      +                                                           final GenomeLoc refLoc ) {
      +        // loop over the set of start locations and consider pairs that start near each other
      +        final Iterator iter = startPosKeySet.iterator();
      +        int thisStart = iter.next();
      +        while( iter.hasNext() ) {
      +            final int nextStart = iter.next();
      +            final LDMergeData toMerge = getPairOfEventsToMerge(haplotypes, thisStart, nextStart);
      +
      +            if ( toMerge.canBeMerged(nSamples) ) {
      +                final double pPhased = ldCalculator.computeProbOfBeingPhased(toMerge.firstVC, toMerge.secondVC);
      +
      +                if( DEBUG ) {
      +                    logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", pPhased));
      +                    logger.info("-- " + toMerge.firstVC);
      +                    logger.info("-- " + toMerge.secondVC);
      +                }
      +
      +                if( pPhased > MERGE_EVENTS_PROB_PHASED_THRESHOLD) {
      +                    final VariantContext mergedVC = createMergedVariantContext(toMerge.firstVC, toMerge.secondVC, ref, refLoc);
      +                    // if for some reason the merging resulting in a bad allele, mergedVC will be null, and we will just remove first and second
      +                    replaceVariantContextsInMap(haplotypes, startPosKeySet, mergedVC, toMerge.firstVC, toMerge.secondVC);
      +                    return true; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events
      +                }
      +            }
      +
      +            thisStart = nextStart;
      +        }
      +
      +        return false;
      +    }
      +
      +    /**
      +     * Info about potential LD merge of two variant contexts
      +     */
      +    private class LDMergeData {
      +        VariantContext firstVC = null, secondVC = null;
      +        boolean canBeMerged = true;
      +
      +        /** Tell this object that it cant be merged for some reason */
      +        public LDMergeData cantBeMerged() {
      +            canBeMerged = false;
      +            return this;
      +        }
      +
      +        /**
      +         * Can these two events be merged
      +         * @param nSamples the number of samples we're considering
      +         * @return true if we can merge our two variant contexts
      +         */
      +        public boolean canBeMerged(final int nSamples) {
      +            if ( ! canBeMerged || firstVC == null || secondVC == null )
      +                return false;
      +
      +            final int distance = secondVC.getStart() - firstVC.getEnd();
      +            if ( firstVC.isSNP() && secondVC.isSNP() ) {
      +                return nSamples >= minSamplesToMergeSNPs && distance <= MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE;
      +            } else {
      +                return nSamples >= minSamplesToMergeOtherEvents && distance <= MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE;
      +            }
      +        }
      +    }
      +
      +    /**
      +     * Get the information about the potential merge of two events starting at thisStart and nextStart
      +     * @param haplotypes our haplotypes
      +     * @param thisStart the starting position of the first event to merge
      +     * @param nextStart the starting position of the next event to merge
      +     * @return
      +     */
      +    private LDMergeData getPairOfEventsToMerge(final List haplotypes, final int thisStart, final int nextStart) {
      +        final LDMergeData mergeData = new LDMergeData();
      +
      +        for( final Haplotype h : haplotypes ) {
      +            // only make complex substitutions out of consecutive biallelic sites
      +            final VariantContext thisHapVC = h.getEventMap().get(thisStart);
      +            if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype
      +                if( mergeData.firstVC == null ) {
      +                    mergeData.firstVC = thisHapVC;
      +                } else if( !thisHapVC.hasSameAllelesAs( mergeData.firstVC) ) {
      +                    return mergeData.cantBeMerged();
      +                }
      +            }
      +            final VariantContext nextHapVC = h.getEventMap().get(nextStart);
      +            if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype
      +                if( mergeData.secondVC == null ) {
      +                    mergeData.secondVC = nextHapVC;
      +                } else if( !nextHapVC.hasSameAllelesAs( mergeData.secondVC) ) {
      +                    return mergeData.cantBeMerged();
      +                }
      +            }
      +        }
      +
      +        // don't try to merge overlapping events
      +        if ( mergeData.firstVC != null && mergeData.secondVC != null && mergeData.firstVC.getEnd() >= mergeData.secondVC.getStart() )
      +            return mergeData.cantBeMerged();
      +
      +        return mergeData;
      +    }
      +
      +    // BUGBUG: make this merge function more general
      +    protected VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) {
      +        final int thisStart = thisVC.getStart();
      +        final int nextStart = nextVC.getStart();
      +        byte[] refBases = new byte[]{};
      +        byte[] altBases = new byte[]{};
      +        refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases());
      +        altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases());
      +        int locus;
      +        for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) {
      +            final byte refByte = ref[locus - refLoc.getStart()];
      +            refBases = ArrayUtils.add(refBases, refByte);
      +            altBases = ArrayUtils.add(altBases, refByte);
      +        }
      +        refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel
      +        altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
      +
      +        int iii = 0;
      +        if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele
      +            while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; }
      +            if ( iii == refBases.length ) {
      +                // we've become a null allele, such as with CA/C + A/AA -> CA/CA => after trimming there's nothing left
      +                // so return a null variant context so we can eliminate the variants from consideration
      +                return null;
      +            }
      +        }
      +
      +
      +        final Allele refAllele = Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true );
      +        final Allele altAllele =  Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false );
      +        return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), Arrays.asList(refAllele, altAllele)).make();
      +    }
      +
      +    /**
      +     * Update the event maps in all haplotypes to replace a replacement of update1 and 2 with replacement
      +     *
      +     * @param haplotypes the haplotypes whose event maps we need to update
      +     * @param startPosKeySet a sorted set of start positions that we must update
      +     * @param replacement a VariantContext to replace update1 and update2 with.  Can be null, indicating that we just want to remove update1 and update2
      +     * @param update1 the first VC we want to update
      +     * @param update2 the second VC we want to update
      +     */
      +    private void replaceVariantContextsInMap(final List haplotypes,
      +                                             final TreeSet startPosKeySet,
      +                                             final VariantContext replacement,
      +                                             final VariantContext update1, final VariantContext update2) {
      +        // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
      +        for( final Haplotype h : haplotypes ) {
      +            // if we had both events, add replacement.  In some cases the haplotype may not have both
      +            // events but they were still merged because the haplotype isn't a particularly informative
      +            // haplotype in any case.  The order of operations here is important because we are modifying the map
      +            final boolean shouldAdd = h.getEventMap().containsKey(update1.getStart()) && h.getEventMap().containsKey(update2.getStart());
      +            h.getEventMap().remove(update1.getStart());
      +            h.getEventMap().remove(update2.getStart());
      +            if ( shouldAdd && replacement != null ) {
      +                h.getEventMap().addVC(replacement, false); // cannot merge we other events at the same position
      +            }
      +        }
      +
      +        startPosKeySet.remove(update1.getStart());
      +        startPosKeySet.remove(update2.getStart());
      +        if ( replacement != null ) startPosKeySet.add(replacement.getStart());
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java
      new file mode 100644
      index 000000000..fc47807e0
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java
      @@ -0,0 +1,79 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotype;
      +
      +import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
      +
      +import java.util.List;
      +import java.util.Map;
      +import java.util.TreeSet;
      +
      +/**
      + * Baseclass for code that wants to merge variants together in the haplotype caller
      + *
      + * This root class is basically a no-op, and can be used to not do any merging
      + */
      +public class MergeVariantsAcrossHaplotypes {
      +    /**
      +     * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate
      +     *
      +     * @param haplotypes a list of haplotypes whose events we want to merge
      +     * @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype
      +     * @param startPosKeySet a set of starting positions of all events among the haplotypes
      +     * @param ref the reference bases
      +     * @param refLoc the span of the reference bases
      +     * @return true if anything was merged
      +     */
      +    public boolean merge( final List haplotypes,
      +                          final Map haplotypeReadMap,
      +                          final TreeSet startPosKeySet,
      +                          final byte[] ref,
      +                          final GenomeLoc refLoc ) {
      +        return false;
      +    }
      +}
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java
      new file mode 100644
      index 000000000..54061c781
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java
      @@ -0,0 +1,98 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotypeBAMWriter;
      +
      +import net.sf.samtools.*;
      +import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
      +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
      +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
      +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      +import org.broadinstitute.variant.variantcontext.Allele;
      +
      +import java.util.*;
      +
      +/**
      + * A haplotype bam writer that writes out all haplotypes as reads and then
      + * the alignment of reach read to its best match among the best haplotypes.
      + *
      + * Primarily useful for people working on the HaplotypeCaller method itself
      + *
      + * User: depristo
      + * Date: 2/22/13
      + * Time: 1:50 PM
      + */
      +class AllHaplotypeBAMWriter extends HaplotypeBAMWriter {
      +    public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
      +        super(bamWriter);
      +    }
      +
      +    /**
      +     * {@inheritDoc}
      +     */
      +    @Override
      +    public void writeReadsAlignedToHaplotypes(final List haplotypes,
      +                                              final GenomeLoc paddedReferenceLoc,
      +                                              final List bestHaplotypes,
      +                                              final Set calledHaplotypes,
      +                                              final Map stratifiedReadMap) {
      +        writeHaplotypesAsReads(haplotypes, new HashSet(bestHaplotypes), paddedReferenceLoc);
      +
      +        // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently
      +        final Map alleleToHaplotypeMap = new HashMap(haplotypes.size());
      +        for ( final Haplotype haplotype : haplotypes )
      +            alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype);
      +
      +        // next, output the interesting reads for each sample aligned against the appropriate haplotype
      +        for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
      +            for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
      +                final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
      +                writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart());
      +            }
      +        }
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java
      similarity index 74%
      rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java
      rename to protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java
      index 18e4bbfc2..d63cf65fc 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java
      @@ -44,78 +44,65 @@
       *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
       */
       
      -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
      +package org.broadinstitute.sting.utils.haplotypeBAMWriter;
       
      -import net.sf.samtools.SAMFileHeader;
      +import net.sf.samtools.SAMFileWriter;
       import org.broadinstitute.sting.utils.GenomeLoc;
      -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
      +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
      +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
       import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      -import org.testng.Assert;
      -import org.testng.annotations.DataProvider;
      -import org.testng.annotations.Test;
      +import org.broadinstitute.variant.variantcontext.Allele;
       
      -public class SampleStatisticsUnitTest/* extends BaseTest */ {
      +import java.util.*;
       
      -    @DataProvider(name = "QuartileValues")
      -    public Object[][] getQuantileValues() {
      -
      -        int[] a1 = {5};
      -        int[] a2 = {1, 2};
      -        int[] a5 = {10, 20, 30, 40, 50};
      -        int[] a10 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
      -
      -
      -        return new Object[][]{
      -                new Object[]{a1, 0.5, 5},
      -                new Object[]{a1, 0, 5},
      -                new Object[]{a1, 1, 5},
      -                new Object[]{a2, 0.5, 1.5},
      -                new Object[]{a2, 0.25, 1},
      -                new Object[]{a2, 0.75, 2},
      -                new Object[]{a5, 0.5, 30},
      -                new Object[]{a5, 0.25, 20},
      -                new Object[]{a5, 0.75, 40},
      -                new Object[]{a5, 0, -1},
      -                new Object[]{a10, 0.5, 5.5},
      -                new Object[]{a10, 0.25, 3},
      -                new Object[]{a10, 0.75, 8}
      -        };
      +/**
      + * Writes a BAM containing just the reads in stratifiedReadMap aligned to their
      + * most likely haplotype among all of the called haplotypes.
      + *
      + * Primarily useful for users of the HaplotypeCaller who want to better understand the
      + * support of their calls w.r.t. the reads.
      + *
      + * User: depristo
      + * Date: 2/22/13
      + * Time: 1:50 PM
      + */
      +class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
      +    public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) {
      +        super(bamWriter);
           }
       
      -    @Test(dataProvider = "QuartileValues")
      -    public void testGetQuartile(int[] dataList, double percentage, double expected) {
      -        Assert.assertEquals(SampleStatistics.getQuartile(dataList, percentage), expected);
      +    /**
      +     * {@inheritDoc}
      +     */
      +    @Override
      +    public void writeReadsAlignedToHaplotypes(final List haplotypes,
      +                                              final GenomeLoc paddedReferenceLoc,
      +                                              final List bestHaplotypes,
      +                                              final Set calledHaplotypes,
      +                                              final Map stratifiedReadMap) {
      +        if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes
      +            return;
       
      +        writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc);
      +
      +        // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently
      +        final Map alleleToHaplotypeMap = new HashMap(haplotypes.size());
      +        for ( final Haplotype haplotype : calledHaplotypes ) {
      +            alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype);
      +        }
      +
      +        // the set of all alleles that were actually called
      +        final Set allelesOfCalledHaplotypes = alleleToHaplotypeMap.keySet();
      +
      +        // next, output the interesting reads for each sample aligned against one of the called haplotypes
      +        for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
      +            for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
      +                if ( entry.getKey().getMappingQuality() > 0 ) {
      +                    final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
      +                    writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart());
      +                }
      +            }
      +        }
           }
      -
      -    @DataProvider(name = "ReadsAndMates")
      -    public Object[][] getReadAndMates() {
      -        SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
      -
      -        GATKSAMRecord noPair = ArtificialSAMUtils.createArtificialRead(header, "test", 0, 100, 50);
      -        GATKSAMRecord good = ArtificialSAMUtils.createPair(header, "test", 30, 100, 150, true, false).get(0);
      -        GATKSAMRecord bigInsertSize = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, false).get(0);
      -        GATKSAMRecord inverted = ArtificialSAMUtils.createPair(header, "test", 30, 151, 150, true, false).get(0);
      -        GATKSAMRecord sameOrientation = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, true).get(0);
      -
      -        GATKSAMRecord pairNotMapped = ArtificialSAMUtils.createPair(header, "test", 30, 100, 140, true, false).get(1);
      -        pairNotMapped.setMateUnmappedFlag(true);
      -
      -        // finish test
      -        return new Object[][]{
      -                new Object[]{noPair, false},
      -                new Object[]{good, true},
      -                new Object[]{bigInsertSize, false},
      -                new Object[]{inverted, false},
      -                new Object[]{sameOrientation, false},
      -                new Object[]{pairNotMapped, false}
      -        };
      -    }
      -
      -    @Test(dataProvider = "ReadsAndMates")
      -    public void testHasValidMate(GATKSAMRecord read, boolean expected) {
      -        //50 is out maximum insert size
      -        Assert.assertEquals(new SampleStatistics(GenomeLoc.UNMAPPED).hasValidMate(read, ThresHolder.DEFAULTS), expected);
      -    }
      -
       }
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java
      new file mode 100644
      index 000000000..2eea664d9
      --- /dev/null
      +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java
      @@ -0,0 +1,304 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotypeBAMWriter;
      +
      +import net.sf.samtools.*;
      +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
      +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
      +import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
      +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
      +import org.broadinstitute.sting.utils.sam.AlignmentUtils;
      +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      +
      +import java.util.*;
      +
      +/**
      + * A BAMWriter that aligns reads to haplotypes and emits their best alignments to a BAM file
      + *
      + * User: depristo
      + * Date: 2/22/13
      + * Time: 2:59 PM
      + */
      +public abstract class HaplotypeBAMWriter {
      +    /**
      +     * Allows us to write out unique names for our synthetic haplotype reads
      +     */
      +    private long uniqueNameCounter = 1;
      +
      +    protected final static String READ_GROUP_ID = "ArtificialHaplotype";
      +    protected final static String HAPLOTYPE_TAG = "HC";
      +
      +    final SAMFileWriter bamWriter;
      +    final SAMFileHeader bamHeader;
      +
      +    /**
      +     * Possible modes for writing haplotypes to BAMs
      +     */
      +    public static enum Type {
      +        /**
      +         * A mode that's for method developers.  Writes out all of the possible
      +         * haplotypes considered, as well as reads aligned to each
      +         */
      +        ALL_POSSIBLE_HAPLOTYPES,
      +
      +        /**
      +         * A mode for users.  Writes out the reads aligned only to the called
      +         * haplotypes.  Useful to understand why the caller is calling what it is
      +         */
      +        CALLED_HAPLOTYPES
      +    }
      +
      +    /**
      +     * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer
      +     *
      +     * @param type the type of the writer we want to create
      +     * @param stingSAMWriter the destination, must not be null
      +     * @param header the header of the input BAMs used to make calls, must not be null
      +     * @return a new HaplotypeBAMWriter
      +     */
      +    public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) {
      +        if ( header == null ) throw new IllegalArgumentException("header cannot be null");
      +        if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null");
      +        if ( type == null ) throw new IllegalArgumentException("type cannot be null");
      +
      +        // prepare the bam header
      +        final SAMFileHeader bamHeader = new SAMFileHeader();
      +        bamHeader.setSequenceDictionary(header.getSequenceDictionary());
      +        bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
      +
      +        // include the original read groups plus a new artificial one for the haplotypes
      +        final List readGroups = new ArrayList(header.getReadGroups());
      +        final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID);
      +        rg.setSample("HC");
      +        rg.setSequencingCenter("BI");
      +        readGroups.add(rg);
      +        bamHeader.setReadGroups(readGroups);
      +
      +        // TODO -- this will be a performance problem at high-scale
      +        stingSAMWriter.setPresorted(false);
      +        stingSAMWriter.writeHeader(bamHeader);
      +        return create(type, stingSAMWriter);
      +    }
      +
      +    /**
      +     * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer
      +     *
      +     * Note that writer must have its presorted bit set to false, as reads
      +     * may come in out of order during writing
      +     *
      +     * @param type the type of the writer we want to create
      +     * @param writer the destination, must not be null
      +     * @return a new HaplotypeBAMWriter
      +     */
      +    public static HaplotypeBAMWriter create(final Type type, final SAMFileWriter writer) {
      +        if ( writer == null ) throw new IllegalArgumentException("writer cannot be null");
      +        if ( type == null ) throw new IllegalArgumentException("type cannot be null");
      +
      +        switch ( type ) {
      +            case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(writer);
      +            case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(writer);
      +            default: throw new IllegalArgumentException("Unknown type " + type);
      +        }
      +    }
      +
      +    /**
      +     * Create a new HaplotypeBAMWriter writing its output to bamWriter
      +     *
      +     * Assumes that the header has been fully initialized with a single
      +     * read group READ_GROUP_ID
      +     *
      +     * @param bamWriter our output destination
      +     */
      +    protected HaplotypeBAMWriter(SAMFileWriter bamWriter) {
      +        this.bamWriter = bamWriter;
      +        this.bamHeader = bamWriter.getFileHeader();
      +    }
      +
      +    /**
      +     * Write out a BAM representing for the haplotype caller at this site
      +     *
      +     * @param haplotypes a list of all possible haplotypes at this loc
      +     * @param paddedReferenceLoc the span of the based reference here
      +     * @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping
      +     * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference
      +     * @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes
      +     */
      +    public abstract void writeReadsAlignedToHaplotypes(final List haplotypes,
      +                                                       final GenomeLoc paddedReferenceLoc,
      +                                                       final List bestHaplotypes,
      +                                                       final Set calledHaplotypes,
      +                                                       final Map stratifiedReadMap);
      +
      +    /**
      +     * Write out read aligned to haplotype to the BAM file
      +     *
      +     * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference
      +     * via the alignment of haplotype (via its getCigar) method.
      +     *
      +     * @param originalRead the read we want to write aligned to the reference genome
      +     * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
      +     * @param referenceStart the start of the reference that haplotype is aligned to.  Provides global coordinate frame.
      +     */
      +    protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead,
      +                                             final Haplotype haplotype,
      +                                             final int referenceStart) {
      +        final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart);
      +        if ( alignedToRef != null )
      +            bamWriter.addAlignment(alignedToRef);
      +    }
      +
      +    /**
      +     * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference
      +     * via the alignment of haplotype (via its getCigar) method.
      +     *
      +     * @param originalRead the read we want to write aligned to the reference genome
      +     * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
      +     * @param referenceStart the start of the reference that haplotype is aligned to.  Provides global coordinate frame.
      +     * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible
      +     */
      +    protected GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead,
      +                                                   final Haplotype haplotype,
      +                                                   final int referenceStart) {
      +        if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null");
      +        if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null");
      +        if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype);
      +        if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart);
      +
      +        try {
      +            // compute the smith-waterman alignment of read -> haplotype
      +            final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), Path.NEW_SW_PARAMETERS);
      +            //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases());
      +            if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 )
      +                // sw can fail (reasons not clear) so if it happens just don't write the read
      +                return null;
      +            final Cigar swCigar = AlignmentUtils.consolidateCigar(swPairwiseAlignment.getCigar());
      +
      +            // since we're modifying the read we need to clone it
      +            final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone();
      +
      +            addHaplotypeTag(read, haplotype);
      +
      +            // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar
      +            final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000);
      +            final int readStartOnHaplotype = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1());
      +            final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype;
      +            read.setAlignmentStart(readStartOnReference);
      +
      +            // compute the read -> ref alignment by mapping read -> hap -> ref from the
      +            // SW of read -> hap mapped through the given by hap -> ref
      +            final Cigar haplotypeToRef = AlignmentUtils.trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1);
      +            final Cigar readToRefCigarRaw = AlignmentUtils.applyCigarToCigar(swCigar, haplotypeToRef);
      +            final Cigar readToRefCigarClean = AlignmentUtils.cleanUpCigar(readToRefCigarRaw);
      +            final Cigar readToRefCigar = AlignmentUtils.leftAlignIndel(readToRefCigarClean, haplotype.getBases(),
      +                    originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true);
      +
      +            read.setCigar(readToRefCigar);
      +
      +            if ( readToRefCigar.getReadLength() != read.getReadLength() )
      +                throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength()
      +                        + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength()
      +                        + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength());
      +
      +            return read;
      +        } catch ( CloneNotSupportedException e ) {
      +            throw new IllegalStateException("GATKSAMRecords should support clone but this one does not " + originalRead);
      +        }
      +    }
      +
      +    /**
      +     * Add a haplotype tag to the read based on haplotype
      +     *
      +     * @param read the read to add the tag to
      +     * @param haplotype the haplotype that gives rises to read
      +     */
      +    private void addHaplotypeTag(final GATKSAMRecord read, final Haplotype haplotype) {
      +        // add a tag to the read that indicates which haplotype it best aligned to.  It's a uniquish integer
      +        read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode());
      +    }
      +
      +    /**
      +     * Write out haplotypes as reads to the BAM, marking specifically those that are among the best haplotypes
      +     *
      +     * @param haplotypes a collection of haplotypes to write to the BAM
      +     * @param bestHaplotypes a subset of haplotypes that contains those that are best "either good or called"
      +     * @param paddedReferenceLoc the genome loc of the padded reference
      +     */
      +    protected void writeHaplotypesAsReads(final Collection haplotypes,
      +                                          final Set bestHaplotypes,
      +                                          final GenomeLoc paddedReferenceLoc) {
      +        for ( final Haplotype haplotype : haplotypes )
      +            writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
      +    }
      +
      +    /**
      +     * Write out a representation of this haplotype as a read
      +     *
      +     * @param haplotype a haplotype to write out.  Cannot be null
      +     * @param paddedRefLoc the reference location.  Cannot be null
      +     * @param isAmongBestHaplotypes true if among the best haplotypes, false if it was just one possible but not so good
      +     */
      +    private void writeHaplotype(final Haplotype haplotype,
      +                                final GenomeLoc paddedRefLoc,
      +                                final boolean isAmongBestHaplotypes) {
      +        final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
      +        record.setReadBases(haplotype.getBases());
      +        record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
      +        record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
      +        record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar()));
      +        record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0);
      +        record.setReadName("HC" + uniqueNameCounter++);
      +        addHaplotypeTag(record, haplotype);
      +        record.setReadUnmappedFlag(false);
      +        record.setReferenceIndex(paddedRefLoc.getContigIndex());
      +        record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID);
      +        record.setFlags(16);
      +        bamWriter.addAlignment(record);
      +    }
      +}
      \ No newline at end of file
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
      similarity index 69%
      rename from protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java
      rename to protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
      index 6f8bec94f..ab2a5bb2a 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
      @@ -55,34 +55,27 @@ import org.broadinstitute.sting.utils.QualityUtils;
        * User: rpoplin, carneiro
        * Date: 10/16/12
        */
      -public class LoglessCachingPairHMM extends PairHMM {
      -    protected static final double SCALE_FACTOR_LOG10 = 300.0;
      +public final class LoglessPairHMM extends PairHMM {
      +    protected static final double INITIAL_CONDITION = Math.pow(2, 1020);
      +    protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);
       
      -    double[][] constantMatrix = null; // The cache
      -    double[][] distanceMatrix = null; // The cache
      -    boolean constantsAreInitialized = false;
      +    private static final int matchToMatch = 0;
      +    private static final int indelToMatch = 1;
      +    private static final int matchToInsertion = 2;
      +    private static final int insertionToInsertion = 3;
      +    private static final int matchToDeletion = 4;
      +    private static final int deletionToDeletion = 5;
       
      -    /**
      -     * Cached data structure that describes the first row's edge condition in the HMM
      -     */
      -    protected static final double [] firstRowConstantMatrix = {
      -            QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
      -            QualityUtils.qualToProb(DEFAULT_GCP),
      -            QualityUtils.qualToErrorProb(DEFAULT_GOP),
      -            QualityUtils.qualToErrorProb(DEFAULT_GCP),
      -            1.0,
      -            1.0
      -    };
       
           /**
            * {@inheritDoc}
            */
           @Override
      -    public void initialize( final int readMaxLength, final int haplotypeMaxLength) {
      +    public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
               super.initialize(readMaxLength, haplotypeMaxLength);
       
      -        constantMatrix = new double[X_METRIC_MAX_LENGTH][6];
      -        distanceMatrix = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH];
      +        transition = new double[paddedMaxReadLength][6];
      +        prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
           }
       
           /**
      @@ -97,26 +90,35 @@ public class LoglessCachingPairHMM extends PairHMM {
                                                                      final byte[] overallGCP,
                                                                      final int hapStartIndex,
                                                                      final boolean recacheReadValues ) {
      -        if ( ! constantsAreInitialized || recacheReadValues )
      -            initializeConstants( haplotypeBases.length, readBases.length, insertionGOP, deletionGOP, overallGCP );
      -        initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
       
      -        // NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this
      -        // read and haplotype lengths, not the max lengths
      -        final int readXMetricLength = readBases.length + 2;
      -        final int hapYMetricLength = haplotypeBases.length + 2;
      -
      -        for (int i = 2; i < readXMetricLength; i++) {
      -            // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
      -            for (int j = hapStartIndex+1; j < hapYMetricLength; j++) {
      -                updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
      +        if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) {
      +            final double initialValue = INITIAL_CONDITION / haplotypeBases.length;
      +            // set the initial value (free deletions in the beginning) for the first row in the deletion matrix
      +            for( int j = 0; j < paddedHaplotypeLength; j++ ) {
      +                deletionMatrix[0][j] = initialValue;
                   }
               }
       
      -        // final probability is the log10 sum of the last element in all three state arrays
      -        final int endI = readXMetricLength - 1;
      -        final int endJ = hapYMetricLength - 1;
      -        return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
      +        if ( ! constantsAreInitialized || recacheReadValues )
      +            initializeProbabilities(insertionGOP, deletionGOP, overallGCP);
      +        initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
      +
      +        for (int i = 1; i < paddedReadLength; i++) {
      +            // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
      +            for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) {
      +                updateCell(i, j, prior[i][j], transition[i]);
      +            }
      +        }
      +
      +        // final probability is the log10 sum of the last element in the Match and Insertion state arrays
      +        // this way we ignore all paths that ended in deletions! (huge)
      +        // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended.
      +        final int endI = paddedReadLength - 1;
      +        double finalSumProbabilities = 0.0;
      +        for (int j = 1; j < paddedHaplotypeLength; j++) {
      +            finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j];
      +        }
      +        return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10;
           }
       
           /**
      @@ -128,10 +130,7 @@ public class LoglessCachingPairHMM extends PairHMM {
            * @param readQuals      the base quality scores of the read
            * @param startIndex     where to start updating the distanceMatrix (in case this read is similar to the previous read)
            */
      -    public void initializeDistanceMatrix( final byte[] haplotypeBases,
      -                                          final byte[] readBases,
      -                                          final byte[] readQuals,
      -                                          final int startIndex ) {
      +    public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
       
               // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
               // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
      @@ -141,7 +140,7 @@ public class LoglessCachingPairHMM extends PairHMM {
                   final byte qual = readQuals[i];
                   for (int j = startIndex; j < haplotypeBases.length; j++) {
                       final byte y = haplotypeBases[j];
      -                distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
      +                prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
                               QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
                   }
               }
      @@ -150,46 +149,26 @@ public class LoglessCachingPairHMM extends PairHMM {
           /**
            * Initializes the matrix that holds all the constants related to quality scores.
            *
      -     * @param haplotypeSize the number of bases in the haplotype we are testing
      -     * @param readSize the number of bases in the read we are testing
            * @param insertionGOP   insertion quality scores of the read
            * @param deletionGOP    deletion quality scores of the read
            * @param overallGCP     overall gap continuation penalty
            */
           @Requires({
      -            "haplotypeSize > 0",
      -            "readSize > 0",
      -            "insertionGOP != null && insertionGOP.length == readSize",
      -            "deletionGOP != null && deletionGOP.length == readSize",
      -            "overallGCP != null && overallGCP.length == readSize"
      +            "insertionGOP != null",
      +            "deletionGOP != null",
      +            "overallGCP != null"
           })
           @Ensures("constantsAreInitialized")
      -    private void initializeConstants( final int haplotypeSize,
      -                                      final int readSize,
      -                                      final byte[] insertionGOP,
      -                                      final byte[] deletionGOP,
      -                                      final byte[] overallGCP ) {
      -        // the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init
      -        matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10) / getNPotentialXStarts(haplotypeSize, readSize);
      -
      -        // fill in the first row
      -        for( int jjj = 2; jjj < Y_METRIC_MAX_LENGTH; jjj++ ) {
      -            updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
      -        }
      -
      -        final int l = insertionGOP.length;
      -        constantMatrix[1] = firstRowConstantMatrix;
      -        for (int i = 0; i < l; i++) {
      +    private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
      +        for (int i = 0; i < insertionGOP.length; i++) {
                   final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
      -            constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
      -            constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
      -            constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
      -            constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
      -            constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
      -            constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
      +            transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP);
      +            transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]);
      +            transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]);
      +            transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]);
      +            transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
      +            transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
               }
      -        constantMatrix[l+1][4] = 1.0;
      -        constantMatrix[l+1][5] = 1.0;
       
               // note that we initialized the constants
               constantsAreInitialized = true;
      @@ -204,18 +183,14 @@ public class LoglessCachingPairHMM extends PairHMM {
            * @param indI             row index in the matrices to update
            * @param indJ             column index in the matrices to update
            * @param prior            the likelihood editing distance matrix for the read x haplotype
      -     * @param constants        an array with the six constants relevant to this location
      -     * @param matchMetricArray the matches likelihood matrix
      -     * @param XMetricArray     the insertions likelihood matrix
      -     * @param YMetricArray     the deletions likelihood matrix
      +     * @param transition        an array with the six transition relevant to this location
            */
      -    private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
      -                             final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
      +    private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) {
       
      -        matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
      -                                                 XMetricArray[indI - 1][indJ - 1] * constants[1] +
      -                                                 YMetricArray[indI - 1][indJ - 1] * constants[1] );
      -        XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
      -        YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
      +        matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] +
      +                                                 insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] +
      +                                                 deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] );
      +        insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion];
      +        deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion];
           }
       }
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java
      index 113ea2222..3f8fd0e88 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java
      @@ -62,6 +62,9 @@ public class BQSRReadTransformer extends ReadTransformer {
           private boolean enabled;
           private BaseRecalibration bqsr = null;
       
      +    @Override
      +    public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; }
      +
           @Override
           public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
               this.enabled = engine.hasBQSRArgumentSet();
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java
      index 6d98803c9..ae6b56e19 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java
      @@ -46,6 +46,8 @@
       
       package org.broadinstitute.sting.utils.recalibration;
       
      +import com.google.java.contract.Ensures;
      +import com.google.java.contract.Requires;
       import net.sf.samtools.SAMFileHeader;
       import org.apache.log4j.Logger;
       import org.broadinstitute.sting.gatk.report.GATKReport;
      @@ -59,7 +61,6 @@ import org.broadinstitute.sting.utils.R.RScriptExecutor;
       import org.broadinstitute.sting.utils.Utils;
       import org.broadinstitute.sting.utils.classloader.PluginManager;
       import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
      -import org.broadinstitute.sting.utils.collections.NestedHashMap;
       import org.broadinstitute.sting.utils.collections.Pair;
       import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
       import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
      @@ -81,7 +82,7 @@ import java.util.*;
        *
        * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
        * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
      - * This class holds the parsing methods that are shared between CountCovariates and TableRecalibration.
      + * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads.
        */
       
       public class RecalUtils {
      @@ -423,7 +424,7 @@ public class RecalUtils {
       
           private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) {
       
      -        final NestedHashMap deltaTable = new NestedHashMap();
      +        final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length);
       
               // add the quality score table to the delta table
               final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable();
      @@ -470,24 +471,57 @@ public class RecalUtils {
                   covariateNameMap.put(covariate, parseCovariateName(covariate));
       
               // print each data line
      -        for (final NestedHashMap.Leaf leaf : deltaTable.getAllLeaves()) {
      +        for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) {
                   final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap);
      -            final RecalDatum deltaDatum = (RecalDatum)leaf.value;
      +            final RecalDatum deltaDatum = leaf.value;
                   deltaTableFile.print(Utils.join(",", deltaKeys));
                   deltaTableFile.print("," + deltaDatum.stringForCSV());
                   deltaTableFile.println("," + recalibrationMode);
               }
           }
       
      -    protected static List generateValuesFromKeys(final List keys, final Covariate[] covariates, final Map covariateNameMap) {
      +    /*
      +     * Return an initialized nested integer array with appropriate dimensions for use with the delta tables
      +     *
      +     * @param recalibrationTables     the recal tables
      +     * @param numCovariates           the total number of covariates being used
      +     * @return a non-null nested integer array
      +     */
      +    @Requires("recalibrationTables != null && numCovariates > 0")
      +    @Ensures("result != null")
      +    private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) {
      +
      +        final int[] dimensionsForDeltaTable = new int[4];
      +
      +        // initialize the dimensions with those of the qual table to start with
      +        final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable();
      +        final int[] dimensionsOfQualTable = qualTable.getDimensions();
      +        dimensionsForDeltaTable[0] = dimensionsOfQualTable[0];    // num read groups
      +        dimensionsForDeltaTable[1] = numCovariates + 1;           // num covariates
      +        dimensionsForDeltaTable[2] = dimensionsOfQualTable[1];
      +        dimensionsForDeltaTable[3] = dimensionsOfQualTable[2];
      +
      +        // now, update the dimensions based on the optional covariate tables as needed
      +        for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) {
      +            final NestedIntegerArray covTable = recalibrationTables.getTable(i);
      +            final int[] dimensionsOfCovTable = covTable.getDimensions();
      +            dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]);
      +            dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]);
      +        }
      +
      +        return new NestedIntegerArray(dimensionsForDeltaTable);
      +    }
      +
      +    protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) {
               final List values = new ArrayList(4);
      -        values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey((Integer)keys.get(0)));
      -        final int covariateIndex = (Integer)keys.get(1);
      +        values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0]));
      +
      +        final int covariateIndex = keys[1];
      +        final int covariateKey = keys[2];
               final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex];
      -        final int covariateKey = (Integer)keys.get(2);
               values.add(covariate.formatKey(covariateKey));
               values.add(covariateNameMap.get(covariate));
      -        values.add(EventType.eventFrom((Integer)keys.get(3)).prettyPrint());
      +        values.add(EventType.eventFrom(keys[3]).prettyPrint());
       
               return values;
           }
      @@ -501,20 +535,14 @@ public class RecalUtils {
            * @param deltaKey the key to the table
            * @param recalDatum the recal datum to combine with the accuracyDatum element in the table
            */
      -    private static void addToDeltaTable(final NestedHashMap deltaTable, final int[] deltaKey, final RecalDatum recalDatum) {
      -        Object[] wrappedKey = wrapKeys(deltaKey);
      -        final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(wrappedKey); // check if we already have a RecalDatum for this key
      +    private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) {
      +        final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
               if (deltaDatum == null)
      -            deltaTable.put(new RecalDatum(recalDatum), wrappedKey); // if we don't have a key yet, create a new one with the same values as the curent datum
      +            // if we don't have a key yet, create a new one with the same values as the current datum
      +            deltaTable.put(new RecalDatum(recalDatum), deltaKey);
               else
      -            deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one.
      -    }
      -
      -    private static Object[] wrapKeys(final int[] keys) {
      -        final Object[] wrappedKeys = new Object[keys.length];
      -        for (int i = 0; i < keys.length; i++)
      -            wrappedKeys[i] = keys[i];
      -        return wrappedKeys;
      +            // if we do have a datum, combine it with this one
      +            deltaDatum.combine(recalDatum);
           }
       
           /**
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java
      index a3fec6a22..ea45c2abf 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java
      @@ -372,4 +372,11 @@ public class RecalibrationReport {
           public Covariate[] getCovariates() {
               return requestedCovariates;
           }
      +
      +    /**
      +     * @return true if the report has no data
      +     */
      +    public boolean isEmpty() {
      +        return recalibrationTables.isEmpty();
      +    }
       }
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java
      index 15b6c8571..7d1a9f956 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java
      @@ -124,6 +124,16 @@ public final class RecalibrationTables {
               return tables.size();
           }
       
      +    /**
      +     * @return true if all the tables contain no RecalDatums
      +     */
      +    public boolean isEmpty() {
      +        for( final NestedIntegerArray table : tables ) {
      +            if( !table.getAllValues().isEmpty() ) { return false; }
      +        }
      +        return true;
      +    }
      +
           /**
            * Allocate a new quality score table, based on requested parameters
            * in this set of tables, without any data in it.  The return result
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java
      index bcb42f7ef..f585299f4 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java
      @@ -237,7 +237,9 @@ public class CycleCovariate implements StandardCovariate {
       
               // Unknown platforms
               else {
      -            throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
      +            throw new UserException("The platform (" + read.getReadGroup().getPlatform()
      +                    + ") associated with read group " + read.getReadGroup()
      +                    + " is not a recognized platform. Allowable options are " + NGSPlatform.knownPlatformsString());
               }
           }
       
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java
      index 5469b38c8..a16fdcaa1 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java
      @@ -53,17 +53,17 @@ package org.broadinstitute.sting.utils.recalibration.covariates;
        * [Functionality of this walker]
        * 

      *

      - *

      Input

      + *

      Input

      *

      * [Input description] *

      *

      - *

      Output

      + *

      Output

      *

      * [Output description] *

      *

      - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java
      index bb55ed0c5..4267c1ffd 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java
      @@ -53,17 +53,17 @@ package org.broadinstitute.sting.utils.recalibration.covariates;
        * [Functionality of this walker]
        * 

      *

      - *

      Input

      + *

      Input

      *

      * [Input description] *

      *

      - *

      Output

      + *

      Output

      *

      * [Output description] *

      *

      - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java
      index 9ade37019..045b21527 100644
      --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java
      +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java
      @@ -53,17 +53,17 @@ package org.broadinstitute.sting.utils.recalibration.covariates;
        * [Functionality of this walker]
        * 

      *

      - *

      Input

      + *

      Input

      *

      * [Input description] *

      *

      - *

      Output

      + *

      Output

      *

      * [Output description] *

      *

      - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
      index d82d920a8..5866075a7 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
      @@ -226,15 +226,29 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
           }
       
           @Test
      -    public void testSnpEffAnnotationsUnsupportedVersion() {
      +    public void testSnpEffAnnotationsUnsupportedVersionGATKMode() {
               WalkerTestSpec spec = new WalkerTestSpec(
      -            "-T VariantAnnotator -R " + hg19Reference + " --no_cmdline_in_header -o %s -A SnpEff --variant " +
      -            validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile  " + validationDataLocation +
      -            "snpEff.AFR.unfiltered.unsupported.version.vcf -L 1:1-1,500,000",
      +            "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " +
      +            "--variant " + privateTestDir + "vcf4.1.example.vcf " +
      +            "--snpEffFile  " + privateTestDir + "snpEff_unsupported_version_gatk_mode.vcf " +
      +            "-L 1:10001292-10012424",
      +            1,
      +            Arrays.asList("7352cf23a4d45d3d2bb34ab44a4100ae")
      +        );
      +        executeTest("Testing SnpEff annotations (unsupported version, GATK mode)", spec);
      +    }
      +
      +    @Test
      +    public void testSnpEffAnnotationsUnsupportedVersionNoGATKMode() {
      +        WalkerTestSpec spec = new WalkerTestSpec(
      +            "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " +
      +            "--variant " + privateTestDir + "vcf4.1.example.vcf " +
      +            "--snpEffFile  " + privateTestDir + "snpEff_unsupported_version_no_gatk_mode.vcf " +
      +            "-L 1:10001292-10012424",
                   1,
                   UserException.class
               );
      -        executeTest("Testing SnpEff annotations (unsupported version)", spec);
      +        executeTest("Testing SnpEff annotations (unsupported version, no GATK mode)", spec);
           }
       
           @Test
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java
      index f82f24439..658b8527d 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java
      @@ -69,6 +69,7 @@ public class BQSRGathererUnitTest extends BaseTest {
           private static File recal3 = new File(privateTestDir + "HiSeq.1mb.1RG.sg3.table");
           private static File recal4 = new File(privateTestDir + "HiSeq.1mb.1RG.sg4.table");
           private static File recal5 = new File(privateTestDir + "HiSeq.1mb.1RG.sg5.table");
      +    private static File recalEmpty = new File(privateTestDir + "HiSeq.1mb.1RG.empty.table");
       
           private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table");
       
      @@ -110,6 +111,26 @@ public class BQSRGathererUnitTest extends BaseTest {
               testReports(originalReport, calculatedReport);
           }
       
      +    @Test(enabled = true)
      +    public void testGatherBQSRWithEmptyFile() {
      +        BQSRGatherer gatherer = new BQSRGatherer();
      +        List recalFiles = new LinkedList ();
      +        final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
      +
      +        recalFiles.add(recal1);
      +        recalFiles.add(recal2);
      +        recalFiles.add(recal3);
      +        recalFiles.add(recal4);
      +        recalFiles.add(recal5);
      +        recalFiles.add(recalEmpty);
      +        gatherer.gather(recalFiles, output);
      +
      +        GATKReport originalReport = new GATKReport(recal_original);
      +        GATKReport calculatedReport = new GATKReport(output);
      +
      +        testReports(originalReport, calculatedReport);
      +    }
      +
           private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) {
       
               // test the Arguments table
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
      index 8a40b44e6..907046704 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
      @@ -82,7 +82,7 @@ public class BQSRIntegrationTest extends WalkerTest {
                           " -I " + bam +
                           " -L " + interval +
                           args +
      -                    " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) +
      +                    " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : (reference.equals(b37KGReference) ? b37dbSNP129 : hg18dbSNP132)) +
                           " --allow_potentially_misencoded_quality_scores" +  // TODO -- remove me when we get new SOLiD bams
                           " -o %s" +
                           " -sortAllCols";
      @@ -115,6 +115,8 @@ public class BQSRIntegrationTest extends WalkerTest {
                       {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "c1c3cda8caceed619d3d439c3990cd26")},
                       {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "c9953f020a65c1603a6d71aeeb1b95f3")},
                       {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5bfff0c699345cca12a9b33acf95588f")},
      +                // make sure we work with ION torrent bam
      +                {new BQSRTest(b37KGReference, privateTestDir + "iontorrent.bam", "20:10,000,000-10,200,000", "", "7375c7b692e76b651c278a9fb478fa1c")},
               };
           }
       
      @@ -151,7 +153,7 @@ public class BQSRIntegrationTest extends WalkerTest {
                               " -sortAllCols" +
                               " --plot_pdf_file /dev/null" +
                               " --intermediate_csv_file %s",
      -                Arrays.asList("dd6e0e1e3f53f8ae0c8f5de21ded6ee9"));
      +                Arrays.asList("90ad19143024684e3c4410dc8fd2bd9d"));
               executeTest("testBQSR-CSVfile", spec);
           }
       
      @@ -257,4 +259,17 @@ public class BQSRIntegrationTest extends WalkerTest {
                       UserException.class);
               executeTest("testPRFailWithLowMaxCycle", spec);
           }
      +
      +    @Test
      +    public void testPRFailWithBadPL() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                " -T BaseRecalibrator" +
      +                        " -R " + b37KGReference +
      +                        " -I " + privateTestDir + "badPLForBQSR.bam" +
      +                        " -L 1:10,000,000-10,200,000" +
      +                        " -o %s",
      +                1,
      +                UserException.class);
      +        executeTest("testPRFailWithBadPL", spec);
      +    }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java
      index 7f41836fa..f988471a0 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java
      @@ -179,7 +179,7 @@ public class BaseCountsUnitTest extends BaseTest {
               BaseCounts counts = new BaseCounts();
       
               for ( int qual : test.quals )
      -            counts.incr(BaseIndex.A, (byte)qual);
      +            counts.incr(BaseIndex.A, (byte)qual, 20, false);
       
               final int actualSum = (int)counts.getSumQuals((byte)'A');
               final int expectedSum = qualSum(test.quals);
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java
      index c48c7cdc7..32791dd97 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java
      @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
       
       
       import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.MathUtils;
       import org.testng.Assert;
       import org.testng.annotations.DataProvider;
       import org.testng.annotations.Test;
      @@ -118,28 +119,27 @@ public class HeaderElementUnitTest extends BaseTest {
               Assert.assertFalse(headerElement.hasFilteredData());
               Assert.assertFalse(headerElement.hasInsertionToTheRight());
               Assert.assertTrue(headerElement.isEmpty());
      -        Assert.assertEquals(headerElement.getRMS(), 0.0);
           }
       
           private void testHeaderData(final HeaderElement headerElement, final HETest test) {
      -        Assert.assertEquals(headerElement.getRMS(), (double)test.MQ);
               Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip);
               Assert.assertFalse(headerElement.isEmpty());
               Assert.assertFalse(headerElement.hasInsertionToTheRight());
      -        Assert.assertEquals(headerElement.hasConsensusData(), headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual));
      -        Assert.assertEquals(headerElement.hasFilteredData(), !headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual));
      -        Assert.assertFalse(headerElement.isVariantFromMismatches(0.05));
      -        Assert.assertEquals(headerElement.isVariant(0.05, 0.05), test.isClip);
      +        Assert.assertEquals(headerElement.hasConsensusData(), test.MQ >= minMappingQual);
      +        Assert.assertEquals(headerElement.hasFilteredData(), test.MQ < minMappingQual);
      +        Assert.assertEquals(headerElement.hasConsensusData() ? headerElement.getConsensusBaseCounts().getRMS() :  headerElement.getFilteredBaseCounts().getRMS(), (double)test.MQ);
      +        Assert.assertFalse(headerElement.isVariantFromMismatches(0.05, 0.05));
      +        Assert.assertEquals(headerElement.isVariant(0.05, 0.05, 0.05), test.isClip);
           }
       
       
           private class AllelesTest {
               public final int[] counts;
      -        public final double proportion;
      +        public final double pvalue;
       
      -        private AllelesTest(final int[] counts, final double proportion) {
      +        private AllelesTest(final int[] counts, final double pvalue) {
                   this.counts = counts;
      -            this.proportion = proportion;
      +            this.pvalue = pvalue;
               }
           }
       
      @@ -148,14 +148,16 @@ public class HeaderElementUnitTest extends BaseTest {
               List tests = new ArrayList();
       
               final int[] counts = new int[]{ 0, 5, 10, 15, 20 };
      -        final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 };
      +        final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 };
       
      -        for ( final int count1 : counts ) {
      -            for ( final int count2 : counts ) {
      -                for ( final int count3 : counts ) {
      -                    for ( final int count4 : counts ) {
      -                        for ( final double proportion : proportions ) {
      -                            tests.add(new Object[]{new AllelesTest(new int[]{count1, count2, count3, count4}, proportion)});
      +        for ( final int countA : counts ) {
      +            for ( final int countC : counts ) {
      +                for ( final int countG : counts ) {
      +                    for ( final int countT : counts ) {
      +                        for ( final int countD : counts ) {
      +                            for ( final double pvalue : pvalues ) {
      +                                tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, pvalue)});
      +                            }
                               }
                           }
                       }
      @@ -170,30 +172,43 @@ public class HeaderElementUnitTest extends BaseTest {
       
               HeaderElement headerElement = new HeaderElement(1000, 0);
               for ( int i = 0; i < test.counts.length; i++ ) {
      -            BaseIndex base = BaseIndex.values()[i];
      +            final BaseIndex base = BaseIndex.values()[i];
                   for ( int j = 0; j < test.counts[i]; j++ )
                       headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false);
               }
       
      -        final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion);
      -        final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion);
      +        final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue, test.pvalue);
      +        final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue);
       
               Assert.assertEquals(nAllelesSeen, nAllelesExpected);
           }
       
      -    private static int calculateExpectedAlleles(final int[] counts, final double proportion) {
      -        double total = 0.0;
      +    private static int calculateExpectedAlleles(final int[] counts, final double targetPvalue) {
      +        int total = 0;
               for ( final int count : counts ) {
                   total += count;
               }
       
      -        final int minCount = (int)(proportion * total);
      -
               int result = 0;
      -        for ( final int count : counts ) {
      -            if ( count > 0 && count >= minCount )
      +        for ( int index = 0; index < counts.length; index++ ) {
      +            final int count = counts[index];
      +            if ( count == 0 )
      +                continue;
      +
      +            final boolean isSignificant;
      +            if ( count <= HeaderElement.MIN_COUNT_FOR_USING_PVALUE ) {
      +                isSignificant = MathUtils.binomialCumulativeProbability(total, 0, count) > targetPvalue;
      +            } else {
      +                isSignificant = (count >= targetPvalue * total);
      +            }
      +
      +            if ( isSignificant ) {
      +                if ( index == BaseIndex.D.index )
      +                    return -1;
                       result++;
      +            }
               }
      +
               return result;
           }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java
      index 970829162..b5963498a 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java
      @@ -47,12 +47,17 @@
       package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
       
       import org.broadinstitute.sting.WalkerTest;
      +import org.broadinstitute.sting.utils.collections.Pair;
      +import org.broadinstitute.sting.utils.exceptions.UserException;
       import org.testng.annotations.Test;
       
      +import java.io.File;
       import java.util.Arrays;
      +import java.util.List;
       
       public class ReduceReadsIntegrationTest extends WalkerTest {
           final static String REF = b37KGReference;
      +    final static String DBSNP = b37dbSNP132;
           final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
           final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam";
           final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam";
      @@ -64,50 +69,172 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
           final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam";
           final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057";
           final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam";
      +    final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam";
           final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam";
       
      -    private void RRTest(String testName, String args, String md5) {
      -        String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
      -        WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5));
      -        executeTest(testName, spec);
      +    final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e";
      +
      +    protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) {
      +        return executeTest(name, spec, emptyFileMd5);
           }
       
      -    @Test(enabled = true)
      +    protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final String qualsTestMD5) {
      +        final Pair, List> result = super.executeTest(name, spec);
      +
      +        // perform some Reduce Reads specific testing now
      +        if ( result != null ) {
      +
      +            // generate a new command-line based on the old one
      +            spec.disableImplicitArgs();
      +            final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" ");
      +
      +            final StringBuilder reducedInputs = new StringBuilder();
      +            for ( final File file : result.getFirst() ) {
      +                reducedInputs.append(" -I:reduced ");
      +                reducedInputs.append(file.getAbsolutePath());
      +            }
      +
      +            // the coverage test is a less stricter version of the quals test so we can safely ignore it for now
      +            //final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs);
      +            //super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5)));
      +
      +            // run the quals test
      +            final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs);
      +            super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(qualsTestMD5)));
      +        }
      +
      +        return result;
      +    }
      +
      +    /*
      +     * Generate a new command-line based on the old one
      +     *
      +     * @param walkerName    the new walker name to use
      +     * @param originalArgs  the original arguments used for the test
      +     * @return the new command line
      +     */
      +    private String createCommandLine(final String walkerName, final String[] originalArgs) {
      +
      +        final StringBuilder newArgs = new StringBuilder();
      +
      +        for ( int i = 0; i < originalArgs.length; i++ ) {
      +            final String arg = originalArgs[i];
      +
      +            if ( arg.equals("-T") ) {
      +                newArgs.append("-T ");
      +                newArgs.append(walkerName);
      +            } else if ( arg.startsWith("-I") ) {
      +                newArgs.append("-I:original ");
      +                newArgs.append(originalArgs[++i]);
      +            } else if ( arg.equals("-R") || arg.equals("-L") ) {
      +                newArgs.append(arg);
      +                newArgs.append(" ");
      +                newArgs.append(originalArgs[++i]);
      +            }
      +
      +            // always add a trailing space
      +            newArgs.append(" ");
      +        }
      +
      +        newArgs.append("-o %s");
      +
      +        return newArgs.toString();
      +    }
      +
      +    protected Pair, List> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) {
      +        return super.executeTest(name, spec);
      +    }
      +
      +    private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) {
      +        this.RRTest(testName, args, md5, useKnowns, emptyFileMd5);
      +    }
      +
      +    private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final String qualsTestMD5) {
      +        String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " ";
      +        WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5));
      +        executeTest(testName, spec, qualsTestMD5);
      +    }
      +
      +        @Test(enabled = true)
           public void testDefaultCompression() {
      -        RRTest("testDefaultCompression ", L, "17908e8515217c4693d303ed68108ccc");
      +        RRTest("testDefaultCompression ", L, "fa1cffc4539e0c20b818a11da5dba5b9", false);
           }
       
           @Test(enabled = true)
      -    public void testInsertionsAtEdgeOfConsensus() {
      -        String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s ";
      -        executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("3103667fc68c3136a8cfa8e22429f94e")));
      +    public void testDefaultCompressionWithKnowns() {
      +        RRTest("testDefaultCompressionWithKnowns ", L, "d1b5fbc402810d9cdc020bb3503f1325", true);
           }
       
      +    private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
      +
           @Test(enabled = true)
           public void testMultipleIntervals() {
      -        String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
      -        RRTest("testMultipleIntervals ", intervals, "497c5e36c2beaad2fcdbd02a0b9c121b");
      +        RRTest("testMultipleIntervals ", intervals, "7e9dcd157ad742d4ebae7e56bc4af663", false);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testMultipleIntervalsWithKnowns() {
      +        RRTest("testMultipleIntervalsWithKnowns ", intervals, "dbb1e95e1bcad956701142afac763717", true);
           }
       
           @Test(enabled = true)
           public void testHighCompression() {
      -        RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "0ff4142e4d7b6a9a9c76012246ad9e2d");
      +        RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "8f8fd1a53fa0789116f45e4cf2625906", false);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testHighCompressionWithKnowns() {
      +        RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "52fd2a77802a4677b604abb18e15d96a", true);
           }
       
           @Test(enabled = true)
           public void testLowCompression() {
      -        RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7890a37444a0e05b902f63a83238ce37");
      +        RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "79c6543d5ce84ebc2ca74404498edbd1", false);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testLowCompressionWithKnowns() {
      +        RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "271aec358b309603291a974b5ba3bd60", true);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testBadPvalueInput() {
      +        final String cmd = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + "-o %s -min_pvalue -0.01";
      +        WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, UserException.BadArgumentValue.class);
      +        executeTest("testBadPvalueInput", spec);
           }
       
           @Test(enabled = true)
           public void testIndelCompression() {
      -        RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f58ae2154e0e5716be0e850b7605856e");
      +        final String md5 = "d20e6012300898a0315c795cab7583d8";
      +        RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false);
      +        RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true);
           }
       
           @Test(enabled = true)
           public void testFilteredDeletionCompression() {
               String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
      -        executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bfe0693aea74634f1035a9bd11302517")));
      +        executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("e5da09662708f562c0c617ba73cf4763")), "4f916da29d91852077f0a2fdbdd2c7f6");
      +    }
      +
      +    private static final String COREDUCTION_QUALS_TEST_MD5 = "26d84a2bd549a01a63fcebf8847a1b7d";
      +
      +    @Test(enabled = true)
      +    public void testCoReduction() {
      +        String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
      +        executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5f4d2c1d9c010dfd6865aeba7d0336fe")), COREDUCTION_QUALS_TEST_MD5);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testCoReductionWithKnowns() {
      +        String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s ";
      +        executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("ca48dd972bf57595c691972c0f887cb4")), COREDUCTION_QUALS_TEST_MD5);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testInsertionsAtEdgeOfConsensus() {
      +        String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s ";
      +        executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("760500a5b036b987f84099f45f26a804")));
           }
       
           /**
      @@ -121,7 +248,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
           @Test(enabled = true)
           public void testAddingReadAfterTailingTheStash() {
               String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
      -        executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("f118e83c394d21d901a24230379864fc")));
      +        executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("67f8a3a647f8ec5212104bdaafd8c862")), "3eab32c215ba68e75efd5ab7e9f7a2e7");
           }
       
           /**
      @@ -131,24 +258,28 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
           @Test(enabled = true)
           public void testDivideByZero() {
               String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
      -        executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bd5198a3e21034887b741faaaa3964bf")));
      -    }
      -
      -    @Test(enabled = true)
      -    public void testCoReduction() {
      -        String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
      -        executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("13c44a9afa92ae728bf55b7075cc5de3")));
      +        // we expect to lose coverage due to the downsampling so don't run the systematic tests
      +        executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("1663f35802f82333c5e15653e437ce2d")));
           }
       
           /**
      -     * Bug happens when reads are soft-clipped off the  contig (usually in the MT). This test guarantees no changes to the upstream code will
      +     * Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will
            * break the current hard-clipping routine that protects reduce reads from such reads.
            */
           @Test(enabled = true)
           public void testReadOffContig() {
               String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s ";
      -        executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("922be8b1151dd0d92602af93b77f7a51")));
      +        executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("0ce693b4ff925998867664e4099f3248")));
           }
       
      +    /**
      +     * Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same.
      +     */
      +    @Test(enabled = true)
      +    public void testPairedReadsInVariantRegion() {
      +        String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) +
      +                " -o %s  --downsample_coverage 250 -dcov 50  ";
      +        executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7e7b358443827ca239db3b98f299aec6")), "2af063d1bd3c322b03405dbb3ecf59a9");
      +    }
       }
       
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java
      new file mode 100644
      index 000000000..6032affa7
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java
      @@ -0,0 +1,214 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
      +
      +import it.unimi.dsi.fastutil.objects.*;
      +import net.sf.samtools.SAMFileHeader;
      +import org.broad.tribble.Feature;
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.commandline.RodBinding;
      +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
      +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
      +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
      +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
      +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
      +import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.GenomeLocParser;
      +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
      +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      +import org.broadinstitute.variant.variantcontext.Allele;
      +import org.broadinstitute.variant.variantcontext.VariantContext;
      +import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
      +import org.testng.Assert;
      +import org.testng.annotations.BeforeClass;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.List;
      +import java.util.Random;
      +
      +
      +public class ReduceReadsUnitTest extends BaseTest {
      +
      +    Random random = new Random(987743);
      +    Object2LongOpenHashMap hash = new Object2LongOpenHashMap();
      +    long nextNumber = 0L;
      +
      +    /**
      +     * Combinatorial unit test data provider example.
      +     *
      +     * Creates data for testMyData test function, containing two arguments, start and size at each value
      +     *
      +     * @return Object[][] for testng DataProvider
      +     */
      +    @DataProvider(name = "ReadNameProvider")
      +    public Object[][] readNameProvider() {
      +        final int readNameLength = 4;
      +        final int nReads = 100000;
      +        final int charVariety = 20;
      +        ObjectArrayList tests = new ObjectArrayList();
      +        ObjectOpenHashSet truthSet = new ObjectOpenHashSet();
      +        byte[] bytes = new byte[readNameLength];
      +        for ( int i = 0; i tests = new ObjectArrayList();
      +
      +        // test single
      +        tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10))});
      +
      +        // test multiple at one position
      +        tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_10_2))});
      +
      +        // test multiple
      +        tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
      +
      +        // test indel not used
      +        tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_1_40))});
      +        tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_2_40))});
      +
      +        // test read clears
      +        tests.add(new Object[]{3, 0, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
      +        tests.add(new Object[]{4, 1, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))});
      +        tests.add(new Object[]{3, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
      +        tests.add(new Object[]{4, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))});
      +        tests.add(new Object[]{4, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_3_10))});
      +        tests.add(new Object[]{5, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10), makeRefMetaDataTracker(snp_3_10))});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    private final RefMetaDataTracker makeRefMetaDataTracker(final Feature feature) {
      +        final List x = new ArrayList();
      +        x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, feature, "known"));
      +        final RODRecordList rods = new RODRecordListImpl("known", x, genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()));
      +        return new RefMetaDataTracker(Arrays.asList(rods));
      +    }
      +
      +    @Test(dataProvider = "PopulateKnownsProvider")
      +    public void testPopulateKnowns(final int expectedSizeBeforeClear, final int expectedSizeAfterClear, final GATKSAMRecord read, final List trackers) {
      +        final ReduceReads rr = new ReduceReads();
      +        RodBinding.resetNameCounter();
      +        rr.known = Arrays.>asList(new RodBinding(VariantContext.class, "known"));
      +        rr.knownSnpPositions = new ObjectAVLTreeSet();
      +
      +        final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
      +        engine.setGenomeLocParser(genomeLocParser);
      +        rr.setToolkit(engine);
      +
      +        for ( final RefMetaDataTracker tracker : trackers )
      +            rr.populateKnownSNPs(tracker);
      +        Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeBeforeClear);
      +
      +        rr.clearStaleKnownPositions(read);
      +        Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeAfterClear);
      +    }
      +
      +}
      \ No newline at end of file
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java
      index a66809b2e..56ad02084 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java
      @@ -46,12 +46,14 @@
       
       package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
       
      +import it.unimi.dsi.fastutil.objects.*;
       import net.sf.picard.reference.IndexedFastaSequenceFile;
       import net.sf.samtools.Cigar;
       import net.sf.samtools.CigarElement;
       import net.sf.samtools.CigarOperator;
       import net.sf.samtools.SAMFileHeader;
       import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.BaseUtils;
       import org.broadinstitute.sting.utils.GenomeLoc;
       import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
       import org.broadinstitute.sting.utils.Utils;
      @@ -69,8 +71,8 @@ import java.io.File;
       import java.io.FileNotFoundException;
       import java.util.ArrayList;
       import java.util.Arrays;
      +import java.util.LinkedList;
       import java.util.List;
      -import java.util.Set;
       
       public class SlidingWindowUnitTest extends BaseTest {
       
      @@ -198,17 +200,16 @@ public class SlidingWindowUnitTest extends BaseTest {
           @Test(enabled = true)
           public void testMarkVariantRegion() {
               final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition);
      -        SlidingWindow.MarkedSites markedSites = slidingWindow.new MarkedSites();
      -        markedSites.updateRegion(100, 100);
      +        slidingWindow.getMarkedSitesForTesting().updateRegion(100, 100);
       
      -        slidingWindow.markVariantRegion(markedSites, 40);
      -        Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 21);
      +        slidingWindow.markVariantRegion(40);
      +        Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 21);
       
      -        slidingWindow.markVariantRegion(markedSites, 5);
      -        Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 37);
      +        slidingWindow.markVariantRegion(5);
      +        Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 37);
       
      -        slidingWindow.markVariantRegion(markedSites, 95);
      -        Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 52);
      +        slidingWindow.markVariantRegion(95);
      +        Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 52);
           }
       
           private static int countTrueBits(final boolean[] bitset) {
      @@ -227,7 +228,7 @@ public class SlidingWindowUnitTest extends BaseTest {
       
           private static final int readLength = 100;
           private static final int testRegionSize = 1000;
      -    private final List basicReads = new ArrayList(20);
      +    private final ObjectList basicReads = new ObjectArrayList(20);
           private IndexedFastaSequenceFile seq;
           private SAMFileHeader header;
       
      @@ -250,12 +251,15 @@ public class SlidingWindowUnitTest extends BaseTest {
           }
       
           private class ConsensusCreationTest {
      -        public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression;
      +        public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage;
               public final List myReads = new ArrayList(20);
      +        public final String description;
       
      -        private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) {
      +        private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) {
                   this.expectedNumberOfReads = expectedNumberOfReads;
                   this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression;
      +            this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage;
      +            this.description = String.format("%d %d %d", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage);
       
                   // first, add the basic reads to the collection
                   myReads.addAll(basicReads);
      @@ -265,9 +269,11 @@ public class SlidingWindowUnitTest extends BaseTest {
                       myReads.add(createVariantRead(loc, readsShouldBeLowQuality, variantBaseShouldBeLowQuality, CigarOperator.M));
               }
       
      -        private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) {
      +        private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) {
                   this.expectedNumberOfReads = expectedNumberOfReads;
                   this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression;
      +            this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage;
      +            this.description = String.format("%s %d %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage);
       
                   // first, add the basic reads to the collection
                   myReads.addAll(basicReads);
      @@ -277,6 +283,8 @@ public class SlidingWindowUnitTest extends BaseTest {
                       myReads.add(createVariantRead(loc, false, false, operator));
               }
       
      +        public String toString() { return  description; }
      +
               private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality,
                                                       final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) {
       
      @@ -313,68 +321,187 @@ public class SlidingWindowUnitTest extends BaseTest {
           private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295);
           private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309);
           private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310);
      +    private static final GenomeLoc loc320 = new UnvalidatingGenomeLoc("1", 0, 1000320, 1000320);
           private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100);
       
      +    private static final int DEEP_COVERAGE_ITERATIONS = 100;
      +
           @DataProvider(name = "ConsensusCreation")
           public Object[][] createConsensusCreationTestData() {
               List tests = new ArrayList();
       
               // test high quality reads and bases
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 5)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 1, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 6, 5 + DEEP_COVERAGE_ITERATIONS)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc320), false, false, 11, 10, 4 + (6 * DEEP_COVERAGE_ITERATIONS))});
       
               // test low quality reads
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 2, 2, 2)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 2, 2, 2)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 2, 2, 2)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 2, 2, 2)});
       
               // test low quality bases
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 1, 1)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 1, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 1, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 1, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 1, 1, 1)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 1, 1, 1)});
       
               // test mixture
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 2, 2)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 3, 3)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 2, 2, 2)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 1, 1, 1)});
       
               // test I/D operators
      -        // TODO -- uncomment this test when the deletion bug is fixed!
      -        // tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 5)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 10, 10)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 10, 10)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 11, 11)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 9, 9)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 10, 10)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 10, 10)});
      -        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 11, 11)});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9, 2 + (7 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 9, 9, 2 + (7 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))});
      +        tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))});
       
               return tests.toArray(new Object[][]{});
           }
       
           @Test(dataProvider = "ConsensusCreation", enabled = true)
           public void testConsensusCreationTest(ConsensusCreationTest test) {
      -        // test WITHOUT het compression allowed
      -        SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false);
      +        final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet();
      +
      +        // test WITHOUT het compression
      +        SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
               for ( final GATKSAMRecord read : test.myReads )
                   slidingWindow.addRead(read);
      -        Pair, CompressionStash> result = slidingWindow.close();
      -
      +        Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
               Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads);
       
      -        // test WITH het compression allowed
      -        slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, true);
      +        // test WITH het compression at KNOWN sites
      +        slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
               for ( final GATKSAMRecord read : test.myReads )
                   slidingWindow.addRead(read);
      -        result = slidingWindow.close();
      -
      +        for ( int i = 0; i < 1200; i++ )
      +            knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
      +        result = slidingWindow.close(knownSNPs);
               Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression);
      +
      +        // test WITH het compression at ALL sites
      +        slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
      +        for ( final GATKSAMRecord read : test.myReads )
      +            slidingWindow.addRead(read);
      +        result = slidingWindow.close(null);
      +        Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression);
      +
      +        // test with deep coverage
      +        slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 0, ReduceReads.DownsampleStrategy.Normal, false);
      +        for ( int i = 0; i < DEEP_COVERAGE_ITERATIONS; i++ ) {
      +            for ( final GATKSAMRecord read : test.myReads ) {
      +                final GATKSAMRecord copy = ArtificialSAMUtils.createArtificialRead(header, read.getReadName() + "_" + (i+1), 0, read.getAlignmentStart(), readLength);
      +                copy.setReadBases(read.getReadBases());
      +                copy.setBaseQualities(read.getBaseQualities());
      +                copy.setMappingQuality(read.getMappingQuality());
      +                copy.setReadNegativeStrandFlag(read.getReadNegativeStrandFlag());
      +                if ( read.getCigar() != null )
      +                    copy.setCigar(read.getCigar());
      +                slidingWindow.addRead(copy);
      +            }
      +        }
      +        result = slidingWindow.close(null);
      +        Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsAtDeepCoverage);
      +    }
      +
      +    @Test
      +    public void testConsensusCreationForMultiallelic() {
      +
      +        final int totalNumReads = 7;
      +        final ObjectList myReads = new ObjectArrayList(totalNumReads);
      +
      +        for ( int i = 0; i < totalNumReads; i++ ) {
      +            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength);
      +            read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
      +            read.setMappingQuality(30);
      +            read.setReadNegativeStrandFlag(false);
      +
      +            final char base = i < totalNumReads - 2 ? 'A' : ( i == totalNumReads - 2 ? 'C' : 'G');
      +            read.setReadBases(Utils.dupBytes((byte) base, readLength));
      +
      +            myReads.add(read);
      +        }
      +
      +        final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet();
      +
      +        // test WITHOUT het compression
      +        SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
      +        for ( final GATKSAMRecord read : myReads )
      +            slidingWindow.addRead(read);
      +        Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
      +        Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
      +
      +        // test WITH het compression at KNOWN sites
      +        slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
      +        for ( final GATKSAMRecord read : myReads )
      +            slidingWindow.addRead(read);
      +        for ( int i = 0; i < readLength; i++ )
      +            knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
      +        result = slidingWindow.close(knownSNPs);
      +        Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
      +
      +        // test WITH het compression at ALL sites
      +        slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
      +        for ( final GATKSAMRecord read : myReads )
      +            slidingWindow.addRead(read);
      +        result = slidingWindow.close(knownSNPs);
      +        Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
      +    }
      +
      +    @Test
      +    public void testAddingReadPairWithSameCoordinates() {
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10);
      +
      +        final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1);
      +        read1.setReadBases(new byte[]{(byte)'A'});
      +        read1.setBaseQualities(new byte[]{(byte)'A'});
      +        read1.setMappingQuality(30);
      +        read1.setReadNegativeStrandFlag(false);
      +        slidingWindow.addRead(read1);
      +
      +        final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1);
      +        read2.setReadBases(new byte[]{(byte)'A'});
      +        read2.setBaseQualities(new byte[]{(byte)'A'});
      +        read2.setMappingQuality(30);
      +        read2.setReadNegativeStrandFlag(true);
      +        slidingWindow.addRead(read2);
      +
      +        Assert.assertEquals(slidingWindow.readsInWindow.size(), 2);
      +    }
      +
      +    @Test
      +    public void testOnlySpanningReadHasLowQual() {
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
      +
      +        final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead1", 0, globalStartPosition, 100);
      +        final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead2", 0, globalStartPosition + 50, 100);
      +
      +        final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
      +        read1.setReadBases(bases);
      +        read2.setReadBases(bases);
      +
      +        final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength);
      +        baseQuals[80] = (byte)10;
      +        read1.setBaseQualities(baseQuals);
      +        read2.setBaseQualities(baseQuals);
      +
      +        read1.setMappingQuality(30);
      +        read2.setMappingQuality(30);
      +
      +        slidingWindow.addRead(read1);
      +        slidingWindow.addRead(read2);
      +
      +        Assert.assertEquals(slidingWindow.close(null).getFirst().size(), 1);
           }
       
       
      @@ -382,30 +509,22 @@ public class SlidingWindowUnitTest extends BaseTest {
           //// This section tests the downsampling functionality ////
           ///////////////////////////////////////////////////////////
       
      -    private class DSTest {
      -        public final int dcov;
      -
      -        private DSTest(final int dcov) {
      -            this.dcov = dcov;
      -        }
      -    }
      -
           @DataProvider(name = "Downsampling")
           public Object[][] createDownsamplingTestData() {
               List tests = new ArrayList();
       
               for ( int i = 1; i < basicReads.size() + 10; i++ )
      -            tests.add(new Object[]{new DSTest(i)});
      +            tests.add(new Object[]{i});
       
               return tests.toArray(new Object[][]{});
           }
       
           @Test(dataProvider = "Downsampling", enabled = true)
      -    public void testDownsamplingTest(DSTest test) {
      -        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, test.dcov, ReduceReads.DownsampleStrategy.Normal, false, false);
      -        final List result = slidingWindow.downsampleVariantRegion(basicReads);
      +    public void testDownsamplingTest(final int dcov) {
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false);
      +        final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads);
       
      -        Assert.assertEquals(result.size(), Math.min(test.dcov, basicReads.size()));
      +        Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size()));
           }
       
       
      @@ -450,10 +569,10 @@ public class SlidingWindowUnitTest extends BaseTest {
       
           @Test(dataProvider = "ConsensusQuals", enabled = true)
           public void testConsensusQualsTest(QualsTest test) {
      -        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false);
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
               for ( final GATKSAMRecord read : test.myReads )
                   slidingWindow.addRead(read);
      -        final Pair, CompressionStash> result = slidingWindow.close();
      +        final Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet());
       
               Assert.assertEquals(result.getFirst().size(), 1);
               final GATKSAMRecord read = result.getFirst().iterator().next();
      @@ -484,5 +603,176 @@ public class SlidingWindowUnitTest extends BaseTest {
           }
       
       
      +    ////////////////////////////////////////////////////
      +    //// This section tests the new header creation ////
      +    ////////////////////////////////////////////////////
       
      +    @DataProvider(name = "CreateNewHeader")
      +    public Object[][] CreateNewHeaderTestData() {
      +        List tests = new ArrayList();
      +
      +        for ( final int start : Arrays.asList(-10, -1, 0, 1, 10) ) {
      +            for ( final int stop : Arrays.asList(-10, -1, 0, 1, 10) ) {
      +                tests.add(new Object[]{start, stop});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "CreateNewHeader", enabled = true)
      +    public void createNewHeaderTest(final int start, final int stop) {
      +
      +        // set up the window header
      +        final int currentHeaderStart = 100;
      +        final int currentHeaderLength = 50;
      +        final LinkedList windowHeader = new LinkedList();
      +        for ( int i = 0; i < currentHeaderLength; i++ )
      +            windowHeader.add(new HeaderElement(currentHeaderStart + i));
      +
      +        // set up the read
      +        final int readStart = currentHeaderStart + start;
      +        final int readLength = currentHeaderLength + stop - start;
      +        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength);
      +        read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
      +        read.setBaseQualities(Utils.dupBytes((byte) 30, readLength));
      +        read.setMappingQuality(30);
      +
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
      +        int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start);
      +
      +        Assert.assertEquals(newIndex, start > 0 ? start : 0);
      +
      +        final int expectedNewLength = currentHeaderLength + (start < 0 ? -start : 0) + (stop > 0 ? stop : 0);
      +        Assert.assertEquals(windowHeader.size(), expectedNewLength);
      +    }
      +
      +
      +    ////////////////////////////////////////////////////////////
      +    //// This section tests updating the header from a read ////
      +    ////////////////////////////////////////////////////////////
      +
      +    @DataProvider(name = "UpdateHeaderForRead")
      +    public Object[][] UpdateHeaderForReadTestData() {
      +        List tests = new ArrayList();
      +
      +        for ( final int start : Arrays.asList(0, 1, 10) ) {
      +            for ( final int readLength : Arrays.asList(1, 5, 10) ) {
      +                tests.add(new Object[]{start, readLength});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "UpdateHeaderForRead", enabled = true)
      +    public void updateHeaderForReadTest(final int start, final int readLength) {
      +
      +        // set up the window header
      +        final int currentHeaderStart = 100;
      +        final int currentHeaderLength = 50;
      +        final LinkedList windowHeader = new LinkedList();
      +        for ( int i = 0; i < currentHeaderLength; i++ )
      +            windowHeader.add(new HeaderElement(currentHeaderStart + i));
      +
      +        // set up the read
      +        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + start, readLength);
      +        read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
      +        read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
      +        read.setMappingQuality(30);
      +
      +        // add the read
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
      +        slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start);
      +        for ( int i = 0; i < start; i++ )
      +            Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0);
      +        for ( int i = 0; i < readLength; i++ )
      +            Assert.assertEquals(windowHeader.get(start + i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 1);
      +        for ( int i = start + readLength; i < currentHeaderLength; i++ )
      +            Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0);
      +
      +        // now remove the read
      +        slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, true, start);
      +        for ( int i = 0; i < currentHeaderLength; i++ )
      +            Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0);
      +    }
      +
      +    //////////////////////////////////////////////////////////////////////////////////
      +    //// This section tests functionality related to polyploid consensus creation ////
      +    //////////////////////////////////////////////////////////////////////////////////
      +
      +    @DataProvider(name = "MatchesKnownProvider")
      +    public Object[][] matchesKnownProvider() {
      +
      +        final ObjectArrayList tests = new ObjectArrayList();
      +
      +        // test no knowns
      +        tests.add(new Object[]{new ObjectAVLTreeSet(), loc290.getStart(), false});
      +
      +        final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet();
      +        knownSnpPositions.add(loc290);
      +        knownSnpPositions.add(loc295);
      +        knownSnpPositions.add(loc310);
      +
      +        // test overlap
      +        tests.add(new Object[]{knownSnpPositions, loc290.getStart(), true});
      +        tests.add(new Object[]{knownSnpPositions, loc295.getStart(), true});
      +        tests.add(new Object[]{knownSnpPositions, loc310.getStart(), true});
      +        tests.add(new Object[]{knownSnpPositions, loc309.getStart(), false});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "MatchesKnownProvider")
      +    public void testMatchesKnown(final ObjectSortedSet knownSnpPositions, final int targetLoc, final boolean expectedResult) {
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10);
      +        Assert.assertEquals(slidingWindow.matchesKnownPosition(targetLoc, knownSnpPositions), expectedResult);
      +    }
      +
      +    @DataProvider(name = "SignificantSoftclipsProvider")
      +    public Object[][] SignificantSoftclipsTestData() {
      +        List tests = new ArrayList();
      +
      +        for ( final int indexWithSoftclips : Arrays.asList(-1, 0, 5, 9) ) {
      +            for ( final int indexToSkip : Arrays.asList(-1, 0, 5, 9) ) {
      +                tests.add(new Object[]{indexWithSoftclips, indexToSkip});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "SignificantSoftclipsProvider", enabled = true)
      +    public void significantSoftclipsTest(final int indexWithSoftclips, final int indexToSkip) {
      +
      +        // set up the window header
      +        final int currentHeaderStart = 100;
      +        final int currentHeaderLength = 10;
      +        final LinkedList windowHeader = new LinkedList();
      +        for ( int i = 0; i < currentHeaderLength; i++ )
      +            windowHeader.add(new HeaderElement(currentHeaderStart + i));
      +
      +        // set up the normal read
      +        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, currentHeaderLength);
      +        read.setReadBases(Utils.dupBytes((byte) 'A', currentHeaderLength));
      +        read.setBaseQualities(Utils.dupBytes((byte)30, currentHeaderLength));
      +        read.setMappingQuality(30);
      +
      +        // add the read
      +        final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
      +        slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0);
      +
      +        // set up and add a soft-clipped read if requested
      +        if ( indexWithSoftclips != -1 ) {
      +            final GATKSAMRecord softclippedRead = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + indexWithSoftclips, 1);
      +            softclippedRead.setReadBases(new byte[]{(byte) 'A'});
      +            softclippedRead.setBaseQualities(new byte[]{(byte) 30});
      +            softclippedRead.setMappingQuality(30);
      +            softclippedRead.setCigarString("1S");
      +            slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips);
      +        }
      +
      +        final boolean result = slidingWindow.hasPositionWithSignificantSoftclipsOrVariant(windowHeader, currentHeaderStart + indexToSkip);
      +        Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip);
      +    }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistributionIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java
      similarity index 99%
      rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistributionIntegrationTest.java
      rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java
      index 53153c100..27f140337 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistributionIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java
      @@ -44,7 +44,7 @@
       *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
       */
       
      -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
      +package org.broadinstitute.sting.gatk.walkers.diagnostics;
       
       import org.broadinstitute.sting.WalkerTest;
       import org.testng.annotations.DataProvider;
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java
      similarity index 93%
      rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java
      rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java
      index 6a52a42e5..bac09f30d 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java
      @@ -44,33 +44,33 @@
       *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
       */
       
      -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
      +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
       
      +import org.broadinstitute.sting.BaseTest;
       import org.broadinstitute.sting.WalkerTest;
       import org.testng.annotations.Test;
       
       import java.util.Arrays;
       
       public class DiagnoseTargetsIntegrationTest extends WalkerTest {
      -    final static String REF = b37KGReference;
      -    final String singleSample = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
      -    final String multiSample = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
      -    final String L = validationDataLocation + "DT-itest.interval_list";
      +    final static String REF = BaseTest.b37KGReference;
      +    final String singleSample = BaseTest.validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
      +    final String multiSample = BaseTest.validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
      +    final String L = BaseTest.validationDataLocation + "DT-itest.interval_list";
       
           private void DTTest(String testName, String args, String md5) {
               String base = String.format("-T DiagnoseTargets  --no_cmdline_in_header -R %s -L %s", REF, L) + " -o %s ";
               WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5));
      -        //spec.disableShadowBCF();
               executeTest(testName, spec);
           }
       
           @Test(enabled = true)
           public void testSingleSample() {
      -        DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9954b21163d3e66db232938ec509067f");
      +        DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "850304909477afa8c2a8f128d6eedde9");
           }
       
           @Test(enabled = true)
           public void testMultiSample() {
      -        DTTest("testMultiSample ", "-I " + multiSample, "7c5277261e8e9dd74666f04843ffb09c");
      +        DTTest("testMultiSample ", "-I " + multiSample, "bedd19bcf21d1a779f6706c0351c9d26");
           }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java
      similarity index 88%
      rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java
      rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java
      index 9ab4621b9..fe3010e02 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java
      @@ -44,40 +44,46 @@
       *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
       */
       
      -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
      +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
       
       import org.testng.Assert;
      +import org.testng.annotations.BeforeClass;
       import org.testng.annotations.DataProvider;
       import org.testng.annotations.Test;
       
      -import java.util.Set;
      +import java.util.List;
       
      -public class LocusStatisticsUnitTest /*extends BaseTest*/ {
      +public class LocusStatisticsUnitTest {
      +
      +    ThresHolder thresholds = new ThresHolder();
      +
      +    @BeforeClass
      +    public void init() {
      +        DiagnoseTargets.loadAllPlugins(thresholds);
      +    }
       
           @Test(dataProvider = "StatusTestValues")
           public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) {
      -        // The min Coverage threshold is 10, the max is 100
      -        ThresHolder thresholds = new ThresHolder(20, 20, 10, 100, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
      -        Set statuses = new LocusStatistics(coverage, rawCoverage).callableStatuses(thresholds);
      -        // Check to make sure the status provides matches the actual
      +        List statuses = new LocusStratification(coverage, rawCoverage, thresholds).callableStatuses();
               Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1));
      -
           }
       
           @DataProvider(name = "StatusTestValues")
           public Object[][] getStatusTestValues() {
      +        final int max = thresholds.maximumCoverage;
      +        final int min = thresholds.minimumCoverage;
               return new Object[][]{
      -                new Object[]{100, 100, null},
      -                new Object[]{100, 101, null},
      -                new Object[]{101, 101, CallableStatus.EXCESSIVE_COVERAGE},
      -                new Object[]{10, 101, null},
      -                new Object[]{9, 101, CallableStatus.POOR_QUALITY},
      -                new Object[]{9, 10, CallableStatus.POOR_QUALITY},
      -                new Object[]{9, 9, CallableStatus.LOW_COVERAGE},
      +                new Object[]{max, max, null},
      +                new Object[]{max, max+1, null},
      +                new Object[]{max+1, max+1, CallableStatus.EXCESSIVE_COVERAGE},
      +                new Object[]{min, max+1, null},
      +                new Object[]{min-1, max+1, CallableStatus.POOR_QUALITY},
      +                new Object[]{min-1, min, CallableStatus.POOR_QUALITY},
      +                new Object[]{min-1, min-1, CallableStatus.LOW_COVERAGE},
                       new Object[]{0, 0, CallableStatus.COVERAGE_GAPS},
      -                new Object[]{0, 9, CallableStatus.LOW_COVERAGE},
      -                new Object[]{0, 101, CallableStatus.POOR_QUALITY},
      -                new Object[]{10, Integer.MAX_VALUE, null},
      +                new Object[]{0, min-1, CallableStatus.LOW_COVERAGE},
      +                new Object[]{0, max+1, CallableStatus.POOR_QUALITY},
      +                new Object[]{min, Integer.MAX_VALUE, null},
                       new Object[]{Integer.MAX_VALUE, Integer.MAX_VALUE, CallableStatus.EXCESSIVE_COVERAGE},
               };
           }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
      index c93f68ef8..5a308928d 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
      @@ -74,10 +74,10 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
       
           @DataProvider(name = "data")
           public Object[][] createData() {
      -        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", true, "aea3d5df32a2acd400da48d06b4dbc60");
      -        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", true, "3f46f5a964f7c34015d972256fe49a35");
      -        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", false, "e71e23e7ebfbe768e59527bc62f8918d");
      -        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", false, "47bf16c27c9e2c657a7e1d13f20880c9");
      +        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", true, "71869ddf9665773a842a9def4cc5f3c8");
      +        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", true, "cec7c644c84ef9c96aacaed604d9ec9b");
      +        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", false, "47546e03344103020e49d8037a7e0727");
      +        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", false, "d27b37f7a366c8dacca5cd2590d3c6ce");
               return TestParams.getTests(TestParams.class);
           }
       
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java
      index 84729647a..6a29ff255 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java
      @@ -47,6 +47,7 @@
       package org.broadinstitute.sting.gatk.walkers.filters;
       
       import org.broadinstitute.sting.WalkerTest;
      +import org.broadinstitute.sting.utils.exceptions.UserException;
       import org.testng.annotations.Test;
       
       import java.util.Arrays;
      @@ -98,6 +99,22 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
               executeTest("test mask extend", spec3);
           }
       
      +    @Test
      +    public void testMaskReversed() {
      +        WalkerTestSpec spec3 = new WalkerTestSpec(
      +                baseTestString() + " -maskName outsideGoodSites -filterNotInMask --mask:BED " + privateTestDir + "goodMask.bed --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
      +                Arrays.asList("e65d27c13953fc3a77dcad27a4357786"));
      +        executeTest("test filter sites not in mask", spec3);
      +    }
      +
      +    @Test
      +    public void testIllegalFilterName() {
      +        WalkerTestSpec spec = new WalkerTestSpec(
      +                baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName 'foo < foo' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
      +                UserException.class);
      +        executeTest("test illegal filter name", spec);
      +    }
      +
           @Test
           public void testFilter1() {
               WalkerTestSpec spec = new WalkerTestSpec(
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java
      index 3f2ace800..77c9f96c9 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java
      @@ -67,7 +67,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
           //
           // --------------------------------------------------------------------------------------------------------------
       
      -    @Test
      +    @Test(enabled = false)
           public void testContaminationDownsamplingFlat() {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1,
      @@ -75,7 +75,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
               executeTest("test contamination_percentage_to_filter 0.20", spec);
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testContaminationDownsamplingFlatAndPerSample() {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_per_sample_file " + ArtificalBAMLocation + "NA12878.NA19240.contam.txt --contamination_fraction_to_filter 0.10", 1,
      @@ -83,7 +83,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
               executeTest("test contamination_percentage_to_filter per-sample and .20 overall", spec);
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testContaminationDownsamplingPerSampleOnly() {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contaminationFile " + ArtificalBAMLocation + "NA19240.contam.txt", 1,
      @@ -98,7 +98,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
           //
           // --------------------------------------------------------------------------------------------------------------
       
      -    @Test
      +    @Test(enabled = false)
           private void testDefaultContamination() {
               final String bam1 = "NA11918.with.1.NA12842.reduced.bam";
               final String bam2 = "NA12842.with.1.NA11918.reduced.bam";
      @@ -116,47 +116,47 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
               executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec);
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase1() {
               testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "e2e5a8dd313f8d7e382e7d49dfac59a2");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase2() {
               testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "549737002f98775fea8f46e7ea174dde");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase3() {
               testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "529d82c2a33fcc303a5dc55de2d56979");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase4() {
               testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.1, "b5689972fbb7d230a372ee5f0da1c6d7");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase5() {
               testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.2, "9dceee2e921b53fbc1ce137a7e0b7b74");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase6() {
               testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.3, "d6a74061033503af80dcaea065bfa075");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase7() {
               testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "7d1b5efab58a1b8f9d99fcf5af82f15a");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase8() {
               testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "a7f8d5c79626aff59d7f426f79d8816e");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testFlatContaminationCase9() {
               testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.3, "fcf482398b7c908e3e2d1e4d5da6377b");
           }
      @@ -168,42 +168,42 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
               executeTest("test contamination on Artificial Contamination (per-sample) on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase1() {
               testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "e00278527a294833259e9e411728e395");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase2() {
               testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "a443e793f0b0e2ffce1b751634d706e2");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase3() {
               testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "e11d83a7815ce757afbcf7689568cb25");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase4() {
               testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "615042eeeffe042bd1c86279d34f80b6");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase5() {
               testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "9bc99fc79ca34744bf26cb19ee4ef44d");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase6() {
               testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "143626fe5fce765d6c997a64f058a813");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase7() {
               testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "f2593674cef894eda4e0be9cf3158f57");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleContaminationCase8() {
               testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "fb7ce0740767ae3896b3e552026da1e4");
           }
      @@ -227,17 +227,17 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
       
           // verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleEqualsFlatContaminationCase1() {
               testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0, "");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleEqualsFlatContaminationCase2() {
               testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15, "");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testPerSampleEqualsFlatContaminationCase3() {
               testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3, "");
           }
      @@ -250,7 +250,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
           // --------------------------------------------------------------------------------------------------------------
       
       
      -    @Test
      +    @Test(enabled = false)
           public void testHCContaminationDownsamplingFlat() {
               final String baseCommand = "-T HaplotypeCaller -R " + b36KGReference + " --no_cmdline_in_header --dbsnp " + b36dbSNP129;
               WalkerTestSpec spec = new WalkerTestSpec(
      @@ -260,7 +260,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
           }
       
           //  HaplotypeCaller can only (currently) use flat contamination reduction, not per-sample. Until that is implemented, this test
      -    @Test
      +    @Test(enabled = false)
           public void testHCCannotProcessPerSampleContamination() {
               final String baseCommand = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header  -L 20:3,000,000-5,000,000";
               final String bam1 = "NA11918.with.1.NA12842.reduced.bam";
      @@ -281,17 +281,17 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest {
               executeTest("HC test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec);
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testHCFlatContaminationCase1() {
               testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "c3e695381d8627e3922d8c642b66c3ce");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testHCFlatContaminationCase2() {
               testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "002d2b45336d88d7c04e19f9f26e29d9");
           }
       
      -    @Test
      +    @Test(enabled = false)
           public void testHCFlatContaminationCase3() {
               testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "1809a33ac112d1a3bd7a071c566794dd");
           }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
      index 23596db83..657cd9c0c 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
      @@ -50,10 +50,16 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
       // the imports for unit testing.
       
       
      +import org.apache.commons.lang.ArrayUtils;
       import org.broadinstitute.sting.BaseTest;
       import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
       import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
       import org.broadinstitute.sting.utils.MathUtils;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.broadinstitute.variant.variantcontext.Allele;
      +import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods;
      +import org.broadinstitute.variant.variantcontext.VariantContext;
      +import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
       import org.testng.Assert;
       import org.testng.annotations.BeforeClass;
       import org.testng.annotations.BeforeMethod;
      @@ -102,4 +108,23 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
               Assert.assertTrue(MathUtils.goodLog10Probability(ref), "Reference calculation wasn't a well formed log10 prob " + ref);
               Assert.assertEquals(ref, expected, TOLERANCE, "Failed reference confidence for single sample");
           }
      +
      +    @Test(enabled=true)
      +    public void testTooManyAlleles() {
      +
      +        for ( Integer numAltAlleles = 0; numAltAlleles < 100; numAltAlleles++ )  {
      +
      +            Set alleles = new HashSet();
      +            alleles.add(Allele.create("A", true));        // ref allele
      +
      +            for (int len = 1; len <=numAltAlleles; len++) {
      +                // add alt allele of length len+1
      +                alleles.add(Allele.create(Utils.dupString('A', len + 1), false));
      +            }
      +            final VariantContext vc = new VariantContextBuilder("test", "chr1", 1000, 1000, alleles).make();
      +            final boolean result = ugEngine.canVCbeGenotyped(vc);
      +            Assert.assertTrue(result == (vc.getNAlleles()<= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED));
      +        }
      +    }
      +
       }
      \ No newline at end of file
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java
      new file mode 100644
      index 000000000..34b19ed2d
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java
      @@ -0,0 +1,84 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.genotyper;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.testng.annotations.Test;
      +
      +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES;
      +
      +/**
      + * Created by IntelliJ IDEA.
      + * User: delangel
      + * Date: 4/5/12
      + * Time: 11:28 AM
      + * To change this template use File | Settings | File Templates.
      + */
      +public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTest {
      +
      +    private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor();
      +
      +    @Test(enabled = true)
      +    public void testSNP_ACS_Pools() {
      +        executor.PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "df0e67c975ef74d593f1c704daab1705");
      +    }
      +
      +    @Test(enabled = true)
      +    public void testBOTH_GGA_Pools() {
      +        executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "71f16e19b7d52e8edee46f4121e59f54");
      +    }
      +
      +    @Test(enabled = true)
      +    public void testINDEL_GGA_Pools() {
      +        executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "3f7d763c654f1d708323f369ea4a099b");
      +    }
      +
      +    @Test(enabled = true)
      +    public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
      +        executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "603416111f34e2a735163fa97e1a8272");
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java
      new file mode 100644
      index 000000000..8a165cbeb
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java
      @@ -0,0 +1,73 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.genotyper;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.testng.annotations.Test;
      +
      +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM;
      +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS;
      +
      +public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest {
      +
      +    private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor();
      +
      +    @Test(enabled = true)
      +    public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
      +        executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","13de8558acaa0b9082f2df477b45de9b");
      +    }
      +
      +    @Test(enabled = true)
      +    public void testMT_SNP_DISCOVERY_sp4() {
      +        executor.PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b");
      +    }
      +
      +    @Test(enabled = true)
      +    public void testMT_SNP_GGA_sp10() {
      +        executor.PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8");
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java
      similarity index 76%
      rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
      rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java
      index 6a381e0cf..53d32832b 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java
      @@ -47,90 +47,47 @@
       package org.broadinstitute.sting.gatk.walkers.genotyper;
       
       import org.broadinstitute.sting.WalkerTest;
      -import org.testng.annotations.Test;
       
       import java.util.Arrays;
       
      -/**
      - * Created by IntelliJ IDEA.
      - * User: delangel
      - * Date: 4/5/12
      - * Time: 11:28 AM
      - * To change this template use File | Settings | File Templates.
      - */
      -public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
      +public class UnifiedGenotyperGeneralPloidyTestExecutor extends WalkerTest {
           final static String REF = b37KGReference;
      -    final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list";
      -    final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
      -    final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
      -    final String REFSAMPLE_NAME = "NA12878";
      -    final String MTINTERVALS = "MT:1-1000";
      -    final String LSVINTERVALS = "20:40,500,000-41,000,000";
      -    final String LSVINTERVALS_SHORT = "20:40,500,000-40,501,000";
      -    final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
      -    final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
      -    final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
      +    final static String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list";
      +    final static String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
      +    final static String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
      +    final static String REFSAMPLE_NAME = "NA12878";
      +    final static String MTINTERVALS = "MT:1-1000";
      +    final static String LSVINTERVALS = "20:40,500,000-41,000,000";
      +    final static String LSVINTERVALS_SHORT = "20:40,500,000-40,501,000";
      +    final static String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
      +    final static String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
      +    final static String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
       
      -    private void PC_MT_Test(String bam, String args, String name, String md5) {
      +    public void PC_MT_Test(String bam, String args, String name, String md5) {
               final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -ignoreLane ",
                       REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
               final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
               executeTest("testPoolCaller:"+name+" args=" + args, spec);
           }
       
      -    private void PC_LSV_Test(String args, String name, String model, String md5) {
      +    public void PC_LSV_Test(String args, String name, String model, String md5) {
               final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ",
                       REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
               final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
               executeTest("testPoolCaller:"+name+" args=" + args, spec);
           }
       
      -    private void PC_LSV_Test_short(String args, String name, String model, String md5) {
      +    public void PC_LSV_Test_short(String args, String name, String model, String md5) {
               final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ",
                       REF, LSV_BAM, LSVINTERVALS_SHORT, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
               final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
               executeTest("testPoolCaller:"+name+" args=" + args, spec);
           }
       
      -    private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
      +    public void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
               final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane",
                       REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
               final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
               executeTest("testPoolCaller:"+name+" args=" + args, spec);
           }
      -
      -    @Test(enabled = true)
      -    public void testSNP_ACS_Pools() {
      -        PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","df0e67c975ef74d593f1c704daab1705");
      -    }
      -
      -    @Test(enabled = true)
      -    public void testBOTH_GGA_Pools() {
      -        PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","71f16e19b7d52e8edee46f4121e59f54");
      -    }
      -
      -    @Test(enabled = true)
      -    public void testINDEL_GGA_Pools() {
      -        PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","3f7d763c654f1d708323f369ea4a099b");
      -    }
      -
      -    @Test(enabled = true)
      -    public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
      -        PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","3a321896c4b8b6457973c76c486da4d4");
      -    }
      -
      -    @Test(enabled = true)
      -    public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
      -        PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","5812da66811887d834d0379a33e655c0");
      -    }
      -
      -    @Test(enabled = true)
      -    public void testMT_SNP_DISCOVERY_sp4() {
      -         PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b");
      -    }
      -
      -    @Test(enabled = true)
      -    public void testMT_SNP_GGA_sp10() {
      -        PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8");
      -    }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java
      new file mode 100644
      index 000000000..52970d70d
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java
      @@ -0,0 +1,208 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.genotyper;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.testng.annotations.Test;
      +
      +import java.io.File;
      +import java.util.Arrays;
      +import java.util.Collections;
      +import java.util.List;
      +
      +public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
      +
      +    private final static String baseCommandIndels = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
      +    private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
      +
      +    // --------------------------------------------------------------------------------------------------------------
      +    //
      +    // testing indel caller
      +    //
      +    // --------------------------------------------------------------------------------------------------------------
      +    // Basic indel testing with SLX data
      +    @Test
      +    public void testSimpleIndels() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndels +
      +                        " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" +
      +                        " -o %s" +
      +                        " -L 1:10,000,000-10,500,000",
      +                1,
      +                Arrays.asList("d8b0c5be39ec6b239641c2f2646d2bc3"));
      +
      +        executeTest(String.format("test indel caller in SLX"), spec);
      +    }
      +
      +    // Basic indel testing with SLX data
      +    @Test
      +    public void testIndelsWithLowMinAlleleCnt() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndels +
      +                        " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" +
      +                        " -o %s" +
      +                        " -minIndelCnt 1" +
      +                        " -L 1:10,000,000-10,100,000",
      +                1,
      +                Arrays.asList("d9572a227ccb13a6baa6dc4fb65bc1e5"));
      +
      +        executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
      +    }
      +
      +    @Test
      +    public void testMultiTechnologyIndels() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndels +
      +                        " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
      +                        " -o %s" +
      +                        " -L 1:10,000,000-10,500,000",
      +                1,
      +                Arrays.asList("8d9b8f8a1479322961c840e461b6dba8"));
      +
      +        executeTest(String.format("test indel calling, multiple technologies"), spec);
      +    }
      +
      +    @Test
      +    public void testWithIndelAllelesPassedIn1() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
      +                        "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
      +                Arrays.asList("16d975480ff1e689113171805b916b62"));
      +        executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
      +    }
      +
      +    @Test
      +    public void testWithIndelAllelesPassedIn2() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
      +                        + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
      +                        "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
      +                Arrays.asList("60ed3f8d5bc3f765e6ce3fa698b68bb7"));
      +        executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
      +    }
      +
      +    @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes
      +    public void testMultiSampleIndels1() {
      +        // since we're going to test the MD5s with GGA only do one here
      +        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
      +                Arrays.asList(""));
      +        List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
      +
      +        WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
      +                        "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
      +                Arrays.asList("3d4d66cc253eac55f16e5b0a36f17d8d"));
      +        executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
      +    }
      +
      +    @Test
      +    public void testGGAwithNoEvidenceInReads() {
      +        final String vcf = "small.indel.test.vcf";
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation +
      +                        "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
      +                Arrays.asList("d76eacc4021b78ccc0a9026162e814a7"));
      +        executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
      +    }
      +
      +    @Test
      +    public void testBaseIndelQualityScores() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandIndelsb37 +
      +                        " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" +
      +                        " -o %s" +
      +                        " -L 20:10,000,000-10,100,000",
      +                1,
      +                Arrays.asList("8a7966e4b67334bca6083670c5a16b67"));
      +
      +        executeTest(String.format("test UG with base indel quality scores"), spec);
      +    }
      +
      +    // --------------------------------------------------------------------------------------------------------------
      +    //
      +    // testing MinIndelFraction
      +    //
      +    // --------------------------------------------------------------------------------------------------------------
      +
      +    final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation
      +            + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030";
      +
      +    @Test
      +    public void testMinIndelFraction0() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                assessMinIndelFraction + " -minIndelFrac 0.0", 1,
      +                Arrays.asList("264325878b988acc11d8e5d9d2ba0b7f"));
      +        executeTest("test minIndelFraction 0.0", spec);
      +    }
      +
      +    @Test
      +    public void testMinIndelFraction25() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                assessMinIndelFraction + " -minIndelFrac 0.25", 1,
      +                Arrays.asList("98abcfb0a008050eba8b9c285a25b2a0"));
      +        executeTest("test minIndelFraction 0.25", spec);
      +    }
      +
      +    @Test
      +    public void testMinIndelFraction100() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                assessMinIndelFraction + " -minIndelFrac 1", 1,
      +                Arrays.asList("3f07efb768e08650a7ce333edd4f9a52"));
      +        executeTest("test minIndelFraction 1.0", spec);
      +    }
      +
      +    // No testing of MD5 here, we previously blew up due to a 0 length haplotypes, so we just need to pass
      +    @Test
      +    public void testHaplotype0Length() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null",
      +                0,
      +                Collections.emptyList());
      +        executeTest("testHaplotype0Length", spec);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
      index 4342b8bfc..d55a923dc 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
      @@ -51,10 +51,8 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
       import org.broadinstitute.sting.utils.exceptions.UserException;
       import org.testng.annotations.Test;
       
      -import java.io.File;
       import java.util.Arrays;
       import java.util.Collections;
      -import java.util.List;
       
       // ********************************************************************************** //
       // Note that this class also serves as an integration test for the VariantAnnotator!  //
      @@ -62,78 +60,122 @@ import java.util.List;
       
       public class UnifiedGenotyperIntegrationTest extends WalkerTest {
       
      -    private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
      -    private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
      -    private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
      -    private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
      +    private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
      +    private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
       
           // --------------------------------------------------------------------------------------------------------------
           //
      -    // testing normal calling
      +    // testing parameters
           //
           // --------------------------------------------------------------------------------------------------------------
      +
           @Test
      -    public void testMultiSamplePilot1() {
      +    public void testMinBaseQualityScore() {
               WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
      -                Arrays.asList("2f15ef1ead56d875a3f1d53772f52b3a"));
      -        executeTest("test MultiSample Pilot1", spec);
      +                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
      +                Arrays.asList("30be17df00acc8a92223f51fe7c1bdf7"));
      +        executeTest("test min_base_quality_score 26", spec);
           }
       
           @Test
      -    public void testWithAllelesPassedIn1() {
      +    public void testSLOD() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
      +                Arrays.asList("4aa226c00a242047cf427d0919003048"));
      +        executeTest("test SLOD", spec);
      +    }
      +
      +    @Test
      +    public void testNDA() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
      +                Arrays.asList("17f65eca1e6c1f06919a58f230b6d8d3"));
      +        executeTest("test NDA", spec);
      +    }
      +
      +    @Test
      +    public void testCompTrack() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
      +                Arrays.asList("50937942e3d228614d2531c3be237709"));
      +        executeTest("test using comp track", spec);
      +    }
      +
      +    @Test(enabled = false) // EB: for some reason this test crashes whenever I run it on my local machine
      +    public void testNoCmdLineHeaderStdout() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0,
      +                Collections.emptyList());
      +        executeTest("testNoCmdLineHeaderStdout", spec);
      +    }
      +
      +    @Test
      +    public void testOutputParameterSitesOnly() {
      +        testOutputParameters("-sites_only", "48cd40d3994911a6f2609bfd375e1d2d");
      +    }
      +
      +    @Test
      +    public void testOutputParameterAllConfident() {
      +        testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "28f40ce47651f504158fc4e5bb58df4b");
      +    }
      +
      +    @Test
      +    public void testOutputParameterAllSites() {
      +        testOutputParameters("--output_mode EMIT_ALL_SITES", "5259dafaa1b57d9489003b16a48e35f8");
      +    }
      +
      +    private void testOutputParameters(final String args, final String md5) {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + args, 1,
      +                Arrays.asList(md5));
      +        executeTest(String.format("testParameter[%s]", args), spec);
      +    }
      +
      +    @Test
      +    public void testConfidence() {
               WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
      -                Arrays.asList("5b31b811072a4df04524e13604015f9b"));
      -        executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
      +                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
      +                Arrays.asList("918109938ef355d759dafc3ebb47d8a5"));
      +        executeTest("test confidence 1", spec1);
           }
       
           @Test
      -    public void testWithAllelesPassedIn2() {
      -        WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
      -                Arrays.asList("d9992e55381afb43742cc9b30fcd7538"));
      -        executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
      +    public void testNoPrior() {
      +        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1,
      +                Arrays.asList("7ac60bdc355d97c0939e644b58de47d7"));
      +        executeTest("test no prior 1", spec1);
      +
      +    }
      +    @Test
      +    public void testUserPrior() {
      +        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1,
      +                Arrays.asList("04d05900849d5a3f6f3f98bd0f262369"));
      +        executeTest("test user prior 1", spec1);
      +
      +    }
      +
      +    // --------------------------------------------------------------------------------------------------------------
      +    //
      +    // testing heterozygosity
      +    //
      +    // --------------------------------------------------------------------------------------------------------------
      +    @Test
      +    public void testHeterozyosity1() {
      +        testHeterozosity( 0.01, "3b66f82dbb746875638e076bf51a1583" );
           }
       
           @Test
      -    public void testSingleSamplePilot2() {
      +    public void testHeterozyosity2() {
      +        testHeterozosity( 1.0 / 1850, "714c1795334c7c62c046a75479381ae6" );
      +    }
      +
      +    private void testHeterozosity(final double arg, final String md5) {
               WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
      -                Arrays.asList("33ab66c2f062cfa1f7fcc077165f778c"));
      -        executeTest("test SingleSample Pilot2", spec);
      -    }
      -
      -    @Test
      -    public void testMultipleSNPAlleles() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
      -                Arrays.asList("9fac00485419878749b03706ae6b852f"));
      -        executeTest("test Multiple SNP alleles", spec);
      -    }
      -
      -    @Test
      -    public void testBadRead() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
      -                Arrays.asList("d915535c1458733f09f82670092fcab6"));
      -        executeTest("test bad read", spec);
      -    }
      -
      -    @Test
      -    public void testReverseTrim() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
      -                Arrays.asList("eb9604b77a7d6baab60c81ac3db5e47b"));
      -        executeTest("test reverse trim", spec);
      -    }
      -
      -    @Test
      -    public void testMismatchedPLs() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
      -                Arrays.asList("de2c5707c1805d17d70acaecd36b7372"));
      -        executeTest("test mismatched PLs", spec);
      +                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 --heterozygosity " + arg, 1,
      +                Arrays.asList(md5));
      +        executeTest(String.format("test heterozyosity[%s]", arg), spec);
           }
       
           // --------------------------------------------------------------------------------------------------------------
      @@ -142,7 +184,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
           //
           // --------------------------------------------------------------------------------------------------------------
       
      -    private final static String COMPRESSED_OUTPUT_MD5 = "d5a7326fdcf6d441b73c381912ad3a2a";
      +    private final static String COMPRESSED_OUTPUT_MD5 = "6f79205f7ed8006470f056f6805db6c8";
       
           @Test
           public void testCompressedOutput() {
      @@ -185,105 +227,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
               executeTest("test parallelization (4 threads)", spec3);
           }
       
      -    // --------------------------------------------------------------------------------------------------------------
      -    //
      -    // testing parameters
      -    //
      -    // --------------------------------------------------------------------------------------------------------------
      -
      -    @Test
      -    public void testMinBaseQualityScore() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
      -                Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff"));
      -        executeTest("test min_base_quality_score 26", spec);
      -    }
      -
      -    @Test
      -    public void testSLOD() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
      -                Arrays.asList("55760482335497086458b09e415ecf54"));
      -        executeTest("test SLOD", spec);
      -    }
      -
      -    @Test
      -    public void testNDA() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
      -                Arrays.asList("938e888a40182878be4c3cc4859adb69"));
      -        executeTest("test NDA", spec);
      -    }
      -
      -    @Test
      -    public void testCompTrack() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
      -                Arrays.asList("7dc186d420487e4e156a24ec8dea0951"));
      -        executeTest("test using comp track", spec);
      -    }
      -
      -    @Test(enabled = false) // EB: for some reason this test crashes whenever I run it on my local machine
      -    public void testNoCmdLineHeaderStdout() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0,
      -                Collections.emptyList());
      -        executeTest("testNoCmdLineHeaderStdout", spec);
      -    }
      -
      -    @Test
      -    public void testOutputParameterSitesOnly() {
      -        testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9");
      -    }
      -
      -    @Test
      -    public void testOutputParameterAllConfident() {
      -        testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "5649f72de04e1391e0f2bb86843d3d72");
      -    }
      -
      -    @Test
      -    public void testOutputParameterAllSites() {
      -        testOutputParameters("--output_mode EMIT_ALL_SITES", "cb151bb9e90680b12714d481091ed209");
      -    }
      -
      -    private void testOutputParameters(final String args, final String md5) {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + args, 1,
      -                Arrays.asList(md5));
      -        executeTest(String.format("testParameter[%s]", args), spec);
      -    }
      -
      -    @Test
      -    public void testConfidence() {
      -        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
      -                Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb"));
      -        executeTest("test confidence 1", spec1);
      -    }
      -
      -    // --------------------------------------------------------------------------------------------------------------
      -    //
      -    // testing heterozygosity
      -    //
      -    // --------------------------------------------------------------------------------------------------------------
      -    @Test
      -    public void testHeterozyosity1() {
      -        testHeterozosity( 0.01, "ffc1f83a045dc09360e11de7a8efd159" );
      -    }
      -
      -    @Test
      -    public void testHeterozyosity2() {
      -        testHeterozosity( 1.0 / 1850, "5426a98df9f5fd70aef295d889c4e4f1" );
      -    }
      -
      -    private void testHeterozosity(final double arg, final String md5) {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 --heterozygosity " + arg, 1,
      -                Arrays.asList(md5));
      -        executeTest(String.format("test heterozyosity[%s]", arg), spec);
      -    }
      -
      -
           // --------------------------------------------------------------------------------------------------------------
           //
           // testing calls with SLX, 454, and SOLID data
      @@ -297,7 +240,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                               " -o %s" +
                               " -L 1:10,000,000-10,100,000",
                       1,
      -                Arrays.asList("68961b19a29ae224059c33ef41cdcb58"));
      +                Arrays.asList("31be725b2a7c15e9769391ad940c0587"));
       
               executeTest(String.format("test multiple technologies"), spec);
           }
      @@ -316,115 +259,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                               " -L 1:10,000,000-10,100,000" +
                               " -baq CALCULATE_AS_NECESSARY",
                       1,
      -                Arrays.asList("9fcb234f7573209dec4dae86db091efd"));
      +                Arrays.asList("dcc5cec42730567982def16da4a7f286"));
       
               executeTest(String.format("test calling with BAQ"), spec);
           }
       
      -    // --------------------------------------------------------------------------------------------------------------
      -    //
      -    // testing indel caller
      -    //
      -    // --------------------------------------------------------------------------------------------------------------
      -    // Basic indel testing with SLX data
      -    @Test
      -    public void testSimpleIndels() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndels +
      -                        " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" +
      -                        " -o %s" +
      -                        " -L 1:10,000,000-10,500,000",
      -                1,
      -                Arrays.asList("1cb469b9cc8e6c70430021540bf1af8b"));
      -
      -        executeTest(String.format("test indel caller in SLX"), spec);
      -    }
      -
      -    // Basic indel testing with SLX data
      -    @Test
      -    public void testIndelsWithLowMinAlleleCnt() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndels +
      -                        " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" +
      -                        " -o %s" +
      -                        " -minIndelCnt 1" +
      -                        " -L 1:10,000,000-10,100,000",
      -                1,
      -                Arrays.asList("c7e59f9ab718df4c604626a0f51af606"));
      -
      -        executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
      -    }
      -
      -    @Test
      -    public void testMultiTechnologyIndels() {
      -         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                 baseCommandIndels +
      -                         " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
      -                         " -o %s" +
      -                         " -L 1:10,000,000-10,500,000",
      -                 1,
      -                 Arrays.asList("4bebbe4ed4a7554285a3b4bb7311101c"));
      -
      -         executeTest(String.format("test indel calling, multiple technologies"), spec);
      -     }
      -
      -    @Test
      -    public void testWithIndelAllelesPassedIn1() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
      -                        "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
      -                Arrays.asList("86880ec78755ae91cb5bb34a0631a32c"));
      -        executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
      -    }
      -
      -    @Test
      -    public void testWithIndelAllelesPassedIn2() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
      -                        + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
      -                        "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
      -                Arrays.asList("2584d5e3ade1b548f1fe9cdcafbe1b28"));
      -        executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
      -    }
      -
      -    @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes
      -    public void testMultiSampleIndels1() {
      -        // since we're going to test the MD5s with GGA only do one here
      -        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
      -                Arrays.asList(""));
      -        List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
      -
      -        WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
      -                        "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
      -                Arrays.asList("08b3a85be00c8f6a4fefd3c671463ecf"));
      -        executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
      -    }
      -
      -    @Test
      -    public void testGGAwithNoEvidenceInReads() {
      -        final String vcf = "small.indel.test.vcf";
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation +
      -                        "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
      -                Arrays.asList("d76eacc4021b78ccc0a9026162e814a7"));
      -        executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
      -    }
      -
      -    @Test
      -    public void testBaseIndelQualityScores() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                baseCommandIndelsb37 +
      -                        " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" +
      -                        " -o %s" +
      -                        " -L 20:10,000,000-10,100,000",
      -                1,
      -                Arrays.asList("8a7966e4b67334bca6083670c5a16b67"));
      -
      -        executeTest(String.format("test UG with base indel quality scores"), spec);
      -    }
      -
           // --------------------------------------------------------------------------------------------------------------
           //
           // testing SnpEff
      @@ -441,39 +280,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
               executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec);
           }
       
      -    // --------------------------------------------------------------------------------------------------------------
      -    //
      -    // testing MinIndelFraction
      -    //
      -    // --------------------------------------------------------------------------------------------------------------
      -
      -    final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation
      -            + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030";
      -
      -    @Test
      -    public void testMinIndelFraction0() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                assessMinIndelFraction + " -minIndelFrac 0.0", 1,
      -                Arrays.asList("556c214366e82e4682e753ce93307a4e"));
      -        executeTest("test minIndelFraction 0.0", spec);
      -    }
      -
      -    @Test
      -    public void testMinIndelFraction25() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                assessMinIndelFraction + " -minIndelFrac 0.25", 1,
      -                Arrays.asList("1df02b805d9dfbd532fa3632875a989d"));
      -        executeTest("test minIndelFraction 0.25", spec);
      -    }
      -
      -    @Test
      -    public void testMinIndelFraction100() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                assessMinIndelFraction + " -minIndelFrac 1", 1,
      -                Arrays.asList("3f07efb768e08650a7ce333edd4f9a52"));
      -        executeTest("test minIndelFraction 1.0", spec);
      -    }
      -
           // --------------------------------------------------------------------------------------------------------------
           //
           // testing Ns in CIGAR
      @@ -483,41 +289,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
           @Test
           public void testNsInCigar() {
               WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
      -                Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6"));
      +                "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
      +                Arrays.asList("2ae3fd39c53a6954d32faed8703adfe8"));
               executeTest("test calling on reads with Ns in CIGAR", spec);
           }
      -
      -    // --------------------------------------------------------------------------------------------------------------
      -    //
      -    // testing reduced reads
      -    //
      -    // --------------------------------------------------------------------------------------------------------------
      -
      -    @Test
      -    public void testReducedBam() {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
      -                Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304"));
      -        executeTest("test calling on a ReducedRead BAM", spec);
      -    }
      -
      -    @Test
      -    public void testReducedBamSNPs() {
      -        testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f");
      -    }
      -
      -    @Test
      -    public void testReducedBamINDELs() {
      -        testReducedCalling("INDEL", "acde5694a74f867256a54a26cbebbf21");
      -    }
      -
      -
      -    private void testReducedCalling(final String model, final String md5) {
      -        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1,
      -                Arrays.asList(md5));
      -        executeTest("test calling on a ReducedRead BAM with " + model, spec);
      -    }
      -
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java
      new file mode 100644
      index 000000000..8256a8496
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java
      @@ -0,0 +1,126 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.genotyper;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.testng.annotations.Test;
      +
      +import java.util.Arrays;
      +
      +public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
      +
      +    private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
      +
      +    // --------------------------------------------------------------------------------------------------------------
      +    //
      +    // testing normal calling
      +    //
      +    // --------------------------------------------------------------------------------------------------------------
      +    @Test
      +    public void testMultiSamplePilot1() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
      +                Arrays.asList("a6c224235c21b4af816b1512eb0624df"));
      +        executeTest("test MultiSample Pilot1", spec);
      +    }
      +
      +    @Test
      +    public void testWithAllelesPassedIn1() {
      +        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
      +                Arrays.asList("ebfcc3dd8c1788929cb50050c5d456df"));
      +        executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
      +    }
      +
      +    @Test
      +    public void testWithAllelesPassedIn2() {
      +        WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
      +                Arrays.asList("698e54aeae3130779d246b9480a4052c"));
      +        executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
      +    }
      +
      +    @Test
      +    public void testSingleSamplePilot2() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
      +                Arrays.asList("aaadb2a355d87344eabb6ac4495a11e4"));
      +        executeTest("test SingleSample Pilot2", spec);
      +    }
      +
      +    @Test
      +    public void testMultipleSNPAlleles() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
      +                Arrays.asList("09a1a4d4bf0289bcc5e8a958f783a989"));
      +        executeTest("test Multiple SNP alleles", spec);
      +    }
      +
      +    @Test
      +    public void testBadRead() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
      +                Arrays.asList("d915535c1458733f09f82670092fcab6"));
      +        executeTest("test bad read", spec);
      +    }
      +
      +    @Test
      +    public void testReverseTrim() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
      +                Arrays.asList("57a1bb44967988f2b7ae7779127990ae"));
      +        executeTest("test reverse trim", spec);
      +    }
      +
      +    @Test
      +    public void testMismatchedPLs() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
      +                Arrays.asList("3011c20165951ca43c8a4e86a5835dbd"));
      +        executeTest("test mismatched PLs", spec);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java
      new file mode 100644
      index 000000000..f7ac87cda
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java
      @@ -0,0 +1,87 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.genotyper;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.testng.annotations.Test;
      +
      +import java.util.Arrays;
      +
      +public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
      +
      +    // --------------------------------------------------------------------------------------------------------------
      +    //
      +    // testing reduced reads
      +    //
      +    // --------------------------------------------------------------------------------------------------------------
      +
      +    @Test
      +    public void testReducedBam() {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
      +                Arrays.asList("e6565060b44a7804935973efcd56e596"));
      +        executeTest("test calling on a ReducedRead BAM", spec);
      +    }
      +
      +    @Test
      +    public void testReducedBamSNPs() {
      +        testReducedCalling("SNP", "ab776d74c41ce2b859e2b2466a76204a");
      +    }
      +
      +    @Test
      +    public void testReducedBamINDELs() {
      +        testReducedCalling("INDEL", "9a986b98ed014576ce923e07452447f4");
      +    }
      +
      +
      +    private void testReducedCalling(final String model, final String md5) {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
      +                Arrays.asList(md5));
      +        executeTest("test calling on a ReducedRead BAM with " + model, spec);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
      index c4f5befcf..2bdf5078d 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
      @@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
                   final int nPriorValues = 2*nSamples+1;
                   final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
                   final double[] humanPriors = new double[nPriorValues];
      -            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
      +            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList());
       
                   for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
                       for ( AFCalc model : calcs ) {
      @@ -575,6 +575,39 @@ public class AFCalcUnitTest extends BaseTest {
               return tests.toArray(new Object[][]{});
           }
       
      +
      +    @Test(enabled = true, dataProvider =  "Models")
      +    public void testNoPrior(final AFCalc model) {
      +        for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
      +            final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
      +
      +            final double[] flatPriors = new double[]{0.0,0.0,0.0};
      +            final double[] noPriors = new double[3];
      +            // test that function computeAlleleFrequency correctly operates when the flat prior option is set
      +            // computeAlleleFrequencyPriors takes linear priors
      +            final ArrayList inputPrior = new ArrayList();
      +            inputPrior.add(1.0/3);
      +            inputPrior.add(1.0/3);
      +            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior);
      +
      +            GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
      +            GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
      +            final AFCalcResult resultTrackerFlat = cfgFlatPrior.execute();
      +            final AFCalcResult resultTrackerNoPrior = cfgNoPrior.execute();
      +
      +            final double pRefWithNoPrior = AB.getLikelihoods().getAsVector()[0];
      +            final double pHetWithNoPrior = AB.getLikelihoods().getAsVector()[1]  - Math.log10(0.5);
      +            final double nonRefPost = Math.pow(10, pHetWithNoPrior) / (Math.pow(10, pRefWithNoPrior) + Math.pow(10, pHetWithNoPrior));
      +            final double log10NonRefPost = Math.log10(nonRefPost);
      +
      +            if ( ! Double.isInfinite(log10NonRefPost) ) {
      +                // check that the no-prior and flat-prior constructions yield same result
      +                Assert.assertEquals(resultTrackerFlat.getLog10PosteriorOfAFGT0(), resultTrackerNoPrior.getLog10PosteriorOfAFGT0());
      +            }
      +
      +        }
      +    }
      +
           @Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models")
           public void testBiallelicPriors(final AFCalc model) {
       
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java
      index f4a6d5494..e6dea4d11 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java
      @@ -55,10 +55,13 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
       import net.sf.samtools.Cigar;
       import net.sf.samtools.CigarElement;
       import net.sf.samtools.CigarOperator;
      +import net.sf.samtools.SAMFileHeader;
       import org.broadinstitute.sting.BaseTest;
      -import org.broadinstitute.sting.utils.Haplotype;
      +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
       import org.broadinstitute.sting.utils.Utils;
       import org.broadinstitute.sting.utils.sam.AlignmentUtils;
      +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
       import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       import org.testng.Assert;
       import org.testng.annotations.DataProvider;
      @@ -67,161 +70,20 @@ import org.testng.annotations.Test;
       import java.util.*;
       
       public class DeBruijnAssemblerUnitTest extends BaseTest {
      +    private final static boolean DEBUG = false;
       
      -
      -    private class MergeNodesWithNoVariationTestProvider extends TestDataProvider {
      -        public byte[] sequence;
      -        public int KMER_LENGTH;
      -
      -        public MergeNodesWithNoVariationTestProvider(String seq, int kmer) {
      -            super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq));
      -            sequence = seq.getBytes();
      -            KMER_LENGTH = kmer;
      -        }
      -
      -        public DeBruijnAssemblyGraph expectedGraph() {
      -            DeBruijnVertex v = new DeBruijnVertex(sequence, KMER_LENGTH);
      -            DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
      -            graph.addVertex(v);
      -            return graph;
      -        }
      -
      -        public DeBruijnAssemblyGraph calcGraph() {
      -
      -            DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
      -            final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
      -            for (int i = 0; i < kmersInSequence - 1; i++) {
      -                // get the kmers
      -                final byte[] kmer1 = new byte[KMER_LENGTH];
      -                System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH);
      -                final byte[] kmer2 = new byte[KMER_LENGTH];
      -                System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH);
      -
      -                graph.addKmersToGraph(kmer1, kmer2, false);
      -            }
      -            DeBruijnAssembler.mergeNodes(graph);
      -            return graph;
      -        }
      -    }
      -
      -    @DataProvider(name = "MergeNodesWithNoVariationTestProvider")
      -    public Object[][] makeMergeNodesWithNoVariationTests() {
      -        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3);
      -        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4);
      -        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5);
      -        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6);
      -        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7);
      -        new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
      -        new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
      -        new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
      -
      -        return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class);
      -    }
      -
      -    @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true)
      -    public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) {
      -        logger.warn(String.format("Test: %s", cfg.toString()));
      -        Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph()));
      -    }
      -
      -    @Test(enabled = true)
      -    public void testPruneGraph() {
      -        DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
      -        DeBruijnAssemblyGraph expectedGraph = new DeBruijnAssemblyGraph();
      -
      -        DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 1);
      -        DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 1);
      -        DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 1);
      -        DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 1);
      -        DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 1);
      -        DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 1);
      -
      -        graph.addVertex(v);
      -        graph.addVertex(v2);
      -        graph.addVertex(v3);
      -        graph.addVertex(v4);
      -        graph.addVertex(v5);
      -        graph.addVertex(v6);
      -        graph.addEdge(v, v2, new DeBruijnEdge(false, 1));
      -        graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
      -        graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
      -        graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
      -        graph.addEdge(v5, v6, new DeBruijnEdge(false, 2));
      -
      -        expectedGraph.addVertex(v2);
      -        expectedGraph.addVertex(v3);
      -        expectedGraph.addVertex(v4);
      -        expectedGraph.addVertex(v5);
      -        expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
      -        expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
      -        expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
      -
      -        DeBruijnAssembler.pruneGraph(graph, 2);
      -
      -        Assert.assertTrue(graphEquals(graph, expectedGraph));
      -
      -        graph = new DeBruijnAssemblyGraph();
      -        expectedGraph = new DeBruijnAssemblyGraph();
      -
      -        graph.addVertex(v);
      -        graph.addVertex(v2);
      -        graph.addVertex(v3);
      -        graph.addVertex(v4);
      -        graph.addVertex(v5);
      -        graph.addVertex(v6);
      -        graph.addEdge(v, v2, new DeBruijnEdge(true, 1));
      -        graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
      -        graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
      -        graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
      -
      -        expectedGraph.addVertex(v);
      -        expectedGraph.addVertex(v2);
      -        expectedGraph.addVertex(v3);
      -        expectedGraph.addVertex(v4);
      -        expectedGraph.addVertex(v5);
      -        expectedGraph.addEdge(v, v2, new DeBruijnEdge(true, 1));
      -        expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
      -        expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
      -        expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
      -
      -        DeBruijnAssembler.pruneGraph(graph, 2);
      -
      -        Assert.assertTrue(graphEquals(graph, expectedGraph));
      -    }
      -
      -    private boolean graphEquals(DeBruijnAssemblyGraph g1, DeBruijnAssemblyGraph g2) {
      -        if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) {
      -            return false;
      -        }
      -        for( DeBruijnEdge e1 : g1.edgeSet() ) {
      -            boolean found = false;
      -            for( DeBruijnEdge e2 : g2.edgeSet() ) {
      -                if( e1.equals(g1, e2, g2) ) { found = true; break; }
      -            }
      -            if( !found ) { return false; }
      -        }
      -        for( DeBruijnEdge e2 : g2.edgeSet() ) {
      -            boolean found = false;
      -            for( DeBruijnEdge e1 : g1.edgeSet() ) {
      -                if( e2.equals(g2, e1, g1) ) { found = true; break; }
      -            }
      -            if( !found ) { return false; }
      -        }
      -        return true;
      -    }
      -
      -    @Test(enabled = true)
      +    @Test(enabled = !DEBUG)
           public void testReferenceCycleGraph() {
               String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC";
               String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC";
      -        final DeBruijnAssemblyGraph g1 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false);
      -        final DeBruijnAssemblyGraph g2 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false);
      +        final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true));
      +        final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true));
       
               Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation.");
               Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation.");
           }
       
      -    @Test(enabled = true)
      +    @Test(enabled = !DEBUG)
           public void testLeftAlignCigarSequentially() {
               String preRefString = "GATCGATCGATC";
               String postRefString = "TTT";
      @@ -255,7 +117,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
                               String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString;
                               String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString;
       
      -                        Cigar calculatedCigar = DeBruijnAssembler.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0);
      +                        Cigar calculatedCigar = new DeBruijnAssembler().leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0);
                               Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!");
                           }
                       }
      @@ -263,4 +125,77 @@ public class DeBruijnAssemblerUnitTest extends BaseTest {
               }
           }
       
      +    private static class MockBuilder extends DeBruijnGraphBuilder {
      +        public final List addedPairs = new LinkedList();
      +
      +        private MockBuilder(final int kmerSize) {
      +            super(new DeBruijnGraph(kmerSize));
      +        }
      +
      +        @Override
      +        public void addKmerPair(Kmer kmerPair, int multiplicity) {
      +            logger.info("addKmerPair" + kmerPair);
      +            addedPairs.add(kmerPair);
      +        }
      +
      +        @Override
      +        public void flushKmersToGraph(boolean addRefEdges) {
      +            // do nothing
      +        }
      +    }
      +
      +    @DataProvider(name = "AddReadKmersToGraph")
      +    public Object[][] makeAddReadKmersToGraphData() {
      +        List tests = new ArrayList();
      +
      +        // this functionality can be adapted to provide input data for whatever you might want in your data
      +        final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
      +        final int readLen = bases.length();
      +        final List allBadStarts = new ArrayList(readLen);
      +        for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i);
      +
      +        for ( final int kmerSize : Arrays.asList(3, 4, 5) ) {
      +            for ( final int nBadQuals : Arrays.asList(0, 1, 2) ) {
      +                for ( final List badStarts : Utils.makePermutations(allBadStarts, nBadQuals, false) ) {
      +                    tests.add(new Object[]{bases, kmerSize, badStarts});
      +                }
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "AddReadKmersToGraph")
      +    public void testAddReadKmersToGraph(final String bases, final int kmerSize, final List badQualsSites) {
      +        final int readLen = bases.length();
      +        final DeBruijnAssembler assembler = new DeBruijnAssembler();
      +        final MockBuilder builder = new MockBuilder(kmerSize);
      +
      +        final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
      +
      +        final byte[] quals = Utils.dupBytes((byte)20, bases.length());
      +        for ( final int badSite : badQualsSites ) quals[badSite] = 0;
      +        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLen);
      +        read.setReadBases(bases.getBytes());
      +        read.setBaseQualities(quals);
      +
      +        final Set expectedBases = new HashSet();
      +        final Set expectedStarts = new LinkedHashSet();
      +        for ( int i = 0; i < readLen; i++) {
      +            boolean good = true;
      +            for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing
      +                good &= i + j < readLen && quals[i+j] >= assembler.getMinBaseQualityToUseInAssembly();
      +            }
      +            if ( good ) {
      +                expectedStarts.add(i);
      +                expectedBases.add(bases.substring(i, i + kmerSize + 1));
      +            }
      +        }
      +
      +        assembler.addReadKmersToGraph(builder, Arrays.asList(read));
      +        Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size());
      +        for ( final Kmer addedKmer : builder.addedPairs ) {
      +            Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases);
      +        }
      +    }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java
      index 5a1497236..a13618dae 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java
      @@ -47,6 +47,7 @@
       package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
       
       import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
       import org.testng.Assert;
       import org.testng.annotations.DataProvider;
       import org.testng.annotations.Test;
      @@ -75,7 +76,7 @@ public class DeBruijnAssemblyGraphUnitTest {
               }
       
               public byte[] calculatedReferenceBytes() {
      -            DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
      +            DeBruijnGraph graph = new DeBruijnGraph();
                   graph.addSequenceToGraph(refSequence, KMER_LENGTH, true);
                   if( altSequence.length > 0 ) {
                       graph.addSequenceToGraph(altSequence, KMER_LENGTH, false);
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
      index 8b09e91ae..8633a1d9d 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
      @@ -56,9 +56,10 @@ import net.sf.picard.reference.ReferenceSequenceFile;
       import org.broadinstitute.sting.BaseTest;
       import org.broadinstitute.sting.utils.*;
       import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
      +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
       import org.broadinstitute.variant.variantcontext.Allele;
       import org.broadinstitute.variant.variantcontext.VariantContext;
      -import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
       import org.testng.Assert;
       import org.testng.annotations.BeforeClass;
       import org.testng.annotations.DataProvider;
      @@ -198,7 +199,8 @@ public class GenotypingEngineUnitTest extends BaseTest {
               
               public Map calcAlignment() {
                   final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap);
      -            return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name");
      +            final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar());
      +            return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name");
               }
           }
       
      @@ -277,148 +279,6 @@ public class GenotypingEngineUnitTest extends BaseTest {
               Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap));
           }
       
      -    /**
      -     * Tests that we get the right values from the R^2 calculation
      -     */
      -    @Test
      -    public void testCalculateR2LD() {
      -        logger.warn("Executing testCalculateR2LD");
      -
      -        Assert.assertEquals(GenotypingEngine.calculateR2LD(1,1,1,1), 0.0, 0.00001);
      -        Assert.assertEquals(GenotypingEngine.calculateR2LD(100,100,100,100), 0.0, 0.00001);
      -        Assert.assertEquals(GenotypingEngine.calculateR2LD(1,0,0,1), 1.0, 0.00001);
      -        Assert.assertEquals(GenotypingEngine.calculateR2LD(100,0,0,100), 1.0, 0.00001);
      -        Assert.assertEquals(GenotypingEngine.calculateR2LD(1,2,3,4), (0.1 - 0.12) * (0.1 - 0.12) / (0.3 * 0.7 * 0.4 * 0.6), 0.00001);
      -    }
      -
      -    @Test
      -    public void testCreateMergedVariantContext() {
      -        logger.warn("Executing testCreateMergedVariantContext");
      -
      -        final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes();
      -        final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length);
      -
      -        // SNP + SNP = simple MNP
      -        VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      -        VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make();
      -        VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make();
      -        VariantContext mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // SNP + ref + SNP = MNP with ref base gap
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // insertion + SNP
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // SNP + insertion
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // deletion + SNP
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // SNP + deletion
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // insertion + deletion = MNP
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // insertion + deletion
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // insertion + insertion
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // deletion + deletion
      -        thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // deletion + insertion (abutting)
      -        thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -
      -        // complex + complex
      -        thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make();
      -        nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make();
      -        truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make();
      -        mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      -        logger.warn(truthVC + " == " + mergedVC);
      -        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      -        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      -        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      -    }
      -    
           /**
            * Private function to compare Map of VCs, it only checks the types and start locations of the VariantContext
            */
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
      new file mode 100644
      index 000000000..d3f3a9936
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
      @@ -0,0 +1,99 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.testng.annotations.Test;
      +
      +import java.util.Arrays;
      +
      +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM;
      +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF;
      +
      +public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
      +
      +    private void HCTestComplexVariants(String bam, String args, String md5) {
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
      +        final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
      +        executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
      +    }
      +
      +    @Test
      +    public void testHaplotypeCallerMultiSampleComplex1() {
      +        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "27db36467d40c3cde201f5826e959d78");
      +    }
      +
      +    private void HCTestSymbolicVariants(String bam, String args, String md5) {
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
      +        final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
      +        executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
      +    }
      +
      +    // TODO -- need a better symbolic allele test
      +    @Test
      +    public void testHaplotypeCallerSingleSampleSymbolic() {
      +        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554");
      +    }
      +
      +    private void HCTestComplexGGA(String bam, String args, String md5) {
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
      +        final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
      +        executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
      +    }
      +
      +    @Test
      +    public void testHaplotypeCallerMultiSampleGGAComplex() {
      +        HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
      +                "ed3b577e6f7d68bba6774a62d9df9cd9");
      +    }
      +
      +    @Test
      +    public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
      +        HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
      +                "a594a28d8053c3e969c39de81a9d03d6");
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
      index 856ef58a1..50165bd01 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
      @@ -46,37 +46,49 @@
       
       package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
       
      +import net.sf.picard.reference.IndexedFastaSequenceFile;
      +import org.broad.tribble.TribbleIndexedFeatureReader;
       import org.broadinstitute.sting.WalkerTest;
      +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
      +import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.GenomeLocParser;
      +import org.broadinstitute.sting.utils.collections.Pair;
      +import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
      +import org.broadinstitute.variant.variantcontext.VariantContext;
      +import org.broadinstitute.variant.vcf.VCFCodec;
       import org.testng.annotations.Test;
       
      -import java.util.Arrays;
      -import java.util.Collections;
      +import java.io.File;
      +import java.io.FileNotFoundException;
      +import java.io.IOException;
      +import java.util.*;
       
       public class HaplotypeCallerIntegrationTest extends WalkerTest {
           final static String REF = b37KGReference;
      -    final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
      -    final String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
      -    final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
      -    final String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
      -    final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
      +    final static String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
      +    final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
      +    final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
      +    final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
      +    final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam";
      +    final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
       
           private void HCTest(String bam, String args, String md5) {
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
               final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
               executeTest("testHaplotypeCaller: args=" + args, spec);
           }
       
           @Test
           public void testHaplotypeCallerMultiSample() {
      -        HCTest(CEUTRIO_BAM, "", "aac5517a0a64ad291b6b00825d982f7f");
      +        HCTest(CEUTRIO_BAM, "", "aeab5f0d40852e6332b96481981a0e46");
           }
       
           @Test
           public void testHaplotypeCallerSingleSample() {
      -        HCTest(NA12878_BAM, "", "3bfab723fb0f3a65998d82152b67ed15");
      +        HCTest(NA12878_BAM, "", "18d5671d8454e8a0c05ee5f6e9fabfe3");
           }
       
      -    @Test(enabled = false)
      +    @Test(enabled = false) // can't annotate the rsID's yet
           public void testHaplotypeCallerSingleSampleWithDbsnp() {
               HCTest(NA12878_BAM, "-D " + b37dbSNP132, "");
           }
      @@ -84,59 +96,60 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
           @Test
           public void testHaplotypeCallerMultiSampleGGA() {
               HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
      -                "283524b3e3397634d4cf0dc2b8723002");
      -    }
      -
      -    private void HCTestComplexGGA(String bam, String args, String md5) {
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
      -        final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
      -        executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
      +                "28c3b1f276ec8198801aafe880e40fb6");
           }
       
           @Test
      -    public void testHaplotypeCallerMultiSampleGGAComplex() {
      -        HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
      -                "417174e043dbb8b86cc3871da9b50536");
      -    }
      -
      -    @Test
      -    public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
      -        HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
      -                "f2df7a8f53ce449e4a8e8f8496e7c745");
      -    }
      -
      -    private void HCTestComplexVariants(String bam, String args, String md5) {
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
      -        final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
      -        executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
      -    }
      -
      -    @Test
      -    public void testHaplotypeCallerMultiSampleComplex() {
      -        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a960722c1ae2b6f774d3443a7e5ac27d");
      -    }
      -
      -    private void HCTestSymbolicVariants(String bam, String args, String md5) {
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
      -        final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
      -        executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
      -    }
      -
      -    // TODO -- need a better symbolic allele test
      -    @Test
      -    public void testHaplotypeCallerSingleSampleSymbolic() {
      -        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "56f2ef9acc6c0d267cf2b7a447d87fb7");
      +    public void testHaplotypeCallerInsertionOnEdgeOfContig() {
      +        HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae");
           }
       
           private void HCTestIndelQualityScores(String bam, String args, String md5) {
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
               final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
               executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
           }
       
           @Test
           public void testHaplotypeCallerSingleSampleIndelQualityScores() {
      -        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f1f867dbbe3747f16a0d9e5f11e6ed64");
      +        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "bac6f98e910290722df28da44b41f06f");
      +    }
      +
      +    private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
      +        try {
      +            final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
      +            final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
      +
      +            final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
      +            final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
      +            for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) {
      +                if( containsDuplicateRecord(vcf, parser) ) {
      +                    throw new IllegalStateException("Duplicate records detected but there should be none.");
      +                }
      +            }
      +        } catch( FileNotFoundException e ) {
      +            throw new IllegalStateException("Could not find the b37 reference file.");
      +        }
      +    }
      +
      +    private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) {
      +        final List> VCs = new ArrayList>();
      +        try {
      +            for( final VariantContext vc :  GATKVCFUtils.readVCF(vcf).getSecond() ) {
      +                VCs.add(new Pair(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc)));
      +            }
      +        } catch( IOException e ) {
      +            throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read.");
      +        }
      +
      +        final Set> VCsAsSet = new HashSet>(VCs);
      +        return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events.
      +    }
      +
      +
      +    @Test
      +    public void testHaplotypeCallerNearbySmallIntervals() {
      +        HCTestNearbySmallIntervals(NA12878_BAM, "", "65e7b1b72a2411d6360138049914aa3a");
           }
       
           // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
      @@ -145,22 +158,22 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
           // any of the calls in that region because it is so messy.
           @Test
           public void HCTestProblematicReadsModifiedInActiveRegions() {
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
      -        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ccd30e226f097a40cdeebaa035a290a7"));
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
      +        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
               executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
           }
       
           @Test
           public void HCTestStructuralIndels() {
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
      -        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a17e95c1191e3aef7892586fe38ca050"));
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
      +        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cb190c935541ebb9f660f713a882b922"));
               executeTest("HCTestStructuralIndels: ", spec);
           }
       
           @Test
           public void HCTestDoesNotFailOnBadRefBase() {
               // don't care about the output - just want to make sure it doesn't fail
      -        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
      +        final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
               final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList());
               executeTest("HCTestDoesNotFailOnBadRefBase: ", spec);
           }
      @@ -174,16 +187,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
           @Test
           public void HCTestReducedBam() {
               WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
      -                Arrays.asList("adb08cb25e902cfe0129404a682b2169"));
      +                "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
      +                Arrays.asList("0df626cd0d76aca8a05a545d0b36bf23"));
               executeTest("HC calling on a ReducedRead BAM", spec);
           }
       
           @Test
           public void testReducedBamWithReadsNotFullySpanningDeletion() {
               WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      -                "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
      -                Arrays.asList("6debe567cd5ed7eb5756b6605a151f56"));
      +                "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
      +                Arrays.asList("8adfa8a27a312760dab50787da595c57"));
               executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
           }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java
      new file mode 100644
      index 000000000..27b429353
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java
      @@ -0,0 +1,85 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
      +import org.testng.annotations.Test;
      +
      +import java.util.Arrays;
      +import java.util.Collections;
      +
      +public class HaplotypeCallerModesIntegrationTest extends WalkerTest {
      +    // --------------------------------------------------------------------------------------------------------------
      +    //
      +    // testing that writing a BAM works
      +    //
      +    // I don't really care about the MD5s, so I'm just not providing them here, so they don't have to be
      +    // updated.  These tests are basically ensuring that the code doesn't just randomly blow up.
      +    //
      +    // TODO -- what i'd really like to ensure here isn't the MD5 but that the BAMs can be read by the GATK or IGV
      +    //
      +    // --------------------------------------------------------------------------------------------------------------
      +
      +    @Test
      +    public void HCTestBamWriterCalledHaplotypes() {
      +        HCTestBamWriter(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, ""); // current MD5 = 9a2b6157f14b44b872a77f4e75c56023
      +    }
      +
      +    @Test
      +    public void HCTestBamWriterAllHaplotypes() {
      +        HCTestBamWriter(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, ""); // current MD5 = 06d885d82be81b8eef13bbfcd8041189
      +    }
      +
      +    public void HCTestBamWriter(final HaplotypeBAMWriter.Type type, final String md5) {
      +        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
      +                "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o /dev/null " +
      +                        "-bamout %s -L 20:10,000,000-10,010,000 -bamWriterType " + type, 1,
      +                Arrays.asList(md5));
      +        executeTest("HC writing bams with mode " + type, spec);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java
      deleted file mode 100644
      index 53400b790..000000000
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java
      +++ /dev/null
      @@ -1,246 +0,0 @@
      -/*
      -*  By downloading the PROGRAM you agree to the following terms of use:
      -*  
      -*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      -*  
      -*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      -*  
      -*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      -*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      -*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      -*  
      -*  1. DEFINITIONS
      -*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      -*  
      -*  2. LICENSE
      -*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      -*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      -*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      -*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      -*  
      -*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      -*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      -*  Copyright 2012 Broad Institute, Inc.
      -*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      -*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      -*  
      -*  4. INDEMNIFICATION
      -*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      -*  
      -*  5. NO REPRESENTATIONS OR WARRANTIES
      -*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      -*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      -*  
      -*  6. ASSIGNMENT
      -*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      -*  
      -*  7. MISCELLANEOUS
      -*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      -*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      -*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      -*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      -*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      -*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      -*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      -*/
      -
      -package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      -
      -import net.sf.samtools.Cigar;
      -import net.sf.samtools.CigarElement;
      -import net.sf.samtools.CigarOperator;
      -import org.apache.commons.lang.ArrayUtils;
      -import org.broadinstitute.sting.utils.Utils;
      -import org.broadinstitute.sting.utils.sam.AlignmentUtils;
      -import org.jgrapht.graph.DefaultDirectedGraph;
      -import org.testng.Assert;
      -import org.testng.annotations.DataProvider;
      -import org.testng.annotations.Test;
      -
      -import java.util.ArrayList;
      -import java.util.Arrays;
      -import java.util.List;
      -
      -/**
      - * Created with IntelliJ IDEA.
      - * User: rpoplin
      - * Date: 1/31/13
      - */
      -
      -public class KBestPathsUnitTest {
      -    @DataProvider(name = "BasicBubbleDataProvider")
      -    public Object[][] makeBasicBubbleDataProvider() {
      -        List tests = new ArrayList();
      -        for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) {
      -            for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) {
      -                tests.add(new Object[]{refBubbleLength, altBubbleLength});
      -            }
      -        }
      -        return tests.toArray(new Object[][]{});
      -    }
      -
      -    @Test(dataProvider = "BasicBubbleDataProvider")
      -    public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) {
      -        // Construct the assembly graph
      -        DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
      -        final int KMER_LENGTH = 3;
      -        final String preRef = "ATGG";
      -        final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGGGC";
      -
      -        DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH);
      -        DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
      -        DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
      -        DeBruijnVertex v3 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH);
      -
      -        graph.addVertex(v);
      -        graph.addVertex(v2Ref);
      -        graph.addVertex(v2Alt);
      -        graph.addVertex(v3);
      -        graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10));
      -        graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10));
      -        graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5));
      -        graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5));
      -
      -        // Construct the test path
      -        KBestPaths.Path path = new KBestPaths.Path(v, graph);
      -        path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt));
      -        path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3));
      -
      -        // Construct the actual cigar string implied by the test path
      -        Cigar expectedCigar = new Cigar();
      -        expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
      -        if( refBubbleLength > altBubbleLength ) {
      -            expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
      -            expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M));
      -        } else if ( refBubbleLength < altBubbleLength ) {
      -            expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M));
      -            expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
      -        } else {
      -            expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      -        }
      -        expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M));
      -
      -        Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
      -    }
      -
      -
      -    @DataProvider(name = "TripleBubbleDataProvider")
      -    public Object[][] makeTripleBubbleDataProvider() {
      -        List tests = new ArrayList();
      -        for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) {
      -            for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) {
      -                for ( final boolean offRefBeginning : Arrays.asList(false) ) {
      -                    for ( final boolean offRefEnding : Arrays.asList(true, false) ) {
      -                        tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding});
      -                    }
      -                }
      -            }
      -        }
      -        return tests.toArray(new Object[][]{});
      -    }
      -
      -    @Test(dataProvider = "TripleBubbleDataProvider")
      -    public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) {
      -        // Construct the assembly graph
      -        DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
      -        final int KMER_LENGTH = 3;
      -        final String preAltOption = "ATCGATCGATCGATCGATCG";
      -        final String postAltOption = "CCCC";
      -        final String preRef = "ATGG";
      -        final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGCCG";
      -        final String midRef1 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "TTCCT";
      -        final String midRef2 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "CCCAAAAAAAAAAAA";
      -
      -        DeBruijnVertex preV = new DeBruijnVertex(preAltOption.getBytes(), KMER_LENGTH);
      -        DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH);
      -        DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
      -        DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
      -        DeBruijnVertex v4Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'C', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
      -        DeBruijnVertex v4Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'C', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
      -        DeBruijnVertex v6Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'G', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH);
      -        DeBruijnVertex v6Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'G', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH);
      -        DeBruijnVertex v3 = new DeBruijnVertex(midRef1.getBytes(), KMER_LENGTH);
      -        DeBruijnVertex v5 = new DeBruijnVertex(midRef2.getBytes(), KMER_LENGTH);
      -        DeBruijnVertex v7 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH);
      -        DeBruijnVertex postV = new DeBruijnVertex(postAltOption.getBytes(), KMER_LENGTH);
      -
      -        graph.addVertex(preV);
      -        graph.addVertex(v);
      -        graph.addVertex(v2Ref);
      -        graph.addVertex(v2Alt);
      -        graph.addVertex(v3);
      -        graph.addVertex(v4Ref);
      -        graph.addVertex(v4Alt);
      -        graph.addVertex(v5);
      -        graph.addVertex(v6Ref);
      -        graph.addVertex(v6Alt);
      -        graph.addVertex(v7);
      -        graph.addVertex(postV);
      -        graph.addEdge(preV, v, new DeBruijnEdge(false, 1));
      -        graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10));
      -        graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10));
      -        graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5));
      -        graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5));
      -        graph.addEdge(v3, v4Ref, new DeBruijnEdge(true, 10));
      -        graph.addEdge(v4Ref, v5, new DeBruijnEdge(true, 10));
      -        graph.addEdge(v3, v4Alt, new DeBruijnEdge(false, 5));
      -        graph.addEdge(v4Alt, v5, new DeBruijnEdge(false, 5));
      -        graph.addEdge(v5, v6Ref, new DeBruijnEdge(true, 11));
      -        graph.addEdge(v6Ref, v7, new DeBruijnEdge(true, 11));
      -        graph.addEdge(v5, v6Alt, new DeBruijnEdge(false, 55));
      -        graph.addEdge(v6Alt, v7, new DeBruijnEdge(false, 55));
      -        graph.addEdge(v7, postV, new DeBruijnEdge(false, 1));
      -
      -        // Construct the test path
      -        KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph);
      -        if( offRefBeginning ) {
      -            path = new KBestPaths.Path(path, graph.getEdge(preV, v));
      -        }
      -        path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt));
      -        path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3));
      -        path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref));
      -        path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5));
      -        path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt));
      -        path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7));
      -        if( offRefEnding ) {
      -            path = new KBestPaths.Path(path, graph.getEdge(v7,postV));
      -        }
      -
      -        // Construct the actual cigar string implied by the test path
      -        Cigar expectedCigar = new Cigar();
      -        if( offRefBeginning ) {
      -            expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I));
      -        }
      -        expectedCigar.add(new CigarElement(preRef.length() - (KMER_LENGTH - 1), CigarOperator.M));
      -        // first bubble
      -        if( refBubbleLength > altBubbleLength ) {
      -            expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
      -            expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M));
      -        } else if ( refBubbleLength < altBubbleLength ) {
      -            expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M));
      -            expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
      -        } else {
      -            expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      -        }
      -        expectedCigar.add(new CigarElement(midRef1.length() - (KMER_LENGTH - 1), CigarOperator.M));
      -        // second bubble is ref path
      -        expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      -        expectedCigar.add(new CigarElement(midRef2.length() - (KMER_LENGTH - 1), CigarOperator.M));
      -        // third bubble
      -        if( refBubbleLength > altBubbleLength ) {
      -            expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
      -            expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M));
      -        } else if ( refBubbleLength < altBubbleLength ) {
      -            expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M));
      -            expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
      -        } else {
      -            expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      -        }
      -        expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M));
      -        if( offRefEnding ) {
      -            expectedCigar.add(new CigarElement(postAltOption.length() - (KMER_LENGTH - 1), CigarOperator.I));
      -        }
      -
      -        Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
      -    }
      -}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java
      new file mode 100644
      index 000000000..c049121a3
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java
      @@ -0,0 +1,84 @@
      +/*
      + *  By downloading the PROGRAM you agree to the following terms of use:
      + *  
      + *  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      + *  
      + *  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      + *  
      + *  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      + *  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      + *  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      + *  
      + *  1. DEFINITIONS
      + *  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      + *  
      + *  2. LICENSE
      + *  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      + *  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      + *  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      + *  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      + *  
      + *  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      + *  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      + *  Copyright 2012 Broad Institute, Inc.
      + *  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      + *  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      + *  
      + *  4. INDEMNIFICATION
      + *  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      + *  
      + *  5. NO REPRESENTATIONS OR WARRANTIES
      + *  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      + *  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      + *  
      + *  6. ASSIGNMENT
      + *  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      + *  
      + *  7. MISCELLANEOUS
      + *  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      + *  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      + *  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      + *  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      + *  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      + *  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      + *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      + */
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.Test;
      +
      +public class KMerCounterCaseFixUnitTest extends BaseTest {
      +    @Test
      +	public void testMyData() {
      +        final KMerCounter counter = new KMerCounter(3);
      +
      +        Assert.assertNotNull(counter.toString());
      +
      +        counter.addKmers(
      +			 "ATG", "ATG", "ATG", "ATG",
      +			 "ACC", "ACC", "ACC",
      +			 "AAA", "AAA",
      +			 "CTG",
      +			 "NNA",
      +                "CCC"
      +			 );
      +
      +        testCounting(counter, "ATG", 4);
      +        testCounting(counter, "ACC", 3);
      +        testCounting(counter, "AAA", 2);
      +        testCounting(counter, "CTG", 1);
      +        testCounting(counter, "NNA", 1);
      +        testCounting(counter, "CCC", 1);
      +        testCounting(counter, "NNN", 0);
      +        testCounting(counter, "NNC", 0);
      +
      +        Assert.assertNotNull(counter.toString());
      +    }
      +
      +    private void testCounting(final KMerCounter counter, final String in, final int expectedCount) {
      +        Assert.assertEquals(counter.getKmerCount(new Kmer(in)), expectedCount);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java
      new file mode 100644
      index 000000000..989c38628
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java
      @@ -0,0 +1,133 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.List;
      +
      +public class KmerUnitTest extends BaseTest {
      +    @DataProvider(name = "KMerCreationData")
      +    public Object[][] makeKMerCreationData() {
      +        List tests = new ArrayList();
      +
      +        final String bases = "ACGTAACCGGTTAAACCCGGGTTT";
      +        for ( int start = 0; start < bases.length(); start++ ) {
      +            for ( int length = 1; start + length < bases.length(); length++ ) {
      +                final String myBases = bases.substring(start, start+length);
      +                tests.add(new Object[]{bases.getBytes(), start, length, myBases});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "KMerCreationData")
      +    public void testFullConstructor(final byte[] allBases, final int start, final int length, final String expected) {
      +        testKmerCreation(new Kmer(allBases, start, length), start, length, expected);
      +    }
      +
      +    @Test(dataProvider = "KMerCreationData")
      +    public void testCopyConstructor(final byte[] allBases, final int start, final int length, final String expected) {
      +        testKmerCreation(new Kmer(new Kmer(allBases, start, length)), start, length, expected);
      +    }
      +
      +    @Test(dataProvider = "KMerCreationData")
      +    public void testByteConstructor(final byte[] allBases, final int start, final int length, final String expected) {
      +        testKmerCreation(new Kmer(Arrays.copyOfRange(allBases, start, start + length)), 0, length, expected);
      +    }
      +
      +    @Test(dataProvider = "KMerCreationData")
      +    public void testStringConstructor(final byte[] allBases, final int start, final int length, final String expected) {
      +        testKmerCreation(new Kmer(new String(Arrays.copyOfRange(allBases, start, start + length))), 0, length, expected);
      +    }
      +
      +    private void testKmerCreation(final Kmer kmer, final int start, final int length, final String expected) {
      +        Assert.assertEquals(kmer.start, start);
      +        Assert.assertEquals(kmer.length(), length);
      +        Assert.assertEquals(new String(kmer.bases()), expected);
      +
      +        // check that the caching is working by calling again
      +        Assert.assertEquals(kmer.start, 0);
      +        Assert.assertEquals(kmer.length(), length);
      +        Assert.assertEquals(new String(kmer.bases()), expected);
      +    }
      +
      +    @Test
      +    public void testEquals() {
      +        final byte[] bases = "ACGTACGT".getBytes();
      +        final Kmer eq1 = new Kmer(bases, 0, 3);
      +        final Kmer eq2 = new Kmer(bases, 4, 3);
      +        final Kmer eq3 = new Kmer(new Kmer(bases, 4, 3));
      +        final Kmer eq4 = new Kmer(new Kmer(bases, 4, 3).bases());
      +        final Kmer neq = new Kmer(bases, 1, 3);
      +
      +//        for ( final Kmer eq : Arrays.asList(eq1, eq2) ) { // TODO -- deal with me
      +        for ( final Kmer eq : Arrays.asList(eq1, eq2, eq3, eq4) ) {
      +            Assert.assertEquals(eq1, eq, "Should have been equal but wasn't: " + eq1.hash + " vs " + eq.hash); // , "should be equals " + eq1 + " with " + eq);
      +            Assert.assertEquals(eq1.hashCode(), eq.hashCode());
      +            Assert.assertNotEquals(eq, neq, "incorrectly equals " + eq + " with " + neq);
      +        }
      +    }
      +
      +    @Test
      +    public void testSubkmer() {
      +        final String bases = "ACGT";
      +        final Kmer one = new Kmer(bases.getBytes());
      +
      +        for ( int start = 0; start < bases.length(); start++ ) {
      +            for ( int length = 0; start + length < bases.length(); length++ ) {
      +                Assert.assertEquals(new String(one.subKmer(start,length).bases()), bases.substring(start, start+length));
      +            }
      +        }
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
      index 58f9a2e74..48c9d3c1a 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
      @@ -53,14 +53,10 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
        */
       
       import org.broadinstitute.sting.BaseTest;
      -import org.broadinstitute.sting.utils.Haplotype;
       import org.broadinstitute.sting.utils.MathUtils;
       import org.testng.Assert;
      -import org.testng.annotations.DataProvider;
       import org.testng.annotations.Test;
       
      -import java.util.*;
      -
       /**
        * Unit tests for LikelihoodCalculationEngine
        */
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java
      similarity index 74%
      rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java
      rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java
      index c94130d18..7df6ee6c8 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java
      @@ -44,88 +44,78 @@
       *  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
       */
       
      -package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
       
      -import net.sf.samtools.SAMFileHeader;
       import org.broadinstitute.sting.BaseTest;
      -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
      -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
      -import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
       import org.testng.annotations.Test;
       
      +import java.util.ArrayList;
       import java.util.Arrays;
      -import java.util.Random;
      +import java.util.Collections;
      +import java.util.List;
       
      -public class SyntheticReadUnitTest extends BaseTest {
      -    final SAMFileHeader artificialSAMHeader = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1);
      -    final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic");
      -    final String artificialContig = "1";
      -    final int artificialContigIndex = 0;
      -    final String artificialReadName = "synth";
      -    final int artificialRefStart = 1;
      -    final double artificialMappingQuality = 60;
      +public class BaseEdgeUnitTest extends BaseTest {
      +    @DataProvider(name = "EdgeCreationData")
      +    public Object[][] makeMyDataProvider() {
      +        List tests = new ArrayList();
       
      -    final Random random = new Random(8854875);
      -
      -
      -@Test
      -public void testBaseCounts() {
      -        BaseIndex [] bases = new BaseIndex[] {BaseIndex.A,BaseIndex.A,BaseIndex.A,BaseIndex.A};
      -        Byte[] quals = new Byte[] {20, 20, 20, 20 };
      -
      -        TestRead [] testReads = new TestRead [] {
      -                new TestRead(bases, quals, new Byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}),
      -                new TestRead(bases, quals, new Byte[] {1, 100, 100, 0},     new byte [] {1, 99, 99, -1}),
      -                new TestRead(bases, quals, new Byte[] {127, 100, 0, 1},     new byte [] {127, -27, -127, -126}),
      -                new TestRead(bases, quals, new Byte[] {1, 127, 51, 126},    new byte [] {1, 126, 50, 125})};
      -
      -        for (TestRead testRead : testReads) {
      -            SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false);
      -            Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
      +        // this functionality can be adapted to provide input data for whatever you might want in your data
      +        for ( final int multiplicity : Arrays.asList(1, 2, 3) ) {
      +            for ( final boolean isRef : Arrays.asList(true, false) ) {
      +                tests.add(new Object[]{isRef, multiplicity});
      +            }
               }
      -}
       
      -private class TestRead {
      -    BaseIndex[] bases;
      -    Byte[] quals;
      -    Byte[] insQuals;
      -    Byte[] delQuals;
      -    Byte[] counts;
      -    byte [] expectedCounts;
      -
      -    private TestRead(BaseIndex[] bases, Byte[] quals, Byte[] counts, byte[] expectedCounts) {
      -        this.bases = bases;
      -        this.quals = quals;
      -        this.insQuals = quals;
      -        this.delQuals = quals;
      -        this.counts = counts;
      -        this.expectedCounts = expectedCounts;
      +        return tests.toArray(new Object[][]{});
           }
       
      -    public BaseIndex[] getBases() {
      -        return bases;
      +    @Test(dataProvider = "EdgeCreationData")
      +    public void testBasic(final boolean isRef, final int mult) {
      +        final BaseEdge e = new BaseEdge(isRef, mult);
      +        Assert.assertEquals(e.isRef(), isRef);
      +        Assert.assertEquals(e.getMultiplicity(), mult);
      +
      +        e.setIsRef(!isRef);
      +        Assert.assertEquals(e.isRef(), !isRef);
      +
      +        e.setMultiplicity(mult + 1);
      +        Assert.assertEquals(e.getMultiplicity(), mult + 1);
      +
      +        final BaseEdge copy = new BaseEdge(e);
      +        Assert.assertEquals(copy.isRef(), e.isRef());
      +        Assert.assertEquals(copy.getMultiplicity(), e.getMultiplicity());
           }
       
      -    public Byte[] getQuals() {
      -        return quals;
      +    @Test
      +    public void testEdgeWeightComparator() {
      +        final BaseEdge e10 = new BaseEdge(false, 10);
      +        final BaseEdge e5 = new BaseEdge(true, 5);
      +        final BaseEdge e2 = new BaseEdge(false, 2);
      +        final BaseEdge e1 = new BaseEdge(false, 1);
      +
      +        final List edges = new ArrayList(Arrays.asList(e1, e2, e5, e10));
      +        Collections.sort(edges, new BaseEdge.EdgeWeightComparator());
      +        Assert.assertEquals(edges.get(0), e10);
      +        Assert.assertEquals(edges.get(1), e5);
      +        Assert.assertEquals(edges.get(2), e2);
      +        Assert.assertEquals(edges.get(3), e1);
           }
       
      -    public Byte[] getInsQuals() {
      -        return insQuals;
      -    }
      -
      -    public Byte[] getDelQuals() {
      -        return delQuals;
      -    }
      -
      -    public Byte[] getCounts() {
      -        return counts;
      -    }
      -
      -    public byte[] getExpectedCounts() {
      -        return expectedCounts;
      +    @Test
      +    public void testMax() {
      +        for ( final boolean firstIsRef : Arrays.asList(true, false) ) {
      +            for ( final boolean secondIsRef : Arrays.asList(true, false) ) {
      +                for ( final int firstMulti : Arrays.asList(1, 4) ) {
      +                    for ( final int secondMulti : Arrays.asList(2, 3) ) {
      +                        final BaseEdge expected = new BaseEdge(firstIsRef || secondIsRef, Math.max(firstMulti, secondMulti));
      +                        final BaseEdge actual = new BaseEdge(firstIsRef, firstMulti).max(new BaseEdge(secondIsRef, secondMulti));
      +                        Assert.assertEquals(actual.getMultiplicity(), expected.getMultiplicity());
      +                        Assert.assertEquals(actual.isRef(), expected.isRef());
      +                    }
      +                }
      +            }
      +        }
           }
       }
      -
      -}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java
      new file mode 100644
      index 000000000..c829488ba
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java
      @@ -0,0 +1,315 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.BeforeMethod;
      +import org.testng.annotations.Test;
      +import scala.actors.threadpool.Arrays;
      +
      +import java.io.File;
      +import java.util.*;
      +
      +public class BaseGraphUnitTest extends BaseTest {
      +    SeqGraph graph;
      +    SeqVertex v1, v2, v3, v4, v5;
      +
      +    @BeforeMethod
      +    public void setUp() throws Exception {
      +        graph = new SeqGraph();
      +
      +        v1 = new SeqVertex("A");
      +        v2 = new SeqVertex("C");
      +        v3 = new SeqVertex("C");
      +        v4 = new SeqVertex("C");
      +        v5 = new SeqVertex("C");
      +
      +        graph.addVertices(v1, v2, v3, v4, v5);
      +        graph.addEdge(v1, v2);
      +        graph.addEdge(v2, v4);
      +        graph.addEdge(v3, v2);
      +        graph.addEdge(v2, v3);
      +        graph.addEdge(v4, v5);
      +    }
      +
      +    @Test
      +    public void testIncomingAndOutgoingVertices() throws Exception {
      +        assertVertexSetEquals(graph.outgoingVerticesOf(v1), v2);
      +        assertVertexSetEquals(graph.incomingVerticesOf(v1));
      +
      +        assertVertexSetEquals(graph.outgoingVerticesOf(v2), v3, v4);
      +        assertVertexSetEquals(graph.incomingVerticesOf(v2), v1, v3);
      +
      +        assertVertexSetEquals(graph.outgoingVerticesOf(v3), v2);
      +        assertVertexSetEquals(graph.incomingVerticesOf(v3), v2);
      +
      +        assertVertexSetEquals(graph.outgoingVerticesOf(v4), v5);
      +        assertVertexSetEquals(graph.incomingVerticesOf(v4), v2);
      +
      +        assertVertexSetEquals(graph.outgoingVerticesOf(v5));
      +        assertVertexSetEquals(graph.incomingVerticesOf(v5), v4);
      +    }
      +
      +    @Test
      +         public void testRemoveSingletonOrphanVertices() throws Exception {
      +        // all vertices in graph are connected
      +        final List kept = new LinkedList(graph.vertexSet());
      +        final SeqVertex rm1 = new SeqVertex("CAGT");
      +        final SeqVertex rm2 = new SeqVertex("AGTC");
      +        graph.addVertices(rm1, rm2);
      +        Assert.assertEquals(graph.vertexSet().size(), kept.size() + 2);
      +        final BaseEdge rm12e = new BaseEdge(false, 1);
      +        graph.addEdge(rm1, rm2, rm12e);
      +
      +        final SeqGraph original = (SeqGraph)graph.clone();
      +        graph.removeSingletonOrphanVertices();
      +        Assert.assertTrue(BaseGraph.graphEquals(original, graph), "Graph with disconnected component but edges between components shouldn't be modified");
      +
      +        graph.removeEdge(rm12e); // now we should be able to remove rm1 and rm2
      +        graph.removeSingletonOrphanVertices();
      +        Assert.assertTrue(graph.vertexSet().containsAll(kept));
      +        Assert.assertFalse(graph.containsVertex(rm1));
      +        Assert.assertFalse(graph.containsVertex(rm2));
      +    }
      +
      +    @Test
      +    public void testRemovePathsNotConnectedToRef() throws Exception {
      +        final SeqGraph graph = new SeqGraph();
      +
      +        SeqVertex src = new SeqVertex("A");
      +        SeqVertex end = new SeqVertex("A");
      +        SeqVertex g1 = new SeqVertex("C");
      +        SeqVertex g2 = new SeqVertex("G");
      +        SeqVertex g3 = new SeqVertex("T");
      +        SeqVertex g4 = new SeqVertex("AA");
      +        SeqVertex g5 = new SeqVertex("AA");
      +        SeqVertex g6 = new SeqVertex("AA");
      +        SeqVertex g8 = new SeqVertex("AA");
      +        SeqVertex g7 = new SeqVertex("AA");
      +        SeqVertex b1 = new SeqVertex("CC");
      +        SeqVertex b2 = new SeqVertex("GG");
      +        SeqVertex b3 = new SeqVertex("TT");
      +        SeqVertex b4 = new SeqVertex("AAA");
      +        SeqVertex b5 = new SeqVertex("CCC");
      +        SeqVertex b6 = new SeqVertex("GGG");
      +        SeqVertex b7 = new SeqVertex("AAAA");
      +        SeqVertex b8 = new SeqVertex("GGGG");
      +        SeqVertex b9 = new SeqVertex("CCCC");
      +
      +        graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8);
      +        graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end);
      +        graph.addEdges(src, g1, g5, g6, g7, end);
      +        graph.addEdges(src, g1, g5, g8, g7, end);
      +        graph.addEdges(src, g1, g3, end);
      +
      +        // the current state of the graph is the good one
      +        final SeqGraph good = (SeqGraph)graph.clone();
      +
      +        // now add the bads to the graph
      +        graph.addVertices(b1, b2, b3, b4, b5, b6, b7, b8, b9);
      +        graph.addEdges(src, b1); // source -> b1 is dead
      +        graph.addEdges(b6, src); // x -> source is bad
      +        graph.addEdges(g4, b2); // off random vertex is bad
      +        graph.addEdges(g3, b3, b4); // two vertices that don't connect to end are bad
      +        graph.addEdges(end, b5); // vertex off end is bad
      +        graph.addEdges(g3, b7, b8, b7); // cycle is bad
      +        graph.addEdges(g3, b9, b9); // self-cycle is bad
      +
      +        final boolean debug = false;
      +        if ( debug ) good.printGraph(new File("expected.dot"), 0);
      +        if ( debug ) graph.printGraph(new File("bad.dot"), 0);
      +        graph.removePathsNotConnectedToRef();
      +        if ( debug ) graph.printGraph(new File("actual.dot"), 0);
      +
      +        Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes");
      +    }
      +
      +    @Test
      +    public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception {
      +        final SeqGraph graph = new SeqGraph();
      +
      +        SeqVertex src = new SeqVertex("A");
      +        SeqVertex end = new SeqVertex("A");
      +        SeqVertex g1 = new SeqVertex("C");
      +        SeqVertex g2 = new SeqVertex("G");
      +        SeqVertex g3 = new SeqVertex("T");
      +        SeqVertex g4 = new SeqVertex("AA");
      +        SeqVertex g5 = new SeqVertex("AA");
      +        SeqVertex g6 = new SeqVertex("AA");
      +        SeqVertex g8 = new SeqVertex("AA");
      +        SeqVertex g7 = new SeqVertex("AA");
      +        SeqVertex gPrev = new SeqVertex("AA");
      +        SeqVertex gPrev1 = new SeqVertex("AA");
      +        SeqVertex gPrev2 = new SeqVertex("AA");
      +        SeqVertex gAfter = new SeqVertex("AA");
      +        SeqVertex gAfter1 = new SeqVertex("AA");
      +        SeqVertex gAfter2 = new SeqVertex("AA");
      +        SeqVertex b1 = new SeqVertex("CC");
      +        SeqVertex b2 = new SeqVertex("GG");
      +        SeqVertex b3 = new SeqVertex("TT");
      +        SeqVertex b4 = new SeqVertex("AAA");
      +        SeqVertex b5 = new SeqVertex("CCC");
      +        SeqVertex b6 = new SeqVertex("GGG");
      +
      +        graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8, gPrev, gPrev1, gPrev2, gAfter, gAfter1, gAfter2);
      +        graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end);
      +        graph.addEdges(src, g1, g5, g6, g7, end);
      +        graph.addEdges(src, g1, g5, g8, g7, end);
      +        graph.addEdges(src, g1, g3, end);
      +
      +        // these should be kept, but are in the wrong direction
      +        graph.addEdges(gPrev, src);
      +        graph.addEdges(gPrev1, gPrev2, src);
      +        graph.addEdges(end, gAfter);
      +        graph.addEdges(end, gAfter1, gAfter2);
      +
      +        // the current state of the graph is the good one
      +        final SeqGraph good = (SeqGraph)graph.clone();
      +
      +        // now add the bads to the graph
      +        graph.addVertices(b1, b2, b3, b4, b5, b6);
      +        graph.addEdges(b2, b3); // b2 -> b3
      +        graph.addEdges(b4, b5, b4); // cycle
      +        graph.addEdges(b6, b6); // isolated self cycle
      +
      +        final boolean debug = false;
      +        if ( debug ) good.printGraph(new File("expected.dot"), 0);
      +        if ( debug ) graph.printGraph(new File("bad.dot"), 0);
      +        graph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
      +        if ( debug ) graph.printGraph(new File("actual.dot"), 0);
      +
      +        Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes");
      +    }
      +
      +    @Test
      +    public void testPrintEmptyGraph() throws Exception {
      +        final File tmp = File.createTempFile("tmp", "dot");
      +        tmp.deleteOnExit();
      +        new SeqGraph().printGraph(tmp, 10);
      +        new DeBruijnGraph().printGraph(tmp, 10);
      +    }
      +
      +    @Test
      +    public void testComplexGraph() throws Exception {
      +        final File tmp = File.createTempFile("tmp", "dot");
      +        tmp.deleteOnExit();
      +        graph.printGraph(tmp, 10);
      +    }
      +
      +    private void assertVertexSetEquals(final Collection actual, final SeqVertex ... expected) {
      +        final Set actualSet = new HashSet(actual);
      +        Assert.assertEquals(actualSet.size(), actual.size(), "Duplicate elements found in vertex list");
      +        final Set expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected));
      +        Assert.assertEquals(actualSet, expectedSet);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testPruneGraph() {
      +        DeBruijnGraph graph = new DeBruijnGraph();
      +        DeBruijnGraph expectedGraph = new DeBruijnGraph();
      +
      +        DeBruijnVertex v = new DeBruijnVertex("ATGG");
      +        DeBruijnVertex v2 = new DeBruijnVertex("ATGGA");
      +        DeBruijnVertex v3 = new DeBruijnVertex("ATGGT");
      +        DeBruijnVertex v4 = new DeBruijnVertex("ATGGG");
      +        DeBruijnVertex v5 = new DeBruijnVertex("ATGGC");
      +        DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC");
      +
      +        graph.addVertex(v);
      +        graph.addVertex(v2);
      +        graph.addVertex(v3);
      +        graph.addVertex(v4);
      +        graph.addVertex(v5);
      +        graph.addVertex(v6);
      +        graph.addEdge(v, v2, new BaseEdge(false, 1));
      +        graph.addEdge(v2, v3, new BaseEdge(false, 3));
      +        graph.addEdge(v3, v4, new BaseEdge(false, 5));
      +        graph.addEdge(v4, v5, new BaseEdge(false, 3));
      +        graph.addEdge(v5, v6, new BaseEdge(false, 2));
      +
      +        expectedGraph.addVertex(v2);
      +        expectedGraph.addVertex(v3);
      +        expectedGraph.addVertex(v4);
      +        expectedGraph.addVertex(v5);
      +        expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
      +        expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
      +        expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
      +
      +        graph.pruneGraph(2);
      +
      +        Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
      +
      +        graph = new DeBruijnGraph();
      +        expectedGraph = new DeBruijnGraph();
      +
      +        graph.addVertex(v);
      +        graph.addVertex(v2);
      +        graph.addVertex(v3);
      +        graph.addVertex(v4);
      +        graph.addVertex(v5);
      +        graph.addVertex(v6);
      +        graph.addEdge(v, v2, new BaseEdge(true, 1));
      +        graph.addEdge(v2, v3, new BaseEdge(false, 3));
      +        graph.addEdge(v3, v4, new BaseEdge(false, 5));
      +        graph.addEdge(v4, v5, new BaseEdge(false, 3));
      +
      +        expectedGraph.addVertex(v);
      +        expectedGraph.addVertex(v2);
      +        expectedGraph.addVertex(v3);
      +        expectedGraph.addVertex(v4);
      +        expectedGraph.addVertex(v5);
      +        expectedGraph.addEdge(v, v2, new BaseEdge(true, 1));
      +        expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3));
      +        expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5));
      +        expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3));
      +
      +        graph.pruneGraph(2);
      +
      +        Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph));
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java
      new file mode 100644
      index 000000000..859892e33
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java
      @@ -0,0 +1,92 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.Test;
      +
      +public class BaseVertexUnitTest extends BaseTest {
      +    @Test
      +    public void testBasic() {
      +        final byte[] bases = "ACT".getBytes();
      +        final BaseVertex v = new BaseVertex(bases);
      +        Assert.assertEquals(v.getSequence(), bases);
      +        Assert.assertEquals(v.getAdditionalSequence(false), bases);
      +        Assert.assertEquals(v.getAdditionalSequence(true), bases);
      +        Assert.assertEquals(v.getSequenceString(), new String(bases));
      +        Assert.assertEquals(v.toString(), v.getSequenceString());
      +        Assert.assertEquals(v.length(), bases.length);
      +    }
      +
      +    @Test(expectedExceptions = IllegalArgumentException.class)
      +    public void testCreationNull() {
      +        new BaseVertex((byte[])null);
      +    }
      +
      +    @Test()
      +    public void testCreationEmptySeq() {
      +        final BaseVertex v = new BaseVertex(new byte[0]);
      +        Assert.assertTrue(v.isEmpty(), "Version with length == 0 should be empty");
      +    }
      +
      +    @Test
      +    public void testEqualsAndHashCode() {
      +        final BaseVertex v1 = new BaseVertex("ACT".getBytes());
      +        final BaseVertex v1_eq = new BaseVertex("ACT".getBytes());
      +        final BaseVertex v2 = new BaseVertex("ACG".getBytes());
      +
      +        Assert.assertEquals(v1, v1);
      +        Assert.assertEquals(v1.hashCode(), v1.hashCode());
      +        Assert.assertEquals(v1, v1_eq);
      +        Assert.assertEquals(v1.hashCode(), v1_eq.hashCode());
      +        Assert.assertFalse(v1.equals(v2));
      +        Assert.assertFalse(v2.equals(v1));
      +        Assert.assertFalse(v2.hashCode() == v1.hashCode());
      +        Assert.assertFalse(v2.equals(v1));
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java
      new file mode 100644
      index 000000000..8682ae5e4
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java
      @@ -0,0 +1,169 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.io.File;
      +import java.util.*;
      +
      +public class CommonSuffixMergerUnitTest extends BaseTest {
      +    private final static boolean PRINT_GRAPHS = true;
      +
      +    @DataProvider(name = "CompleteCycleData")
      +    public Object[][] makeCompleteCycleData() {
      +        return makeSplitMergeData(-1);
      +    }
      +
      +    public static class SplitMergeData {
      +        final SeqGraph graph;
      +        final SeqVertex v;
      +        final String commonSuffix;
      +
      +        public SplitMergeData(SeqGraph graph, SeqVertex v, String commonSuffix) {
      +            this.graph = graph;
      +            this.v = v;
      +            this.commonSuffix = commonSuffix;
      +        }
      +
      +        @Override
      +        public String toString() {
      +            return "SplitMergeData{" +
      +                    "graph=" + graph +
      +                    ", v=" + v +
      +                    ", commonSuffix='" + commonSuffix + '\'' +
      +                    '}';
      +        }
      +    }
      +
      +    public static Object[][] makeSplitMergeData(final int maxTests) {
      +        List tests = new ArrayList();
      +
      +        final List bases = Arrays.asList("A", "C", "G", "T");
      +        for ( final String commonSuffix : Arrays.asList("", "A", "AT") ) {
      +            for ( final int nBots : Arrays.asList(0, 1, 2) ) {
      +                for ( final int nMids : Arrays.asList(1, 2, 3) ) {
      +                    for ( int nTops = 0; nTops < nMids; nTops++ ) {
      +                        for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) {
      +                            int multi = 1;
      +                            final SeqGraph graph = new SeqGraph();
      +                            final SeqVertex v = new SeqVertex("GGGG");
      +                            graph.addVertex(v);
      +
      +                            final LinkedList tops = new LinkedList();
      +                            final LinkedList mids = new LinkedList();
      +
      +                            for ( int i = 0; i < nMids; i++) {
      +                                final SeqVertex mid = new SeqVertex(bases.get(i) + commonSuffix);
      +                                graph.addVertex(mid);
      +                                graph.addEdge(mid, v, new BaseEdge(i == 0, multi++));
      +                                mids.add(mid);
      +
      +                                tops.add(new SeqVertex(bases.get(i)));
      +                            }
      +
      +                            graph.addVertices(tops);
      +                            for ( final SeqVertex t : tops ) {
      +                                for ( int i = 0; i < nTopConnections; i++ ) {
      +                                    graph.addEdge(t, mids.get(i), new BaseEdge(i == 0, multi++));
      +                                }
      +                            }
      +
      +                            for ( int i = 0; i < nBots; i++ ) {
      +                                final SeqVertex bot = new SeqVertex(bases.get(i));
      +                                graph.addVertex(bot);
      +                                graph.addEdge(v, bot, new BaseEdge(i == 0, multi++));
      +
      +                            }
      +
      +                            tests.add(new Object[]{new SplitMergeData(graph, v, commonSuffix)});
      +                        }
      +                    }
      +                }
      +            }
      +        }
      +
      +        final List toUse = maxTests == -1 ? tests : tests.subList(0, Math.min(tests.size(), maxTests));
      +        return toUse.toArray(new Object[][]{});
      +    }
      +
      +    public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) {
      +        try {
      +            final Set haplotypes = new HashSet();
      +            final List> originalPaths = new KBestPaths().getKBestPaths(original);
      +            for ( final Path path : originalPaths )
      +                haplotypes.add(new String(path.getBases()));
      +
      +            final List> splitPaths = new KBestPaths().getKBestPaths(actual);
      +            for ( final Path path : splitPaths ) {
      +                final String h = new String(path.getBases());
      +                Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
      +            }
      +
      +            if ( splitPaths.size() == originalPaths.size() ) {
      +                for ( int i = 0; i < originalPaths.size(); i++ ) {
      +                    Assert.assertTrue(splitPaths.get(i).equalSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i));
      +                }
      +            }
      +        } catch ( AssertionError e ) {
      +            if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0);
      +            if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0);
      +            throw e;
      +        }
      +    }
      +
      +    @Test(dataProvider = "CompleteCycleData")
      +    public void testMerging(final SplitMergeData data) {
      +        final SeqGraph original = (SeqGraph)data.graph.clone();
      +        final SharedSequenceMerger splitter = new SharedSequenceMerger();
      +        splitter.merge(data.graph, data.v);
      +        assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java
      new file mode 100644
      index 000000000..1ed20e5f4
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java
      @@ -0,0 +1,170 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.io.File;
      +import java.util.Arrays;
      +
      +public class CommonSuffixSplitterUnitTest extends BaseTest {
      +    private final static boolean DEBUG = false;
      +
      +    @DataProvider(name = "SplitData")
      +    public Object[][] makeSplitData() {
      +        return CommonSuffixMergerUnitTest.makeSplitMergeData(-1);
      +    }
      +
      +    @Test(dataProvider = "SplitData", enabled = !DEBUG)
      +    public void testSplit(final CommonSuffixMergerUnitTest.SplitMergeData data) {
      +        final boolean expectedMerge = ! data.commonSuffix.isEmpty() && data.graph.inDegreeOf(data.v) > 1;
      +
      +        final SeqGraph original = (SeqGraph)data.graph.clone();
      +//        original.printGraph(new File("original.dot"), 0);
      +        final CommonSuffixSplitter splitter = new CommonSuffixSplitter();
      +        final boolean succeed = splitter.split(data.graph, data.v);
      +//        data.graph.printGraph(new File("actual.dot"), 0);
      +        Assert.assertEquals(succeed, expectedMerge, "Not excepted merge success/fail result");
      +        if ( succeed ) {
      +            Assert.assertEquals(data.graph.incomingVerticesOf(data.v).iterator().next().getSequenceString(), data.commonSuffix, "Common suffix not computed correctly");
      +        }
      +
      +        CommonSuffixMergerUnitTest.assertSameHaplotypes(String.format("suffixSplit.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original);
      +    }
      +
      +    @Test(enabled = !DEBUG)
      +    public void testSplitPrevHaveMultipleEdges() {
      +        final SeqGraph original = new SeqGraph();
      +        final SeqVertex v1 = new SeqVertex("A");
      +        final SeqVertex v2 = new SeqVertex("A");
      +        final SeqVertex v3 = new SeqVertex("A");
      +        final SeqVertex v4 = new SeqVertex("A");
      +
      +        original.addVertices(v1, v2, v3, v4);
      +        original.addEdges(v1, v3);
      +
      +        Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with only one vertex");
      +
      +        original.addEdges(v2, v3);
      +        original.addEdges(v2, v4);
      +
      +        Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with multiple outgoing edges from middle nodes");
      +    }
      +
      +    @Test(enabled = !DEBUG)
      +    public void testSplitNoCycles() {
      +        final SeqGraph original = new SeqGraph();
      +        final SeqVertex v1 = new SeqVertex("A");
      +        final SeqVertex v2 = new SeqVertex("AC");
      +        final SeqVertex v3 = new SeqVertex("TC");
      +        final SeqVertex v4 = new SeqVertex("G");
      +
      +        original.addVertices(v1, v2, v3, v4);
      +        original.addEdges(v1, v3, v4);
      +        original.addEdges(v1, v2, v4);
      +
      +        Assert.assertTrue(new CommonSuffixSplitter().split((SeqGraph)original.clone(), v4), "Should be able to split pre-cycle graph");
      +
      +        original.addEdges(v4, v4);
      +        Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list");
      +    }
      +
      +    @Test(timeOut = 10000, enabled = !DEBUG)
      +    public void testSplitComplexCycle() {
      +        final SeqGraph original = new SeqGraph();
      +        final SeqVertex r1 = new SeqVertex("ACTG");
      +        final SeqVertex r2 = new SeqVertex("ATGC");
      +        final SeqVertex cat1 = new SeqVertex("CAT");
      +        final SeqVertex cat2 = new SeqVertex("CAT");
      +        final SeqVertex c1 = new SeqVertex("C");
      +        final SeqVertex c2 = new SeqVertex("C");
      +
      +        original.addVertices(r1, r2, cat1, cat2, c1, c2);
      +        original.addEdges(r1, cat1, c1, cat2, c1);
      +        original.addEdges(r2, c2, cat2);
      +
      +        //original.printGraph(new File("testSplitComplexCycle.dot"), 0);
      +
      +        for ( final SeqVertex v : Arrays.asList(cat2) ) { // original.vertexSet() ) {
      +            final SeqGraph graph = (SeqGraph)original.clone();
      +            final boolean success = new CommonSuffixSplitter().split(graph, v);
      +            if ( success ) graph.printGraph(new File("testSplitComplexCycle.fail.dot"), 0);
      +            Assert.assertFalse(success, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v);
      +        }
      +    }
      +
      +    @Test(timeOut = 10000)
      +    public void testSplitInfiniteCycleFailure() {
      +        final SeqGraph original = new SeqGraph();
      +        final SeqVertex v1 = new SeqVertex("GC");
      +        final SeqVertex v2 = new SeqVertex("X");
      +        final SeqVertex v3 = new SeqVertex("N");
      +        final SeqVertex v4 = new SeqVertex("C");
      +
      +        original.addVertices(v1, v2, v3, v4);
      +        original.addEdge(v1, v2, new BaseEdge(false, 12));
      +        original.addEdge(v2, v3, new BaseEdge(false, 23));
      +        original.addEdge(v3, v4, new BaseEdge(false, 34));
      +        original.addEdge(v4, v2, new BaseEdge(false, 42));
      +
      +        original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0);
      +
      +        final SeqGraph graph = (SeqGraph)original.clone();
      +        final boolean success = new CommonSuffixSplitter().split(graph, v2);
      +        Assert.assertTrue(success);
      +
      +        for ( final SeqVertex v : graph.vertexSet() ) {
      +            graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0);
      +            final boolean success2 = new CommonSuffixSplitter().split((SeqGraph)graph.clone(), v);
      +            if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0);
      +            Assert.assertFalse(success2, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v);
      +        }
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java
      new file mode 100644
      index 000000000..bdc8ab36d
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java
      @@ -0,0 +1,68 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.annotations.Test;
      +import org.testng.Assert;
      +
      +public class DeBruijnVertexUnitTest extends BaseTest {
      +    @Test
      +    public void testBasic() {
      +        final byte[] bases = "ACT".getBytes();
      +        final DeBruijnVertex v = new DeBruijnVertex(bases);
      +        Assert.assertEquals(v.getSequence(), bases);
      +        Assert.assertEquals(v.getSequenceString(), new String(bases));
      +        Assert.assertEquals(v.length(), bases.length);
      +        Assert.assertEquals(v.getSuffix(), (byte)'T');
      +        Assert.assertEquals(v.getSuffixString(), "T");
      +
      +        Assert.assertEquals(v.getAdditionalSequence(true), bases);
      +        Assert.assertEquals(v.getAdditionalSequence(false).length, 1);
      +        Assert.assertEquals(v.getAdditionalSequence(false)[0], (byte)'T');
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java
      new file mode 100644
      index 000000000..d1bae74b2
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java
      @@ -0,0 +1,495 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import net.sf.samtools.Cigar;
      +import net.sf.samtools.CigarElement;
      +import net.sf.samtools.CigarOperator;
      +import net.sf.samtools.TextCigarCodec;
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.broadinstitute.sting.utils.sam.AlignmentUtils;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.util.*;
      +
      +/**
      + * Created with IntelliJ IDEA.
      + * User: rpoplin
      + * Date: 1/31/13
      + */
      +
      +public class KBestPathsUnitTest extends BaseTest {
      +    private final static boolean DEBUG = false;
      +
      +    @DataProvider(name = "BasicPathFindingData")
      +    public Object[][] makeBasicPathFindingData() {
      +        List tests = new ArrayList();
      +        for ( final boolean allowCycles : Arrays.asList(false, true)) {
      +            for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) {
      +                for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) {
      +                    for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) {
      +                        for ( final boolean addCycle : Arrays.asList(true, false) ) {
      +                            tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes, addCycle, allowCycles});
      +                        }
      +                    }
      +                }
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    private static int weight = 1;
      +    final Set createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) {
      +        final List seqs = Arrays.asList("A", "C", "G", "T");
      +        final Set vertices = new LinkedHashSet();
      +        for ( int i = 0; i < n; i++ ) {
      +            final SeqVertex v = new SeqVertex(seqs.get(i));
      +            graph.addVertex(v);
      +            vertices.add(v);
      +            if ( source != null ) graph.addEdge(source, v, new BaseEdge(false, weight++));
      +            if ( target != null ) graph.addEdge(v, target, new BaseEdge(false, weight++));
      +        }
      +        return vertices;
      +    }
      +
      +    @Test(dataProvider = "BasicPathFindingData", enabled = !DEBUG)
      +    public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle, final boolean allowCycles) {
      +        SeqGraph graph = new SeqGraph();
      +
      +        final SeqVertex middleTop = new SeqVertex("GTAC");
      +        final SeqVertex middleBottom = new SeqVertex("ACTG");
      +        graph.addVertices(middleTop, middleBottom);
      +        final Set starts = createVertices(graph, nStartNodes, null, middleTop);
      +        final Set bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom);
      +        final Set ends = createVertices(graph, nEndNodes, middleBottom, null);
      +
      +        if ( addCycle ) graph.addEdge(middleBottom, middleBottom);
      +
      +        // enumerate all possible paths
      +        final List> paths = new KBestPaths(allowCycles).getKBestPaths(graph, starts, ends);
      +
      +        final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle && allowCycles ? 2 : 1) * nEndNodes;
      +        Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths");
      +
      +        int lastScore = Integer.MAX_VALUE;
      +        for ( final Path path : paths ) {
      +            Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order.   Path " + path + " has score above previous " + lastScore);
      +            lastScore = path.getScore();
      +        }
      +
      +        // get the best path, and make sure it's the same as our optimal path overall
      +        final Path best = paths.get(0);
      +        final List> justOne = new KBestPaths(allowCycles).getKBestPaths(graph, 1, starts, ends);
      +        Assert.assertEquals(justOne.size(), 1);
      +        Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0));
      +    }
      +
      +    @Test(enabled = !DEBUG)
      +    public void testPathFindingComplexCycle() {
      +        SeqGraph graph = new SeqGraph();
      +
      +        final SeqVertex v1 = new SeqVertex("A");
      +        final SeqVertex v2 = new SeqVertex("C");
      +        final SeqVertex v3 = new SeqVertex("G");
      +        final SeqVertex v4 = new SeqVertex("T");
      +        final SeqVertex v5 = new SeqVertex("AA");
      +        graph.addVertices(v1, v2, v3, v4, v5);
      +        graph.addEdges(v1, v2, v3, v4, v5);
      +        graph.addEdges(v3, v3);
      +        graph.addEdges(v4, v2);
      +
      +        // enumerate all possible paths
      +        final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v5);
      +
      +        Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths");
      +    }
      +
      +    @Test(enabled = !DEBUG)
      +    public void testPathFindingCycleLastNode() {
      +        SeqGraph graph = new SeqGraph();
      +
      +        final SeqVertex v1 = new SeqVertex("A");
      +        final SeqVertex v2 = new SeqVertex("C");
      +        final SeqVertex v3 = new SeqVertex("G");
      +        graph.addVertices(v1, v2, v3);
      +        graph.addEdges(v1, v2, v3, v3);
      +
      +        // enumerate all possible paths
      +        final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v3);
      +
      +        Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths");
      +    }
      +
      +    @DataProvider(name = "BasicBubbleDataProvider")
      +    public Object[][] makeBasicBubbleDataProvider() {
      +        List tests = new ArrayList();
      +        for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) {
      +            for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) {
      +                tests.add(new Object[]{refBubbleLength, altBubbleLength});
      +            }
      +        }
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "BasicBubbleDataProvider", enabled = !DEBUG)
      +    public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) {
      +        // Construct the assembly graph
      +        SeqGraph graph = new SeqGraph(3);
      +        final String preRef = "ATGG";
      +        final String postRef = "GGGGC";
      +
      +        SeqVertex v = new SeqVertex(preRef);
      +        SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength));
      +        SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T");
      +        SeqVertex v3 = new SeqVertex(postRef);
      +
      +        graph.addVertex(v);
      +        graph.addVertex(v2Ref);
      +        graph.addVertex(v2Alt);
      +        graph.addVertex(v3);
      +        graph.addEdge(v, v2Ref, new BaseEdge(true, 10));
      +        graph.addEdge(v2Ref, v3, new BaseEdge(true, 10));
      +        graph.addEdge(v, v2Alt, new BaseEdge(false, 5));
      +        graph.addEdge(v2Alt, v3, new BaseEdge(false, 5));
      +
      +        // Construct the test path
      +        Path path = new Path(v, graph);
      +        path = new Path(path, graph.getEdge(v, v2Alt));
      +        path = new Path(path, graph.getEdge(v2Alt, v3));
      +
      +        // Construct the actual cigar string implied by the test path
      +        Cigar expectedCigar = new Cigar();
      +        expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
      +        if( refBubbleLength > altBubbleLength ) {
      +            expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
      +            expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M));
      +        } else if ( refBubbleLength < altBubbleLength ) {
      +            expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      +            expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
      +        } else {
      +            expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      +        }
      +        expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M));
      +
      +        Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
      +    }
      +
      +    @DataProvider(name = "GetBasesData")
      +    public Object[][] makeGetBasesData() {
      +        List tests = new ArrayList();
      +
      +        final List frags = Arrays.asList("ACT", "GAC", "CAT");
      +
      +        for ( int n = 1; n <= frags.size(); n++ ) {
      +            for ( final List comb : Utils.makePermutations(frags, n, false) ) {
      +                tests.add(new Object[]{comb});
      +            }
      +        }
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "GetBasesData", enabled = !DEBUG)
      +    public void testGetBases(final List frags) {
      +        // Construct the assembly graph
      +        SeqGraph graph = new SeqGraph(3);
      +
      +        SeqVertex prev = null;
      +        for ( int i = 0; i < frags.size(); i++ ) {
      +            SeqVertex v = new SeqVertex(frags.get(i));
      +            graph.addVertex(v);
      +            if ( prev != null )
      +                graph.addEdge(prev, v);
      +            prev = v;
      +        }
      +
      +        // enumerate all possible paths
      +        final List> paths = new KBestPaths().getKBestPaths(graph);
      +        Assert.assertEquals(paths.size(), 1);
      +        final Path path = paths.get(0);
      +        Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence");
      +    }
      +
      +    @DataProvider(name = "TripleBubbleDataProvider")
      +    public Object[][] makeTripleBubbleDataProvider() {
      +        List tests = new ArrayList();
      +        for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) {
      +            for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) {
      +                for ( final boolean offRefEnding : Arrays.asList(true, false) ) {
      +                    for ( final boolean offRefBeginning : Arrays.asList(false) ) {
      +                        tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding});
      +                    }
      +                }
      +            }
      +        }
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "TripleBubbleDataProvider", enabled = !DEBUG)
      +    public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) {
      +        // Construct the assembly graph
      +        SeqGraph graph = new SeqGraph();
      +        final String preAltOption = "ATCGATCGATCGATCGATCG";
      +        final String postAltOption = "CCCC";
      +        final String preRef = "ATGG";
      +        final String postRef = "GGCCG";
      +        final String midRef1 = "TTCCT";
      +        final String midRef2 = "CCCAAAAAAAAAAAA";
      +
      +        SeqVertex preV = new SeqVertex(preAltOption);
      +        SeqVertex v = new SeqVertex(preRef);
      +        SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength));
      +        SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T");
      +        SeqVertex v4Ref = new SeqVertex(Utils.dupString('C', refBubbleLength));
      +        SeqVertex v4Alt = new SeqVertex(Utils.dupString('C', altBubbleLength-1) + "T");
      +        SeqVertex v6Ref = new SeqVertex(Utils.dupString('G', refBubbleLength));
      +        SeqVertex v6Alt = new SeqVertex(Utils.dupString('G', altBubbleLength-1) + "T");
      +        SeqVertex v3 = new SeqVertex(midRef1);
      +        SeqVertex v5 = new SeqVertex(midRef2);
      +        SeqVertex v7 = new SeqVertex(postRef);
      +        SeqVertex postV = new SeqVertex(postAltOption);
      +
      +        graph.addVertex(preV);
      +        graph.addVertex(v);
      +        graph.addVertex(v2Ref);
      +        graph.addVertex(v2Alt);
      +        graph.addVertex(v3);
      +        graph.addVertex(v4Ref);
      +        graph.addVertex(v4Alt);
      +        graph.addVertex(v5);
      +        graph.addVertex(v6Ref);
      +        graph.addVertex(v6Alt);
      +        graph.addVertex(v7);
      +        graph.addVertex(postV);
      +        graph.addEdge(preV, v, new BaseEdge(false, 1));
      +        graph.addEdge(v, v2Ref, new BaseEdge(true, 10));
      +        graph.addEdge(v2Ref, v3, new BaseEdge(true, 10));
      +        graph.addEdge(v, v2Alt, new BaseEdge(false, 5));
      +        graph.addEdge(v2Alt, v3, new BaseEdge(false, 5));
      +        graph.addEdge(v3, v4Ref, new BaseEdge(true, 10));
      +        graph.addEdge(v4Ref, v5, new BaseEdge(true, 10));
      +        graph.addEdge(v3, v4Alt, new BaseEdge(false, 5));
      +        graph.addEdge(v4Alt, v5, new BaseEdge(false, 5));
      +        graph.addEdge(v5, v6Ref, new BaseEdge(true, 11));
      +        graph.addEdge(v6Ref, v7, new BaseEdge(true, 11));
      +        graph.addEdge(v5, v6Alt, new BaseEdge(false, 55));
      +        graph.addEdge(v6Alt, v7, new BaseEdge(false, 55));
      +        graph.addEdge(v7, postV, new BaseEdge(false, 1));
      +
      +        // Construct the test path
      +        Path path = new Path( (offRefBeginning ? preV : v), graph);
      +        if( offRefBeginning ) {
      +            path = new Path(path, graph.getEdge(preV, v));
      +        }
      +        path = new Path(path, graph.getEdge(v, v2Alt));
      +        path = new Path(path, graph.getEdge(v2Alt, v3));
      +        path = new Path(path, graph.getEdge(v3, v4Ref));
      +        path = new Path(path, graph.getEdge(v4Ref, v5));
      +        path = new Path(path, graph.getEdge(v5, v6Alt));
      +        path = new Path(path, graph.getEdge(v6Alt, v7));
      +        if( offRefEnding ) {
      +            path = new Path(path, graph.getEdge(v7,postV));
      +        }
      +
      +        // Construct the actual cigar string implied by the test path
      +        Cigar expectedCigar = new Cigar();
      +        if( offRefBeginning ) {
      +            expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I));
      +        }
      +        expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
      +        // first bubble
      +        if( refBubbleLength > altBubbleLength ) {
      +            expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
      +            expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M));
      +        } else if ( refBubbleLength < altBubbleLength ) {
      +            expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M));
      +            expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
      +        } else {
      +            expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      +        }
      +        expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M));
      +        // second bubble is ref path
      +        expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      +        expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M));
      +        // third bubble
      +        if( refBubbleLength > altBubbleLength ) {
      +            expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
      +            expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M));
      +        } else if ( refBubbleLength < altBubbleLength ) {
      +            expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M));
      +            expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I));
      +        } else {
      +            expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
      +        }
      +        expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M));
      +        if( offRefEnding ) {
      +            expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I));
      +        }
      +
      +        Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch");
      +    }
      +
      +    @Test(enabled = !DEBUG)
      +    public void testIntraNodeInsertionDeletion() {
      +        // Construct the assembly graph
      +        SeqGraph graph = new SeqGraph();
      +        final SeqVertex top = new SeqVertex("T");
      +        final SeqVertex bot = new SeqVertex("T");
      +        final SeqVertex alt = new SeqVertex("AAACCCCC");
      +        final SeqVertex ref = new SeqVertex("CCCCCGGG");
      +
      +        graph.addVertices(top, bot, alt, ref);
      +        graph.addEdges(new BaseEdge(true, 1), top, ref, bot);
      +        graph.addEdges(new BaseEdge(false, 1), top, alt, bot);
      +
      +        final KBestPaths pathFinder = new KBestPaths();
      +        final List> paths = pathFinder.getKBestPaths(graph, top, bot);
      +
      +        Assert.assertEquals(paths.size(), 2);
      +
      +        final Path refPath = paths.get(0);
      +        final Path altPath = paths.get(1);
      +
      +        Assert.assertEquals(refPath.calculateCigar().toString(), "10M");
      +        Assert.assertEquals(altPath.calculateCigar().toString(), "1M3I5M3D1M");
      +    }
      +
      +    @Test(enabled = !DEBUG)
      +    public void testHardSWPath() {
      +        // Construct the assembly graph
      +        SeqGraph graph = new SeqGraph();
      +        final SeqVertex top = new SeqVertex( "NNN");
      +        final SeqVertex bot = new SeqVertex( "NNN");
      +        final SeqVertex alt = new SeqVertex(               "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" );
      +        final SeqVertex ref = new SeqVertex( "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" );
      +        graph.addVertices(top, bot, alt, ref);
      +        graph.addEdges(new BaseEdge(true, 1), top, ref, bot);
      +        graph.addEdges(new BaseEdge(false, 1), top, alt, bot);
      +
      +        final KBestPaths pathFinder = new KBestPaths();
      +        final List> paths = pathFinder.getKBestPaths(graph, top, bot);
      +
      +        Assert.assertEquals(paths.size(), 2);
      +
      +        final Path refPath = paths.get(0);
      +        final Path altPath = paths.get(1);
      +
      +        logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar());
      +        logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar());
      +
      +        Assert.assertEquals(refPath.calculateCigar().toString(), "51M");
      +        Assert.assertEquals(altPath.calculateCigar().toString(), "3M6I48M");
      +    }
      +
      +    // -----------------------------------------------------------------
      +    //
      +    // Systematic tests to ensure that we get the correct SW result for
      +    // a variety of variants in the ref vs alt bubble
      +    //
      +    // -----------------------------------------------------------------
      +
      +    @DataProvider(name = "SystematicRefAltSWTestData")
      +    public Object[][] makeSystematicRefAltSWTestData() {
      +        List tests = new ArrayList();
      +
      +        final List> allDiffs = Arrays.asList(
      +                Arrays.asList("G", "C", "1M"),
      +                Arrays.asList("G", "", "1D"),
      +                Arrays.asList("", "C", "1I"),
      +                Arrays.asList("AAA", "CGT", "3M"),
      +                Arrays.asList("TAT", "CAC", "3M"),
      +                Arrays.asList("GCTG", "GTCG", "4M"),
      +                Arrays.asList("AAAAA", "", "5D"),
      +                Arrays.asList("", "AAAAA", "5I"),
      +                Arrays.asList("AAAAACC", "CCGGGGGG", "5D2M6I")
      +        );
      +
      +        for ( final String prefix : Arrays.asList("", "X", "XXXXXXXXXXXXX")) {
      +            for ( final String end : Arrays.asList("", "X", "XXXXXXXXXXXXX")) {
      +                for ( final List diffs : allDiffs )
      +                    tests.add(new Object[]{prefix, end, diffs.get(0), diffs.get(1), diffs.get(2)});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "SystematicRefAltSWTestData", enabled = !DEBUG)
      +    public void testRefAltSW(final String prefix, final String end, final String refMid, final String altMid, final String midCigar) {
      +        // Construct the assembly graph
      +        SeqGraph graph = new SeqGraph();
      +
      +        SeqVertex top = new SeqVertex("");
      +        SeqVertex ref = new SeqVertex(prefix + refMid + end);
      +        SeqVertex alt = new SeqVertex(prefix + altMid + end);
      +        SeqVertex bot = new SeqVertex("");
      +
      +        graph.addVertices(top, ref, alt, bot);
      +        graph.addEdges(new BaseEdge(true, 1), top, ref, bot);
      +        graph.addEdges(new BaseEdge(false, 1), top, alt, bot);
      +
      +        // Construct the test path
      +        Path path = Path.makePath(Arrays.asList(top, alt, bot), graph);
      +
      +        Cigar expected = new Cigar();
      +        if ( ! prefix.equals("") ) expected.add(new CigarElement(prefix.length(), CigarOperator.M));
      +        for ( final CigarElement elt : TextCigarCodec.getSingleton().decode(midCigar).getCigarElements() ) expected.add(elt);
      +        if ( ! end.equals("") ) expected.add(new CigarElement(end.length(), CigarOperator.M));
      +        expected = AlignmentUtils.consolidateCigar(expected);
      +
      +        final Cigar pathCigar = path.calculateCigar();
      +
      +        logger.warn("diffs: " + ref + " vs. " + alt + " cigar " + midCigar);
      +        logger.warn("Path " + path + " with cigar " + pathCigar);
      +        logger.warn("Expected cigar " + expected);
      +
      +        Assert.assertEquals(pathCigar, expected, "Cigar mismatch");
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java
      new file mode 100644
      index 000000000..bd2e3cc2c
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java
      @@ -0,0 +1,542 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.io.File;
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.LinkedList;
      +import java.util.List;
      +
      +public class SeqGraphUnitTest extends BaseTest {
      +    private final static boolean DEBUG = false;
      +
      +    private class MergeNodesWithNoVariationTestProvider extends TestDataProvider {
      +        public byte[] sequence;
      +        public int KMER_LENGTH;
      +
      +        public MergeNodesWithNoVariationTestProvider(String seq, int kmer) {
      +            super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq));
      +            sequence = seq.getBytes();
      +            KMER_LENGTH = kmer;
      +        }
      +
      +        public SeqGraph calcGraph() {
      +            final DeBruijnGraph deBruijnGraph = new DeBruijnGraph();
      +            final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
      +            for (int i = 0; i < kmersInSequence - 1; i++) {
      +                // get the kmers
      +                final byte[] kmer1 = new byte[KMER_LENGTH];
      +                System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH);
      +                final byte[] kmer2 = new byte[KMER_LENGTH];
      +                System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH);
      +
      +                deBruijnGraph.addKmersToGraph(kmer1, kmer2, false, 1);
      +            }
      +            final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
      +            seqGraph.simplifyGraph();
      +            return seqGraph;
      +        }
      +    }
      +
      +    @DataProvider(name = "MergeNodesWithNoVariationTestProvider")
      +    public Object[][] makeMergeNodesWithNoVariationTests() {
      +        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3);
      +        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4);
      +        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5);
      +        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6);
      +        new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7);
      +        new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
      +        new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
      +        new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
      +
      +        return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class);
      +    }
      +
      +    @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = !DEBUG)
      +    public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) {
      +        logger.warn(String.format("Test: %s", cfg.toString()));
      +
      +        final SeqGraph actual = cfg.calcGraph();
      +        Assert.assertEquals(actual.vertexSet().size(), 1);
      +        final SeqVertex actualV = actual.vertexSet().iterator().next();
      +        Assert.assertEquals(actualV.getSequence(), cfg.sequence);
      +    }
      +
      +    @DataProvider(name = "IsDiamondData")
      +    public Object[][] makeIsDiamondData() throws Exception {
      +        List tests = new ArrayList();
      +
      +        SeqGraph graph;
      +        SeqVertex pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2;
      +
      +        graph = new SeqGraph();
      +
      +        pre1 = new SeqVertex("ACT");
      +        pre2 = new SeqVertex("AGT");
      +        top = new SeqVertex("A");
      +        middle1 = new SeqVertex("CT");
      +        middle2 = new SeqVertex("CG");
      +        middle3 = new SeqVertex("CA");
      +        bottom = new SeqVertex("AA");
      +        tail1 = new SeqVertex("GC");
      +        tail2 = new SeqVertex("GC");
      +
      +        graph.addVertices(pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2);
      +        graph.addEdges(pre1, top, middle1, bottom, tail1);
      +        graph.addEdges(pre2, top, middle2, bottom, tail1);
      +        graph.addEdges(top, middle3, bottom);
      +        graph.addEdges(bottom, tail2);
      +
      +        for ( final SeqVertex no : Arrays.asList(pre1, pre2, middle1, middle2, middle3, bottom, tail1, tail2)) {
      +            tests.add(new Object[]{graph, no, false});
      +        }
      +        tests.add(new Object[]{graph, top, true});
      +
      +        final SeqGraph danglingMiddleGraph = (SeqGraph)graph.clone();
      +        final SeqVertex danglingMiddle = new SeqVertex("A");
      +        danglingMiddleGraph.addVertex(danglingMiddle);
      +        danglingMiddleGraph.addEdge(top, danglingMiddle);
      +        tests.add(new Object[]{danglingMiddleGraph, top, false});
      +
      +        final SeqGraph strangerToBottom = (SeqGraph)graph.clone();
      +        final SeqVertex notAttachedToTop = new SeqVertex("A");
      +        strangerToBottom.addVertex(notAttachedToTop);
      +        strangerToBottom.addEdge(notAttachedToTop, bottom);
      +        tests.add(new Object[]{strangerToBottom, top, false});
      +
      +        final SeqGraph strangerToMiddle = (SeqGraph)graph.clone();
      +        final SeqVertex attachedToMiddle = new SeqVertex("A");
      +        strangerToMiddle.addVertex(attachedToMiddle);
      +        strangerToMiddle.addEdge(attachedToMiddle, middle1);
      +        tests.add(new Object[]{strangerToMiddle, top, false});
      +
      +        // middle1 has outgoing edge to non-bottom
      +        final SeqGraph middleExtraOut = (SeqGraph)graph.clone();
      +        final SeqVertex fromMiddle = new SeqVertex("A");
      +        middleExtraOut.addVertex(fromMiddle);
      +        middleExtraOut.addEdge(middle1, fromMiddle);
      +        tests.add(new Object[]{middleExtraOut, top, false});
      +
      +        // top connects to bottom directly as well
      +        {
      +            final SeqGraph topConnectsToBottomToo = new SeqGraph();
      +            final SeqVertex top2 = new SeqVertex("A");
      +            final SeqVertex middle4 = new SeqVertex("C");
      +            final SeqVertex bottom2 = new SeqVertex("G");
      +            topConnectsToBottomToo.addVertices(top2, middle4, bottom2);
      +            topConnectsToBottomToo.addEdges(top2, middle4, bottom2);
      +            topConnectsToBottomToo.addEdges(top2, bottom2);
      +            tests.add(new Object[]{topConnectsToBottomToo, top2, false});
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "IsDiamondData", enabled = !DEBUG)
      +    public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) {
      +        final SeqGraph.MergeDiamonds merger = graph.new MergeDiamonds();
      +        merger.setDontModifyGraphEvenIfPossible();
      +        Assert.assertEquals(merger.tryToTransform(v), isRootOfDiamond);
      +    }
      +
      +    @DataProvider(name = "MergingData")
      +    public Object[][] makeMergingData() throws Exception {
      +        List tests = new ArrayList();
      +
      +        final SeqGraph graph = new SeqGraph();
      +
      +        SeqVertex pre1 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "CT");
      +        SeqVertex pre2 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "GT");
      +        SeqVertex top = new SeqVertex("A");
      +        SeqVertex middle1 = new SeqVertex("GC");
      +        SeqVertex middle2 = new SeqVertex("TC");
      +        SeqVertex middle3 = new SeqVertex("AC");
      +        SeqVertex middle4 = new SeqVertex("GCAC");
      +        SeqVertex bottom = new SeqVertex("AA");
      +        SeqVertex tail1 = new SeqVertex("GC");
      +        SeqVertex tail2 = new SeqVertex("GC");
      +
      +        // just a single vertex
      +        graph.addVertices(pre1);
      +        tests.add(new Object[]{graph.clone(), graph.clone()});
      +
      +        // pre1 -> top = pre1 + top
      +        {
      +            graph.addVertices(top);
      +            graph.addEdges(pre1, top);
      +            final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString());
      +            final SeqGraph expected = new SeqGraph();
      +            expected.addVertex(pre1_top);
      +            tests.add(new Object[]{graph.clone(), expected.clone()});
      +        }
      +
      +        // pre1 -> top -> middle1 = pre1 + top + middle1
      +        {
      +            graph.addVertices(middle1);
      +            graph.addEdges(top, middle1);
      +            final SeqGraph expected = new SeqGraph();
      +            final SeqVertex pre1_top_middle1 = new SeqVertex(pre1.getSequenceString() + top.getSequenceString() + middle1.getSequenceString());
      +            expected.addVertex(pre1_top_middle1);
      +            tests.add(new Object[]{graph.clone(), expected});
      +        }
      +
      +        // pre1 -> top -> middle1 & top -> middle2 = pre1 + top -> middle1 & -> middle2
      +        {
      +            graph.addVertices(middle2);
      +            graph.addEdges(top, middle2);
      +            final SeqGraph expected = new SeqGraph();
      +            final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString());
      +            expected.addVertices(pre1_top, middle1, middle2);
      +            expected.addEdges(pre1_top, middle1);
      +            expected.addEdges(pre1_top, middle2);
      +            tests.add(new Object[]{graph.clone(), expected});
      +        }
      +
      +        // An actual diamond event to merge!
      +        {
      +            graph.addVertices(bottom);
      +            graph.addEdges(middle1, bottom);
      +            graph.addEdges(middle2, bottom);
      +            final SeqGraph expected = new SeqGraph();
      +            final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString());
      +            final SeqVertex newMiddle1 = new SeqVertex("G");
      +            final SeqVertex newMiddle2 = new SeqVertex("T");
      +            final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString());
      +            expected.addVertices(pre1_top, newMiddle1, newMiddle2, newBottom);
      +            expected.addEdges(pre1_top, newMiddle1, newBottom);
      +            expected.addEdges(pre1_top, newMiddle2, newBottom);
      +            tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +            graph.addVertices(middle3);
      +            graph.addEdges(top, middle3, bottom);
      +            final SeqVertex newMiddle3 = new SeqVertex("A");
      +            expected.addVertices(newMiddle3);
      +            expected.addEdges(pre1_top, newMiddle3, newBottom);
      +            tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +            graph.addVertices(middle4);
      +            graph.addEdges(top, middle4, bottom);
      +            final SeqVertex newMiddle4 = new SeqVertex("GCA");
      +            expected.addVertices(newMiddle4);
      +            expected.addEdges(pre1_top, newMiddle4, newBottom);
      +            tests.add(new Object[]{graph.clone(), expected.clone()});
      +        }
      +
      +        { // all the nodes -> lots of merging and motion of nodes
      +            final SeqGraph all = new SeqGraph();
      +            all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2);
      +            all.addEdges(pre1, top, middle1, bottom, tail1);
      +            all.addEdges(pre2, top, middle2, bottom, tail2);
      +
      +            final SeqGraph expected = new SeqGraph();
      +            final SeqVertex newMiddle1 = new SeqVertex("G");
      +            final SeqVertex newMiddle2 = new SeqVertex("T");
      +            final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString());
      +            final SeqVertex newTop = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES));
      +            final SeqVertex newTopDown1 = new SeqVertex("G");
      +            final SeqVertex newTopDown2 = new SeqVertex("C");
      +            final SeqVertex newTopBottomMerged = new SeqVertex("TA");
      +            expected.addVertices(newTop, newTopDown1, newTopDown2, newTopBottomMerged, newMiddle1, newMiddle2, newBottom, tail1, tail2);
      +            expected.addEdges(newTop, newTopDown1, newTopBottomMerged, newMiddle1, newBottom, tail1);
      +            expected.addEdges(newTop, newTopDown2, newTopBottomMerged, newMiddle2, newBottom, tail2);
      +            tests.add(new Object[]{all.clone(), expected.clone()});
      +        }
      +
      +        // test the case where we delete a middle node away because the common sequence is all of its sequence
      +        {
      +            final SeqGraph graph2 = new SeqGraph();
      +            final SeqVertex mytop = new SeqVertex("A");
      +            final SeqVertex mid1 = new SeqVertex("AC");
      +            final SeqVertex mid2 = new SeqVertex("C");
      +            final SeqVertex bot = new SeqVertex("G");
      +            graph2.addVertices(mytop, mid1, mid2, bot);
      +            graph2.addEdges(mytop, mid1, bot);
      +            graph2.addEdges(mytop, mid2, bot);
      +
      +            final SeqGraph expected = new SeqGraph();
      +            final SeqVertex newMid1 = new SeqVertex("A");
      +            final SeqVertex newBottom = new SeqVertex("CG");
      +            expected.addVertices(mytop, newMid1, newBottom);
      +            expected.addEdges(mytop, newMid1, newBottom);
      +            expected.addEdges(mytop, newBottom);
      +            tests.add(new Object[]{graph2, expected});
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "MergingData", enabled = !DEBUG)
      +    public void testMerging(final SeqGraph graph, final SeqGraph expected) {
      +        final SeqGraph merged = (SeqGraph)graph.clone();
      +        merged.simplifyGraph(1);
      +        try {
      +            Assert.assertTrue(SeqGraph.graphEquals(merged, expected));
      +        } catch (AssertionError e) {
      +//            if ( ! SeqGraph.graphEquals(merged, expected) ) {
      +//                graph.printGraph(new File("graph.dot"), 0);
      +//                merged.printGraph(new File("merged.dot"), 0);
      +//                expected.printGraph(new File("expected.dot"), 0);
      +//            }
      +            throw e;
      +        }
      +    }
      +
      +    // A -> ACT -> C [non-ref]
      +    // A -> ACT -> C [non-ref]
      +    // A -> ACT -> C [ref]
      +    //
      +    // Should become A -> ACT -> C [ref and non-ref edges]
      +    //
      +    @Test(enabled = !DEBUG)
      +    public void testBubbleSameBasesWithRef() {
      +        final SeqGraph graph = new SeqGraph();
      +        final SeqVertex top = new SeqVertex("A");
      +        final SeqVertex mid1 = new SeqVertex("ACT");
      +        final SeqVertex mid2 = new SeqVertex("ACT");
      +        final SeqVertex bot = new SeqVertex("C");
      +        graph.addVertices(top, mid1, mid2, bot);
      +        graph.addEdges(top, mid2, bot);
      +        graph.addEdge(top, mid1, new BaseEdge(true, 1));
      +        graph.addEdge(mid1, bot, new BaseEdge(true, 1));
      +
      +        final SeqGraph expected = new SeqGraph();
      +        expected.addVertex(new SeqVertex("AACTC"));
      +        final SeqGraph actual = ((SeqGraph)graph.clone());
      +        actual.simplifyGraph();
      +        Assert.assertTrue(BaseGraph.graphEquals(actual, expected), "Wrong merging result after complete merging");
      +    }
      +
      +    @DataProvider(name = "LinearZipData")
      +    public Object[][] makeLinearZipData() throws Exception {
      +        List tests = new ArrayList();
      +
      +        SeqGraph graph = new SeqGraph();
      +        SeqGraph expected = new SeqGraph();
      +
      +        // empty graph => empty graph
      +        tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +        SeqVertex a1 = new SeqVertex("A");
      +        SeqVertex c1 = new SeqVertex("C");
      +        SeqVertex ac1 = new SeqVertex("AC");
      +
      +        // just a single vertex
      +        graph.addVertices(a1, c1);
      +        expected.addVertices(a1, c1);
      +
      +        tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +        graph.addEdges(a1, c1);
      +        expected = new SeqGraph();
      +        expected.addVertices(ac1);
      +        tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +        // three long chain merged corrected
      +        SeqVertex g1 = new SeqVertex("G");
      +        graph.addVertices(g1);
      +        graph.addEdges(c1, g1);
      +        expected = new SeqGraph();
      +        expected.addVertex(new SeqVertex("ACG"));
      +        tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +        // adding something that isn't connected isn't a problem
      +        SeqVertex t1 = new SeqVertex("T");
      +        graph.addVertices(t1);
      +        expected = new SeqGraph();
      +        expected.addVertices(new SeqVertex("ACG"), new SeqVertex("T"));
      +        tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +        // splitting chain with branch produces the correct zipped subgraphs
      +        final SeqVertex a2 = new SeqVertex("A");
      +        final SeqVertex c2 = new SeqVertex("C");
      +        graph = new SeqGraph();
      +        graph.addVertices(a1, c1, g1, t1, a2, c2);
      +        graph.addEdges(a1, c1, g1, t1, a2);
      +        graph.addEdges(g1, c2);
      +        expected = new SeqGraph();
      +        SeqVertex acg = new SeqVertex("ACG");
      +        SeqVertex ta = new SeqVertex("TA");
      +        expected.addVertices(acg, ta, c2);
      +        expected.addEdges(acg, ta);
      +        expected.addEdges(acg, c2);
      +        tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +        // Can merge chains with loops in them
      +        {
      +            graph = new SeqGraph();
      +            graph.addVertices(a1, c1, g1);
      +            graph.addEdges(a1, c1, g1);
      +            graph.addEdges(a1, a1);
      +            expected = new SeqGraph();
      +
      +            SeqVertex ac = new SeqVertex("AC");
      +            SeqVertex cg = new SeqVertex("CG");
      +
      +            expected.addVertices(a1, cg);
      +            expected.addEdges(a1, cg);
      +            expected.addEdges(a1, a1);
      +            tests.add(new Object[]{graph.clone(), expected.clone()});
      +
      +            graph.removeEdge(a1, a1);
      +            graph.addEdges(c1, c1);
      +            tests.add(new Object[]{graph.clone(), graph.clone()});
      +
      +            graph.removeEdge(c1, c1);
      +            graph.addEdges(g1, g1);
      +            expected = new SeqGraph();
      +            expected.addVertices(ac, g1);
      +            expected.addEdges(ac, g1, g1);
      +            tests.add(new Object[]{graph.clone(), expected.clone()});
      +        }
      +
      +        // check building n element long chains
      +        {
      +            final List bases = Arrays.asList("A", "C", "G", "T", "TT", "GG", "CC", "AA");
      +            for ( final int len : Arrays.asList(1, 2, 10, 100, 1000)) {
      +                graph = new SeqGraph();
      +                expected = new SeqGraph();
      +                SeqVertex last = null;
      +                String expectedBases = "";
      +                for ( int i = 0; i < len; i++ ) {
      +                    final String seq = bases.get(i % bases.size());
      +                    expectedBases += seq;
      +                    SeqVertex a = new SeqVertex(seq);
      +                    graph.addVertex(a);
      +                    if ( last != null ) graph.addEdge(last, a);
      +                    last = a;
      +                }
      +                expected.addVertex(new SeqVertex(expectedBases));
      +                tests.add(new Object[]{graph.clone(), expected.clone()});
      +            }
      +        }
      +
      +        // check that edge connections are properly maintained
      +        {
      +            int edgeWeight = 1;
      +            for ( final int nIncoming : Arrays.asList(0, 2, 5, 10) ) {
      +                for ( final int nOutgoing : Arrays.asList(0, 2, 5, 10) ) {
      +                    graph = new SeqGraph();
      +                    expected = new SeqGraph();
      +
      +                    graph.addVertices(a1, c1, g1);
      +                    graph.addEdges(a1, c1, g1);
      +                    expected.addVertex(acg);
      +
      +                    for ( final SeqVertex v : makeVertices(nIncoming) ) {
      +                        final BaseEdge e = new BaseEdge(false, edgeWeight++);
      +                        graph.addVertices(v);
      +                        graph.addEdge(v, a1, e);
      +                        expected.addVertex(v);
      +                        expected.addEdge(v, acg, e);
      +                    }
      +
      +                    for ( final SeqVertex v : makeVertices(nOutgoing) ) {
      +                        final BaseEdge e = new BaseEdge(false, edgeWeight++);
      +                        graph.addVertices(v);
      +                        graph.addEdge(g1, v, e);
      +                        expected.addVertex(v);
      +                        expected.addEdge(acg, v, e);
      +                    }
      +
      +                    tests.add(new Object[]{graph, expected});
      +                }
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    private List makeVertices(final int n) {
      +        final List vs = new LinkedList();
      +        final List bases = Arrays.asList("A", "C", "G", "T", "TT", "GG", "CC", "AA");
      +
      +        for ( int i = 0; i < n; i++ )
      +            vs.add(new SeqVertex(bases.get(i % bases.size())));
      +        return vs;
      +    }
      +
      +    @Test(dataProvider = "LinearZipData", enabled = true)
      +    public void testLinearZip(final SeqGraph graph, final SeqGraph expected) {
      +        final SeqGraph merged = (SeqGraph)graph.clone();
      +        merged.zipLinearChains();
      +        try {
      +            Assert.assertTrue(SeqGraph.graphEquals(merged, expected));
      +        } catch (AssertionError e) {
      +            if ( ! SeqGraph.graphEquals(merged, expected) ) {
      +                graph.printGraph(new File("graph.dot"), 0);
      +                merged.printGraph(new File("merged.dot"), 0);
      +                expected.printGraph(new File("expected.dot"), 0);
      +            }
      +            throw e;
      +        }
      +    }
      +
      +    @Test(timeOut = 10000)
      +    public void testInfiniteCycleFromEmpiricalRuns() {
      +        final SeqVertex v1 = new SeqVertex("CCCT");
      +        final SeqVertex v2 = new SeqVertex("CATCCTCCCTTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCC");
      +        final SeqVertex v3 = new SeqVertex("CTAGACTTCTCCTCCTCCTCC");
      +        final SeqVertex v4 = new SeqVertex("ACCATC");
      +        final SeqVertex v5 = new SeqVertex("CCTCCACCATCCTCCCCTCTAGGCTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATC");
      +        final SeqVertex v6 = new SeqVertex("CTCCCCT");
      +
      +        final SeqGraph graph = new SeqGraph();
      +        graph.addVertices(v1, v2, v3, v4, v5, v6);
      +        graph.addEdges(v1, v3, v4, v6, v3);
      +        graph.addEdges(v2, v4);
      +        graph.addEdges(v5, v6);
      +
      +        graph.simplifyGraph();
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java
      new file mode 100644
      index 000000000..eab9dfc27
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java
      @@ -0,0 +1,108 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.util.ArrayList;
      +import java.util.List;
      +
      +public class SeqVertexUnitTest extends BaseTest {
      +    @Test
      +    public void testBasic() {
      +        final byte[] bases = "ACT".getBytes();
      +        final SeqVertex v1 = new SeqVertex(bases);
      +        final SeqVertex v2 = new SeqVertex(bases);
      +        Assert.assertTrue(v1.getId() >= 0);
      +        Assert.assertTrue(v2.getId() >= 0);
      +        Assert.assertTrue(v2.getId() > v1.getId());
      +    }
      +
      +    @Test
      +    public void testEqualsAndHashCode() {
      +        final byte[] bases = "ACT".getBytes();
      +        final SeqVertex v1 = new SeqVertex(bases);
      +        final SeqVertex v1_neq = new SeqVertex(bases);
      +        final SeqVertex v1_eq = new SeqVertex(v1);
      +
      +        Assert.assertEquals(v1, v1);
      +        Assert.assertEquals(v1.hashCode(), v1.hashCode());
      +        Assert.assertEquals(v1, v1_eq);
      +        Assert.assertEquals(v1.hashCode(), v1_eq.hashCode());
      +        Assert.assertFalse(v1.equals(v1_neq));
      +        Assert.assertFalse(v1_neq.equals(v1));
      +        Assert.assertFalse(v1_neq.hashCode() == v1.hashCode());
      +    }
      +
      +    @DataProvider(name = "WithoutSuffixData")
      +    public Object[][] makeWithoutSuffixData() {
      +        List tests = new ArrayList();
      +
      +        final String bases = "ACGTACGTACGT";
      +        final int l = bases.length();
      +        for ( int suffixLength = 0; suffixLength <= l; suffixLength++ ) {
      +            final int suffixStart = l - suffixLength;
      +            final String prefix = suffixLength == l ? null : bases.substring(0, suffixStart);
      +            final String suffix = suffixStart == l ? "" : bases.substring(suffixStart, l);
      +            tests.add(new Object[]{bases, suffix, prefix});
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "WithoutSuffixData")
      +    public void testWithoutSuffix(final String bases, final String suffix, final String expected) {
      +        final SeqVertex basesSV = new SeqVertex(bases);
      +        if ( expected == null )
      +            Assert.assertNull(basesSV.withoutSuffix(suffix.getBytes()), "Failed for bases " + bases + " with suffix " + suffix + " != " + expected);
      +        else
      +            Assert.assertEquals(basesSV.withoutSuffix(suffix.getBytes()).getSequenceString(), expected, "Failed for bases " + bases + " with suffix " + suffix + " != " + expected);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java
      new file mode 100644
      index 000000000..2df783b19
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java
      @@ -0,0 +1,294 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.broadinstitute.sting.utils.collections.Pair;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.io.File;
      +import java.util.*;
      +
      +public class SharedVertexSequenceSplitterUnitTest extends BaseTest {
      +    private final static boolean PRINT_GRAPHS = false;
      +
      +    @DataProvider(name = "PrefixSuffixData")
      +    public Object[][] makePrefixSuffixData() {
      +        List tests = new ArrayList();
      +
      +        tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0});
      +        tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0});
      +        tests.add(new Object[]{Arrays.asList("ACT", "AGT"), 1, 1});
      +        tests.add(new Object[]{Arrays.asList("ACCT", "AGT"), 1, 1});
      +        tests.add(new Object[]{Arrays.asList("ACT", "ACT"), 3, 0});
      +        tests.add(new Object[]{Arrays.asList("ACTA", "ACT"), 3, 0});
      +        tests.add(new Object[]{Arrays.asList("ACTA", "ACTG"), 3, 0});
      +        tests.add(new Object[]{Arrays.asList("ACTA", "ACTGA"), 3, 1});
      +        tests.add(new Object[]{Arrays.asList("GCTGA", "ACTGA"), 0, 4});
      +
      +        tests.add(new Object[]{Arrays.asList("A", "C", "A"), 0, 0});
      +        tests.add(new Object[]{Arrays.asList("A", "A", "A"), 1, 0});
      +        tests.add(new Object[]{Arrays.asList("A", "AA", "A"), 1, 0});
      +        tests.add(new Object[]{Arrays.asList("A", "ACA", "A"), 1, 0});
      +        tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACT"), 2, 1});
      +        tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACGT"), 2, 1});
      +        tests.add(new Object[]{Arrays.asList("AAAT", "AAA", "CAAA"), 0, 0});
      +        tests.add(new Object[]{Arrays.asList("AACTTT", "AAGTTT", "AAGCTTT"), 2, 3});
      +        tests.add(new Object[]{Arrays.asList("AAA", "AAA", "CAAA"), 0, 3});
      +        tests.add(new Object[]{Arrays.asList("AAA", "AAA", "AAA"), 3, 0});
      +
      +        tests.add(new Object[]{Arrays.asList("AC", "ACA", "AC"), 2, 0});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "PrefixSuffixData")
      +    public void testPrefixSuffix(final List strings, int expectedPrefixLen, int expectedSuffixLen) {
      +        final List bytes = new ArrayList();
      +        int min = Integer.MAX_VALUE;
      +        for ( final String s : strings ) {
      +            bytes.add(s.getBytes());
      +            min = Math.min(min, s.length());
      +        }
      +
      +        final int actualPrefixLen = GraphUtils.compPrefixLen(bytes, min);
      +        Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test");
      +
      +        final int actualSuffixLen = GraphUtils.compSuffixLen(bytes, min - actualPrefixLen);
      +        Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test");
      +    }
      +
      +    @Test(dataProvider = "PrefixSuffixData")
      +    public void testPrefixSuffixVertices(final List strings, int expectedPrefixLen, int expectedSuffixLen) {
      +        final List v = new ArrayList();
      +        for ( final String s : strings ) {
      +            v.add(new SeqVertex(s));
      +        }
      +
      +        final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen);
      +        final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen);
      +
      +        final Pair result = SharedVertexSequenceSplitter.commonPrefixAndSuffixOfVertices(v);
      +        Assert.assertEquals(result.getFirst().getSequenceString(), expectedPrefix, "Failed suffix test");
      +        Assert.assertEquals(result.getSecond().getSequenceString(), expectedSuffix, "Failed suffix test");
      +
      +        Assert.assertEquals(result.getFirst().isEmpty(), expectedPrefix.isEmpty());
      +        Assert.assertEquals(result.getSecond().isEmpty(), expectedSuffix.isEmpty());
      +    }
      +
      +    @Test(dataProvider = "PrefixSuffixData")
      +    public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) {
      +        final SeqGraph graph = new SeqGraph();
      +
      +        final List v = new ArrayList();
      +        for ( final String s : strings ) {
      +            v.add(new SeqVertex(s));
      +        }
      +
      +        graph.addVertices(v.toArray(new SeqVertex[]{}));
      +
      +        final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen);
      +        final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen);
      +
      +        final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
      +        splitter.split();
      +//        splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".dot"), 0);
      +
      +        Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix);
      +        Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix);
      +
      +        Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size());
      +        Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0);
      +
      +        Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size());
      +        Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0);
      +
      +        for ( final SeqVertex mid : splitter.newMiddles ) {
      +            Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid));
      +            Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV));
      +        }
      +    }
      +
      +    @DataProvider(name = "CompleteCycleData")
      +    public Object[][] makeCompleteCycleData() {
      +        List tests = new ArrayList();
      +
      +        for ( final boolean hasTop : Arrays.asList(true, false) ) {
      +            for ( final boolean hasBot : Arrays.asList(true, false) ) {
      +                if ( ! hasTop && ! hasBot ) continue;
      +                tests.add(new Object[]{Arrays.asList("A", "A"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("A", "C"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("A", "AC"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("A", "CA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("A", "ACA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("AC", "ACA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("AT", "ACA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("ATA", "ACA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("ATAA", "ACA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("ATAACA", "ACA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("CCCAAA", "AAA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "AAA"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "CCCAAA"), hasTop, hasBot});
      +
      +                tests.add(new Object[]{Arrays.asList("A", "A", "A"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("A", "A", "C"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("A", "C", "C"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("AC", "C", "C"), hasTop, hasBot});
      +                tests.add(new Object[]{Arrays.asList("CA", "C", "C"), hasTop, hasBot});
      +                // all merged
      +                tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGA"), hasTop, hasBot});
      +                // prefix and suffix
      +                tests.add(new Object[]{Arrays.asList("AGA", "AGA", "ACA"), hasTop, hasBot});
      +                // 2 -> prefix, leave C
      +                tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGAC"), hasTop, hasBot});
      +                // 2 -> prefix, leave CCC
      +                tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGACCC"), hasTop, hasBot});
      +                // 2 -> suffix, leave A/T
      +                tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AAGA"), hasTop, hasBot});
      +                // 2 -> suffix, leave T, delete 1
      +                tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AGA"), hasTop, hasBot});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "CompleteCycleData")
      +    public void testSplitterCompleteCycle(final List strings, final boolean hasTop, final boolean hasBot) {
      +        final SeqGraph graph = new SeqGraph();
      +
      +        int edgeWeight = 1;
      +        final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null;
      +        final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null;
      +        final List v = new ArrayList();
      +        for ( final String s : strings ) {
      +            v.add(new SeqVertex(s));
      +        }
      +        graph.addVertices(v.toArray(new SeqVertex[]{}));
      +        final SeqVertex first = v.get(0);
      +
      +        if ( hasTop ) {
      +            graph.addVertex(top);
      +            for ( final SeqVertex vi : v )
      +                graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++));
      +        }
      +
      +        if ( hasBot ) {
      +            graph.addVertex(bot);
      +            for ( final SeqVertex vi : v )
      +                graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++));
      +        }
      +
      +        final Set haplotypes = new HashSet();
      +        final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone());
      +        for ( final Path path : originalPaths )
      +            haplotypes.add(new String(path.getBases()));
      +
      +        final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
      +        splitter.split();
      +        if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0);
      +        if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0);
      +        splitter.updateGraph(top, bot);
      +        if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0);
      +
      +        final List> splitPaths = new KBestPaths().getKBestPaths(graph);
      +        for ( final Path path : splitPaths ) {
      +            final String h = new String(path.getBases());
      +            Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h);
      +        }
      +
      +        if ( splitPaths.size() == originalPaths.size() ) {
      +            for ( int i = 0; i < originalPaths.size(); i++ ) {
      +                Assert.assertTrue(splitPaths.get(i).equalScoreAndSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i));
      +            }
      +        }
      +    }
      +
      +    @DataProvider(name = "MeetsMinSequenceData")
      +    public Object[][] makeMeetsMinSequenceData() {
      +        List tests = new ArrayList();
      +
      +        final boolean prefixBiased = SharedVertexSequenceSplitter.prefersPrefixMerging();
      +        tests.add(new Object[]{Arrays.asList("AC", "AC"), 0, true, true});
      +        tests.add(new Object[]{Arrays.asList("AC", "AC"), 1, prefixBiased, ! prefixBiased});
      +        tests.add(new Object[]{Arrays.asList("AC", "AC"), 2, prefixBiased, ! prefixBiased});
      +        tests.add(new Object[]{Arrays.asList("AC", "AC"), 3, false, false});
      +        tests.add(new Object[]{Arrays.asList("A", "AC"), 1, true, false});
      +        tests.add(new Object[]{Arrays.asList("A", "AC"), 2, false, false});
      +        tests.add(new Object[]{Arrays.asList("AT", "AC"), 1, true, false});
      +        tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 1, true, false});
      +        tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 2, true, false});
      +        tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 3, false, false});
      +        tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 1, true, true});
      +        tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 2, true, true});
      +        tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 3, false, true});
      +        tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 4, false, false});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "MeetsMinSequenceData")
      +    public void testSplitterCompleteCycle(final List mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) {
      +        final SeqGraph graph = new SeqGraph();
      +
      +        final SeqVertex top = new SeqVertex("AAAAAAAA");
      +        final SeqVertex bot = new SeqVertex("GGGGGGGG");
      +        final List v = new ArrayList();
      +        for ( final String s : mids ) { v.add(new SeqVertex(s)); }
      +        graph.addVertices(v.toArray(new SeqVertex[]{}));
      +        graph.addVertices(top, bot);
      +        for ( final SeqVertex vi : v ) { graph.addEdge(top, vi); graph.addEdge(vi, bot); }
      +
      +        final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v);
      +        Assert.assertEquals(splitter.meetsMinMergableSequenceForPrefix(minSeqLength), prefixMeets, "Prefix failed");
      +        Assert.assertEquals(splitter.meetsMinMergableSequenceForSuffix(minSeqLength), suffixMeets, "Suffix failed");
      +        Assert.assertEquals(splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(minSeqLength), suffixMeets || prefixMeets, "Either prefix or suffix failed");
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java
      index ff9896307..a3d9121d0 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java
      @@ -79,7 +79,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseTestString(sampleNone + freqUnif + "--variant " + testfile),
                       1,
      -                Arrays.asList("b8a988757ac1f206d123140da5a3e778")
      +                Arrays.asList("658c70cbb93faed8ca18e51cd6dd593f")
               );
       
               executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec);
      @@ -91,7 +91,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseTestString(sampleNone + freqAF + "--variant " + testfile),
                       1,
      -                Arrays.asList("542d5d5ff8c64da7b077bab4b950a9a3")
      +                Arrays.asList("90411433ea42846352b767da735af53b")
               );
       
               executeTest("testNoSampleSelectionFreqAF--" + testfile, spec);
      @@ -103,7 +103,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseTestString(sampleGT + freqUnif + "--variant " + testfile),
                       1,
      -                Arrays.asList("7385b17eed7f4ff0f6e82e60c3334ce7")
      +                Arrays.asList("2afabd447185cf017f60c85380902117")
               );
       
               executeTest("testPolyGTFreqUniform--" + testfile, spec);
      @@ -115,7 +115,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseTestString(sampleGT + freqAF + "--variant " + testfile),
                       1,
      -                Arrays.asList("0ee4a565a0d4f6b6942abd72a373becd")
      +                Arrays.asList("381e1a2f0e1908b4d7cba5d6361cf5aa")
               );
       
               executeTest("testPolyGTFreqAF--" + testfile, spec);
      @@ -127,7 +127,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
               WalkerTestSpec spec = new WalkerTestSpec(
                       baseTestString(sampleGL + freqAF + "--variant " + testfile),
                       1,
      -                Arrays.asList("0ee4a565a0d4f6b6942abd72a373becd")
      +                Arrays.asList("381e1a2f0e1908b4d7cba5d6361cf5aa")
               );
       
               executeTest("testPolyGLFreqAF--" + testfile, spec);
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
      index a3fbbf68b..e7a3f23a4 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
      @@ -73,8 +73,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
       
           VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
                   "4d08c8eee61dd1bdea8c5765f34e41f0",  // tranches
      -            "ce396fe4045e020b61471f6737dff36e",  // recal file
      -            "4f59bd61be900b25c6ecedaa68b9c8de"); // cut VCF
      +            "ca7de32b6143cce58aa4bc59b311feb7",  // recal file
      +            "cc7f413ba50b3d12f11f95aaa31e67d1"); // cut VCF
       
           @DataProvider(name = "VRTest")
           public Object[][] createData1() {
      @@ -122,8 +122,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
       
           VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
                   "6a1eef4d02857dbb117a15420b5c0ce9",  // tranches
      -            "238366af66b05b6d21749e799c25353d",  // recal file
      -            "3928d6bc5007becf52312ade70f14c42"); // cut VCF
      +            "db9faaee11ee5427a81ddee328245f8c",  // recal file
      +            "42e0fcd8e048a5f6abc41a4d1c3e97a5"); // cut VCF
       
           @DataProvider(name = "VRBCFTest")
           public Object[][] createVRBCFTest() {
      @@ -174,14 +174,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
           VRTest indelUnfiltered = new VRTest(
                   validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
                   "b7589cd098dc153ec64c02dcff2838e4",  // tranches
      -            "a04a9001f62eff43d363f4d63769f3ee",  // recal file
      -            "b2c6827be592c24a4692b1753edc7d23"); // cut VCF
      +            "5a9ba210a3c68109289a71039a04509d",  // recal file
      +            "d816bd43c844069d65711a7975707437"); // cut VCF
       
           VRTest indelFiltered = new VRTest(
                   validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
                   "b7589cd098dc153ec64c02dcff2838e4",  // tranches
      -            "a04a9001f62eff43d363f4d63769f3ee",  // recal file
      -            "5d483fe1ba2ef36ee9e6c14cbd654706"); // cut VCF
      +            "5a9ba210a3c68109289a71039a04509d",  // recal file
      +            "6bcb344511c727c28523825f73c7daee"); // cut VCF
       
           @DataProvider(name = "VRIndelTest")
           public Object[][] createTestVariantRecalibratorIndel() {
      @@ -239,7 +239,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
                               " -o %s" +
                               " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" +
                               " -recalFile " + privateTestDir + "VQSR.mixedTest.recal",
      -                Arrays.asList("018b3a5cc7cf0cb5468c6a0c80ccaa8b"));
      +                Arrays.asList("20c23643a78c5b95abd1526fdab8960d"));
               executeTest("testApplyRecalibrationSnpAndIndelTogether", spec);
           }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java
      index 2e31f6725..bca912d63 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java
      @@ -203,6 +203,27 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
               Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],1);
               Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0);
               Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0);
      +
      +        // now flip them around
      +
      +        eval = data.getSecond();
      +        truth = data.getFirst();
      +        codec = new VCFCodec();
      +        evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
      +        compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
      +        metrics = new ConcordanceMetrics(evalHeader,compHeader);
      +        metrics.update(eval,truth);
      +        Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2);
      +        Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2);
      +        Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),1);
      +        Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[1][2],0);
      +        Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[1][2],0);
      +        Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[3][2],1);
      +        Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1);
      +        Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],0);
      +        Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUBSET_TRUTH.ordinal()],1);
      +        Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0);
      +        Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0);
           }
       
           private Pair getData3() {
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java
      new file mode 100644
      index 000000000..a7d32d43b
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java
      @@ -0,0 +1,77 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.variantutils;
      +
      +import org.broadinstitute.sting.WalkerTest;
      +import org.testng.annotations.Test;
      +
      +import java.util.Arrays;
      +
      +/**
      + * Tests LeftAlignAndTrimVariants
      + */
      +public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest {
      +
      +    @Test
      +    public void testLeftAlignment() {
      +         WalkerTestSpec spec = new WalkerTestSpec(
      +                 "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header",
      +                 1,
      +                 Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2"));
      +         executeTest("test left alignment", spec);
      +    }
      +
      +    @Test
      +    public void testLeftAlignmentWithTrimmingAndMultialleliecs() {
      +        WalkerTestSpec spec = new WalkerTestSpec(
      +                "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split",
      +                1,
      +                Arrays.asList("4ae03954f8bd66e73fd005c49ea301db"));
      +        executeTest("test left alignment with trimming and hard multiple alleles", spec);
      +
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java
      new file mode 100644
      index 000000000..a8739dac2
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java
      @@ -0,0 +1,177 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.variantutils;
      +
      +import net.sf.picard.reference.IndexedFastaSequenceFile;
      +import net.sf.samtools.SAMFileHeader;
      +import net.sf.samtools.SAMReadGroupRecord;
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
      +import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.GenomeLocParser;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.broadinstitute.sting.utils.collections.Pair;
      +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
      +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
      +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
      +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
      +import org.broadinstitute.variant.variantcontext.Allele;
      +import org.broadinstitute.variant.variantcontext.VariantContext;
      +import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
      +import org.testng.Assert;
      +import org.testng.annotations.BeforeClass;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.io.File;
      +import java.io.FileNotFoundException;
      +import java.util.*;
      +
      +/**
      + * Created with IntelliJ IDEA.
      + * User: delangel
      + * Date: 3/22/13
      + * Time: 6:09 PM
      + * To change this template use File | Settings | File Templates.
      + */
      +public class LeftAlignAndTrimVariantsUnitTest extends BaseTest {
      +    final String refBases1 = "ACAGAGCTGACCCTCCCTCCCCTCTCCCAGTGCAACAGCACGGGCGGCGACTGCTTTTACCGAGGCTACACGTCAGGCGTGGCGGCTGTCCAGGACTGGTACCACTTCCACTATGTGGATCTCTGCTGAGGACCAGGAAAGCCAGCACCCGCAGAGACTCTTCCCCAGTGCTCCATACGATCACCATTCTCTGCAGAAGG";
      +    final String longPiece = "AAAAAAAAAAAAAAAAAAAAAAAAAAAA"; // where we'll perform tests
      +    final String refBases = refBases1 + longPiece + refBases1;
      +
      +    final int contigStop = refBases.length();
      +    final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigStop );
      +    final String artificialContig = "chr1";
      +    final int locStart = refBases1.length(); // start position where we desire artificial variant
      +    final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
      +    final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,1,refBases.length());
      +    final String windowBases = refBases;
      +
      +
      +
      +    @DataProvider(name = "LeftAlignDataProvider")
      +    public Object[][] makeLeftAlignDataProvider() {
      +        List tests = new ArrayList();
      +
      +        // this functionality can be adapted to provide input data for whatever you might want in your data
      +        for (  int offset = 1; offset < longPiece.length(); offset++ ) {
      +            for ( int indelSize = -longPiece.length()+offset; indelSize < longPiece.length()-offset; indelSize++ ) {
      +                tests.add(new Object[]{offset, indelSize});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "LeftAlignDataProvider")
      +    public void testLeftAlignNoTrimming(final int offset, final int indelSize) {
      +        if (indelSize == 0)
      +            return;
      +
      +        final List alleles = new ArrayList();
      +
      +        if (indelSize < 0) { // deletion
      +            alleles.add(Allele.create(Utils.dupString("A",Math.abs(indelSize)+1),true));
      +            alleles.add(Allele.create("A", false));
      +        }
      +        else {
      +            alleles.add(Allele.create("A", true));
      +            alleles.add(Allele.create(Utils.dupString("A",Math.abs(indelSize)+1),false));
      +
      +        }
      +        final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart+offset,locStart+offset);
      +        final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,windowBases.getBytes());
      +
      +        final VariantContext vc = new VariantContextBuilder("test", artificialContig, locStart+offset, locStart+offset+alleles.get(0).length()-1, alleles).make();
      +        final Pair result = LeftAlignAndTrimVariants.alignAndWrite(vc,referenceContext);
      +        Assert.assertTrue(result.second == (offset>0?1:0));
      +        Assert.assertEquals(result.first.getStart(), locStart);
      +
      +
      +    }
      +
      +    @DataProvider(name = "TrimDataProvider")
      +    public Object[][] makeTrimDataProvider() {
      +        List tests = new ArrayList();
      +
      +        for (  int offset = 1; offset < longPiece.length(); offset++ ) {
      +            for ( int indelSize = -longPiece.length()+offset; indelSize < longPiece.length()-offset; indelSize++ )
      +                tests.add(new Object[]{indelSize, offset});
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "TrimDataProvider")
      +    public void testTrimming(final int indelSize, final int offset) {
      +        if (indelSize == 0)
      +            return;
      +
      +        final List alleles = new ArrayList();
      +
      +        final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart+offset,locStart+offset);
      +        final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,windowBases.getBytes());
      +
      +        final int prefixLen = 10;
      +        final String prefix = refBases.substring(locStart+offset-prefixLen,locStart+offset);
      +        if (indelSize < 0) { // deletion
      +            alleles.add(Allele.create(prefix+Utils.dupString("A",Math.abs(indelSize)+1),true));
      +            alleles.add(Allele.create(prefix+"A", false));
      +        }
      +        else {
      +            alleles.add(Allele.create(prefix+"A", true));
      +            alleles.add(Allele.create(prefix+Utils.dupString("A",Math.abs(indelSize)+1),false));
      +
      +        }
      +
      +        final VariantContext vc = GATKVariantContextUtils.trimAlleles( new VariantContextBuilder("test", artificialContig, locStart + offset, locStart + offset + alleles.get(0).length() - 1, alleles).make(),true,true);
      +        if (indelSize>0)
      +            Assert.assertEquals(vc.getReference().length(),1);
      +        else
      +            Assert.assertEquals(vc.getReference().length(),Math.abs(indelSize)+1);
      +    }
      +}
      \ No newline at end of file
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
      index c97f0bf02..4b1483cb6 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
      @@ -144,7 +144,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
           }
       
           @Test
      -    public void testSampleExclusion() {
      +    public void testSampleExclusionFromFileAndSeparateSample() {
               String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
               String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
       
      @@ -158,6 +158,21 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
               executeTest("testSampleExclusion--" + testfile, spec);
           }
       
      +    @Test
      +    public void testSampleExclusionJustFromFile() {
      +        String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
      +        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
      +
      +        WalkerTestSpec spec = new WalkerTestSpec(
      +                "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sf " + samplesFile + " --variant " + testfile,
      +                1,
      +                Arrays.asList("875d7e00ac8081e87ab9fb1b20c83677")
      +        );
      +        spec.disableShadowBCF();
      +
      +        executeTest("testSampleExclusion--" + testfile, spec);
      +    }
      +
           @Test
           public void testSampleInclusionWithNonexistingSamples() {
               String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
      @@ -242,6 +257,32 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
               executeTest("testRemoveMLE--" + testFile, spec);
           }
       
      +    @Test
      +    public void testKeepOriginalAC() {
      +        String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf";
      +
      +        WalkerTestSpec spec = new WalkerTestSpec(
      +                "-T SelectVariants --keepOriginalAC -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
      +                1,
      +                Arrays.asList("ad7e8b25e431a3229a78cec063876559")
      +        );
      +
      +        executeTest("testKeepOriginalAC--" + testFile, spec);
      +    }
      +
      +    @Test
      +    public void testKeepOriginalACAndENV() {
      +        String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf";
      +
      +        WalkerTestSpec spec = new WalkerTestSpec(
      +                "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
      +                1,
      +                Arrays.asList("e9b8292212545684cdb163423329ee7e")
      +        );
      +
      +        executeTest("testKeepOriginalACAndENV--" + testFile, spec);
      +    }
      +
           @Test
           public void testMultipleRecordsAtOnePosition() {
               String testFile = privateTestDir + "selectVariants.onePosition.vcf";
      diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java
      similarity index 95%
      rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java
      rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java
      index 721eb2874..7b1b9b7d2 100644
      --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java
      @@ -52,16 +52,16 @@ import org.testng.annotations.Test;
       import java.util.Arrays;
       
       /**
      - * Tests LeftAlignVariants
      + * Tests VariantsToAllelicPrimitives
        */
      -public class LeftAlignVariantsIntegrationTest extends WalkerTest {
      +public class VariantsToAllelicPrimitivesIntegrationTest extends WalkerTest {
       
           @Test
      -    public void testLeftAlignment() {
      +    public void testMNPsToSNPs() {
                WalkerTestSpec spec = new WalkerTestSpec(
      -                 "-T LeftAlignVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header",
      +                 "-T VariantsToAllelicPrimitives -o %s -R " + b37KGReference + " -V " + privateTestDir + "vcfWithMNPs.vcf --no_cmdline_in_header",
                        1,
      -                 Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2"));
      -         executeTest("test left alignment", spec);
      +                 Arrays.asList("c5333d2e352312bdb7c5182ca3009594"));
      +         executeTest("test MNPs To SNPs", spec);
           }
       }
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java
      new file mode 100644
      index 000000000..08d82281e
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java
      @@ -0,0 +1,117 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.genotyper;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.variant.variantcontext.Allele;
      +import org.testng.Assert;
      +import org.testng.annotations.Test;
      +
      +public class MostLikelyAlleleUnitTest extends BaseTest {
      +    final Allele a = Allele.create("A");
      +    final Allele b = Allele.create("C");
      +
      +    @Test
      +    public void testBasicCreation() {
      +        final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1;
      +        MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
      +        Assert.assertEquals(mla.getMostLikelyAllele(), a);
      +        Assert.assertEquals(mla.getSecondMostLikelyAllele(), b);
      +        Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
      +        Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second);
      +
      +        Assert.assertEquals(mla.isInformative(), true);
      +        Assert.assertEquals(mla.isInformative(10), false);
      +        Assert.assertEquals(mla.isInformative(0), true);
      +        Assert.assertEquals(mla.getAlleleIfInformative(), a);
      +        Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL);
      +        Assert.assertEquals(mla.getAlleleIfInformative(0), a);
      +    }
      +
      +    @Test
      +    public void testNotDefaultInformative() {
      +        final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2);
      +        MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
      +        Assert.assertEquals(mla.isInformative(), false);
      +        Assert.assertEquals(mla.isInformative(10), false);
      +        Assert.assertEquals(mla.isInformative(0), true);
      +        Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL);
      +        Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL);
      +        Assert.assertEquals(mla.getAlleleIfInformative(0), a);
      +    }
      +
      +    @Test
      +    public void testCreationNoGoodSecond() {
      +        MostLikelyAllele mla = new MostLikelyAllele(a, null, -1.0, Double.NEGATIVE_INFINITY);
      +        Assert.assertEquals(mla.getMostLikelyAllele(), a);
      +        Assert.assertEquals(mla.getSecondMostLikelyAllele(), null);
      +        Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
      +        Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
      +
      +        Assert.assertEquals(mla.isInformative(), true);
      +        Assert.assertEquals(mla.isInformative(10), true);
      +        Assert.assertEquals(mla.isInformative(0), true);
      +        Assert.assertEquals(mla.getAlleleIfInformative(), a);
      +        Assert.assertEquals(mla.getAlleleIfInformative(10), a);
      +        Assert.assertEquals(mla.getAlleleIfInformative(0), a);
      +    }
      +
      +    @Test
      +    public void testCreationNoAllele() {
      +        MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, null, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
      +        Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL);
      +        Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY);
      +        Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
      +
      +        Assert.assertEquals(mla.isInformative(), false);
      +        Assert.assertEquals(mla.isInformative(10), false);
      +        Assert.assertEquals(mla.isInformative(0), false);
      +        Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL);
      +        Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL);
      +        Assert.assertEquals(mla.getAlleleIfInformative(0), Allele.NO_CALL);
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
      index 84bdfd19b..9530ea41f 100644
      --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
      @@ -46,41 +46,24 @@
       
       package org.broadinstitute.sting.utils.genotyper;
       
      +import net.sf.samtools.*;
       import org.broadinstitute.sting.BaseTest;
       import org.broadinstitute.variant.variantcontext.Allele;
       import org.broadinstitute.sting.utils.BaseUtils;
      -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
       import org.broadinstitute.sting.utils.pileup.PileupElement;
       import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
       import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
       import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      -import org.broadinstitute.variant.variantcontext.Allele;
       import org.broadinstitute.sting.utils.Utils;
       import java.util.Map;
       import java.util.List;
       import org.testng.Assert;
       import org.testng.annotations.Test;
       import net.sf.picard.reference.IndexedFastaSequenceFile;
      -import net.sf.samtools.SAMFileHeader;
      -import net.sf.samtools.SAMFileReader;
      -import net.sf.samtools.SAMRecord;
       import org.broadinstitute.sting.utils.GenomeLoc;
       import org.broadinstitute.sting.utils.GenomeLocParser;
      -import org.broadinstitute.sting.utils.Utils;
      -import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
      -import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
       import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
      -import org.broadinstitute.sting.utils.pileup.PileupElement;
      -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
      -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
      -import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder;
       import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
      -import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      -import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
      -import org.broadinstitute.variant.variantcontext.Allele;
      -import org.broadinstitute.variant.variantcontext.VariantContext;
      -import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
      -import org.broadinstitute.variant.vcf.VCFCodec;
       import java.io.File;
       import java.io.FileNotFoundException;
       import java.util.*;
      @@ -228,14 +211,140 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
       
               Assert.assertEquals(perReadAlleleLikelihoodMap.size(),pileup.depthOfCoverage()+10);
               Assert.assertEquals(perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap().get(base_A).size(),60);
      -        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1,null);
      +        perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1);
               Assert.assertEquals(perReadAlleleLikelihoodMap.size(),(int) (0.9*(pileup.depthOfCoverage()+10)));
       
               Map> downsampledStrat = perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap();
               Assert.assertEquals(downsampledStrat.get(base_A).size(),(int) (pileup.depthOfCoverage()/2) - 1);
               Assert.assertEquals(downsampledStrat.get(base_C).size(),(int) (pileup.depthOfCoverage()/2));
               Assert.assertEquals(downsampledStrat.get(base_T).size(),0);
      +    }
      +
      +    @DataProvider(name = "PoorlyModelledReadData")
      +    public Object[][] makePoorlyModelledReadData() {
      +        List tests = new ArrayList();
      +
      +        // this functionality can be adapted to provide input data for whatever you might want in your data
      +        tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0)});
      +        tests.add(new Object[]{10, 0.1, true, Arrays.asList(-10.0)});
      +        tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0, -10.0)});
      +        tests.add(new Object[]{10, 0.1, true, Arrays.asList(-5.0, -10.0)});
      +        tests.add(new Object[]{100, 0.1, false, Arrays.asList(-5.0, -10.0)});
      +        tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0)});
      +        tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -3.0)});
      +        tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -2.0)});
      +        tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0, -4.0)});
      +        tests.add(new Object[]{100, 0.001, true, Arrays.asList(-5.0, -10.0)});
      +        tests.add(new Object[]{100, 0.001, false, Arrays.asList(-5.0, -10.0, 0.0)});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "PoorlyModelledReadData")
      +    public void testPoorlyModelledRead(final int readLen, final double maxErrorRatePerBase, final boolean expected, final List log10likelihoods) {
      +        final byte[] bases = Utils.dupBytes((byte)'A', readLen);
      +        final byte[] quals = Utils.dupBytes((byte) 30, readLen);
      +
      +        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M");
      +
      +        final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
      +        final boolean actual = map.readIsPoorlyModelled(read, log10likelihoods, maxErrorRatePerBase);
      +        Assert.assertEquals(actual, expected);
      +    }
       
       
      +    @DataProvider(name = "RemovingPoorlyModelledReadData")
      +    public Object[][] makeRemovingPoorlyModelledReadData() {
      +        List tests = new ArrayList();
      +
      +        // this functionality can be adapted to provide input data for whatever you might want in your data
      +        final int readLen = 10;
      +        for ( int nReads = 0; nReads < 4; nReads++ ) {
      +            for ( int nBad = 0; nBad <= nReads; nBad++ ) {
      +                final int nGood = nReads - nBad;
      +                tests.add(new Object[]{readLen, nReads, nBad, nGood});
      +            }
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "RemovingPoorlyModelledReadData")
      +    public void testRemovingPoorlyModelledReads(final int readLen, final int nReads, final int nBad, final int nGood) {
      +        final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
      +        final Set goodReads = new HashSet();
      +        final Set badReads = new HashSet();
      +        for ( int readI = 0; readI < nReads; readI++ ) {
      +            final boolean bad = readI < nBad;
      +            final double likelihood = bad ? -100.0 : 0.0;
      +
      +            final byte[] bases = Utils.dupBytes((byte)'A', readLen);
      +            final byte[] quals = Utils.dupBytes((byte) 30, readLen);
      +
      +            final Allele allele = Allele.create(Utils.dupString("A", readI+1));
      +
      +            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M");
      +            read.setReadName("readName" + readI);
      +            map.add(read, allele, likelihood);
      +            (bad ? badReads : goodReads).add(read);
      +        }
      +
      +        final List removedReads = map.filterPoorlyModelledReads(0.01);
      +        Assert.assertEquals(removedReads.size(), nBad, "nBad " + nBad + " nGood " + nGood);
      +        Assert.assertEquals(new HashSet(removedReads), badReads, "nBad " + nBad + " nGood " + nGood);
      +        Assert.assertEquals(map.size(), nGood, "nBad " + nBad + " nGood " + nGood);
      +        Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood);
      +        Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood);
      +    }
      +
      +    @DataProvider(name = "MostLikelyAlleleData")
      +    public Object[][] makeMostLikelyAlleleData() {
      +        List tests = new ArrayList();
      +
      +        final Allele a = Allele.create("A");
      +        final Allele c = Allele.create("C");
      +        final Allele g = Allele.create("G");
      +
      +        tests.add(new Object[]{Arrays.asList(a), Arrays.asList(Arrays.asList(0.0)), a, a});
      +        tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(0.0, -1.0)), a, a});
      +        tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(-1.0, 0.0)), c, c});
      +        tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
      +        tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
      +        tests.add(new Object[]{Arrays.asList(a, c, g),
      +                Arrays.asList(
      +                        Arrays.asList(0.0, -10.0, -10.0),
      +                        Arrays.asList(-100.0, 0.0, -10.0)),
      +                c, a});
      +        tests.add(new Object[]{Arrays.asList(a, c, g),
      +                Arrays.asList(
      +                        Arrays.asList(0.0, -10.0, -10.0),
      +                        Arrays.asList(-20.0, 0.0, -100.0)),
      +                c, a});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "MostLikelyAlleleData")
      +    public void testMostLikelyAllele(final List alleles, final List> perReadlikelihoods, final Allele best, final Allele second) {
      +        final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
      +
      +        for ( int readI = 0; readI < perReadlikelihoods.size(); readI++ ) {
      +            final List likelihoods = perReadlikelihoods.get(readI);
      +
      +            final byte[] bases = Utils.dupBytes((byte)'A', 10);
      +            final byte[] quals = Utils.dupBytes((byte) 30, 10);
      +            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "10M");
      +            read.setReadName("readName" + readI);
      +
      +            for ( int i = 0; i < alleles.size(); i++ ) {
      +                final Allele allele = alleles.get(i);
      +                final double likelihood = likelihoods.get(i);
      +                map.add(read, allele, likelihood);
      +            }
      +        }
      +
      +        final MostLikelyAllele mla = map.getMostLikelyDiploidAlleles();
      +        Assert.assertEquals(mla.getMostLikelyAllele(), best);
      +        Assert.assertEquals(mla.getSecondMostLikelyAllele(), second);
           }
       }
      \ No newline at end of file
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java
      new file mode 100644
      index 000000000..26384c190
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java
      @@ -0,0 +1,77 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotype;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.testng.Assert;
      +import org.testng.annotations.Test;
      +
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.Collections;
      +import java.util.List;
      +
      +public class HaplotypeBaseComparatorUnitTest extends BaseTest {
      +    @Test
      +    public void testComparison() {
      +        final List rawStrings = Arrays.asList("A", "C", "AC", "CT", "GTC", "ACGT");
      +        final List lexStrings = new ArrayList(rawStrings);
      +        Collections.sort(lexStrings);
      +
      +        for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) {
      +            final List haps = new ArrayList(seqs.size());
      +            for ( final String seq : seqs ) {
      +                haps.add(new Haplotype(seq.getBytes(), false));
      +            }
      +
      +            Collections.sort(haps, new HaplotypeBaseComparator());
      +            for ( int i = 0; i < lexStrings.size(); i++ )
      +                Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings);
      +        }
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java
      new file mode 100644
      index 000000000..3c3452bbf
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java
      @@ -0,0 +1,118 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotype;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.BeforeMethod;
      +import org.testng.annotations.Test;
      +
      +public class HaplotypeLDCalculatorUnitTest extends BaseTest {
      +    HaplotypeLDCalculator calculator;
      +
      +    @BeforeMethod
      +    public void setUp() throws Exception {
      +        calculator = new HaplotypeLDCalculator();
      +    }
      +
      +    /**
      +     * Tests that we get the right values from the R^2 calculation
      +     */
      +    @Test
      +    public void computeProbOfBeingPhased() {
      +        logger.warn("Executing testCalculateR2LD");
      +
      +        // See AA, AB, and BA in population
      +        Assert.assertEquals(calculator.pPhasedTest(0, 0, 0, -100), 0, 0.00001);
      +
      +        // See AA, AB, BB in population
      +        Assert.assertTrue(calculator.pPhasedTest(0, 0, -100, 0) < 0.5);
      +
      +        // See AA and BB in population
      +        Assert.assertEquals(calculator.pPhasedTest(0, -100, -100, 0), 1, 0.00001);
      +
      +        // See AA, AB, and BA but no BBs in population
      +        Assert.assertEquals(calculator.pPhasedTest(0, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001);
      +
      +        // See BB, AB, and BA but no AAs in population, so BB is the best explanation
      +        Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, 0), 1, 0.00001);
      +
      +        // See only AB and BA but no AAs nor BBs in population
      +        Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001);
      +
      +        // Previously bad input
      +        Assert.assertEquals(calculator.pPhasedTest(-400, -600, -1200, Double.NEGATIVE_INFINITY), 0, 0.00001);
      +
      +        // first variant is just bad, so BA and BB are both very bad, shouldn't be phased
      +        Assert.assertEquals(calculator.pPhasedTest(0, -1000, -100, -10000), 0, 0.00001);
      +
      +        // second variant is just bad, so AB and BB are both very bad, shouldn't be phased
      +        Assert.assertEquals(calculator.pPhasedTest(0, -100, -1000, -10000), 0, 0.00001);
      +
      +        // AA is very good, all all others are quite poor.  Shouldn't be phased
      +        Assert.assertEquals(calculator.pPhasedTest(0, -1000, -1000, -10000), 0, 0.00001);
      +
      +
      +        for ( int i = -10; i > -10000; i -= 10 ) {
      +            // only bad het states
      +            Assert.assertTrue(calculator.pPhasedTest(0, i, i, 0) > 0.99, "Failed for " + i);
      +
      +            // BB state is terrible
      +            Assert.assertTrue(calculator.pPhasedTest(0, 0, 0, i) < 0.5, "Failed for " + i);
      +
      +            // truth is AB, BA, and BB
      +            Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, 0) < 0.5, "Failed for " + i);
      +
      +            // truth is AB, BA
      +            Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, i) < 0.5, "Failed for " + i);
      +
      +            // Only good signal is AB, so we shouldn't be phased
      +            Assert.assertTrue(calculator.pPhasedTest(i, i, 0, i) < 0.5, "Failed for " + i);
      +            Assert.assertTrue(calculator.pPhasedTest(i, 0, i, i) < 0.5, "Failed for " + i);
      +        }
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java
      new file mode 100644
      index 000000000..64a62bc02
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java
      @@ -0,0 +1,76 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotype;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.testng.Assert;
      +import org.testng.annotations.Test;
      +
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.Collections;
      +import java.util.List;
      +
      +public class HaplotypeScoreComparatorUnitTest extends BaseTest {
      +    @Test
      +    public void testComparison() {
      +        final List scores = Arrays.asList(3.0, 2.0, 1.0);
      +        for ( final List myScores : Utils.makePermutations(scores, scores.size(), false) ) {
      +            final List haps = new ArrayList(myScores.size());
      +            for ( final double score : myScores ) {
      +                final Haplotype h = new Haplotype("ACT".getBytes(), false);
      +                h.setScore(score);
      +                haps.add(h);
      +            }
      +
      +            Collections.sort(haps, new HaplotypeScoreComparator());
      +            for ( int i = 0; i < myScores.size(); i++ )
      +                Assert.assertEquals(haps.get(i).getScore(), scores.get(i));
      +        }
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java
      new file mode 100644
      index 000000000..a2c69e535
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java
      @@ -0,0 +1,334 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotype;
      +
      +import net.sf.samtools.TextCigarCodec;
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.*;
      +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
      +import org.broadinstitute.variant.variantcontext.VariantContext;
      +import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
      +import org.testng.Assert;
      +import org.testng.annotations.BeforeClass;
      +import org.testng.annotations.BeforeMethod;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.io.File;
      +import java.io.FileNotFoundException;
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.List;
      +import java.util.TreeSet;
      +
      +public class LDMergerUnitTest extends BaseTest {
      +    LDMerger merger;
      +    GenomeLocParser genomeLocParser;
      +
      +    @BeforeClass
      +    public void init() throws FileNotFoundException {
      +        genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference)));
      +    }
      +
      +    @BeforeMethod
      +    public void setUp() throws Exception {
      +        merger = new LDMerger();
      +    }
      +
      +    @Test
      +    public void testCreateMergedVariantContext() {
      +        logger.warn("Executing testCreateMergedVariantContext");
      +
      +        final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes();
      +        final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length);
      +
      +        // SNP + SNP = simple MNP
      +        VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      +        VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make();
      +        VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make();
      +        VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // SNP + ref + SNP = MNP with ref base gap
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // insertion + SNP
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // SNP + insertion
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // deletion + SNP
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // SNP + deletion
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // insertion + deletion = MNP
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // insertion + deletion
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // insertion + insertion
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // deletion + deletion
      +        thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // deletion + insertion (abutting)
      +        thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +
      +        // complex + complex
      +        thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make();
      +        nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make();
      +        truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make();
      +        mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        logger.warn(truthVC + " == " + mergedVC);
      +        Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
      +        Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
      +        Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
      +    }
      +
      +    @Test
      +    public void testInsertionDeletionBecomingNullAllele() {
      +        final byte[] ref = "CAAA".getBytes();
      +        final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length);
      +
      +        // insertion + deletion results in a null allele, should return false
      +        final VariantContext thisVC = new VariantContextBuilder().loc("2", 1700, 1701).alleles("CA","C").make();
      +        final VariantContext nextVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("A","AA").make();
      +        final VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
      +        Assert.assertNull(mergedVC,  "Insertion deletion becoming a null allele should return a null variant context");
      +    }
      +
      +    /**
      +     * Just returns a given R2 value for testing
      +     */
      +    private static class MockLDCalculator extends HaplotypeLDCalculator {
      +        private final double R2;
      +
      +        private MockLDCalculator(double r2) {
      +            R2 = r2;
      +        }
      +
      +        @Override
      +        protected double computeProbOfBeingPhased(VariantContext first, VariantContext second) {
      +            return R2;
      +        }
      +    }
      +
      +    @DataProvider(name = "R2MergerData")
      +    public Object[][] makeR2MergerData() {
      +        List tests = new ArrayList();
      +
      +        // this functionality can be adapted to provide input data for whatever you might want in your data
      +        final double thres = LDMerger.MERGE_EVENTS_PROB_PHASED_THRESHOLD;
      +        for ( final double r2 : Arrays.asList(0.0, thres - 0.01, thres + 0.01, 1.0) ) {
      +            tests.add(new Object[]{"ACGT", "CCGC", 2, "4M", "ACGT", "CCGC", r2, r2 >= thres});
      +            tests.add(new Object[]{"ACGT", "AGGC", 2, "4M", "CGT", "GGC", r2, r2 >= thres});
      +            tests.add(new Object[]{"ACGT", "ACCC", 2, "4M", "GT", "CC", r2, r2 >= thres});
      +            tests.add(new Object[]{"ACGT", "ACCGTT", 2, "2M1I1M1I1M", "CG", "CCGT", r2, r2 >= thres});
      +            tests.add(new Object[]{"ACGT", "AGCT", 2, "4M", "CG", "GC", r2, r2 >= thres});
      +            tests.add(new Object[]{"ACAGT", "AAGC", 2, "1M1D3M", "ACAGT", "AAGC", r2, r2 >= thres});
      +            tests.add(new Object[]{"ACAGT", "AAT", 2, "1M1D1M1D1M", "ACAG", "AA", r2, r2 >= thres});
      +
      +            // cannot be merged -- only 1 event
      +            tests.add(new Object[]{"AAA", "ACA", 1, "3M", null, null, r2, false});
      +
      +            final int dist = LDMerger.MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE + 2;
      +            tests.add(new Object[]{Utils.dupString("A", dist), "C" + Utils.dupString("A", dist - 2) + "C", 2, dist + "M", null, null, r2, false});
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "R2MergerData")
      +    public void testR2Merger(final String refS, final String hapS, int nEvents, final String cigar, final String expectedMergedRef, final String expectedMergedAlt, final double r2, final boolean expectMerge) {
      +        final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M"));
      +        final Haplotype hap = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
      +        final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length());
      +
      +        final List haplotypes = Arrays.asList(ref, hap);
      +        final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false);
      +        final MockLDCalculator r2Calc = new MockLDCalculator(r2);
      +
      +        Assert.assertEquals(vcStarts.size(), nEvents);
      +        final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc);
      +        Assert.assertEquals(merged, expectMerge);
      +        Assert.assertEquals(vcStarts.size(), expectMerge ? 1 : nEvents);
      +        if ( expectMerge ) {
      +            final VariantContext vc = hap.getEventMap().getVariantContexts().iterator().next();
      +            Assert.assertTrue(vc.isBiallelic());
      +            Assert.assertEquals(vc.getReference().getDisplayString(), expectedMergedRef);
      +            Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), expectedMergedAlt);
      +        }
      +    }
      +
      +    @Test
      +    public void testR2MergerWithThirdHapWithoutEvent() {
      +        final String refS = "ACGT";
      +        final String hapS = "CCGA";
      +        final String cigar = "4M";
      +        final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M"));
      +        final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
      +        final Haplotype hap2 = new Haplotype("ACGA".getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
      +        final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length());
      +
      +        final List haplotypes = Arrays.asList(ref, hap1, hap2);
      +        final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false);
      +        final MockLDCalculator r2Calc = new MockLDCalculator(1.0);
      +
      +        Assert.assertEquals(vcStarts.size(), 2);
      +        final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc);
      +        Assert.assertEquals(merged, true);
      +        Assert.assertEquals(vcStarts.size(), 1);
      +
      +        final VariantContext vc = hap1.getEventMap().getVariantContexts().iterator().next();
      +        Assert.assertTrue(vc.isBiallelic());
      +        Assert.assertEquals(vc.getReference().getDisplayString(), "ACGT");
      +        Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), "CCGA");
      +
      +        Assert.assertEquals(hap2.getEventMap().size(), 0);
      +    }
      +
      +    @Test
      +    public void testR2MergerWithMultipleAllelesAtSites() {
      +        final String refS = "ACGT";
      +        final String hapS = "TCGA";
      +        final String cigar = "4M";
      +        final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M"));
      +        final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
      +
      +        final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length());
      +        for (final String hap2S : Arrays.asList("GCGA", "TCGG")) {
      +            final Haplotype hap2 = new Haplotype(hap2S.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
      +
      +            final List haplotypes = Arrays.asList(ref, hap1, hap2);
      +            final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false);
      +            final MockLDCalculator r2Calc = new MockLDCalculator(1.0);
      +
      +            Assert.assertEquals(vcStarts.size(), 2);
      +            final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc);
      +            Assert.assertEquals(merged, false);
      +            Assert.assertEquals(vcStarts.size(), 2);
      +        }
      +    }
      +}
      \ No newline at end of file
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java
      new file mode 100644
      index 000000000..91a2988aa
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java
      @@ -0,0 +1,302 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.haplotypeBAMWriter;
      +
      +import net.sf.samtools.*;
      +import org.broadinstitute.sting.BaseTest;
      +import org.broadinstitute.sting.utils.haplotype.Haplotype;
      +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
      +import org.broadinstitute.sting.utils.Utils;
      +import org.broadinstitute.sting.utils.sam.AlignmentUtils;
      +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
      +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.Collections;
      +import java.util.List;
      +
      +public class HaplotypeBAMWriterUnitTest extends BaseTest {
      +    private final static boolean DEBUG = false;
      +    final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
      +
      +    private GATKSAMRecord makeRead(final String baseString) {
      +        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, 10);
      +        final byte[] bases = baseString.getBytes();
      +        read.setReadBases(bases.clone());
      +        read.setBaseQualities(Utils.dupBytes((byte)30, read.getReadLength()));
      +        return read;
      +    }
      +
      +    private Haplotype makeHaplotype(final String bases, final String cigar) {
      +        final Haplotype hap = new Haplotype(bases.getBytes());
      +        hap.setCigar(TextCigarCodec.getSingleton().decode(cigar));
      +        return hap;
      +    }
      +
      +    private static class MockBAMWriter implements SAMFileWriter {
      +        @Override
      +        public void addAlignment(SAMRecord alignment) {
      +            //To change body of implemented methods use File | Settings | File Templates.
      +        }
      +
      +        @Override
      +        public SAMFileHeader getFileHeader() {
      +            return null;  //To change body of implemented methods use File | Settings | File Templates.
      +        }
      +
      +        @Override
      +        public void close() {
      +            //To change body of implemented methods use File | Settings | File Templates.
      +        }
      +    }
      +
      +    @Test
      +    public void testCreate() throws Exception {
      +        final SAMFileWriter writer = new MockBAMWriter();
      +        Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, writer) instanceof CalledHaplotypeBAMWriter);
      +        Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, writer) instanceof AllHaplotypeBAMWriter);
      +    }
      +
      +
      +    //////////////////////////////////////////
      +    // Test HaplotypeBAMWriter.createReadAlignedToRef() //
      +    //////////////////////////////////////////
      +
      +    @DataProvider(name = "ReadAlignedToRefData")
      +    public Object[][] makeReadAlignedToRefData() {
      +        List tests = new ArrayList();
      +
      +        final String hapBases = "ACTGAAGGTTCC";
      +        final Haplotype allM = makeHaplotype(hapBases, hapBases.length() + "M");
      +
      +        // make sure we get back a cigar of the right length
      +        for ( int i = -1; i < hapBases.length(); i++ ) {
      +            final GATKSAMRecord read = makeRead(hapBases);
      +            if ( i != -1 ) read.getReadBases()[i] = (byte)'A';
      +            tests.add(new Object[]{read, allM, 10, 10, allM.getCigar().toString()});
      +        }
      +
      +        // make sure insertions at the front are correctly handled
      +        for ( int padFront = 1; padFront < 10; padFront++ ) {
      +            final GATKSAMRecord read = makeRead(Utils.dupString("N", padFront) + hapBases);
      +            tests.add(new Object[]{read, allM, 10, 10, padFront + "I" + allM.getCigar().toString()});
      +        }
      +
      +        // make sure insertions at the back are correctly handled
      +        for ( int padBack = 1; padBack < 10; padBack++ ) {
      +            final GATKSAMRecord read = makeRead(hapBases + Utils.dupString("N", padBack));
      +            tests.add(new Object[]{read, allM, 10, 10, allM.getCigar().toString() + padBack + "I"});
      +        }
      +
      +        // make sure refStart and hapStart are respected
      +        for ( int refStart = 1; refStart < 10; refStart++ ) {
      +            for ( int hapStart = refStart; hapStart < 10 + refStart; hapStart++ ) {
      +                final Haplotype hap = new Haplotype(allM.getBases());
      +                hap.setCigar(allM.getCigar());
      +                hap.setAlignmentStartHapwrtRef(hapStart);
      +
      +                final GATKSAMRecord read = makeRead(new String(hap.getBases()));
      +                tests.add(new Object[]{read, hap, refStart, refStart + hapStart, allM.getCigar().toString()});
      +            }
      +        }
      +
      +        // example case of bad alignment because SW doesn't necessarily left-align indels
      +        {
      +            final String hap = "ACTGTGGGTTCCTCTTATTTTATTTCTACATCAATGTTCATATTTAACTTATTATTTTATCTTATTTTTAAATTTCTTTTATGTTGAGCCTTGATGAAAGCCATAGGTTCTCTCATATAATTGTATGTGTATGTATGTATATGTACATAATATATACATATATGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTGTATTACATAATATATACATATATGTATATATTATGTATATGTACATAATATATACATATATG";
      +            final String hapCigar = "399M";
      +            final String readBases = "ATGTACATAATATATACATATATGTATATGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTGTATTACATAATATATACATATATGTATATATTATGTATATGTACATAATAT";
      +            final GATKSAMRecord read = makeRead(readBases);
      +            final int refStart = 10130100;
      +            final int hapStart = 500;
      +            final String badCigar = "31M6D211M";
      +            final String goodCigar = "28M6D214M";
      +            final Haplotype badHap = new Haplotype(hap.getBytes());
      +            badHap.setCigar(TextCigarCodec.getSingleton().decode(hapCigar));
      +            badHap.setAlignmentStartHapwrtRef(hapStart);
      +
      +            final int expectedPos = 10130740;
      +            tests.add(new Object[]{read, badHap, refStart, expectedPos, goodCigar});
      +        }
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +
      +
      +    @Test(dataProvider = "ReadAlignedToRefData", enabled = true)
      +    public void testReadAlignedToRef(final GATKSAMRecord read, final Haplotype haplotype, final int refStart, final int expectedReadStart, final String expectedReadCigar) throws Exception {
      +        final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter());
      +        final GATKSAMRecord originalReadCopy = (GATKSAMRecord)read.clone();
      +
      +        if ( expectedReadCigar == null ) {
      +            Assert.assertNull(writer.createReadAlignedToRef(read, haplotype, refStart));
      +        } else {
      +            final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedReadCigar);
      +            final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, refStart);
      +
      +            Assert.assertEquals(alignedRead.getReadName(), originalReadCopy.getReadName());
      +            Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart);
      +            Assert.assertEquals(alignedRead.getReadBases(), originalReadCopy.getReadBases());
      +            Assert.assertEquals(alignedRead.getBaseQualities(), originalReadCopy.getBaseQualities());
      +            Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart);
      +            Assert.assertEquals(alignedRead.getCigar(), expectedCigar);
      +            Assert.assertNotNull(alignedRead.getAttribute("HC"));
      +        }
      +
      +        Assert.assertEquals(read, originalReadCopy, "createReadAlignedToRef seems be modifying the original read!");
      +    }
      +
      +    private static class Mutation implements Comparable {
      +        int pos, len;
      +        CigarOperator operator;
      +
      +        private Mutation(int pos, int len, CigarOperator operator) {
      +            this.pos = pos;
      +            this.len = len;
      +            this.operator = operator;
      +        }
      +        public int getNMismatches() { return len; }
      +
      +        @Override
      +        public int compareTo(Mutation o) {
      +            return Integer.valueOf(pos).compareTo(o.pos);
      +        }
      +
      +        private String apply(final String seq) {
      +            switch ( operator ) {
      +                case M:
      +                    final byte[] bases = seq.getBytes();
      +                    if ( pos < seq.length() )
      +                        bases[pos] = (byte)(bases[pos] == 'A' ? 'C' : 'A');
      +                    return new String(bases);
      +                case I: {
      +                    final String prefix = seq.substring(0, pos);
      +                    final String postfix = seq.substring(pos, seq.length());
      +                    return prefix + "GTCAGTTA".substring(0, len) + postfix;
      +                } case D: {
      +                    final String prefix = seq.substring(0, pos);
      +                    final String postfix = seq.substring(pos + len, seq.length());
      +                    return prefix + postfix;
      +                }default:
      +                    throw new IllegalStateException("Unexpected operator " + operator);
      +            }
      +        }
      +    }
      +
      +    private static class MutatedSequence {
      +        int numMismatches;
      +        String seq;
      +
      +        private MutatedSequence(int numMismatches, String seq) {
      +            this.numMismatches = numMismatches;
      +            this.seq = seq;
      +        }
      +    }
      +
      +    private MutatedSequence mutateSequence(final String hapIn, final List mutations) {
      +        Collections.sort(mutations);
      +        int mismatches = 0;
      +        String hap = hapIn;
      +        for ( final Mutation mut : mutations ) {
      +            hap = mut.apply(hap);
      +            mismatches += mut.getNMismatches();
      +        }
      +        return new MutatedSequence(mismatches, hap);
      +    }
      +
      +    @DataProvider(name = "ComplexReadAlignedToRef")
      +    public Object[][] makeComplexReadAlignedToRef() {
      +        List tests = new ArrayList();
      +
      +        final List allMutations = Arrays.asList(
      +                new Mutation(1, 1, CigarOperator.M),
      +                new Mutation(2, 1, CigarOperator.M),
      +                new Mutation(3, 1, CigarOperator.I),
      +                new Mutation(7, 1, CigarOperator.D)
      +        );
      +
      +        int i = 0;
      +        final String referenceBases  = "ACTGACTGACTG";
      +        final String paddedReference = "NNNN" + referenceBases + "NNNN";
      +        for ( final List mutations : Utils.makePermutations(allMutations, 3, false) ) {
      +            final MutatedSequence hap = mutateSequence(referenceBases, mutations);
      +            final Haplotype haplotype = new Haplotype(hap.seq.getBytes());
      +            final SWPairwiseAlignment align = new SWPairwiseAlignment(paddedReference.getBytes(), hap.seq.getBytes());
      +            haplotype.setAlignmentStartHapwrtRef(align.getAlignmentStart2wrt1());
      +            haplotype.setCigar(align.getCigar());
      +
      +            for ( final List readMutations : Utils.makePermutations(allMutations, 3, false) ) {
      +                final MutatedSequence readBases = mutateSequence(hap.seq, readMutations);
      +                final GATKSAMRecord read = makeRead(readBases.seq);
      +                tests.add(new Object[]{i++, read, paddedReference, haplotype, hap.numMismatches + readBases.numMismatches});
      +            }
      +        }
      +
      +        // for convenient testing of a single failing case
      +        //tests.add(new Object[]{makeRead("ACCGGGACTGACTG"), reference, makeHaplotype("AAAGGACTGACTG", "1M1I11M"), 2});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +
      +    @Test(dataProvider = "ComplexReadAlignedToRef", enabled = !DEBUG)
      +    public void testReadAlignedToRefComplexAlignment(final int testIndex, final GATKSAMRecord read, final String reference, final Haplotype haplotype, final int expectedMaxMismatches) throws Exception {
      +        final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter());
      +        final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, 1);
      +        if ( alignedRead != null ) {
      +            final int mismatches = AlignmentUtils.getMismatchCount(alignedRead, reference.getBytes(), alignedRead.getAlignmentStart() - 1).numMismatches;
      +            Assert.assertTrue(mismatches <= expectedMaxMismatches,
      +                    "Alignment of read to ref looks broken.  Expected at most " + expectedMaxMismatches + " but saw " + mismatches
      +                            + " for readBases " + new String(read.getReadBases()) + " with cigar " + read.getCigar() + " reference " + reference + " haplotype "
      +                            + haplotype + " with cigar " + haplotype.getCigar() + " aligned read cigar " + alignedRead.getCigarString() + " @ " + alignedRead.getAlignmentStart());
      +        }
      +    }
      +}
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java
      index 555c02cde..f9a4985b0 100644
      --- a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java
      @@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
                   for ( final int nct : Arrays.asList(1, 2) ) {
       //                tests.add(new Object[]{ "SNP",   "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
       ////                tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
      -                tests.add(new Object[]{ "BOTH",  "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct });
      +                tests.add(new Object[]{ "BOTH",  "aad3a398273ec795e363268997247bd8", nt, nct });
                   }
       
               return tests.toArray(new Object[][]{});
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java
      index 9de562aa5..2499183a6 100644
      --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java
      +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java
      @@ -70,7 +70,7 @@ public class PairHMMUnitTest extends BaseTest {
           final static boolean EXTENSIVE_TESTING = true;
           final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation
           final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation
      -    final PairHMM loglessHMM = new LoglessCachingPairHMM();
      +    final PairHMM loglessHMM = new LoglessPairHMM();
       
           private List getHMMs() {
               return Arrays.asList(exactHMM, originalHMM, loglessHMM);
      @@ -82,11 +82,12 @@ public class PairHMMUnitTest extends BaseTest {
           //
           // --------------------------------------------------------------------------------
       
      -    private class BasicLikelihoodTestProvider extends TestDataProvider {
      +    private class BasicLikelihoodTestProvider {
               final String ref, read;
               final byte[] refBasesWithContext, readBasesWithContext;
               final int baseQual, insQual, delQual, gcp;
               final int expectedQual;
      +        final boolean left, right;
               final static String CONTEXT = "ACGTAATGACGATTGCA";
               final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC";
               final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA";
      @@ -96,7 +97,6 @@ public class PairHMMUnitTest extends BaseTest {
               }
       
               public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) {
      -            super(BasicLikelihoodTestProvider.class, String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual));
                   this.baseQual = baseQual;
                   this.delQual = delQual;
                   this.insQual = insQual;
      @@ -104,18 +104,24 @@ public class PairHMMUnitTest extends BaseTest {
                   this.read = read;
                   this.ref = ref;
                   this.expectedQual = expectedQual;
      +            this.left = left;
      +            this.right = right;
       
                   refBasesWithContext = asBytes(ref, left, right);
                   readBasesWithContext = asBytes(read, false, false);
               }
       
      -        public double expectedLogL(final PairHMM hmm) {
      -            return (expectedQual / -10.0) + 0.03 +
      -                    hmm.getNPotentialXStartsLikelihoodPenaltyLog10(refBasesWithContext.length, readBasesWithContext.length);
      +        @Override
      +        public String toString() {
      +            return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual);
      +        }
      +
      +        public double expectedLogL() {
      +            return (expectedQual / -10.0) + 0.03 + Math.log10(1.0/refBasesWithContext.length);
               }
       
               public double getTolerance(final PairHMM hmm) {
      -            if ( hmm instanceof LoglessCachingPairHMM )
      +            if ( hmm instanceof LoglessPairHMM)
                       return toleranceFromExact();
                   if ( hmm instanceof Log10PairHMM ) {
                       return ((Log10PairHMM)hmm).isDoingExactLog10Calculations() ? toleranceFromExact() : toleranceFromReference();
      @@ -140,10 +146,10 @@ public class PairHMMUnitTest extends BaseTest {
                   return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
                           refBasesWithContext, readBasesWithContext,
                           qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
      -                    qualAsBytes(gcp, false, anchorIndel), 0, true);
      +                    qualAsBytes(gcp, false, anchorIndel), true);
               }
       
      -        private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
      +        private byte[] asBytes(final String bases, final boolean left, final boolean right) {
                   return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
               }
       
      @@ -156,7 +162,7 @@ public class PairHMMUnitTest extends BaseTest {
       
                       // update just the bases corresponding to the provided micro read with the quality scores
                       if( doGOP ) {
      -                    phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
      +                    phredQuals[CONTEXT.length()] = (byte)phredQual;
                       } else {
                           for ( int i = 0; i < read.length(); i++)
                               phredQuals[i + CONTEXT.length()] = (byte)phredQual;
      @@ -178,6 +184,8 @@ public class PairHMMUnitTest extends BaseTest {
               final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10);
               final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2);
       
      +        final List tests = new ArrayList();
      +
               for ( final int baseQual : baseQuals ) {
                   for ( final int indelQual : indelQuals ) {
                       for ( final int gcp : gcps ) {
      @@ -188,7 +196,7 @@ public class PairHMMUnitTest extends BaseTest {
                                   final String ref  = new String(new byte[]{refBase});
                                   final String read = new String(new byte[]{readBase});
                                   final int expected = refBase == readBase ? 0 : baseQual;
      -                            new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
      +                            tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)});
                               }
                           }
       
      @@ -204,10 +212,10 @@ public class PairHMMUnitTest extends BaseTest {
                                       final String ref = insertionP ? small : big;
                                       final String read = insertionP ? big : small;
       
      -                                new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
      -                                new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false);
      -                                new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true);
      -                                new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true);
      +                                tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)});
      +                                tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false)});
      +                                tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true)});
      +                                tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true)});
                                   }
                               }
                           }
      @@ -215,7 +223,7 @@ public class PairHMMUnitTest extends BaseTest {
                   }
               }
       
      -        return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
      +        return tests.toArray(new Object[][]{});
           }
       
           @DataProvider(name = "OptimizedLikelihoodTestProvider")
      @@ -227,6 +235,8 @@ public class PairHMMUnitTest extends BaseTest {
               final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
               final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2);
       
      +        final List tests = new ArrayList();
      +
               for ( final int baseQual : baseQuals ) {
                   for ( final int indelQual : indelQuals ) {
                       for ( final int gcp : gcps ) {
      @@ -243,14 +253,14 @@ public class PairHMMUnitTest extends BaseTest {
       
                                   for ( final boolean leftFlank : Arrays.asList(true, false) )
                                       for ( final boolean rightFlank : Arrays.asList(true, false) )
      -                                    new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank);
      +                                    tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank)});
                               }
                           }
                       }
                   }
               }
       
      -        return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
      +        return tests.toArray(new Object[][]{});
           }
       
           @Test(enabled = !DEBUG, dataProvider = "BasicLikelihoodTestProvider")
      @@ -258,8 +268,8 @@ public class PairHMMUnitTest extends BaseTest {
               if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) {
                   final double exactLogL = cfg.calcLogL( exactHMM, true );
                   for ( final PairHMM hmm : getHMMs() ) {
      -                double actualLogL = cfg.calcLogL( hmm, true );
      -                double expectedLogL = cfg.expectedLogL(hmm);
      +                final double actualLogL = cfg.calcLogL( hmm, true );
      +                final double expectedLogL = cfg.expectedLogL();
       
                       // compare to our theoretical expectation with appropriate tolerance
                       Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm);
      @@ -273,10 +283,10 @@ public class PairHMMUnitTest extends BaseTest {
           @Test(enabled = !DEBUG, dataProvider = "OptimizedLikelihoodTestProvider")
           public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
               if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) {
      -            double exactLogL = cfg.calcLogL( exactHMM, false );
      +            final double exactLogL = cfg.calcLogL( exactHMM, false );
       
                   for ( final PairHMM hmm : getHMMs() ) {
      -                double calculatedLogL = cfg.calcLogL( hmm, false );
      +                final double calculatedLogL = cfg.calcLogL( hmm, false );
                       // compare to the exact reference implementation with appropriate tolerance
                       Assert.assertEquals(calculatedLogL, exactLogL, cfg.getTolerance(hmm), String.format("Test: logL calc=%.2f expected=%.2f for %s with hmm %s", calculatedLogL, exactLogL, cfg.toString(), hmm));
                       Assert.assertTrue(MathUtils.goodLog10Probability(calculatedLogL), "Bad log10 likelihood " + calculatedLogL);
      @@ -286,65 +296,56 @@ public class PairHMMUnitTest extends BaseTest {
       
           @Test(enabled = !DEBUG)
           public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() {
      -        byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
      -
      +        final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
      +        final byte matchQual = 90;
      +        final byte mismatchQual = 20;
      +        final byte indelQual = 80;
               final int offset = 2;
      -        byte[] gop = new byte[haplotype1.length - 2 * offset];
      -        Arrays.fill(gop, (byte) 80);
      -        byte[] gcp = new byte[haplotype1.length - 2 * offset];
      -        Arrays.fill(gcp, (byte) 80);
      +        final byte[] gop = new byte[haplotype1.length - 2 * offset];
      +        Arrays.fill(gop, indelQual);
      +        final byte[] gcp = new byte[haplotype1.length - 2 * offset];
      +        Arrays.fill(gcp, indelQual);
      +        loglessHMM.initialize(gop.length, haplotype1.length);
       
               for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) {
      -            byte[] quals = new byte[haplotype1.length - 2 * offset];
      -            Arrays.fill(quals, (byte) 90);
      -            // one read mismatches the haplotype
      -            quals[k] = 20;
      -
      -            byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
      +            final byte[] quals = new byte[haplotype1.length - 2 * offset];
      +            Arrays.fill(quals, matchQual);
      +            // one base mismatches the haplotype
      +            quals[k] = mismatchQual;
      +            final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
                   // change single base at position k to C. If it's a C, change to T
                   mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
      -            originalHMM.initialize(mread.length, haplotype1.length);
      -            double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
      -                    haplotype1, mread,
      -                    quals, gop, gop,
      -                    gcp, 0, false);
      -
      -            System.out.format("H:%s\nR:  %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
      -
      -            // - log10 is because of number of start positions
      -            Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2);
      +            final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false);
      +            final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual));
      +            Assert.assertEquals(res1, expected, 1e-2);
               }
           }
       
           @Test(enabled = ! DEBUG)
           public void testMismatchInEveryPositionInTheRead() {
      -        byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
      +        final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
      +        final byte matchQual = 90;
      +        final byte mismatchQual = 20;
      +        final byte indelQual = 80;
       
               final int offset = 2;
      -        byte[] gop = new byte[haplotype1.length - offset];
      -        Arrays.fill(gop, (byte) 80);
      -        byte[] gcp = new byte[haplotype1.length - offset];
      -        Arrays.fill(gcp, (byte) 80);
      +        final byte[] gop = new byte[haplotype1.length - offset];
      +        Arrays.fill(gop, indelQual);
      +        final byte[] gcp = new byte[haplotype1.length - offset];
      +        Arrays.fill(gcp, indelQual);
      +        loglessHMM.initialize(gop.length, haplotype1.length);
       
               for( int k = 0; k < haplotype1.length - offset; k++ ) {
      -            byte[] quals = new byte[haplotype1.length - offset];
      -            Arrays.fill(quals, (byte) 90);
      -            // one read mismatches the haplotype
      -            quals[k] = 20;
      -
      -            byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
      +            final byte[] quals = new byte[haplotype1.length - offset];
      +            Arrays.fill(quals, matchQual);
      +            // one base mismatches the haplotype with low qual
      +            quals[k] = mismatchQual;
      +            final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
                   // change single base at position k to C. If it's a C, change to T
                   mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
      -            originalHMM.initialize(mread.length, haplotype1.length);
      -            double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
      -                    haplotype1, mread,
      -                    quals, gop, gop,
      -                    gcp, 0, false);
      -
      -            System.out.format("H:%s\nR:  %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
      -
      -            // - log10 is because of number of start positions
      -            Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2);
      +            final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false);
      +            final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual));
      +            Assert.assertEquals(res1, expected, 1e-2);
               }
           }
       
      @@ -366,37 +367,43 @@ public class PairHMMUnitTest extends BaseTest {
       
           @Test(enabled = !DEBUG, dataProvider = "HMMProvider")
           void testMultipleReadMatchesInHaplotype(final PairHMM hmm, final int readSize, final int refSize) {
      -        byte[] readBases =  Utils.dupBytes((byte)'A', readSize);
      -        byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes();
      -        byte baseQual = 20;
      -        byte insQual = 37;
      -        byte delQual = 37;
      -        byte gcp = 10;
      +        final byte[] readBases =  Utils.dupBytes((byte)'A', readSize);
      +        final byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes();
      +        final byte baseQual = 20;
      +        final byte insQual = 37;
      +        final byte delQual = 37;
      +        final byte gcp = 10;
               hmm.initialize(readBases.length, refBases.length);
      -        double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      +        final double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
                       Utils.dupBytes(baseQual, readBases.length),
                       Utils.dupBytes(insQual, readBases.length),
                       Utils.dupBytes(delQual, readBases.length),
      -                Utils.dupBytes(gcp, readBases.length), 0, true);
      +                Utils.dupBytes(gcp, readBases.length), true);
               Assert.assertTrue(d <= 0.0, "Likelihoods should be <= 0 but got "+ d);
           }
       
           @Test(enabled = !DEBUG, dataProvider = "HMMProvider")
           void testAllMatchingRead(final PairHMM hmm, final int readSize, final int refSize) {
      -        byte[] readBases =  Utils.dupBytes((byte)'A', readSize);
      -        byte[] refBases = Utils.dupBytes((byte)'A', refSize);
      -        byte baseQual = 20;
      -        byte insQual = 100;
      -        byte delQual = 100;
      -        byte gcp = 100;
      +        final byte[] readBases =  Utils.dupBytes((byte)'A', readSize);
      +        final byte[] refBases = Utils.dupBytes((byte)'A', refSize);
      +        final byte baseQual = 20;
      +        final byte insQual = 100;
      +        final byte delQual = 100;
      +        final byte gcp = 100;
               hmm.initialize(readBases.length, refBases.length);
               double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
                       Utils.dupBytes(baseQual, readBases.length),
                       Utils.dupBytes(insQual, readBases.length),
                       Utils.dupBytes(delQual, readBases.length),
      -                Utils.dupBytes(gcp, readBases.length), 0, true);
      -        final double expected = Math.log10(Math.pow(1.0 - QualityUtils.qualToErrorProb(baseQual), readBases.length));
      -        Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read");
      +                Utils.dupBytes(gcp, readBases.length), true);
      +        double expected =  0;
      +        final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length;
      +        if (readBases.length < refBases.length) {
      +            expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), readBases.length));
      +        } else if (readBases.length > refBases.length) {
      +            expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), refBases.length) * Math.pow(QualityUtils.qualToErrorProb(insQual), readBases.length - refBases.length));
      +        }
      +        Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read " + String.format("readSize=%d refSize=%d", readSize, refSize));
           }
       
           @DataProvider(name = "HMMProviderWithBigReads")
      @@ -423,45 +430,42 @@ public class PairHMMUnitTest extends BaseTest {
       
           @Test(enabled = !DEBUG, dataProvider = "HMMProviderWithBigReads")
           void testReallyBigReads(final PairHMM hmm, final String read, final String ref) {
      -        byte[] readBases =  read.getBytes();
      -        byte[] refBases = ref.getBytes();
      -        byte baseQual = 30;
      -        byte insQual = 40;
      -        byte delQual = 40;
      -        byte gcp = 10;
      +        final byte[] readBases =  read.getBytes();
      +        final byte[] refBases = ref.getBytes();
      +        final byte baseQual = 30;
      +        final byte insQual = 40;
      +        final byte delQual = 40;
      +        final byte gcp = 10;
               hmm.initialize(readBases.length, refBases.length);
      -        double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      +        hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
                       Utils.dupBytes(baseQual, readBases.length),
                       Utils.dupBytes(insQual, readBases.length),
                       Utils.dupBytes(delQual, readBases.length),
      -                Utils.dupBytes(gcp, readBases.length), 0, true);
      -        Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + read.length() + " bases and ref with " + ref.length() + " bases");
      +                Utils.dupBytes(gcp, readBases.length), true);
           }
       
           @Test(enabled = !DEBUG)
           void testPreviousBadValue() {
      -        byte[] readBases = "A".getBytes();
      -        byte[] refBases =  "AT".getBytes();
      -        byte baseQual = 30;
      -        byte insQual = 40;
      -        byte delQual = 40;
      -        byte gcp = 10;
      +        final byte[] readBases = "A".getBytes();
      +        final byte[] refBases =  "AT".getBytes();
      +        final byte baseQual = 30;
      +        final byte insQual = 40;
      +        final byte delQual = 40;
      +        final byte gcp = 10;
       
               exactHMM.initialize(readBases.length, refBases.length);
      -        double d = exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      +        exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
                       Utils.dupBytes(baseQual, readBases.length),
                       Utils.dupBytes(insQual, readBases.length),
                       Utils.dupBytes(delQual, readBases.length),
      -                Utils.dupBytes(gcp, readBases.length), 0, true);
      -        //exactHMM.dumpMatrices();
      +                Utils.dupBytes(gcp, readBases.length), true);
       
               loglessHMM.initialize(readBases.length, refBases.length);
      -        double logless = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      +        loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
                       Utils.dupBytes(baseQual, readBases.length),
                       Utils.dupBytes(insQual, readBases.length),
                       Utils.dupBytes(delQual, readBases.length),
      -                Utils.dupBytes(gcp, readBases.length), 0, true);
      -        loglessHMM.dumpMatrices();
      +                Utils.dupBytes(gcp, readBases.length), true);
           }
       
           @DataProvider(name = "JustHMMProvider")
      @@ -477,25 +481,16 @@ public class PairHMMUnitTest extends BaseTest {
       
           @Test(enabled = !DEBUG, dataProvider = "JustHMMProvider")
           void testMaxLengthsBiggerThanProvidedRead(final PairHMM hmm) {
      +        final byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes();
      +        final byte[] refBases =  "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes();
      +
      +        final byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29};
      +        final byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45};
      +        final byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43};
      +        final byte[] gcp = Utils.dupBytes((byte) 10, delQual.length);
      +        hmm.initialize(readBases.length + 100, refBases.length + 100);
               for ( int nExtraMaxSize = 0; nExtraMaxSize < 100; nExtraMaxSize++ ) {
      -            byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes();
      -            byte[] refBases =  "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes();
      -            byte gcp = 10;
      -
      -            byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29};
      -            byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45};
      -            byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43};
      -
      -            final int maxHaplotypeLength = refBases.length + nExtraMaxSize;
      -            final int maxReadLength = readBases.length + nExtraMaxSize;
      -
      -            hmm.initialize(maxReadLength, maxHaplotypeLength);
      -            double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      -                    quals,
      -                    insQual,
      -                    delQual,
      -                    Utils.dupBytes(gcp, readBases.length), 0, true);
      -            Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + readBases.length + " bases and ref with " + refBases.length + " bases");
      +            hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true);
               }
           }
       
      @@ -558,9 +553,7 @@ public class PairHMMUnitTest extends BaseTest {
               final byte[] insQuals = Utils.dupBytes((byte)45, readBases.length);
               final byte[] delQuals = Utils.dupBytes((byte)40, readBases.length);
               final byte[] gcp = Utils.dupBytes((byte)10, readBases.length);
      -        double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(
      -                hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp,
      -                hapStart, recache);
      +        double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache);
               Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d + " was bad for read " + read + " and ref " + hap + " with hapStart " + hapStart);
               return d;
           }
      @@ -599,7 +592,7 @@ public class PairHMMUnitTest extends BaseTest {
           public Object[][] makeUninitializedHMMs() {
               List tests = new ArrayList();
       
      -        tests.add(new Object[]{new LoglessCachingPairHMM()});
      +        tests.add(new Object[]{new LoglessPairHMM()});
               tests.add(new Object[]{new Log10PairHMM(true)});
       
               return tests.toArray(new Object[][]{});
      @@ -613,7 +606,7 @@ public class PairHMMUnitTest extends BaseTest {
       
               // didn't call initialize => should exception out
               double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      -                baseQuals, baseQuals, baseQuals, baseQuals, 0, true);
      +                baseQuals, baseQuals, baseQuals, baseQuals, true);
           }
       
           @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider")
      @@ -624,7 +617,7 @@ public class PairHMMUnitTest extends BaseTest {
       
               hmm.initialize(3, 3);
               double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      -                baseQuals, baseQuals, baseQuals, baseQuals, 0, true);
      +                baseQuals, baseQuals, baseQuals, baseQuals, true);
           }
       
           @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider")
      @@ -635,6 +628,6 @@ public class PairHMMUnitTest extends BaseTest {
       
               hmm.initialize(2, 3);
               double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
      -                baseQuals, baseQuals, baseQuals, baseQuals, 0, true);
      +                baseQuals, baseQuals, baseQuals, baseQuals, true);
           }
       }
      \ No newline at end of file
      diff --git a/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java
      new file mode 100644
      index 000000000..c8fc458e8
      --- /dev/null
      +++ b/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java
      @@ -0,0 +1,120 @@
      +/*
      +*  By downloading the PROGRAM you agree to the following terms of use:
      +*  
      +*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
      +*  
      +*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
      +*  
      +*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
      +*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
      +*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
      +*  
      +*  1. DEFINITIONS
      +*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
      +*  
      +*  2. LICENSE
      +*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. 
      +*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
      +*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
      +*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.  
      +*  
      +*  3. OWNERSHIP OF INTELLECTUAL PROPERTY 
      +*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
      +*  Copyright 2012 Broad Institute, Inc.
      +*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
      +*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
      +*  
      +*  4. INDEMNIFICATION
      +*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
      +*  
      +*  5. NO REPRESENTATIONS OR WARRANTIES
      +*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
      +*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
      +*  
      +*  6. ASSIGNMENT
      +*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
      +*  
      +*  7. MISCELLANEOUS
      +*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
      +*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
      +*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
      +*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt. 
      +*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. 
      +*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
      +*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
      +*/
      +
      +package org.broadinstitute.sting.utils.smithwaterman;
      +
      +import org.broadinstitute.sting.BaseTest;
      +import org.testng.Assert;
      +import org.testng.annotations.DataProvider;
      +import org.testng.annotations.Test;
      +
      +import java.util.ArrayList;
      +import java.util.List;
      +
      +public class SWPairwiseAlignmentUnitTest extends BaseTest {
      +    @DataProvider(name = "ComplexReadAlignedToRef")
      +    public Object[][] makeComplexReadAlignedToRef() {
      +        List tests = new ArrayList();
      +
      +        final String ref1     = "ACTGACTGACTG";
      +        tests.add(new Object[]{"AAAGGACTGACTG", ref1, 1, "12M"});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "ComplexReadAlignedToRef", enabled = true)
      +    public void testReadAlignedToRefComplexAlignment(final String reference, final String read, final int expectedStart, final String expectedCigar) {
      +        final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes());
      +        Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
      +        Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
      +    }
      +
      +    @DataProvider(name = "OddNoAlignment")
      +    public Object[][] makeOddNoAlignment() {
      +        List tests = new ArrayList();
      +
      +        final String ref1     = "AAAGACTACTG";
      +        final String read1    = "AACGGACACTG";
      +        tests.add(new Object[]{ref1, read1, 5.0, -10.0, -22.0, -1.2, 1, "2M2I3M1D4M"});
      +        tests.add(new Object[]{ref1, read1, 20.0, -5.0, -30.0, -2.2, 0, "11M"});
      +
      +        return tests.toArray(new Object[][]{});
      +    }
      +
      +    @Test(dataProvider = "OddNoAlignment", enabled = true)
      +    public void testOddNoAlignment(final String reference, final String read, final double match, final double mismatch, final double gap, final double gap_extend,
      +                                   final int expectedStart, final String expectedCigar) {
      +        final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes(), match, mismatch, gap, gap_extend);
      +        Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
      +        Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testIndelsAtStartAndEnd() {
      +        final String match     = "CCCCC";
      +        final String reference = "AAA" + match;
      +        final String read      = match + "GGG";
      +        final int expectedStart = 3;
      +        final String expectedCigar = "5M3S";
      +        final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes());
      +        sw.printAlignment(reference.getBytes(), read.getBytes());
      +        Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
      +        Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
      +    }
      +
      +    @Test(enabled = true)
      +    public void testDegenerateAlignmentWithIndelsAtBothEnds() {
      +        logger.warn("testDegenerateAlignmentWithIndelsAtBothEnds");
      +        final String ref = "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA";
      +        final String alt =               "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA";
      +        final int expectedStart = 14;
      +        final String expectedCigar = "31M20S";
      +        final SWPairwiseAlignment sw = new SWPairwiseAlignment(ref.getBytes(), alt.getBytes(), SWParameterSet.STANDARD_NGS);
      +        sw.printAlignment(ref.getBytes(), alt.getBytes());
      +        Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
      +        Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
      +    }
      +}
      diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd
      index dc7a08287..4a49cf932 100644
      --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd
      +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd
      @@ -19,9 +19,11 @@ Medical and Population Genetics Program
       Maintainer: Kiran Garimella
       }
       \references{
      -GSA wiki page: http://www.broadinstitute.org/gatk
      +GATK website: http://www.broadinstitute.org/gatk
       
      -GATK help forum: http://www.broadinstitute.org/gatk
      +GATK documentation guide: http://www.broadinstitute.org/gatk/guide
      +
      +GATK help forum: http://gatkforums.broadinstitute.org
       }
       \examples{
       ## get script arguments in interactive and non-interactive mode
      diff --git a/public/doc/README b/public/doc/README
      index ec5fa8500..e70ced0df 100644
      --- a/public/doc/README
      +++ b/public/doc/README
      @@ -59,7 +59,7 @@ index (.fasta.fai).
       
       Instructions for preparing input files are available here:
       
      -http://www.broadinstitute.org/gsa/wiki/index.php/Preparing_input_files
      +http://www.broadinstitute.org/gatk/guide/article?id=1204
       
       The bundled 'resources' directory  contains an example BAM and fasta.
       
      @@ -69,7 +69,7 @@ The GATK is distributed with a few standard analyses, including PrintReads,
       Pileup, and DepthOfCoverage.  More information on the included walkers is
       available here:
       
      -http://www.broadinstitute.org/gsa/wiki/index.php/Built-in_walkers
      +http://www.broadinstitute.org/gatk/gatkdocs
       
       To print the reads of the included sample data, untar the package into
       the GenomeAnalysisTK directory and run the following command:
      @@ -81,6 +81,6 @@ java -jar GenomeAnalysisTK/GenomeAnalysisTK.jar \
       
       Support
       -------
      -Documentation for the GATK is available at http://www.broadinstitute.org/gsa/wiki.  
      +Documentation for the GATK is available at http://www.broadinstitute.org/gatk/guide.
       For help using the GATK, developing analyses with the GATK, bug reports, 
      -or feature requests, please email gsadevelopers@broadinstitute.org.
      +or feature requests, please visit our support forum at http://gatkforums.broadinstitute.org/
      diff --git a/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java b/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java
      index 93b4d5e6f..d313f35ce 100644
      --- a/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java
      +++ b/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java
      @@ -42,9 +42,14 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       import java.util.Iterator;
       
       /**
      - * Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them
      + * Validates consistency of the aligner interface
      + *
      + * 

      Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them * of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original - * alignment from the input file. + * alignment from the input file.

      + * + *

      Caveat

      + *

      This tool requires that BWA be available on the java path.

      * * @author mhanna * @version 0.1 diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java index 828f10fcb..e354601da 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java @@ -276,10 +276,10 @@ public class ArgumentMatch implements Iterable { * @return A collection of the string representation of these value. */ public List values() { - List values = new ArrayList(); - for( ArgumentMatchSite site: sites.keySet() ) { - if( sites.get(site) != null ) - values.addAll(sites.get(site)); + final List values = new ArrayList(); + for ( final List siteValue : sites.values() ) { + if ( siteValue != null ) + values.addAll(siteValue); } return values; } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java index b9c785879..efacde231 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java @@ -175,6 +175,14 @@ public class ArgumentSource { return field.isAnnotationPresent(Deprecated.class); } + /** + * Returns whether the field should default to stdout if not provided explicitly on the command-line. + * @return True if field should default to stdout. + */ + public boolean defaultsToStdout() { + return field.isAnnotationPresent(Output.class) && (Boolean)CommandLineUtils.getValue(ArgumentTypeDescriptor.getArgumentAnnotation(this),"defaultToStdout"); + } + /** * Returns false if a type-specific default can be employed. * @return True to throw in a type specific default. False otherwise. diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index 08aa5f8b3..cf11bb61c 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -370,7 +370,7 @@ public abstract class CommandLineProgram { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); errorPrintf("%n"); - errorPrintf("Please visit the wiki to see if this is a known problem%n"); + errorPrintf("Please check the documentation guide to see if this is a known problem%n"); errorPrintf("If not, please post the error, with stack trace, to the GATK forum%n"); printDocumentationReference(); if ( msg == null ) // some exceptions don't have detailed messages diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java index 7f419abb2..9253e1ee5 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -98,6 +98,7 @@ public final class IntervalBinding { intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); } + Collections.sort(intervals); return intervals; } diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java index 6c2b143c4..0db870f2e 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Output.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Output.java @@ -64,7 +64,14 @@ public @interface Output { * fail if the type can't be populated. * @return True if the argument is required. False otherwise. */ - boolean required() default true; + boolean required() default false; + + /** + * If this argument is not required, should it default to use stdout if no + * output file is explicitly provided on the command-line? + * @return True if the argument should default to stdout. False otherwise. + */ + boolean defaultToStdout() default true; /** * Should this command-line argument be exclusive of others. Should be diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ba25ac957..82bee7826 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -67,6 +67,9 @@ import java.io.File; import java.util.*; import java.util.concurrent.TimeUnit; +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.isDeprecatedWalker; + /** * A GenomeAnalysisEngine that runs a specified walker. */ @@ -288,40 +291,6 @@ public class GenomeAnalysisEngine { //return result; } - // TODO -- Let's move this to a utility class in unstable - but which one? - // ************************************************************************************** - // * Handle Deprecated Walkers * - // ************************************************************************************** - - // Mapping from walker name to major version number where the walker first disappeared - private static Map deprecatedGATKWalkers = new HashMap(); - static { - deprecatedGATKWalkers.put("CountCovariates", "2.0"); - deprecatedGATKWalkers.put("TableRecalibration", "2.0"); - deprecatedGATKWalkers.put("AlignmentWalker", "2.2"); - deprecatedGATKWalkers.put("CountBestAlignments", "2.2"); - } - - /** - * Utility method to check whether a given walker has been deprecated in a previous GATK release - * - * @param walkerName the walker class name (not the full package) to check - */ - public static boolean isDeprecatedWalker(final String walkerName) { - return deprecatedGATKWalkers.containsKey(walkerName); - } - - /** - * Utility method to check whether a given walker has been deprecated in a previous GATK release - * - * @param walkerName the walker class name (not the full package) to check - */ - public static String getDeprecatedMajorVersionNumber(final String walkerName) { - return deprecatedGATKWalkers.get(walkerName); - } - - // ************************************************************************************** - /** * Retrieves an instance of the walker based on the walker name. * @@ -333,7 +302,7 @@ public class GenomeAnalysisEngine { return walkerManager.createByName(walkerName); } catch ( UserException e ) { if ( isDeprecatedWalker(walkerName) ) { - e = new UserException.DeprecatedWalker(walkerName, getDeprecatedMajorVersionNumber(walkerName)); + e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); } throw e; } @@ -372,7 +341,8 @@ public class GenomeAnalysisEngine { * @param walker the walker we need to apply read transformers too */ public void initializeReadTransformers(final Walker walker) { - final List activeTransformers = new ArrayList(); + // keep a list of the active read transformers sorted based on priority ordering + List activeTransformers = new ArrayList(); final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; @@ -392,9 +362,41 @@ public class GenomeAnalysisEngine { return readTransformers; } - private void setReadTransformers(final List readTransformers) { + /* + * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). + * + * @param readTransformers the active read transformers + */ + protected void checkActiveReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new IllegalArgumentException("read transformers cannot be null"); + + ReadTransformer sawMustBeFirst = null; + ReadTransformer sawMustBeLast = null; + + for ( final ReadTransformer r : readTransformers ) { + if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { + if ( sawMustBeFirst != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); + sawMustBeFirst = r; + } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { + if ( sawMustBeLast != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); + sawMustBeLast = r; + } + } + } + + protected void setReadTransformers(final List readTransformers) { if ( readTransformers == null ) throw new ReviewedStingException("read transformers cannot be null"); + + // sort them in priority order + Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); + + // make sure we don't have an invalid set of active read transformers + checkActiveReadTransformers(readTransformers); + this.readTransformers = readTransformers; } @@ -532,6 +534,8 @@ public class GenomeAnalysisEngine { if ( intervals != null && intervals.isEmpty() ) { logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); } + + // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome } /** @@ -558,7 +562,7 @@ public class GenomeAnalysisEngine { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); } @@ -566,7 +570,7 @@ public class GenomeAnalysisEngine { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); } @@ -721,6 +725,15 @@ public class GenomeAnalysisEngine { rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); } + /** + * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or + * need to absolutely positively kill everyone in the room) + * @param dataSource + */ + public void setReadsDataSource(final SAMDataSource dataSource) { + this.readsDataSource = dataSource; + } + /** * Entry-point function to initialize the samples database from input data and pedigree arguments */ @@ -852,7 +865,8 @@ public class GenomeAnalysisEngine { SAMSequenceDictionary sequenceDictionary, GenomeLocParser genomeLocParser, ValidationExclusion.TYPE validationExclusionType) { - final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType); + final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, + getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); final List dataSources = new ArrayList(); for (RMDTriplet fileDescriptor : referenceMetaDataFiles) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index a3e19b944..8d1fa4638 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -104,8 +104,9 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; - @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.") - public boolean disableRandomization = false; + @Hidden + @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.") + public boolean disableDithering = false; @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false) public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; @@ -206,7 +207,7 @@ public class GATKArgumentCollection { * Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool. * Please be aware that one should only run recalibration with the covariates file created on the same input bam(s). */ - @Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration") + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration (intended for use with BaseRecalibrator and PrintReads)") public File BQSR_RECAL_FILE = null; /** @@ -274,6 +275,17 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; + @Hidden + @Advanced + @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", + doc = "UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking " + + "when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index " + + "generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it " + + "for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general " + + "because it allows reading from index files without first acquiring a lock.", + required = false) + public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; + // -------------------------------------------------------------------------------------------------------------- // // Multi-threading arguments diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 8d7cfbaa7..adb668ff9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; -import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -53,14 +52,15 @@ public class BAMScheduler implements Iterator { private PeekableIterator locusIterator; private GenomeLoc currentLocus; - public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { - BAMScheduler scheduler = new BAMScheduler(dataSource); - GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); - for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { - // Match only on sequence name; trust startup validation to make sure all the sequences match. - if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) - intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); - } + /* + * Creates BAMScheduler using contigs from the given BAM data source. + * + * @param dataSource BAM source + * @return non-null BAM scheduler + */ + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { + final BAMScheduler scheduler = new BAMScheduler(dataSource); + final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); scheduler.populateFilteredIntervalList(intervals); return scheduler; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 57b409dcd..6c7a6c867 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -25,16 +25,17 @@ package org.broadinstitute.sting.gatk.datasources.reads; +import net.sf.samtools.Bin; +import net.sf.samtools.GATKBin; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.LinearIndex; import net.sf.samtools.seekablestream.SeekableBufferedStream; import net.sf.samtools.seekablestream.SeekableFileStream; - -import net.sf.samtools.*; - import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import java.io.*; +import java.io.File; +import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.ArrayList; @@ -86,6 +87,7 @@ public class GATKBAMIndex { private SeekableFileStream fileStream; private SeekableBufferedStream bufferedStream; + private long fileLength; public GATKBAMIndex(final File file) { mFile = file; @@ -307,6 +309,7 @@ public class GATKBAMIndex { try { fileStream = new SeekableFileStream(mFile); bufferedStream = new SeekableBufferedStream(fileStream,BUFFERED_STREAM_BUFFER_SIZE); + fileLength=bufferedStream.length(); } catch (IOException exc) { throw new ReviewedStingException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); @@ -317,6 +320,7 @@ public class GATKBAMIndex { try { bufferedStream.close(); fileStream.close(); + fileLength = -1; } catch (IOException exc) { throw new ReviewedStingException("Unable to close index file " + mFile, exc); @@ -368,7 +372,7 @@ public class GATKBAMIndex { // We have a rigid expectation here to read in exactly the number of bytes we've limited // our buffer to -- if there isn't enough data in the file, the index // must be truncated or otherwise corrupt: - if(bytesRequested > bufferedStream.length() - bufferedStream.position()){ + if(bytesRequested > fileLength - bufferedStream.position()){ throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + "It's likely that this file is truncated or corrupt -- " + "Please try re-indexing the corresponding BAM file.", diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index f7ca7593f..048ce17f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -51,8 +50,8 @@ public class IntervalSharder implements Iterator { return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); } - public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { - return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); + public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser); } public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d52e55d6d..1223dd2af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -1060,10 +1060,12 @@ public class SAMDataSource { /** * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any * read that has been assigned - * @return + * + * @param shardBalancer shard balancer object + * @return non-null initialized version of the shard balancer */ - public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); + public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); return shardBalancer; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 14bec213e..66463e576 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -26,12 +26,10 @@ package org.broadinstitute.sting.gatk.datasources.reads.utilities; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; @@ -98,14 +96,11 @@ public class FindLargeShards extends CommandLineProgram { SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); // intervals - GenomeLocSortedSet intervalSortedSet = null; - if(intervals != null) + final GenomeLocSortedSet intervalSortedSet; + if ( intervals != null ) intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); - else { - intervalSortedSet = new GenomeLocSortedSet(genomeLocParser); - for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences()) - intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength())); - } + else + intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary()); logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index 79100e89a..01edd44ba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -25,10 +25,7 @@ package org.broadinstitute.sting.gatk.datasources.reference; -import net.sf.picard.reference.FastaSequenceIndex; -import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.sam.CreateSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; @@ -36,11 +33,8 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.file.FSLockWithShared; -import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; import java.util.ArrayList; @@ -77,128 +71,25 @@ public class ReferenceDataSource { final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? ".fa" : ".fasta"; final File dictFile = new File(fastaFile.getAbsolutePath().replace(fastaExt, ".dict")); - /* - * if index file does not exist, create it manually - */ + // It's an error if either the fai or dict file does not exist. The user is now responsible + // for creating these files. if (!indexFile.exists()) { - - logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath())); - FSLockWithShared indexLock = new FSLockWithShared(indexFile,true); - try { - // get exclusive lock - if (!indexLock.exclusiveLock()) - throw new UserException.CouldNotCreateReferenceIndexFileBecauseOfLock(dictFile); - FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, true); - FastaSequenceIndex sequenceIndex = faiBuilder.createIndex(); - FastaSequenceIndexBuilder.saveAsFaiFile(sequenceIndex, indexFile); - } - catch(FileSystemInabilityToLockException ex) { - logger.info("Unable to create write lock: " + ex.getMessage()); - logger.info("Skipping index creation."); - } - catch(UserException e) { - // Rethrow all user exceptions as-is; there should be more details in the UserException itself. - throw e; - } - catch (Exception e) { - // If lock creation succeeded, the failure must have been generating the index. - // If lock creation failed, just skip over index creation entirely. - throw new UserException.CouldNotCreateReferenceIndexFile(indexFile, e); - } - finally { - indexLock.unlock(); - } + throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile); } - - /* - * If dict file doesn't exist, try to create it using Picard's CreateSequenceDictionary - * Currently, dictionary cannot be created without running CreateSequenceDictionary's main routine, hence the - * argument string - * This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary - */ if (!dictFile.exists()) { - - logger.info(String.format("Dict file %s does not exist. Trying to create it now.", dictFile.getAbsolutePath())); - - /* - * Please note another hack here: we have to create a temporary file b/c CreateSequenceDictionary cannot - * create a dictionary file if that file is locked. - */ - - // get read lock on dict file so nobody else can read it - FSLockWithShared dictLock = new FSLockWithShared(dictFile,true); - try { - // get shared lock on dict file so nobody else can start creating it - if (!dictLock.exclusiveLock()) - throw new UserException.CouldNotCreateReferenceIndexFileBecauseOfLock(dictFile); - // dict will be written to random temporary file in same directory (see note above) - File tempFile = File.createTempFile("dict", null, dictFile.getParentFile()); - tempFile.deleteOnExit(); - - // create dictionary by calling main routine. Temporary fix - see comment above. - String args[] = {String.format("r=%s", fastaFile.getAbsolutePath()), - String.format("o=%s", tempFile.getAbsolutePath())}; - new CreateSequenceDictionary().instanceMain(args); - - if (!tempFile.renameTo(dictFile)) - throw new UserException("Error transferring temp file " + tempFile + " to dict file " + dictFile); - } - catch(FileSystemInabilityToLockException ex) { - logger.info("Unable to create write lock: " + ex.getMessage()); - logger.info("Skipping dictionary creation."); - } - catch (Exception e) { - // If lock creation succeeded, the failure must have been generating the index. - // If lock creation failed, just skip over index creation entirely. - throw new UserException.CouldNotCreateReferenceIndexFile(dictFile, e); - } - finally { - dictLock.unlock(); - } + throw new UserException.MissingReferenceDictFile(dictFile, fastaFile); } - /* - * Read reference data by creating an IndexedFastaSequenceFile. - * A note about thread safety: IndexFastaSequenceFile reads the fasta using dictionary and index files. It will - * fail if either does not exist, but not if either is currently being written (in which case it exists - * but is incomplete). To avoid this, obtain shared locks on both files before creating IndexedFastaSequenceFile. - */ - - FSLockWithShared dictLock = new FSLockWithShared(dictFile,true); - FSLockWithShared indexLock = new FSLockWithShared(indexFile,true); + // Read reference data by creating an IndexedFastaSequenceFile. try { - try { - if (!dictLock.sharedLock()) { - throw new ReviewedStingException("Could not open dictionary file because a lock could not be obtained."); - } - } - catch(FileSystemInabilityToLockException ex) { - logger.info(String.format("Unable to create a lock on dictionary file: %s",ex.getMessage())); - logger.info("Treating existing dictionary file as complete."); - } - - try { - if (!indexLock.sharedLock()) { - throw new ReviewedStingException("Could not open index file because a lock could not be obtained."); - } - } - catch(FileSystemInabilityToLockException ex) { - logger.info(String.format("Unable to create a lock on index file: %s",ex.getMessage())); - logger.info("Treating existing index file as complete."); - } - reference = new CachingIndexedFastaSequenceFile(fastaFile); - - } catch (IllegalArgumentException e) { + } + catch (IllegalArgumentException e) { throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); } catch (Exception e) { throw new UserException.CouldNotReadInputFile(fastaFile, e); } - finally { - dictLock.unlock(); - indexLock.unlock(); - } } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 6785375ba..fb7a16bfd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.downsampling; -import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -38,65 +37,62 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.io.File; import java.io.IOException; -import java.io.PrintStream; import java.util.*; import org.apache.log4j.Logger; public class AlleleBiasedDownsamplingUtils { + // define this class so that we can use Java generics below + private final static class PileupElementList extends ArrayList {} + /** * Computes an allele biased version of the given pileup * * @param pileup the original pileup * @param downsamplingFraction the fraction of total reads to remove per allele - * @param log logging output * @return allele biased pileup */ - public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { + public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { // special case removal of all or no reads if ( downsamplingFraction <= 0.0 ) return pileup; if ( downsamplingFraction >= 1.0 ) return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); - final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i] = new ArrayList(); + alleleStratifiedElements[i] = new PileupElementList(); // start by stratifying the reads by the alleles they represent at this position + boolean sawReducedRead = false; for ( final PileupElement pe : pileup ) { - // we do not want to remove a reduced read - if ( !pe.getRead().isReducedRead() ) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); - } + if ( pe.getRead().isReducedRead() ) + sawReducedRead = true; + + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); } - // make a listing of allele counts - final int[] alleleCounts = new int[4]; - for ( int i = 0; i < 4; i++ ) - alleleCounts[i] = alleleStratifiedElements[i].size(); + // make a listing of allele counts and calculate the total count + final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead); + final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); // do smart down-sampling - int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor + final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); final HashSet readsToRemove = new HashSet(numReadsToRemove); for ( int i = 0; i < 4; i++ ) { - final ArrayList alleleList = alleleStratifiedElements[i]; + final PileupElementList alleleList = alleleStratifiedElements[i]; // if we don't need to remove any reads, then don't - if ( alleleList.size() > targetAlleleCounts[i] ) - readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); + if ( alleleCounts[i] > targetAlleleCounts[i] ) + readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); } - // clean up pointers so memory can be garbage collected if needed - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i].clear(); - // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise - final List readsToKeep = new ArrayList(pileup.getNumberOfElements() - numReadsToRemove); + final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); for ( final PileupElement pe : pileup ) { if ( !readsToRemove.contains(pe) ) { readsToKeep.add(pe); @@ -106,6 +102,26 @@ public class AlleleBiasedDownsamplingUtils { return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); } + /** + * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) + * + * @param alleleStratifiedElements pileup elements stratified by allele + * @param sawReducedRead is at least one read a reduced read? + * @return non-null int array representing allele counts + */ + private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) { + final int[] alleleCounts = new int[alleleStratifiedElements.length]; + for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { + if ( !sawReducedRead ) { + alleleCounts[i] = alleleStratifiedElements[i].size(); + } else { + for ( final PileupElement pe : alleleStratifiedElements[i] ) + alleleCounts[i] += pe.getRepresentativeCount(); + } + } + return alleleCounts; + } + private static int scoreAlleleCounts(final int[] alleleCounts) { if ( alleleCounts.length < 2 ) return 0; @@ -128,11 +144,11 @@ public class AlleleBiasedDownsamplingUtils { } /** - * Computes an allele biased version of the given pileup + * Computes an allele biased version of the allele counts for a given pileup * - * @param alleleCounts the original pileup - * @param numReadsToRemove fraction of total reads to remove per allele - * @return allele biased pileup + * @param alleleCounts the allele counts for the original pileup + * @param numReadsToRemove number of total reads to remove per allele + * @return non-null array of new counts needed per allele */ protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { final int numAlleles = alleleCounts.length; @@ -169,36 +185,50 @@ public class AlleleBiasedDownsamplingUtils { /** * Performs allele biased down-sampling on a pileup and computes the list of elements to remove * - * @param elements original list of records + * @param elements original list of pileup elements + * @param originalElementCount original count of elements (taking reduced reads into account) * @param numElementsToRemove the number of records to remove - * @param log logging output * @return the list of pileup elements TO REMOVE */ - private static List downsampleElements(final List elements, final int numElementsToRemove, final PrintStream log) { - ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - + protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { // are there no elements to remove? if ( numElementsToRemove == 0 ) - return elementsToRemove; + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); // should we remove all of the elements? - final int pileupSize = elements.size(); - if ( numElementsToRemove == pileupSize ) { - logAllElements(elements, log); + if ( numElementsToRemove >= originalElementCount ) { elementsToRemove.addAll(elements); return elementsToRemove; } // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(pileupSize); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); } - for ( int i = 0; i < pileupSize; i++ ) { - if ( itemsToRemove.get(i) ) { - final T element = elements.get(i); - logElement(element, log); + int currentBitSetIndex = 0; + for ( final PileupElement element : elements ) { + + final int representativeCount = element.getRepresentativeCount(); + + // if it's a reduced read, we need to be smart about how we down-sample + if ( representativeCount > 1 ) { + // count how many bits are set over the span represented by this read + int setBits = 0; + for ( int i = 0; i < representativeCount; i++ ) + setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0; + + // remove that count from the count of the reduced read + if ( setBits == representativeCount ) + elementsToRemove.add(element); + else + element.adjustRepresentativeCount(-1 * setBits); + } + // otherwise it's trivial: remove if the corresponding bit is set + else if ( itemsToRemove.get(currentBitSetIndex++) ) { elementsToRemove.add(element); } } @@ -211,10 +241,9 @@ public class AlleleBiasedDownsamplingUtils { * * @param alleleReadMap original list of records per allele * @param downsamplingFraction the fraction of total reads to remove per allele - * @param log logging output * @return list of reads TO REMOVE from allele biased down-sampling */ - public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction, final PrintStream log) { + public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { int totalReads = 0; for ( final List reads : alleleReadMap.values() ) totalReads += reads.size(); @@ -225,6 +254,8 @@ public class AlleleBiasedDownsamplingUtils { final List alleles = new ArrayList(alleleReadMap.keySet()); alleles.remove(Allele.NO_CALL); // ignore the no-call bin final int numAlleles = alleles.size(); + + // TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead final int[] alleleCounts = new int[numAlleles]; for ( int i = 0; i < numAlleles; i++ ) alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); @@ -234,38 +265,52 @@ public class AlleleBiasedDownsamplingUtils { final List readsToRemove = new ArrayList(numReadsToRemove); for ( int i = 0; i < numAlleles; i++ ) { - final List alleleBin = alleleReadMap.get(alleles.get(i)); - - if ( alleleBin.size() > targetAlleleCounts[i] ) { - readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); + if ( alleleCounts[i] > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); } } return readsToRemove; } - private static void logAllElements(final List elements, final PrintStream log) { - if ( log != null ) { - for ( final T obj : elements ) { - logElement(obj, log); - } + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param reads original list of records + * @param numElementsToRemove the number of records to remove + * @return the list of pileup elements TO REMOVE + */ + protected static List downsampleElements(final List reads, final int numElementsToRemove) { + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + final int originalElementCount = reads.size(); + + // should we remove all of the elements? + if ( numElementsToRemove >= originalElementCount ) { + elementsToRemove.addAll(reads); + return elementsToRemove; } - } - private static void logElement(final T obj, final PrintStream log) { - if ( log != null ) { - - final GATKSAMRecord read; - if ( obj instanceof PileupElement ) - read = ((PileupElement)obj).getRead(); - else - read = (GATKSAMRecord)obj; - - final SAMReadGroupRecord readGroup = read.getReadGroup(); - log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit())); + // create a bitset describing which elements to remove + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); } - } + int currentBitSetIndex = 0; + for ( final GATKSAMRecord read : reads ) { + if ( read.isReducedRead() ) + throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords"); + + if ( itemsToRemove.get(currentBitSetIndex++) ) + elementsToRemove.add(read); + } + + return elementsToRemove; + } /** * Create sample-contamination maps from file @@ -288,17 +333,17 @@ public class AlleleBiasedDownsamplingUtils { continue; } - StringTokenizer st = new StringTokenizer(line); + StringTokenizer st = new StringTokenizer(line,"\t"); String fields[] = new String[2]; try { fields[0] = st.nextToken(); fields[1] = st.nextToken(); } catch(NoSuchElementException e){ - throw new UserException.MalformedFile("Contamination file must have exactly two columns. Offending line:\n" + line); + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); } if(st.hasMoreTokens()) { - throw new UserException.MalformedFile("Contamination file must have exactly two columns. Offending line:\n" + line); + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); } if (fields[0].length() == 0 || fields[1].length() == 0) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java index bfac08d35..23b16cff2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -94,6 +94,17 @@ public interface Downsampler { */ public T peekPending(); + /** + * Get the current number of items in this downsampler + * + * This should be the best estimate of the total number of elements that will come out of the downsampler + * were consumeFinalizedItems() to be called immediately after this call. In other words it should + * be number of finalized items + estimate of number of pending items that will ultimately be included as well. + * + * @return a positive integer + */ + public int size(); + /** * Returns the number of items discarded (so far) during the downsampling process * diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java new file mode 100644 index 000000000..877083829 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Utilities for using the downsamplers for common tasks + * + * User: depristo + * Date: 3/6/13 + * Time: 4:26 PM + */ +public class DownsamplingUtils { + private DownsamplingUtils() { } + + /** + * Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing + * coverage at any read start to less than minReadsPerAlignmentStart + * + * This algorithm can be used to handle the situation where you have lots of coverage in some interval, and + * want to reduce the coverage of the big peak down without removing the many reads at the edge of this + * interval that are in fact good + * + * This algorithm separately operates on the reads for each sample independently. + * + * @param reads a sorted list of reads + * @param downsampleTo the targeted number of reads we want from reads per sample + * @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start + * to below this. That is, if this value is 2, we'll never reduce the number + * of reads starting at a specific start site to less than 2 + * @return a sorted list of reads + */ + public static List levelCoverageByPosition(final List reads, final int downsampleTo, final int minReadsPerAlignmentStart) { + if ( reads == null ) throw new IllegalArgumentException("reads must not be null"); + + final List downsampled = new ArrayList(reads.size()); + + final Map>> readsBySampleByStart = partitionReadsBySampleAndStart(reads); + for ( final Map> readsByPosMap : readsBySampleByStart.values() ) { + final LevelingDownsampler, GATKSAMRecord> downsampler = new LevelingDownsampler, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart); + downsampler.submit(readsByPosMap.values()); + downsampler.signalEndOfInput(); + for ( final List downsampledReads : downsampler.consumeFinalizedItems()) + downsampled.addAll(downsampledReads); + } + + return ReadUtils.sortReadsByCoordinate(downsampled); + } + + /** + * Build the data structure mapping for each sample -> (position -> reads at position) + * + * Note that the map position -> reads isn't ordered in any meaningful way + * + * @param reads a list of sorted reads + * @return a map containing the list of reads at each start location, for each sample independently + */ + private static Map>> partitionReadsBySampleAndStart(final List reads) { + final Map>> readsBySampleByStart = new LinkedHashMap>>(); + + for ( final GATKSAMRecord read : reads ) { + Map> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample()); + + if ( readsByStart == null ) { + readsByStart = new LinkedHashMap>(); + readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart); + } + + List readsAtStart = readsByStart.get(read.getAlignmentStart()); + if ( readsAtStart == null ) { + readsAtStart = new LinkedList(); + readsByStart.put(read.getAlignmentStart(), readsAtStart); + } + + readsAtStart.add(read); + } + + return readsBySampleByStart; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java index 266148178..1cede9c33 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -109,6 +109,11 @@ public class FractionalDownsampler implements ReadsDownsamp return numDiscardedItems; } + @Override + public int size() { + return selectedReads.size(); + } + public void signalEndOfInput() { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java index 9b4b2adcb..4ff729537 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -47,8 +47,8 @@ import java.util.*; * @author David Roazen */ public class LevelingDownsampler, E> implements Downsampler { - - private int targetSize; + private final int minElementsPerStack; + private final int targetSize; private List groups; @@ -59,12 +59,32 @@ public class LevelingDownsampler, E> implements Downsampler /** * Construct a LevelingDownsampler * + * Uses the default minElementsPerStack of 1 + * * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed * this value -- if it does, items are removed from Lists evenly until the total size * is <= this value */ public LevelingDownsampler( int targetSize ) { + this(targetSize, 1); + } + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + * @param minElementsPerStack no stack will be reduced below this size during downsampling. That is, + * if a stack has only 3 elements and minElementsPerStack is 3, no matter what + * we'll not reduce this stack below 3. + */ + public LevelingDownsampler(final int targetSize, final int minElementsPerStack) { + if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); + if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); + this.targetSize = targetSize; + this.minElementsPerStack = minElementsPerStack; clear(); reset(); } @@ -108,6 +128,15 @@ public class LevelingDownsampler, E> implements Downsampler return numDiscardedItems; } + @Override + public int size() { + int s = 0; + for ( final List l : groups ) { + s += l.size(); + } + return s; + } + public void signalEndOfInput() { levelGroups(); groupsAreFinalized = true; @@ -148,7 +177,7 @@ public class LevelingDownsampler, E> implements Downsampler // remove any more items without violating the constraint that all groups must // be left with at least one item while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { - if ( groupSizes[currentGroupIndex] > 1 ) { + if ( groupSizes[currentGroupIndex] > minElementsPerStack ) { groupSizes[currentGroupIndex]--; numItemsToRemove--; numConsecutiveUmodifiableGroups = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java index b06d5f5b4..3aaed6c73 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -89,6 +89,11 @@ public class PassThroughDownsampler implements ReadsDownsam return 0; } + @Override + public int size() { + return selectedReads.size(); + } + public void signalEndOfInput() { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index 4331fd723..0e6bbfcb6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -156,6 +156,11 @@ public class ReservoirDownsampler implements ReadsDownsampl return numDiscardedItems; } + @Override + public int size() { + return reservoir.size(); + } + public void signalEndOfInput() { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java index 3da18b2bb..7c6c043c2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java @@ -112,6 +112,11 @@ public class SimplePositionalDownsampler implements ReadsDo return numDiscardedItems; } + @Override + public int size() { + return finalizedReads.size() + reservoir.size(); + } + public void signalEndOfInput() { finalizeReservoir(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java index 362cb202e..fcae3cc68 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java @@ -41,17 +41,17 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; * [Functionality of this walker] *

      * - *

      Input

      + *

      Input

      *

      * [Input description] *

      * - *

      Output

      + *

      Output

      *

      * [Output description] *

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java
      index 7b56852d3..07ec088cf 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java
      @@ -40,7 +40,8 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
       import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
       
       import java.io.PrintStream;
      -
      +import java.util.Arrays;
      +import java.util.Comparator;
       
       /**
        * A simple Bayesian genotyper, that outputs a text based call format. Intended to be used only as an
      @@ -95,7 +96,7 @@ public class GATKPaperGenotyper extends LocusWalker implements Tre
                       likelihoods[genotype.ordinal()] += Math.log10(p / genotype.toString().length());
                   }
       
      -        Integer sortedList[] = MathUtils.sortPermutation(likelihoods);
      +        Integer sortedList[] = sortPermutation(likelihoods);
       
               // create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString())
               // and calculate the LOD score from best - next best (9 and 8 in the sorted list, since the best likelihoods are closest to zero)
      @@ -110,6 +111,29 @@ public class GATKPaperGenotyper extends LocusWalker implements Tre
               return 0;
           }
       
      +    private static Integer[] sortPermutation(final double[] A) {
      +        class comparator implements Comparator {
      +            public int compare(Integer a, Integer b) {
      +                if (A[a.intValue()] < A[b.intValue()]) {
      +                    return -1;
      +                }
      +                if (A[a.intValue()] == A[b.intValue()]) {
      +                    return 0;
      +                }
      +                if (A[a.intValue()] > A[b.intValue()]) {
      +                    return 1;
      +                }
      +                return 0;
      +            }
      +        }
      +        Integer[] permutation = new Integer[A.length];
      +        for (int i = 0; i < A.length; i++) {
      +            permutation[i] = i;
      +        }
      +        Arrays.sort(permutation, new comparator());
      +        return permutation;
      +    }
      +
           /**
            * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
            * appropriately.
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java
      index 0f2353ce5..f7d1d0297 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java
      @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.filters;
       import net.sf.samtools.SAMFileHeader;
       import net.sf.samtools.SAMRecord;
       import net.sf.samtools.SAMSequenceRecord;
      +import net.sf.samtools.SAMTagUtil;
       import org.broadinstitute.sting.commandline.Argument;
       import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
       import org.broadinstitute.sting.utils.exceptions.UserException;
      @@ -44,6 +45,9 @@ public class MalformedReadFilter extends ReadFilter {
           @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false)
           boolean filterMismatchingBaseAndQuals = false;
       
      +    @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false)
      +    boolean filterBasesNotStored = false;
      +
           @Override
           public void initialize(GenomeAnalysisEngine engine) {
               this.header = engine.getSAMFileHeader();
      @@ -56,12 +60,18 @@ public class MalformedReadFilter extends ReadFilter {
                       !checkAlignmentDisagreesWithHeader(this.header,read) ||
                       !checkHasReadGroup(read) ||
                       !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) ||
      -                !checkCigarDisagreesWithAlignment(read);
      +                !checkCigarDisagreesWithAlignment(read) ||
      +                !checkSeqStored(read, filterBasesNotStored);
           }
       
      -    private static boolean checkHasReadGroup(SAMRecord read) {
      -        if ( read.getReadGroup() == null )
      -            throw new UserException.ReadMissingReadGroup(read);
      +    private static boolean checkHasReadGroup(final SAMRecord read) {
      +        if ( read.getReadGroup() == null ) {
      +            // there are 2 possibilities: either the RG tag is missing or it is not defined in the header
      +            final String rgID = (String)read.getAttribute(SAMTagUtil.getSingleton().RG);
      +            if ( rgID == null )
      +                throw new UserException.ReadMissingReadGroup(read);
      +            throw new UserException.ReadHasUndefinedReadGroup(read, rgID);
      +        }
               return true;
           }
       
      @@ -140,4 +150,20 @@ public class MalformedReadFilter extends ReadFilter {
       
               return result;
           }
      +
      +    /**
      +     * Check if the read has its base sequence stored
      +     * @param read the read to validate
      +     * @return true if the sequence is stored and false otherwise ("*" in the SEQ field).
      +     */
      +    protected static boolean checkSeqStored(final SAMRecord read, final boolean filterBasesNotStored) {
      +
      +        if ( read.getReadBases() != SAMRecord.NULL_SEQUENCE )
      +            return true;
      +
      +        if ( filterBasesNotStored )
      +            return false;
      +
      +        throw new UserException.MalformedBAM(read, String.format("the BAM file has a read with no stored bases (i.e. it uses '*') which is not supported in the GATK; see the --filter_bases_not_stored argument. Offender: %s", read.getReadName()));
      +    }
       }
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java
      similarity index 79%
      rename from public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java
      rename to public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java
      index df1c11a2b..80224b786 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java
      @@ -29,18 +29,20 @@ import net.sf.samtools.SAMRecord;
       import org.broadinstitute.sting.commandline.Argument;
       
       /**
      - * Filters out reads whose length is >= some value.
      + * Filters out reads whose length is >= some value or < some value.
        *
        * @author mhanna
        * @version 0.1
        */
      -public class MaxReadLengthFilter extends ReadFilter {
      +public class ReadLengthFilter extends ReadFilter {
           @Argument(fullName = "maxReadLength", shortName = "maxRead", doc="Discard reads with length greater than the specified value", required=true)
           private int maxReadLength;
      -    
      +
      +    @Argument(fullName = "minReadLength", shortName = "minRead", doc="Discard reads with length shorter than the specified value", required=true)
      +    private int minReadLength = 1;
           public boolean filterOut(SAMRecord read) {
               // check the length
      -        return read.getReadLength() > maxReadLength;
      +        return read.getReadLength() > maxReadLength || read.getReadLength() < minReadLength;
           }
       
       }
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java
      index e0166ab38..41ab59845 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java
      @@ -37,18 +37,18 @@ import org.broadinstitute.sting.commandline.Argument;
        *  

      * * - *

      Input

      + *

      Input

      *

      * BAM file(s) *

      * * - *

      Output

      + *

      Output

      *

      * BAM file(s) with all reads mapping qualities reassigned *

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java
      index c894dd801..f31313a86 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java
      @@ -47,18 +47,18 @@ import org.broadinstitute.sting.commandline.Argument;
        *  

      * * - *

      Input

      + *

      Input

      *

      * BAM file(s) *

      * * - *

      Output

      + *

      Output

      *

      * BAM file(s) with one read mapping quality selectively reassigned as desired *

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java
      index fbcc32d78..18185f12e 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java
      @@ -66,7 +66,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor {
       
           @Override
           public boolean createsTypeDefault(ArgumentSource source) {
      -        return source.isRequired();
      +        return !source.isRequired() && source.defaultsToStdout();
           }
       
           @Override
      @@ -76,7 +76,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor {
       
           @Override
           public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) {
      -        if(!source.isRequired())
      +        if(source.isRequired() || !source.defaultsToStdout())
                   throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default.");
               OutputStreamStub stub = new OutputStreamStub(defaultOutputStream);
               engine.addOutput(stub);
      @@ -90,7 +90,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor {
       
               // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object;
               // therefore, the user must have failed to specify a type default
      -        if(fileName == null && !source.isRequired())
      +        if(fileName == null && source.isRequired())
                   throw new MissingArgumentValueException(definition);
       
               OutputStreamStub stub = new OutputStreamStub(new File(fileName));
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java
      index 34a7f967f..458846db0 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java
      @@ -89,7 +89,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
       
           @Override
           public boolean createsTypeDefault(ArgumentSource source) {
      -        return source.isRequired();
      +        return !source.isRequired() && source.defaultsToStdout();
           }
       
           @Override
      @@ -99,7 +99,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
       
           @Override
           public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) {
      -        if(!source.isRequired())
      +        if(source.isRequired() || !source.defaultsToStdout())
                   throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default.");
               SAMFileWriterStub stub = new SAMFileWriterStub(engine,defaultOutputStream);
               engine.addOutput(stub);
      @@ -162,7 +162,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
                                              DEFAULT_ARGUMENT_FULLNAME,
                                              DEFAULT_ARGUMENT_SHORTNAME,
                                              ArgumentDefinition.getDoc(annotation),
      -                                       false,
      +                                       source.isRequired(),
                                              false,
                                              source.isMultiValued(),
                                              source.isHidden(),
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
      index 5b03859f5..91013673f 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
      @@ -110,7 +110,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
            */
           @Override
           public boolean createsTypeDefault(ArgumentSource source) {
      -        return source.isRequired();
      +        return !source.isRequired() && source.defaultsToStdout();
           }
       
           @Override
      @@ -119,8 +119,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
           }
       
           @Override
      -    public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) {
      -        if(!source.isRequired())
      +    public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) {
      +        if(source.isRequired() || !source.defaultsToStdout())
                   throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default.");        
               VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
               engine.addOutput(stub);
      @@ -143,7 +143,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
       
               // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object;
               // therefore, the user must have failed to specify a type default
      -        if(writerFile == null && !source.isRequired())
      +        if(writerFile == null && source.isRequired())
                   throw new MissingArgumentValueException(defaultArgumentDefinition);
       
               // Create a stub for the given object.
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java
      index f026b8f6c..799014cd4 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java
      @@ -31,6 +31,8 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
       import org.broadinstitute.sting.gatk.walkers.Walker;
       import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       
      +import java.util.Comparator;
      +
       /**
        * Baseclass used to describe a read transformer like BAQ and BQSR
        *
      @@ -65,6 +67,11 @@ abstract public class ReadTransformer {
       
           protected ReadTransformer() {}
       
      +    /*
      +     * @return the ordering constraint for the given read transformer
      +     */
      +    public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.DO_NOT_CARE; }
      +
           /**
            * Master initialization routine.  Called to setup a ReadTransform, using it's overloaded initializeSub routine.
            *
      @@ -166,4 +173,33 @@ abstract public class ReadTransformer {
                */
               HANDLED_IN_WALKER
           }
      +
      +    /*
      +     * This enum specifies the constraints that the given read transformer has relative to any other read transformers being used
      +     */
      +    public enum OrderingConstraint {
      +        /*
      +         * If 2 read transformers are both active and MUST_BE_FIRST, then an error will be generated
      +         */
      +        MUST_BE_FIRST,
      +
      +        /*
      +         * No constraints on the ordering for this read transformer
      +         */
      +        DO_NOT_CARE,
      +
      +        /*
      +         * If 2 read transformers are both active and MUST_BE_LAST, then an error will be generated
      +         */
      +        MUST_BE_LAST
      +    }
      +
      +    public static class ReadTransformerComparator implements Comparator {
      +
      +        public int compare(final ReadTransformer r1, final ReadTransformer r2) {
      +            if ( r1.getOrderingConstraint() == r2.getOrderingConstraint() )
      +                return 0;
      +            return ( r1.getOrderingConstraint() == OrderingConstraint.MUST_BE_FIRST || r2.getOrderingConstraint() == OrderingConstraint.MUST_BE_LAST ) ? -1 : 1;
      +        }
      +    }
       }
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
      index 02f2f9f02..de84809bd 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
      @@ -78,17 +78,11 @@ public class GATKRunReport {
       
           private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss");
       
      -    /**
      -     * number of milliseconds before the S3 put operation is timed-out:
      -     */
      -    private static final long S3_PUT_TIME_OUT = 30 * 1000;
      -
           /**
            * The root file system directory where we keep common report data
            */
           private final static File REPORT_DIR = new File("/humgen/gsa-hpprojects/GATK/reports");
       
      -
           /**
            * The full path to the direct where submitted (and uncharacterized) report files are written
            */
      @@ -105,6 +99,17 @@ public class GATKRunReport {
            */
           protected static final Logger logger = Logger.getLogger(GATKRunReport.class);
       
      +    /**
      +     * Default value for the number of milliseconds before an S3 put operation is timed-out.
      +     * Can be overridden via a constructor argument.
      +     */
      +    private static final long S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS = 30 * 1000;
      +
      +    /**
      +     * Number of milliseconds before an S3 put operation is timed-out.
      +     */
      +    private long s3PutTimeOutInMilliseconds = S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS;
      +
           // -----------------------------------------------------------------
           // elements captured for the report
           // -----------------------------------------------------------------
      @@ -230,13 +235,31 @@ public class GATKRunReport {
           }
       
           /**
      -     * Create a new RunReport and population all of the fields with values from the walker and engine
      +     * Create a new RunReport and population all of the fields with values from the walker and engine.
      +     * Allows the S3 put timeout to be explicitly set.
            *
            * @param walker the GATK walker that we ran
            * @param e the exception caused by running this walker, or null if we completed successfully
            * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc
      +     * @param type the GATK phone home setting
      +     * @param s3PutTimeOutInMilliseconds number of milliseconds to wait before timing out an S3 put operation
            */
      -    public GATKRunReport(Walker walker, Exception e, GenomeAnalysisEngine engine, PhoneHomeOption type) {
      +    public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type,
      +                         final long s3PutTimeOutInMilliseconds) {
      +        this(walker, e, engine, type);
      +        this.s3PutTimeOutInMilliseconds = s3PutTimeOutInMilliseconds;
      +    }
      +
      +    /**
      +     * Create a new RunReport and population all of the fields with values from the walker and engine.
      +     * Leaves the S3 put timeout set to the default value of S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS.
      +     *
      +     * @param walker the GATK walker that we ran
      +     * @param e the exception caused by running this walker, or null if we completed successfully
      +     * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc
      +     * @param type the GATK phone home setting
      +     */
      +    public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type) {
               if ( type == PhoneHomeOption.NO_ET )
                   throw new ReviewedStingException("Trying to create a run report when type is NO_ET!");
       
      @@ -563,7 +586,7 @@ public class GATKRunReport {
                               throw new IllegalStateException("We are throwing an exception for testing purposes");
                           case TIMEOUT:
                               try {
      -                            Thread.sleep(S3_PUT_TIME_OUT * 100);
      +                            Thread.sleep(s3PutTimeOutInMilliseconds * 100);
                               } catch ( InterruptedException e ) {
                                   // supposed to be empty
                               }
      @@ -625,7 +648,7 @@ public class GATKRunReport {
                   s3thread.setName("S3Put-Thread");
                   s3thread.start();
       
      -            s3thread.join(S3_PUT_TIME_OUT);
      +            s3thread.join(s3PutTimeOutInMilliseconds);
       
                   if(s3thread.isAlive()){
                       s3thread.interrupt();
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java
      index c5f87d625..4c50cfaae 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java
      @@ -44,7 +44,6 @@ import org.broadinstitute.sting.utils.collections.Pair;
       import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
       import org.broadinstitute.sting.utils.exceptions.UserException;
       import org.broadinstitute.sting.utils.file.FSLockWithShared;
      -import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
       import org.broadinstitute.sting.utils.instrumentation.Sizeof;
       
       import java.io.File;
      @@ -83,6 +82,10 @@ public class RMDTrackBuilder { // extends PluginManager {
       
           private final FeatureManager featureManager;
       
      +    // If true, do not attempt to create index files if they don't exist or are outdated, and don't
      +    // make any file lock acquisition calls on the index files.
      +    private final boolean disableAutoIndexCreation;
      +
           /**
            * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact.  This is generally
            * used when walkers want to directly manage the ROD system for whatever reason.  Before using this constructor,
      @@ -90,14 +93,19 @@ public class RMDTrackBuilder { // extends PluginManager {
            * @param dict Sequence dictionary to use.
            * @param genomeLocParser Location parser to use.
            * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification.
      +     * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files.
      +     *                                 UNSAFE in general (because it causes us not to lock index files before reading them) --
      +     *                                 suitable only for test suite use.
            */
           public RMDTrackBuilder(final SAMSequenceDictionary dict,
                                  final GenomeLocParser genomeLocParser,
      -                           ValidationExclusion.TYPE validationExclusionType) {
      +                           final ValidationExclusion.TYPE validationExclusionType,
      +                           final boolean disableAutoIndexCreation) {
               this.dict = dict;
               this.validationExclusionType = validationExclusionType;
               this.genomeLocParser = genomeLocParser;
               this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType));
      +        this.disableAutoIndexCreation = disableAutoIndexCreation;
           }
       
           /**
      @@ -208,12 +216,15 @@ public class RMDTrackBuilder { // extends PluginManager {
       
                       // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
                       if (sequenceDictionary.size() == 0 && dict != null) {
      -                    File indexFile = Tribble.indexFile(inputFile);
                           validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
      -                    try { // re-write the index
      -                        writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
      -                    } catch (IOException e) {
      -                        logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
      +
      +                    if ( ! disableAutoIndexCreation ) {
      +                        File indexFile = Tribble.indexFile(inputFile);
      +                        try { // re-write the index
      +                            writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
      +                        } catch (IOException e) {
      +                            logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK");
      +                        }
                           }
       
                           sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
      @@ -225,7 +236,7 @@ public class RMDTrackBuilder { // extends PluginManager {
                       throw new UserException(e.getMessage());
                   }
                   catch (IOException e) {
      -                throw new UserException.CouldNotCreateOutputFile(inputFile, "unable to write Tribble index", e);
      +                throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e);
                   }
               }
               else {
      @@ -242,25 +253,36 @@ public class RMDTrackBuilder { // extends PluginManager {
            * @return a linear index for the specified type
            * @throws IOException if we cannot write the index file
            */
      -    public synchronized Index loadIndex(File inputFile, FeatureCodec codec) throws IOException {
      -        // create the index file name, locking on the index file name
      -        File indexFile = Tribble.indexFile(inputFile);
      -        FSLockWithShared lock = new FSLockWithShared(indexFile);
      -
      -        // acquire a lock on the file
      +    public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException {
      +        final File indexFile = Tribble.indexFile(inputFile);
      +        final FSLockWithShared lock = new FSLockWithShared(indexFile);
               Index idx = null;
      -        if (indexFile.canRead())
      -            idx = attemptIndexFromDisk(inputFile, codec, indexFile, lock);
       
      -        // if we managed to make an index, return
      +        // If the index file exists and is readable, attempt to load it from disk. We'll get null back
      +        // if a problem was discovered with the index file when it was inspected, and we'll get an
      +        // in-memory index back in the case where the index file could not be locked.
      +        if (indexFile.canRead()) {
      +            idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile)  // load without locking if we're in disableAutoIndexCreation mode
      +                                           : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock);
      +        }
      +
      +        // If we have an index, it means we either loaded it from disk without issue or we created an in-memory
      +        // index due to not being able to acquire a lock.
               if (idx != null) return idx;
       
      -        // we couldn't read the file, or we fell out of the conditions above, continue on to making a new index
      -        return writeIndexToDisk(createIndexInMemory(inputFile, codec), indexFile, lock);
      +        // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index
      +        idx = createIndexInMemory(inputFile, codec);
      +        if ( ! disableAutoIndexCreation ) {
      +            writeIndexToDisk(idx, indexFile, lock);
      +        }
      +        return idx;
           }
       
           /**
      -     * attempt to read the index from disk
      +     * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if
      +     * a lock could not be obtained. Returns null if a problem was discovered with the index file when it
      +     * was examined (eg., it was out-of-date).
      +     *
            * @param inputFile the input file
            * @param codec the codec to read from
            * @param indexFile the index file itself
      @@ -268,20 +290,21 @@ public class RMDTrackBuilder { // extends PluginManager {
            * @return an index, or null if we couldn't load one
            * @throws IOException if we fail for FS issues
            */
      -    protected Index attemptIndexFromDisk(File inputFile, FeatureCodec codec, File indexFile, FSLockWithShared lock) throws IOException {
      -        boolean locked;
      +    protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException {
      +        boolean locked = false;
      +        Index idx = null;
      +
               try {
                   locked = lock.sharedLock();
      -        }
      -        catch(FileSystemInabilityToLockException ex) {
      -            throw new UserException.MissortedFile(inputFile, "Unexpected inability to lock exception", ex);
      -        }
      -        Index idx;
      -        try {
      -            if (!locked) // can't lock file
      +
      +            if ( ! locked ) { // can't lock file
      +                logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.",
      +                                          indexFile.getAbsolutePath()));
                       idx = createIndexInMemory(inputFile, codec);
      -            else
      +            }
      +            else {
                       idx = loadFromDisk(inputFile, indexFile);
      +            }
               } finally {
                   if (locked) lock.unlock();
               }
      @@ -294,7 +317,7 @@ public class RMDTrackBuilder { // extends PluginManager {
            * @param indexFile the input file, plus the index extension
            * @return an Index, or null if we're unable to load
            */
      -    public static Index loadFromDisk(File inputFile, File indexFile) {
      +    protected Index loadFromDisk( final File inputFile, final File indexFile ) {
               logger.info("Loading Tribble index from disk for file " + inputFile);
               Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath());
       
      @@ -302,14 +325,17 @@ public class RMDTrackBuilder { // extends PluginManager {
               if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified())
                   return index;
               else if (indexFile.lastModified() < inputFile.lastModified())
      -            logger.warn("Index file " + indexFile + " is out of date (index older than input file), deleting and updating the index file");
      +            logger.warn("Index file " + indexFile + " is out of date (index older than input file), " +
      +                        (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file"));
               else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable
      -            logger.warn("Index file " + indexFile + " is out of date (old version), deleting and updating the index file");
      +            logger.warn("Index file " + indexFile + " is out of date (old version), " +
      +                        (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file"));
       
      -        // however we got here, remove the index and return null
      -        boolean deleted = indexFile.delete();
      +        if ( ! disableAutoIndexCreation ) {
      +            boolean deleted = indexFile.delete();
      +            if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)");
      +        }
       
      -        if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)");
               return null;
           }
       
      @@ -319,13 +345,18 @@ public class RMDTrackBuilder { // extends PluginManager {
            * @param index the index to write to disk
            * @param indexFile the index file location
            * @param lock the locking object
      -     * @return the index object
            * @throws IOException when unable to create the new index
            */
      -    private static Index writeIndexToDisk(Index index, File indexFile, FSLockWithShared lock) throws IOException {
      -        boolean locked = false; // could we exclusive lock the file?
      +    private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException {
      +        if ( disableAutoIndexCreation ) {
      +            return;
      +        }
      +
      +        boolean locked = false;
      +
               try {
      -            locked = lock.exclusiveLock(); // handle the case where we aren't locking anything
      +            locked = lock.exclusiveLock();
      +
                   if (locked) {
                       logger.info("Writing Tribble index to disk for file " + indexFile);
                       LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
      @@ -337,11 +368,6 @@ public class RMDTrackBuilder { // extends PluginManager {
       
                   try { logger.info(String.format("  Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); }
                   catch ( ReviewedStingException e) { }
      -
      -            return index;
      -        }
      -        catch(FileSystemInabilityToLockException ex) {
      -            throw new UserException.CouldNotCreateOutputFile(indexFile,"Unexpected inability to lock exception", ex);
               }
               finally {
                   if (locked) lock.unlock();
      @@ -356,7 +382,7 @@ public class RMDTrackBuilder { // extends PluginManager {
            * @return a LinearIndex, given the file location
            * @throws IOException when unable to create the index in memory
            */
      -    private Index createIndexInMemory(File inputFile, FeatureCodec codec) {
      +    protected Index createIndexInMemory(File inputFile, FeatureCodec codec) {
               // this can take a while, let them know what we're doing
               logger.info("Creating Tribble index in memory for file " + inputFile);
               Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java
      new file mode 100644
      index 000000000..80da8f8eb
      --- /dev/null
      +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java
      @@ -0,0 +1,126 @@
      +/*
      + * Copyright (c) 2012 The Broad Institute
      + *
      + * Permission is hereby granted, free of charge, to any person
      + * obtaining a copy of this software and associated documentation
      + * files (the "Software"), to deal in the Software without
      + * restriction, including without limitation the rights to use,
      + * copy, modify, merge, publish, distribute, sublicense, and/or sell
      + * copies of the Software, and to permit persons to whom the
      + * Software is furnished to do so, subject to the following
      + * conditions:
      + *
      + * The above copyright notice and this permission notice shall be
      + * included in all copies or substantial portions of the Software.
      + *
      + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
      + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
      + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
      + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
      + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
      + * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      + */
      +
      +package org.broadinstitute.sting.gatk.traversals;
      +
      +import org.broadinstitute.sting.gatk.downsampling.Downsampler;
      +import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
      +import org.broadinstitute.sting.utils.sam.AlignmentStartComparator;
      +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      +
      +import java.util.ArrayList;
      +import java.util.Collections;
      +import java.util.List;
      +
      +/**
      + * Subsystem to track a list of all reads currently live in the TraverseActiveRegions system,
      + * while limiting the total number of reads to a maximum capacity.
      + *
      + * User: depristo
      + * Date: 4/7/13
      + * Time: 11:23 AM
      + */
      +public class TAROrderedReadCache {
      +    final int maxCapacity;
      +    final Downsampler downsampler;
      +
      +    /**
      +     * Create a new empty ReadCache
      +     * @param maxCapacity the max capacity of the read cache.
      +     */
      +    public TAROrderedReadCache(int maxCapacity) {
      +        if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity);
      +        this.maxCapacity = maxCapacity;
      +        this.downsampler = new ReservoirDownsampler(maxCapacity);
      +    }
      +
      +    /**
      +     * What's the maximum number of reads we'll store in the cache?
      +     * @return a positive integer
      +     */
      +    public int getMaxCapacity() {
      +        return maxCapacity;
      +    }
      +
      +    /**
      +     * Add a single read to this cache.  Assumed to be in sorted order w.r.t. the previously added reads
      +     * @param read a read to add
      +     */
      +    public void add(final GATKSAMRecord read) {
      +        if ( read == null ) throw new IllegalArgumentException("Read cannot be null");
      +        downsampler.submit(read);
      +    }
      +
      +    /**
      +     * Add a collection of reads to this cache.  Assumed to be in sorted order w.r.t. the previously added reads and each other
      +     * @param reads a collection of reads to add
      +     */
      +    public void addAll(final List reads) {
      +        if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null");
      +        downsampler.submit(reads);
      +    }
      +
      +    /**
      +     * How many reads are currently in the cache?
      +     * @return a positive integer
      +     */
      +    public int size() {
      +        return downsampler.size();
      +    }
      +
      +    /**
      +     * How many reads were discarded since the last call to popCurrentReads
      +     * @return
      +     */
      +    public int getNumDiscarded() {
      +        return downsampler.getNumberOfDiscardedItems();
      +    }
      +
      +    /**
      +     * Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart)
      +     *
      +     * Flushes this cache, so after this call the cache will contain no reads and all downsampling stats will
      +     * be reset.
      +     *
      +     * @return a list of GATKSAMRecords in this cache
      +     */
      +    public List popCurrentReads() {
      +        final List maybeUnordered = downsampler.consumeFinalizedItems();
      +
      +        final List ordered;
      +        if ( downsampler.getNumberOfDiscardedItems() == 0 ) {
      +            // haven't discarded anything, so the reads are ordered properly
      +            ordered = maybeUnordered;
      +        } else {
      +            // we need to sort these damn things: O(n log n)
      +            ordered = new ArrayList(maybeUnordered);
      +            Collections.sort(ordered, new AlignmentStartComparator());
      +        }
      +
      +        // reset the downsampler stats so getNumberOfDiscardedItems is 0
      +        downsampler.reset();
      +        return ordered;
      +    }
      +}
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
      index 64c6d5094..1daaaf1da 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
      @@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
       import org.broadinstitute.sting.gatk.walkers.DataSource;
       import org.broadinstitute.sting.gatk.walkers.Walker;
       import org.broadinstitute.sting.utils.GenomeLoc;
      +import org.broadinstitute.sting.utils.SampleUtils;
       import org.broadinstitute.sting.utils.Utils;
       import org.broadinstitute.sting.utils.activeregion.*;
       import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
      @@ -70,7 +71,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine workQueue = new LinkedList();
       
      -    private LinkedList myReads = new LinkedList();
      +    private TAROrderedReadCache myReads = null;
      +
           private GenomeLoc spanOfLastReadSeen = null;
           private ActivityProfile activityProfile = null;
           int maxReadsInMemory = 0;
      @@ -112,11 +114,15 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine extends TraversalEngine extends TraversalEngine extends TraversalEngine walker) {
      -        final Iterator liveReads = myReads.iterator();
      -        while ( liveReads.hasNext() ) {
      +        final List stillLive = new LinkedList();
      +        for ( final GATKSAMRecord read : myReads.popCurrentReads() ) {
                   boolean killed = false;
      -            final GATKSAMRecord read = liveReads.next();
                   final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
       
                   if( activeRegion.getLocation().overlapsP( readLoc ) ) {
                       activeRegion.add(read);
       
                       if ( ! walker.wantsNonPrimaryReads() ) {
      -                    liveReads.remove();
                           killed = true;
                       }
                   } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
                       activeRegion.add( read );
                   }
       
      +            // if the read hasn't already been killed, check if it cannot occur in any more active regions, and maybe kill it
                   if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) {
      -                liveReads.remove();
      +                killed = true;
                   }
      +
      +            // keep track of all of the still live active regions
      +            if ( ! killed ) stillLive.add(read);
               }
      +        myReads.addAll(stillLive);
       
               if ( logger.isDebugEnabled() ) {
                   logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc());
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java
      index cdb45db7b..5560946ea 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java
      @@ -78,4 +78,20 @@ public @interface ActiveRegionTraversalParameters {
            * @return the breadth of the band pass gaussian kernel we want for our traversal
            */
           public double bandPassSigma() default BandPassActivityProfile.DEFAULT_SIGMA;
      +
      +    /**
      +     * What is the maximum number of reads we're willing to hold in memory per sample
      +     * during the traversal?  This limits our exposure to unusually large amounts
      +     * of coverage in the engine.
      +     * @return the maximum number of reads we're willing to hold in memory
      +     */
      +    public int maxReadsToHoldInMemoryPerSample() default 3000;
      +
      +    /**
      +     * No matter what the per sample value says, we will never hold more than this
      +     * number of reads in memory at any time.  Provides an upper bound on the total number
      +     * of reads in the case where we have a lot of samples.
      +     * @return the maximum number of reads to hold in memory
      +     */
      +    public int maxReadsToHoldTotal() default 1000000;
       }
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
      index e14e50b1a..9595b8f42 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
      @@ -67,7 +67,7 @@ public abstract class ActiveRegionWalker extends Walker extends Walker extends WalkerThe allele balance is the fraction of ref bases over ref + alt bases.

      + * + *

      Caveats

      + *

      Note that this annotation will only work properly for biallelic het-called samples.

      + *

      This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.

      */ public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java deleted file mode 100644 index 17a33bdca..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 1/3/13 - * Time: 11:36 AM - * To change this template use File | Settings | File Templates. - */ -public class AverageAltAlleleLength extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation, ExperimentalAnnotation { - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Average Allele Length")); - } - - public List getKeyNames() { return Arrays.asList("AAL"); } - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map perReadAlleleLikelihoodMap ) { - if ( !vc.hasLog10PError() ) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - Map map = new HashMap(); - - double length = getMeanAltAlleleLength(vc); - map.put(getKeyNames().get(0),String.format("%.2f",length)); - return map; - } - - public static double getMeanAltAlleleLength(VariantContext vc) { - double averageLength = 1.0; - if ( ! vc.isSNP() && ! vc.isSymbolic() ) { - // adjust for the event length - int averageLengthNum = 0; - int averageLengthDenom = 0; - int refLength = vc.getReference().length(); - for ( Allele a : vc.getAlternateAlleles() ) { - int numAllele = vc.getCalledChrCount(a); - int alleleSize; - if ( a.length() == refLength ) { - // SNP or MNP - byte[] a_bases = a.getBases(); - byte[] ref_bases = vc.getReference().getBases(); - int n_mismatch = 0; - for ( int idx = 0; idx < a_bases.length; idx++ ) { - if ( a_bases[idx] != ref_bases[idx] ) - n_mismatch++; - } - alleleSize = n_mismatch; - } - else if ( a.isSymbolic() ) { - alleleSize = 1; - } else { - alleleSize = Math.abs(refLength-a.length()); - } - averageLengthNum += alleleSize*numAllele; - averageLengthDenom += numAllele; - } - averageLength = ( (double) averageLengthNum )/averageLengthDenom; - } - - return averageLength; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java deleted file mode 100644 index 65d2f0757..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ /dev/null @@ -1,85 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Fraction of all reads across samples that have mapping quality zero - */ -public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation { - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) - return null; - - int mq0 = 0; - int depth = 0; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - depth += context.size(); - final ReadBackedPileup pileup = context.getBasePileup(); - for (PileupElement p : pileup ) { - if ( p.getMappingQual() == 0 ) - mq0++; - } - } - if (depth > 0) { - double mq0f = (double)mq0 / (double )depth; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%1.4f", mq0f)); - return map; - } - else - return null; - } - - public List getKeyNames() { return Arrays.asList("MQ0Fraction"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Fraction of Mapping Quality Zero Reads")); } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 17002ba39..288196d1b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -42,14 +42,16 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; +import java.util.regex.Pattern; /** * A set of genomic annotations based on the output of the SnpEff variant effect predictor tool - * (http://snpeff.sourceforge.net/). * - * For each variant, chooses one of the effects of highest biological impact from the SnpEff + *

      See http://snpeff.sourceforge.net/ for more information on the SnpEff tool

      . + * + *

      For each variant, this tol chooses one of the effects of highest biological impact from the SnpEff * output file (which must be provided on the command line via --snpEffFile filename.vcf), - * and adds annotations on that effect. + * and adds annotations on that effect.

      * * @author David Roazen */ @@ -62,6 +64,8 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.5" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; + public static final String SNPEFF_GATK_COMPATIBILITY_ARGUMENT = "-o gatk"; + public static final Pattern SNPEFF_GATK_COMPATIBILITY_ARGUMENT_PATTERN = Pattern.compile("-o\\s+gatk"); // When we write the SnpEff version number and command line to the output VCF, we change // the key name slightly so that the output VCF won't be confused in the future for an @@ -218,8 +222,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY); VCFHeaderLine snpEffCommandLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_COMMAND_LINE_KEY); - checkSnpEffVersion(snpEffVersionLine); - checkSnpEffCommandLine(snpEffCommandLine); + checkSnpEffVersionAndCommandLine(snpEffVersionLine, snpEffCommandLine); // If everything looks ok, add the SnpEff version number and command-line header lines to the // header of the VCF output file, changing the key names so that our output file won't be @@ -266,37 +269,45 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio } } - private void checkSnpEffVersion ( VCFHeaderLine snpEffVersionLine ) { + private void checkSnpEffVersionAndCommandLine( final VCFHeaderLine snpEffVersionLine, final VCFHeaderLine snpEffCommandLine ) { if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) { - throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " + - "input file, and so could not verify that the file was generated by a supported version of SnpEff (" + - Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")"); + throw new UserException(String.format("Could not find a %s entry in the VCF header for the SnpEff input file, " + + "and so could not verify that the file was generated by a supported version of SnpEff (%s)", + SNPEFF_VCF_HEADER_VERSION_LINE_KEY, supportedSnpEffVersionsString())); + } + + if ( snpEffCommandLine == null || snpEffCommandLine.getValue() == null || snpEffCommandLine.getValue().trim().length() == 0 ) { + throw new UserException(String.format("Could not find a %s entry in the VCF header for the SnpEff input file, " + + "which should be added by all supported versions of SnpEff (%s)", + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY, supportedSnpEffVersionsString())); } String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0]; - if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) { - throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " + - "is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS)); + if ( ! isSupportedSnpEffVersion(snpEffVersionString, snpEffCommandLine.getValue()) ) { + throw new UserException(String.format("The version of SnpEff used to generate the SnpEff input file (%s) " + + "is not currently supported by the GATK, and was not run in GATK " + + "compatibility mode. Supported versions are: %s", + snpEffVersionString, supportedSnpEffVersionsString())); } } - private void checkSnpEffCommandLine ( VCFHeaderLine snpEffCommandLine ) { - if ( snpEffCommandLine == null || snpEffCommandLine.getValue() == null || snpEffCommandLine.getValue().trim().length() == 0 ) { - throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY + " entry in the VCF header for the SnpEff " + - "input file, which should be added by all supported versions of SnpEff (" + - Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")"); - } - } - - private boolean isSupportedSnpEffVersion ( String versionString ) { + private boolean isSupportedSnpEffVersion( final String versionString, final String commandLine ) { + // first check to see if it's an officially-supported version for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) { if ( supportedVersion.equals(versionString) ) { return true; } } - return false; + // if it's not an officially-supported version, check to see whether the + // "-o gatk" compatibility option was specified + return SNPEFF_GATK_COMPATIBILITY_ARGUMENT_PATTERN.matcher(commandLine).find(); + } + + private String supportedSnpEffVersionsString() { + return String.format("%s, as well as later versions when run with the option %s", + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS), SNPEFF_GATK_COMPATIBILITY_ARGUMENT); } private VariantContext getMatchingSnpEffRecord ( List snpEffRecords, VariantContext vc ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java deleted file mode 100644 index dbaafb1ed..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ /dev/null @@ -1,101 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Counts of bases from Illumina, 454, and SOLiD at this site - */ -@Hidden -public class TechnologyComposition extends InfoFieldAnnotation implements ExperimentalAnnotation { - private String nIllumina = "NumIllumina"; - private String n454 ="Num454"; - private String nSolid = "NumSOLiD"; - private String nOther = "NumOther"; - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) - return null; - - int readsIllumina = 0; - int readsSolid = 0; - int reads454 = 0; - int readsOther = 0; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - for ( PileupElement p : pileup ) { - if(ReadUtils.is454Read(p.getRead())) - reads454++; - else if (ReadUtils.isSOLiDRead(p.getRead())) - readsSolid++; - else if (ReadUtils.isIlluminaRead(p.getRead())) - readsIllumina++; - else - readsOther++; - } - } - - Map map = new HashMap(); - map.put(nIllumina, String.format("%d", readsIllumina)); - map.put(n454, String.format("%d", reads454)); - map.put(nSolid, String.format("%d", readsSolid)); - map.put(nOther, String.format("%d", readsOther)); - return map; - } - - public List getKeyNames() { return Arrays.asList(nIllumina,n454,nSolid,nOther); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(nIllumina, 1, VCFHeaderLineType.Integer, "Number of Illumina reads"), - new VCFInfoHeaderLine(n454, 1, VCFHeaderLineType.Integer, "Number of 454 reads"), - new VCFInfoHeaderLine(nSolid, 1, VCFHeaderLineType.Integer, "Number of SOLiD reads"), - new VCFInfoHeaderLine(nOther, 1, VCFHeaderLineType.Integer, "Number of Other technology reads")); } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 826dc9f22..f2bd6c14c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -36,18 +36,18 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import java.util.*; - /** * Annotates variant calls with context information. * @@ -55,17 +55,17 @@ import java.util.*; * VariantAnnotator is a GATK tool for annotating variant calls based on their context. * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself. * - *

      Input

      + *

      Input

      *

      * A variant set to annotate and optionally one or more BAM files. *

      * - *

      Output

      + *

      Output

      *

      * An annotated VCF. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -125,7 +125,7 @@ public class VariantAnnotator extends RodWalker implements Ann
           public List> resources = Collections.emptyList();
           public List> getResourceRodBindings() { return resources; }
       
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter vcfWriter = null;
       
           /**
      @@ -142,7 +142,8 @@ public class VariantAnnotator extends RodWalker implements Ann
           protected List annotationsToExclude = new ArrayList();
       
           /**
      -     * See the -list argument to view available groups.
      +     * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups.
      +     * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs.
            */
           @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
           protected List annotationGroupsToUse = new ArrayList();
      @@ -155,7 +156,7 @@ public class VariantAnnotator extends RodWalker implements Ann
            * If multiple records in the rod overlap the given position, one is chosen arbitrarily.
            */
           @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
      -    protected List expressionsToUse = new ArrayList();
      +    protected Set expressionsToUse = new ObjectOpenHashSet();
       
           /**
            * Note that the -XL argument can be used along with this one to exclude annotations.
      @@ -164,19 +165,19 @@ public class VariantAnnotator extends RodWalker implements Ann
           protected Boolean USE_ALL_ANNOTATIONS = false;
       
           /**
      -     * Note that the --list argument requires a fully resolved and correct command-line to work.
      +     * Note that the --list argument requires a fully resolved and correct command-line to work. As a simpler alternative, you can use ListAnnotations (see Help Utilities).
            */
      -    @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
      +    @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit", required=false)
           protected Boolean LIST = false;
       
           /**
            * By default, the dbSNP ID is added only when the ID field in the variant VCF is empty.
            */
      -    @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated")
      +    @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated", required=false)
           protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
           public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
       
      -    @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio")
      +    @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality threshold in order to annotate mendelian violation ratio")
           public double minGenotypeQualityP = 0.0;
       
           @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false)
      @@ -184,33 +185,15 @@ public class VariantAnnotator extends RodWalker implements Ann
       
           private VariantAnnotatorEngine engine;
       
      -
      -    private void listAnnotationsAndExit() {
      -        System.out.println("\nStandard annotations in the list below are marked with a '*'.");
      -        List> infoAnnotationClasses = new PluginManager(InfoFieldAnnotation.class).getPlugins();
      -        System.out.println("\nAvailable annotations for the VCF INFO field:");
      -        for (int i = 0; i < infoAnnotationClasses.size(); i++)
      -            System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName());
      -        System.out.println();
      -        List> genotypeAnnotationClasses = new PluginManager(GenotypeAnnotation.class).getPlugins();
      -        System.out.println("\nAvailable annotations for the VCF FORMAT field:");
      -        for (int i = 0; i < genotypeAnnotationClasses.size(); i++)
      -            System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName());
      -        System.out.println();
      -        System.out.println("\nAvailable classes/groups of annotations:");
      -        for ( Class c : new PluginManager(AnnotationType.class).getInterfaces() )
      -            System.out.println("\t" + c.getSimpleName());
      -        System.out.println();
      -        System.exit(0);
      -    }
      -
           /**
            * Prepare the output file and the list of available features.
            */
           public void initialize() {
       
      -        if ( LIST )
      -            listAnnotationsAndExit();
      +        if ( LIST ) {
      +            HelpUtils.listAnnotations();
      +            System.exit(0);
      +        }
       
               // get the list of all sample names from the variant VCF input rod, if applicable
               List rodName = Arrays.asList(variantCollection.variants.getName());
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
      index c5703afc8..695868bb1 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
      @@ -104,7 +104,7 @@ public class VariantAnnotatorEngine {
           }
       
           // select specific expressions to use
      -    public void initializeExpressions(List expressionsToUse) {
      +    public void initializeExpressions(Set expressionsToUse) {
               // set up the expressions
               for ( String expression : expressionsToUse )
                   requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java
      index 221887158..59b4b1b3b 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java
      @@ -25,6 +25,7 @@
       
       package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
       
      +import org.broadinstitute.sting.utils.DeprecatedToolChecks;
       import org.broadinstitute.sting.utils.classloader.PluginManager;
       import org.broadinstitute.sting.utils.exceptions.UserException;
       
      @@ -58,7 +59,7 @@ public class AnnotationInterfaceManager {
                       if ( interfaceClass == null )
                           interfaceClass = classMap.get(group + "Annotation");
                       if ( interfaceClass == null )
      -                    throw new UserException.BadArgumentValue("group", "Class " + group + " is not found; please check that you have specified the class name correctly");
      +                    throw new UserException.BadArgumentValue("group", "Annotation group " + group + " was not found; please check that you have specified the group name correctly");
                   }
               }
       
      @@ -67,8 +68,13 @@ public class AnnotationInterfaceManager {
                   Class annotationClass = classMap.get(annotation);
                   if ( annotationClass == null )
                       annotationClass = classMap.get(annotation + "Annotation");
      -            if ( annotationClass == null )
      -                throw new UserException.BadArgumentValue("annotation", "Class " + annotation + " is not found; please check that you have specified the class name correctly");
      +            if ( annotationClass == null ) {
      +                if (DeprecatedToolChecks.isDeprecatedAnnotation(annotation) ) {
      +                    throw new UserException.DeprecatedAnnotation(annotation, DeprecatedToolChecks.getAnnotationDeprecationInfo(annotation));
      +                } else {
      +                    throw new UserException.BadArgumentValue("annotation", "Annotation " + annotation + " was not found; please check that you have specified the annotation name correctly");
      +                }
      +            }
               }
           }
       
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java
      index 2e85fe8f9..15bd79586 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java
      @@ -61,7 +61,7 @@ import static java.lang.Math.log10;
        * Note that this walker requires all input files produced by Beagle.
        *
        *
      - * 

      Example

      + *

      Example

      *
        *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
        *      -R reffile.fasta -T BeagleOutputToVCF \
      @@ -106,7 +106,7 @@ public class BeagleOutputToVCF extends RodWalker {
           @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true)
           public RodBinding beaglePhased;
       
      -    @Output(doc="VCF File to which variants should be written",required=true)
      +    @Output(doc="VCF File to which variants should be written")
           protected VariantContextWriter vcfWriter = null;
       
           /**
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java
      index 937c3abc0..6e5aa250f 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java
      @@ -57,7 +57,7 @@ import java.util.*;
        *  Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
        * 

      * - *

      Input

      + *

      Input

      *

      * A VCF with variants to convert to Beagle format *

      @@ -70,7 +70,7 @@ import java.util.*; * Optional: A file with a list of markers *

      * - *

      Examples

      + *

      Examples

      *
        *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
        *      -R reffile.fasta -T ProduceBeagleInput \
      @@ -89,11 +89,11 @@ public class ProduceBeagleInput extends RodWalker {
           public RodBinding validation;
       
       
      -    @Output(doc="File to which BEAGLE input should be written",required=true)
      +    @Output(doc="File to which BEAGLE input should be written")
           protected PrintStream  beagleWriter = null;
       
           @Hidden
      -     @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
      +    @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false, defaultToStdout = false)
           protected PrintStream  markers = null;
           int markerCounter = 1;
       
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java
      index ab0ce79fd..646c57a2b 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java
      @@ -62,7 +62,7 @@ public class VariantsToBeagleUnphased extends RodWalker {
           @Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true)
           public RodBinding variants;
       
      -    @Output(doc="File to which BEAGLE unphased genotypes should be written",required=true)
      +    @Output(doc="File to which BEAGLE unphased genotypes should be written")
           protected PrintStream  beagleWriter = null;
       
           @Argument(fullName = "bootstrap_fraction", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
      index 0681ebf1e..6af6723f2 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
      @@ -40,7 +40,6 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
       import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
       import org.broadinstitute.sting.utils.help.HelpConstants;
       import org.broadinstitute.sting.utils.pileup.PileupElement;
      -import org.broadinstitute.sting.utils.BaseUtils;
       
       import java.io.File;
       import java.io.FileNotFoundException;
      @@ -70,12 +69,12 @@ import java.io.PrintStream;
        * 
        * 

      *

      - *

      Input

      + *

      Input

      *

      * A BAM file containing exactly one sample. *

      *

      - *

      Output

      + *

      Output

      *

      *

        *
      • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
      • @@ -83,7 +82,7 @@ import java.io.PrintStream; *
      *

      *

      - *

      Examples

      + *

      Examples

      *
        *     -T CallableLociWalker \
        *     -I my.bam \
      @@ -314,13 +313,14 @@ public class CallableLoci extends LocusWalker= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) {
      -                    QCDepth++;
      +                    QCDepth += depth;
                       }
                   }
       
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
      index 3bd114aa1..c4ef4d23b 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
      @@ -66,7 +66,7 @@ import java.util.*;
        * and/or percentage of bases covered to or beyond a threshold.
        * Additionally, reads and bases can be filtered by mapping or base quality score.
        *
      - * 

      Input

      + *

      Input

      *

      * One or more bam files (with proper headers) to be analyzed for coverage statistics *

      @@ -75,7 +75,7 @@ import java.util.*; *

      * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) *

      - *

      Output

      + *

      Output

      *

      * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: *

      @@ -98,7 +98,7 @@ import java.util.*; * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -117,7 +117,7 @@ import java.util.*;
       // todo -- alter logarithmic scaling to spread out bins more
       // todo -- allow for user to set linear binning (default is logarithmic)
       // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
      -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
      +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
       @By(DataSource.REFERENCE)
       @PartitionBy(PartitionType.NONE)
       @Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE)
      @@ -577,7 +577,8 @@ public class DepthOfCoverage extends LocusWalkerInput
      + * 

      Input

      *

      * A reference file *

      * - *

      Output

      + *

      Output

      *

      * GC content calculations per interval. *

      * - *

      Examples

      + *

      Example

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
      - *   -R ref.fasta \
        *   -T GCContentByInterval \
      + *   -R ref.fasta \
        *   -o output.txt \
        *   -L input.intervals
        * 
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java index a5a8edb0c..506ef2c72 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java @@ -29,12 +29,15 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -44,23 +47,26 @@ import java.io.*; import java.util.Collection; /** - * print intervals file with all the variant sites that have "most" ( >= 90% by default) of the samples with "good" (>= 10 by default)coverage ("most" and "good" can be set in the command line). + * Print intervals file with all the variant sites for which most of the samples have good coverage * *

      - * CoveredByNSamplesSites is a GATK tool for filter out sites based on their coverage. + * CoveredByNSamplesSites is a GATK tool for filtering out sites based on their coverage. * The sites that pass the filter are printed out to an intervals file. * - *

      Input

      + * See argument defaults for what constitutes "most" samples and "good" coverage. These parameters can be modified from the command line. + *

      + * + *

      Input

      *

      * A variant file and optionally min coverage and sample percentage values. *

      * - *

      Output

      + *

      Output

      *

      * An intervals file. *

      * - *

      Examples

      + *

      Example

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -71,20 +77,20 @@ import java.util.Collection;
        * 
      * */ - +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @By(DataSource.REFERENCE_ORDERED_DATA) public class CoveredByNSamplesSites extends RodWalker implements TreeReducible { - @Output(fullName = "OutputIntervals", shortName = "out", doc = "Name of file for output intervals", required = true) + @Output(fullName = "OutputIntervals", shortName = "out", doc = "Name of file for output intervals") PrintStream outputStream; @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Argument(fullName = "minCoverage", shortName = "minCov",doc = "only samples that have covarage bigger then minCoverage will be counted",required = false) + @Argument(fullName = "minCoverage", shortName = "minCov",doc = "only samples that have coverage bigger than minCoverage will be counted",required = false) int minCoverage = 10; - @Argument(fullName = "precentageOfSamples", shortName = "percentage", doc = "only sites where at list percentageOfSamples of the samples have good coverage, will be emited", required = false) + @Argument(fullName = "percentageOfSamples", shortName = "percentage", doc = "only sites where at least percentageOfSamples of the samples have good coverage, will be emitted", required = false) double percentageOfSamples = 0.9; @Override @@ -95,8 +101,6 @@ public class CoveredByNSamplesSites extends RodWalker implem Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); if ( VCs.size() == 0 ) return null; - if(VCs.size() != 1) - throw new RuntimeException("there are more then one vc: "+VCs.size()); boolean emitSite = false; for(VariantContext vc : VCs){ @@ -135,12 +139,11 @@ public class CoveredByNSamplesSites extends RodWalker implem } /** - * Tell the user the number of sites processed and how many passed. Close out the new intervals file. * - * @param result pair of *the number of sites seen and number of sites passed the filter. + * @param result the number of sites that passed the filter. */ public void onTraversalDone(Integer result) { - logger.info(result+" sites that have "+(percentageOfSamples*100)+"% of the samples with at list "+minCoverage+" coverage.\n"); + logger.info(result+" sites that have "+(percentageOfSamples*100)+"% of the samples with at least "+minCoverage+" coverage.\n"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index 76f5478a4..86676ca54 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -49,12 +49,12 @@ import java.io.PrintStream; * Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read * group in the input BAMs FOR ONLY THE FIRST OF PAIR READS. * - *

      Input

      + *

      Input

      *

      * Any number of BAM files *

      * - *

      Output

      + *

      Output

      *

      * GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. * @@ -82,7 +82,7 @@ import java.io.PrintStream; *

      *

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
      index de7ac3e41..0af1dbed5 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
      @@ -53,12 +53,12 @@ import java.util.Map;
        * the median statistics are well determined.  It is safe to run it WG and it'll finish in an appropriate
        * timeframe.
        *
      - * 

      Input

      + *

      Input

      *

      * Any number of BAM files *

      * - *

      Output

      + *

      Output

      *

      * GATKReport containing read group, sample, library, platform, center, median insert size and median read length. * @@ -86,7 +86,7 @@ import java.util.Map; *

      *

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
      index ccad7f0b2..a269a94bc 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
      @@ -49,17 +49,17 @@ import java.util.List;
        *  

      * * - *

      Input

      + *

      Input

      *

      * A BAM file. *

      * - *

      Output

      + *

      Output

      *

      * A human/R readable table of tab separated values with one column per sample and one row per read. *

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
      index 7ac59790c..c909eb2d5 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
      @@ -83,7 +83,7 @@ public class DiffEngine {
                   DiffElement masterElt = master.getElement(name);
                   DiffElement testElt = test.getElement(name);
                   if ( masterElt == null && testElt == null ) {
      -                throw new ReviewedStingException("BUG: unexceptedly got two null elements for field: " + name);
      +                throw new ReviewedStingException("BUG: unexpectedly got two null elements for field: " + name);
                   } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value
                       // todo -- should one of these be a special MISSING item?
                       diffs.add(new Difference(masterElt, testElt));
      @@ -283,8 +283,7 @@ public class DiffEngine {
               // now that we have a specific list of values we want to show, display them
               GATKReport report = new GATKReport();
               final String tableName = "differences";
      -        // TODO for Geraldine -- link needs to be updated below
      -        report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", 3);
      +        report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gatk/guide/article?id=1299 for more information", 3);
               final GATKReportTable table = report.getTable(tableName);
               table.addColumn("Difference");
               table.addColumn("NumberOfOccurrences");
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java
      index d1903c2bb..524f5c250 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java
      @@ -68,12 +68,12 @@ import java.util.List;
        *      The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them.  This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others.
        * 

      * - *

      Input

      + *

      Input

      *

      * The DiffObjectsWalker works with BAM or VCF files. *

      * - *

      Output

      + *

      Output

      *

      * The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named * nodes. Suppose I have two trees: @@ -132,6 +132,10 @@ import java.util.List; [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1

      * + *

      Caveat

      + *

      Because this is a walker, it requires that you pass a reference file. However the reference is not actually used, so it does not matter what you pass as reference.

      + * + * * @author Mark DePristo * @since 7/4/11 */ @@ -140,10 +144,9 @@ public class DiffObjects extends RodWalker { /** * Writes out a file of the DiffEngine format: * - * TODO for Geraldine -- link needs to be updated below (and also in SelectVariants and RefSeqCodec GATK docs) - * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine + * See http://www.broadinstitute.org/gatk/guide/article?id=1299 for details. */ - @Output(doc="File to which results should be written",required=true) + @Output(doc="File to which results should be written") protected PrintStream out; /** @@ -169,7 +172,7 @@ public class DiffObjects extends RodWalker { @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) int MAX_OBJECTS_TO_READ = -1; - @Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) + @Argument(fullName="maxRawDiffsToSummarize", shortName="maxRawDiffsToSummarize", doc="Max. number of differences to include in the summary. -1 [default] means unlimited", required=false) int maxRawDiffsToSummary = -1; @Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index e881315b9..d2f2e32b3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -60,17 +60,17 @@ import java.util.List; * 3) this tool works only for SNPs and for simple indels (but not for things like complex substitutions). * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). * - *

      Input

      + *

      Input

      *

      * The reference, requested intervals, and any number of variant rod files. *

      * - *

      Output

      + *

      Output

      *

      * A fasta file representing the requested intervals. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java
      index f2f5fb5fe..fb7941fec 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java
      @@ -48,17 +48,17 @@ import java.io.PrintStream;
        * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
        * separate fasta sequence (named numerically in order).
        *
      - * 

      Input

      + *

      Input

      *

      * The reference and requested intervals. *

      * - *

      Output

      + *

      Output

      *

      * A fasta file representing the requested intervals. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
      index 9fbaca14e..8883523d9 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
      @@ -38,7 +38,27 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
       import java.io.PrintStream;
       
       /**
      - * Calculates basic statistics about the reference sequence itself
      + * Calculate basic statistics about the reference sequence itself
      + *
      + * 

      These are very basic statistics: total number of bases and number of "regular" bases (i.e. A, C, T or G).

      + * + *

      Input

      + *

      + * A FASTA reference file. + *

      + * + *

      Output

      + *

      + * Base counts are written to file if an output file name is given (with -o), otherwise output to stdout. + *

      + * + *

      Example

      + *
      + * java -Xmx2g -jar GenomeAnalysisTK.jar \
      + *   -T FastaStats \
      + *   -R ref.fasta \
      + *   [-o output.txt]
      + * 
      */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class FastaStats extends RefWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java index 61a847f4c..83d4d81d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java @@ -55,17 +55,17 @@ import java.util.*; * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria. * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. * - *

      Input

      + *

      Input

      *

      * A variant set to filter. *

      * - *

      Output

      + *

      Output

      *

      * A filtered VCF. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -87,12 +87,13 @@ public class VariantFiltration extends RodWalker {
           protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
       
           /**
      -     * Any variant which overlaps entries from the provided mask rod will be filtered.
      +     * Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed,
      +     * i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used.
            */
      -    @Input(fullName="mask", doc="Input ROD mask", required=false)
      +    @Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false)
           public RodBinding mask;
       
      -    @Output(doc="File to which variants should be written", required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter writer = null;
       
           /**
      @@ -114,7 +115,7 @@ public class VariantFiltration extends RodWalker {
            * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
            * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1").
            */
      -    @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
      +    @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
           protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList();
       
           /**
      @@ -140,6 +141,14 @@ public class VariantFiltration extends RodWalker {
           @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
           protected String MASK_NAME = "Mask";
       
      +    /**
      +     * By default, if the -mask argument is used, any variant falling in a mask will be filtered.
      +     * If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered.
      +     * Use case is, for example, if we have an interval list or BED file with "good" sites.
      +     */
      +    @Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false)
      +    protected boolean filterRecordsNotInMask = false;
      +
           /**
            * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
            * Use this argument to have it evaluate as failing filters instead for these cases.
      @@ -177,16 +186,22 @@ public class VariantFiltration extends RodWalker {
               if ( clusterWindow > 0 )
                   hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
       
      -        for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
      -            hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
      -        for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
      -            hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
      -
               if ( genotypeFilterExps.size() > 0 )
                   hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
       
      -        if ( mask.isBound() ) {
      -            hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask"));
      +        try {
      +            for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
      +                hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
      +            for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
      +                hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
      +
      +            if ( mask.isBound() ) {
      +                if (filterRecordsNotInMask)
      +                    hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Doesn't overlap a user-input mask"));
      +                else hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask"));
      +            }
      +        } catch (IllegalArgumentException e) {
      +            throw new UserException.BadInput(e.getMessage());
               }
       
               writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
      @@ -199,6 +214,8 @@ public class VariantFiltration extends RodWalker {
               if ( MASK_EXTEND < 0 )
                    throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
       
      +        if (filterRecordsNotInMask && !mask.isBound())
      +            throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided");
               filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS);
               genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS);
       
      @@ -223,7 +240,7 @@ public class VariantFiltration extends RodWalker {
               Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation());
       
               // is there a SNP mask present?
      -        boolean hasMask = tracker.hasValues(mask);
      +        boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask));
               if ( hasMask )
                   previousMaskPosition = ref.getLocus();  // multi-base masks will get triggered over all bases of the mask
       
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java
      new file mode 100644
      index 000000000..21b66986a
      --- /dev/null
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java
      @@ -0,0 +1,54 @@
      +/*
      +* Copyright (c) 2012 The Broad Institute
      +* 
      +* Permission is hereby granted, free of charge, to any person
      +* obtaining a copy of this software and associated documentation
      +* files (the "Software"), to deal in the Software without
      +* restriction, including without limitation the rights to use,
      +* copy, modify, merge, publish, distribute, sublicense, and/or sell
      +* copies of the Software, and to permit persons to whom the
      +* Software is furnished to do so, subject to the following
      +* conditions:
      +* 
      +* The above copyright notice and this permission notice shall be
      +* included in all copies or substantial portions of the Software.
      +* 
      +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
      +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
      +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
      +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
      +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
      +* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
      +
      +import net.sf.samtools.SAMRecord;
      +import org.apache.log4j.Logger;
      +import org.broadinstitute.sting.commandline.Argument;
      +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
      +import org.broadinstitute.sting.gatk.filters.ReadFilter;
      +
      +/**
      + * Filter out reads with low mapping qualities.
      + *
      + * @author mdepristo
      + */
      +public class HCMappingQualityFilter extends ReadFilter {
      +    private final static Logger logger = Logger.getLogger(HCMappingQualityFilter.class);
      +
      +    @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for analysis with the HaplotypeCaller", required = false)
      +    public int MIN_MAPPING_QUALTY_SCORE = 20;
      +
      +    @Override
      +    public void initialize(GenomeAnalysisEngine engine) {
      +        if ( MIN_MAPPING_QUALTY_SCORE > 0 )
      +            logger.info("Filtering out reads with MAPQ < " + MIN_MAPPING_QUALTY_SCORE);
      +    }
      +
      +    public boolean filterOut(SAMRecord rec) {
      +        return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE);
      +    }
      +}
      \ No newline at end of file
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java
      index 503cdb6d6..8b82e50a7 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java
      @@ -38,17 +38,17 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       /**
        * Walks over the input data set, calculating the number of bases seen for diagnostic purposes.
        *
      - * 

      Input

      + *

      Input

      *

      * One or more BAM files. *

      * - *

      Output

      + *

      Output

      *

      * Number of bases seen. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
      index 3b8eba398..e7b6df623 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
      @@ -45,9 +45,42 @@ import java.util.Collections;
       import java.util.List;
       
       /**
      - * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
      - * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
      - * This was its very first use.
      + * Count contiguous regions in an interval list.
      + *
      + * 

      When the GATK reads in intervals from an intervals list, any intervals that overlap each other get merged into + * a single interval spanning the original ones. For example, if you have the following intervals: + *

      • + * 20:1-2000 + *
      • + * 20:1500-3000 + *
      + * They will be merged into a single interval: + *
      • 20:1-3000
      + * + * This tool allows you to check, for a given list of intervals, how many separate intervals the GATK will actually + * distinguish at runtime. + *

      + * + *

      Input

      + *

      + * One or more rod files containing intervals to check. + *

      + * + *

      Output

      + *

      + * Number of separate intervals identified by GATK after merging overlapping intervals. + *

      + * + * You can use the -numOverlaps argument to find out how many cases you have of a specific number of overlaps. + * + *

      Example

      + *
      + * java -Xmx2g -jar GenomeAnalysisTK.jar \
      + *   -T CountIntervals \
      + *   -R ref.fasta \
      + *   -0 output.txt \
      + *   -check intervals.list
      + * 
      */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class CountIntervals extends RefWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java index f2bd791c1..d999dfebf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java @@ -42,33 +42,34 @@ import java.io.PrintStream; * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. * *

      - * Simplest example of a locus walker. + * This is the simplest example of a locus walker. + *

      * - * - *

      Input

      + *

      Input

      *

      * One or more BAM files. *

      * - *

      Output

      + *

      Output

      *

      - * Number of loci traversed. + * Number of loci traversed. If an output file name is provided, then the result will be written to that file. + * Otherwise it will be sent to standard console output. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
      - *   -R ref.fasta \
        *   -T CountLoci \
      - *   -o output.txt \
      + *   -R ref.fasta \
        *   -I input.bam \
      + *   -o output.txt \
        *   [-L input.intervals]
        * 
      * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class CountLoci extends LocusWalker implements TreeReducible, NanoSchedulable { - @Output(doc="Write count to this file instead of STDOUT") + @Output PrintStream out; public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java index 6fb4b84d6..7279a64a4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -37,12 +38,36 @@ import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; + /** * Walks over the input data set, calculating the number of reads seen from male samples for diagnostic purposes. + * + *

      Input

      + *

      + * One or more BAM files. + *

      + * + *

      Output

      + *

      + * Number of reads seen from male samples. + *

      + * + *

      Examples

      + *
      + * java -Xmx2g -jar GenomeAnalysisTK.jar \
      + *   -T CountMales \
      + *   -R ref.fasta \
      + *   -I samples.bam \
      + *   -o output.txt
      + * 
      */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountMales extends ReadWalker { + @Output + public PrintStream out; + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { Sample sample = getSampleDB().getSample(read); return sample.getGender() == Gender.MALE ? 1 : 0; @@ -53,4 +78,8 @@ public class CountMales extends ReadWalker { public Integer reduce(Integer value, Integer sum) { return value + sum; } + + public void onTraversalDone( Integer c ) { + out.println(c); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index c01a1df89..65f82efe4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -53,22 +53,32 @@ import java.util.*; /** * Prints out counts of the number of reference ordered data objects encountered. * + *

      CountRods is a RODWalker, and so traverses the data by ROD. For example if the ROD passed to it is a VCF file, + * it will count the variants in the file.

      * - *

      Input

      + *

      Note that this tool is different from CountRodsByRef which is a RefWalker, and so traverses the data by + * position along the reference. CountRodsByRef can count ROD elements (such as, but not limited to, variants) found + * at each position or within specific intervals if you use the -L argument (see CommandLineGATK).

      + * + *

      Both these tools are different from CountVariants in that they are more generic (they can also count RODs that + * are not variants) and CountVariants is more detailed, in that it computes additional statistics (type of variants + * being indels vs. SNPs etc).

      + * + *

      Input

      *

      * One or more rod files. *

      * - *

      Output

      + *

      Output

      *

      * Number of rods seen. *

      * - *

      Examples

      + *

      Example

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
      - *   -R ref.fasta \
        *   -T CountRODs \
      + *   -R ref.fasta \
        *   -o output.txt \
        *   --rod input.vcf
        * 
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java index 303f1704f..594ca239d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java @@ -43,24 +43,34 @@ import java.util.Collections; import java.util.List; /** - * Prints out counts of the number of reference ordered data objects encountered. + * Prints out counts of the number of reference ordered data objects encountered along the reference. * + *

      CountRodsByRef is a RefWalker, and so traverses the data by position along the reference. It counts ROD + * elements (such as, but not limited to, variants) found at each position or within specific intervals if you use + * the -L argument (see CommandLineGATK).

      * - *

      Input

      + *

      Note that this tool is different from the basic CountRods, which is a RODWalker, and so traverses the data by + * ROD. For example if the ROD passed to it is a VCF file, CountRods will simply count the variants in the file.

      + * + *

      Both these tools are different from CountVariants in that they are more generic (they can also count RODs that + * are not variants) and CountVariants is more detailed, in that it computes additional statistics (type of variants + * being indels vs. SNPs etc).

      + * + *

      Input

      *

      * One or more rod files. *

      * - *

      Output

      + *

      Output

      *

      * Number of rods seen. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
      - *   -R ref.fasta \
        *   -T CountRODsByRef \
      + *   -R ref.fasta \
        *   -o output.txt \
        *   --rod input.vcf
        * 
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java index 8b0646092..cfb7325a9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java @@ -47,22 +47,22 @@ import java.util.Map; /** * Walks over the input data set, counting the number of read events (from the CIGAR operator) * - *

      Input

      + *

      Input

      *

      * One or more BAM files. *

      * - *

      Output

      + *

      Output

      *

      - * Number of reads events for each category + * Number of read events for each category, formatted as a GATKReport table. * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
      - *   -R ref.fasta \
        *   -T CountReadEvents \
      - *   -o output.grp \
      + *   -R ref.fasta \
        *   -I input.bam \
      + *   -o output.grp \
        *   [-L input.intervals]
        * 
      */ @@ -70,7 +70,7 @@ import java.util.Map; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReadEvents extends ReadWalker> , Map>> { - @Output (doc = "GATKReport table output") + @Output PrintStream out; public Map> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 1a3984014..825fcac90 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -44,17 +44,17 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * --read-filter command line argument). Simplest example of a read-backed analysis. * * - *

      Input

      + *

      Input

      *

      * One or more BAM files. *

      * - *

      Output

      + *

      Output

      *

      * Number of reads seen. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java
      index 40b78588f..54562aa43 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java
      @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.qc;
       
       import net.sf.samtools.CigarElement;
       import net.sf.samtools.CigarOperator;
      +import org.broadinstitute.sting.commandline.Output;
       import org.broadinstitute.sting.gatk.CommandLineGATK;
       import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
       import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
      @@ -39,22 +40,23 @@ import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
       import org.broadinstitute.sting.utils.help.HelpConstants;
       import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
       
      +import java.io.PrintStream;
       import java.util.List;
       
       /**
        * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips
        *
      - * 

      Input

      + *

      Input

      *

      * One or more BAM files. *

      * - *

      Output

      + *

      Output

      *

      * Number of reads ending in each category. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -67,6 +69,9 @@ import java.util.List;
       @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
       @Requires({DataSource.READS, DataSource.REFERENCE})
       public class CountTerminusEvent extends ReadWalker, Pair> {
      +    @Output
      +    public PrintStream out;
      +
           public Pair map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) {
               List cigarElements = read.getCigar().getCigarElements();
       
      @@ -94,6 +99,6 @@ public class CountTerminusEvent extends ReadWalker, Pair result) {
      -        System.out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond()));
      +        out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond()));
           }
       }
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java
      index 8902773f7..5db67a7f0 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java
      @@ -73,7 +73,7 @@ public class DocumentationTest extends RodWalker {
           @Input(fullName="featureArg", shortName = "featureArg", doc="A RodBinding of feature", required=false)
           private RodBinding featureArg = null;
       
      -    @Output(doc="VCFWriter",required=true)
      +    @Output(doc="VCFWriter")
           protected VariantContextWriter vcfWriter = null;
       
           @Advanced
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java
      index d0a3f3508..17fb4e322 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java
      @@ -41,36 +41,31 @@ import java.io.PrintStream;
       import java.text.DecimalFormat;
       import java.text.NumberFormat;
       
      -
      -/*
      - * Copyright (c) 2009 The Broad Institute
      - *
      - * Permission is hereby granted, free of charge, to any person
      - * obtaining a copy of this software and associated documentation
      - * files (the "Software"), to deal in the Software without
      - * restriction, including without limitation the rights to use,
      - * copy, modify, merge, publish, distribute, sublicense, and/or sell
      - * copies of the Software, and to permit persons to whom the
      - * Software is furnished to do so, subject to the following
      - * conditions:
      - *
      - * The above copyright notice and this permission notice shall be
      - * included in all copies or substantial portions of the Software.
      - *
      - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
      - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
      - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
      - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
      - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
      - * OTHER DEALINGS IN THE SOFTWARE.
      - */
      -
       /**
      - * A reimplementation of the 'samtools flagstat' subcommand in the GATK.  Walks
      - * over all input data, accumulating statistics such as total number of reads,
      - * reads with QC failure flag set, number of duplicates, percentage mapped, etc.
      + * A reimplementation of the 'samtools flagstat' subcommand in the GATK
      + *
      + * 

      This tool walks over all input data, accumulating statistics such as total number of reads, + * reads with QC failure flag set, number of duplicates, percentage mapped, etc.

      + * + *

      Input

      + *

      + * A BAM file containing the sequence data. + *

      + * + *

      Output

      + *

      + * Resulting stats are written to file if an output file name is given (with -o), otherwise output to stdout. + *

      + * + *

      Example

      + *
      + * java -Xmx2g -jar GenomeAnalysisTK.jar \
      + *   -T FlagStat \
      + *   -R ref.fasta \
      + *   -I reads.bam \
      + *   [-o output.txt]
      + * 
      + * * @author aaron */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java index 0790f2ced..bc98c670a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java @@ -49,11 +49,33 @@ import java.util.Collections; import java.util.List; /** - * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, + * Emulates the samtools pileup command to print aligned reads + * + *

      Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. * - * Associated command: + * Emulated command: * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] + * + *

      Input

      + *

      + * A BAM file and the interval to print. + *

      + * + *

      Output

      + *

      + * Formatted pileup-style alignment of reads. + *

      + * + *

      Example

      + *
      + * java -Xmx2g -jar GenomeAnalysisTK.jar \
      + *   -T Pileup \
      + *   -R ref.fasta \
      + *   -I aligned_reads.bam \
      + *   -o output.txt
      + * 
      + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { @@ -68,7 +90,7 @@ public class Pileup extends LocusWalker implements TreeReducibl * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per * read items are delimited with an '@' character. */ - @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output") + @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false) public boolean SHOW_VERBOSE = false; @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java index 395945f03..48bd6feba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java @@ -44,17 +44,17 @@ import java.io.PrintStream; * Quality control for the reference fasta * * - *

      Input

      + *

      Input

      *

      * One reference file only. And optionally -L intervals *

      * - *

      Output

      + *

      Output

      *

      * If ok, nothing, else will throw an exception at the site where there's been a problem *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
      index f7b125828..879022299 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
      @@ -88,12 +88,12 @@ import java.util.regex.Pattern;
        *
        * 

      * - *

      Input

      + *

      Input

      *

      * Any number of BAM files. *

      * - *

      Output

      + *

      Output

      *

      * A new BAM file containing all of the reads from the input BAMs with the user-specified clipping * operation applied to each read. @@ -145,7 +145,7 @@ import java.util.regex.Pattern; *

      *

      * - *

      Examples

      + *

      Examples

      *
        *     -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
        *     -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
      @@ -161,13 +161,13 @@ public class ClipReads extends ReadWalker> implements NanoSchedulable {
      +    @Output(doc="Write output to this BAM filename instead of STDOUT", required = true)
      +    StingSAMFileWriter out;
      +
      +    @Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignmentStart", doc ="", required = false)
      +    private int minReadsPerAlignmentStart = 5;
      +
      +    @Argument(fullName = "downsampleTo", shortName = "downsampleTo", doc ="", required = false)
      +    private int downsampleTo = 1000;
      +
      +    /**
      +     * The initialize function.
      +     */
      +    public void initialize() {
      +//        final boolean preSorted = true;
      +//        if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
      +//            Utils.setupWriter(out, getToolkit(), getToolkit().getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
      +//        }
      +    }
      +
      +    /**
      +     * The reads map function.
      +     *
      +     * @param ref  the reference bases that correspond to our read, if a reference was provided
      +     * @param readIn the read itself, as a GATKSAMRecord
      +     * @return the read itself
      +     */
      +    public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
      +        return readIn;
      +    }
      +
      +    /**
      +     * reduceInit is called once before any calls to the map function.  We use it here to setup the output
      +     * bam file, if it was specified on the command line
      +     *
      +     * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
      +     */
      +    public Collection reduceInit() {
      +        return new LinkedList();
      +    }
      +
      +    /**
      +     * given a read and a output location, reduce by emitting the read
      +     *
      +     * @param read   the read itself
      +     * @param output the output source
      +     * @return the SAMFileWriter, so that the next reduce can emit to the same source
      +     */
      +    public Collection reduce( GATKSAMRecord read, Collection output ) {
      +        output.add(read);
      +        return output;
      +    }
      +
      +    @Override
      +    public void onTraversalDone(Collection result) {
      +        for ( final GATKSAMRecord read : DownsamplingUtils.levelCoverageByPosition(new ArrayList(result), downsampleTo, minReadsPerAlignmentStart) )
      +            out.addAlignment(read);
      +    }
      +}
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java
      index 8a1178574..a28523369 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java
      @@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
       import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
       import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
       import org.broadinstitute.sting.gatk.walkers.*;
      -import org.broadinstitute.sting.utils.GenomeLoc;
       import org.broadinstitute.sting.utils.SampleUtils;
       import org.broadinstitute.sting.utils.Utils;
       import org.broadinstitute.sting.utils.baq.BAQ;
      @@ -56,18 +55,24 @@ import java.util.*;
        * PrintReads can dynamically merge the contents of multiple input BAM files, resulting
        * in merged output sorted in coordinate order.  Can also optionally filter reads based on the
        * --read_filter command line argument.
      + * 

      * - *

      Input

      + *

      + * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow, + * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below. + *

      + * + *

      Input

      *

      * One or more bam files. *

      * - *

      Output

      + *

      Output

      *

      * A single processed bam file. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -101,7 +106,7 @@ import java.util.*;
       @Requires({DataSource.READS, DataSource.REFERENCE})
       public class PrintReads extends ReadWalker implements NanoSchedulable {
       
      -    @Output(doc="Write output to this BAM filename instead of STDOUT", required = true)
      +    @Output(doc="Write output to this BAM filename instead of STDOUT")
           StingSAMFileWriter out;
       
           @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
      @@ -145,7 +150,7 @@ public class PrintReads extends ReadWalker impleme
       
           @Hidden
           @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
      -    private boolean NO_PG_TAG = false;
      +    public boolean NO_PG_TAG = false;
       
           List readTransformers = Collections.emptyList();
           private TreeSet samplesToChoose = new TreeSet();
      @@ -160,7 +165,6 @@ public class PrintReads extends ReadWalker impleme
            * The initialize function.
            */
           public void initialize() {
      -        final boolean keep_records = true;
               final GenomeAnalysisEngine toolkit = getToolkit();
       
               if  ( platform != null )
      @@ -186,7 +190,7 @@ public class PrintReads extends ReadWalker impleme
       
               final boolean preSorted = true;
               if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
      -            Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
      +            Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME);
               }
       
           }
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java
      new file mode 100644
      index 000000000..43a1ddd74
      --- /dev/null
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java
      @@ -0,0 +1,392 @@
      +/*
      +* Copyright (c) 2012 The Broad Institute
      +* 
      +* Permission is hereby granted, free of charge, to any person
      +* obtaining a copy of this software and associated documentation
      +* files (the "Software"), to deal in the Software without
      +* restriction, including without limitation the rights to use,
      +* copy, modify, merge, publish, distribute, sublicense, and/or sell
      +* copies of the Software, and to permit persons to whom the
      +* Software is furnished to do so, subject to the following
      +* conditions:
      +* 
      +* The above copyright notice and this permission notice shall be
      +* included in all copies or substantial portions of the Software.
      +* 
      +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
      +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
      +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
      +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
      +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
      +* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.readutils;
      +
      +import com.google.java.contract.Ensures;
      +import com.google.java.contract.Requires;
      +import net.sf.samtools.SAMFileWriter;
      +import org.apache.log4j.Logger;
      +import org.broadinstitute.sting.commandline.Advanced;
      +import org.broadinstitute.sting.commandline.Argument;
      +import org.broadinstitute.sting.commandline.Hidden;
      +import org.broadinstitute.sting.commandline.Output;
      +import org.broadinstitute.sting.gatk.CommandLineGATK;
      +import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
      +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
      +import org.broadinstitute.sting.gatk.walkers.*;
      +import org.broadinstitute.sting.utils.BaseUtils;
      +import org.broadinstitute.sting.utils.collections.Pair;
      +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
      +import org.broadinstitute.sting.utils.help.HelpConstants;
      +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
      +
      +import java.util.ArrayList;
      +import java.util.Arrays;
      +import java.util.List;
      +/**
      + * Utility tool to blindly strip base adaptors. Main application is for FASTQ/unaligned BAM pre-processing where libraries
      + * have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present.
      + * 

      + * By design, tool will only work for Illumina-like library constructs, where the typical library architecture is: + * [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)] + *

      + * It is assumed that when data is paired, one read will span the forward strand and one read will span the reverse strand. + * Hence, when specifying adaptors they should be specified as both forward and reverse-complement to make sure they're removed in all cases. + * By design, as well, "circular" constructions where a read can have an insert, then adaptor, then more genomic insert, are not supported. + * When an adaptor is detected, all bases downstream from it (i.e. in the 3' direction) will be removed. + * Adaptor detection is carried out by looking for overlaps between forward and reverse reads in a pair. + * If a sufficiently high overlap is found, the insert size is computed and if insert size < read lengths adaptor bases are removed from reads. + * + * Advantages over ReadClipper: + * - No previous knowledge of adaptors or library structure is necessary + * + * Advantages over 3rd party tools like SeqPrep: + * - Can do BAM streaming instead of having to convert to fastq + * - No need to merge reads - merging reads can have some advantages, but complicates downstream processing and loses information that can be used, + * e.g. in variant calling + *

      + * + *

      Input

      + *

      + * The input read data in BAM format. Read data MUST be in query name ordering as produced, for example with Picard's FastqToBam + * + *

      Output

      + *

      + * A merged BAM file with unaligned reads + *

      + * + *

      Examples

      + *
      + * java -Xmx4g -jar GenomeAnalysisTK.jar \
      + *   -T ReadAdaptorTrimmer \
      + *   -I my_reads.bam \
      + *   -R resources/Homo_sapiens_assembly18.fasta \
      + *   -o trimmed_Reads.bam
      + * 
      + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@PartitionBy(PartitionType.READ) +public class ReadAdaptorTrimmer extends ReadWalker, SAMFileWriter> implements NanoSchedulable { + @Output(doc="Write output to this BAM filename instead of STDOUT", required = false) + SAMFileWriter out; + + /** + * Only prints the first n reads of the file - for short testing + */ + @Hidden + @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) + int nReadsToPrint = -1; + + /** + * Argument to control strictness of match between forward and reverse reads - by default, we require 15 matches between them to declare + * an overlap. + */ + @Advanced + @Argument(fullName = "minMatches", shortName = "minMatches", doc="Minimum number of substring matches to detect pair overlaps", required = false) + int minMatchesForOverlap = 15; + + + /** + * If true, this argument will make the walker discard unpaired reads instead of erroring out. + */ + @Advanced + @Argument(fullName = "removeUnpairedReads", shortName = "removeUnpairedReads", doc="Remove unpaired reads instead of erroring out", required = false) + boolean cleanUnpairedReads = false; + + /** + * private class members + */ + private GATKSAMRecord firstReadInPair; + private TrimStats trimStats = new TrimStats(); + + static class TrimStats { + long numReadsProcessed; + long numReadsWithAdaptorTrimmed; + long numUnpairedReadsFound; + } + + /** + * The reads filter function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param read the read itself, as a GATKSAMRecord + * @return true if the read passes the filter, false if it doesn't + */ + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + // check if we've reached the output limit + if ( nReadsToPrint == 0 ) { + return false; // n == 0 means we've printed all we needed. + } + else if (nReadsToPrint > 0) { + nReadsToPrint--; // n > 0 means there are still reads to be printed. + } + return true; + } + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the output + * bam file, if it was specified on the command line + * + * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise + */ + public SAMFileWriter reduceInit() { + return out; + } + + public List map( final ReferenceContext ref, final GATKSAMRecord readIn, final RefMetaDataTracker metaDataTracker ) { + + + final List readsToEmit = new ArrayList(); + + + // cache first read in pair if flag set. + if (readIn.getFirstOfPairFlag()) { + firstReadInPair = GATKSAMRecord.emptyRead(readIn); + firstReadInPair.setReadString(readIn.getReadString()); + firstReadInPair.setReadName(readIn.getReadName()); + firstReadInPair.setBaseQualities(readIn.getBaseQualities()); + } + else { + if (!readIn.getReadName().equals(firstReadInPair.getReadName())) { + if (cleanUnpairedReads) { + trimStats.numUnpairedReadsFound++; + return readsToEmit; + } + else // by default require that reads be completely paired + throw new IllegalStateException("Second read in pair must follow first read in pair: data not ordered?"); + } + + final int oldLength1 = firstReadInPair.getReadLength(); + final int oldLength2 = readIn.getReadLength(); + // try to strip any adaptor sequence in read pair + final Integer result = trimReads(firstReadInPair, readIn, minMatchesForOverlap, logger); + + if (logger.isDebugEnabled()) { + if (result == null) + logger.debug("No overlap found, insert size cannot be computed"); + else + logger.debug("Insert size estimate = " + result); + + } + + + readsToEmit.add(firstReadInPair); + readsToEmit.add(readIn); + + if (oldLength1 != firstReadInPair.getReadLength()) + trimStats.numReadsWithAdaptorTrimmed++; + if (oldLength2 != readIn.getReadLength()) + trimStats.numReadsWithAdaptorTrimmed++; + + } + + + trimStats.numReadsProcessed++; + return readsToEmit; + + } + + /** + * given a read and a output location, reduce by emitting the read + * + * @param readsToEmit the read itself + * @param output the output source + * @return the SAMFileWriter, so that the next reduce can emit to the same source + */ + public SAMFileWriter reduce( final List readsToEmit, final SAMFileWriter output ) { + for (final GATKSAMRecord read : readsToEmit) + output.addAlignment(read); + + return output; + } + + @Override + public void onTraversalDone(SAMFileWriter output) { + + logger.info("Finished Trimming:"); + logger.info("Number of processed reads: "+ trimStats.numReadsProcessed); + logger.info("Number of reads with adaptor sequence trimmed: "+ trimStats.numReadsWithAdaptorTrimmed); + if (cleanUnpairedReads) + logger.info("Number of unpaired reads thrown out: "+ trimStats.numUnpairedReadsFound); + } + + + /** + * + * Workhorse routines... + * + */ + /** + * Core routine that does most underlying work for walker. Takes two reads and looks for overlaps in them. + * An overlap is defined as a contiguous chunk of N bases that matches reverse-complement between reads. + * Currently, the only insert structure that it will look for overlaps is as follows: + * CASE 1: Insert shorter than read length: + * 3' XXXXXXXXXXXXXXXX 5' (second read) + * 5' YYYYYYYYYYYYYYYY 3' (first read) + * *********** + * + * In this case, if X and Y are complements at the 11 positions marked by *, routine will do the following + * iff minMatchesForOverlap <= 11: + * a) Cleave adaptor from end of second read (leftmost dangling part in diagram above) + * b) Cleave adaptor from end of first read (rightmost part in diagram). + * + * CASE 2: Insert size >= read length: + * 3' XXXXXXXXXXXXXXXX 5' (second read) + * 5' YYYYYYYYYYYYYYYY 3' (first read) + * ********* (overlap) + * + * In this case, no trimming is done and reads are left unchanged + * @param first (I/O) First read in pair - read contents (bases/quals) can be modified if adaptor is detected + * @param second (I/O) Second read in pair - read contents (bases/quals) can be modified if adaptor is detected + * @param minMatchesForOverlap Reads need to match in these # of bases to be joined + * @return Offset between second and first read. + * If there's no detectable offset, return Null + */ + @Requires({"first != null","second != null","minMatchesForOverlap>0"}) + protected static Integer trimReads(final GATKSAMRecord first, + final GATKSAMRecord second, + final int minMatchesForOverlap, + final Logger logger) { + + final Integer insertSize = estimateInsertSize(first.getReadBases(), second.getReadBases(), + minMatchesForOverlap, logger); + + if (insertSize == null) + return insertSize; + if (insertSize < first.getReadLength()) { + // trim adaptor sequence from read + first.setReadBases(Arrays.copyOfRange(first.getReadBases(),0,insertSize)); + first.setBaseQualities(Arrays.copyOfRange(first.getBaseQualities(),0,insertSize)); + } + if (insertSize < second.getReadLength()) { + // trim adaptor sequence from read + second.setReadBases(Arrays.copyOfRange(second.getReadBases(),0,insertSize)); + second.setBaseQualities(Arrays.copyOfRange(second.getBaseQualities(),0,insertSize)); + } + return insertSize; + } + + /** + * Brain-dead implementation of an aligner of two sequences, where it's assumed that there might be an overlap + * from the first into the second. From this, an estimate of insert size is performed and returned + * Assumes that reads come in reverse direction, so one of the base sequences needs to be reverse-complemented.] + * + * @param firstRead Bytes from first read + * @param secondRead Bytes from second read (reverse direction) + * @return Estimated insert size based on offset between first and second read. + * If no overlap can be detected, return null + */ + + @Requires({"firstRead != null","secondRead != null","minMatches>0","firstRead.length == secondRead.length"}) + protected static Integer estimateInsertSize(final byte[] firstRead, + final byte[] secondRead, + final int minMatches, + final Logger logger) { + final byte[] firstBases = firstRead; + final byte[] secondBases = BaseUtils.simpleReverseComplement(secondRead); + + final Pair overlaps = findOverlappingSequence(firstBases, secondBases); + final int bestOffset = overlaps.first; + final int maxScore = overlaps.second; + if ( logger.isDebugEnabled()) { + String sb="", s1 = new String(firstBases), s2 = new String(secondBases); + for (int k=0; k < Math.abs(bestOffset); k++) sb+=" "; + if (maxScore >= minMatches) { + logger.debug(String.format("Match, Max Score = %d, best offset = %d\n",maxScore, bestOffset)); + if (bestOffset>0) + s2 = sb+s2; + else + s1 = sb+s1; + } + else logger.debug("NoMatch:"); + logger.debug("R1:"+s1); + logger.debug("R2:"+s2); + + + } + + if (maxScore < minMatches) + return null; // no overlap detected + + return bestOffset+secondRead.length; + + + } + + + /** + * Tries to find overlapping sequence between two reads, and computes offset between them + * For each possible offset, computes matching score, which is = MATCH_SCORE*Num_matches + MISMATCH_SCORE*num_mismatches + * (like SW with infinite gap penalties). + * @param first First read bytes + * @param second Second read bytes + * @return Pair of integers (x,y). x = best offset between reads, y = corresponding score + */ + @Requires({"first != null","second != null"}) + @Ensures("result != null") + protected static Pair findOverlappingSequence(final byte[] first, + final byte[] second) { + final int MATCH_SCORE = 1; + final int MISMATCH_SCORE = -1; + // try every possible offset - O(N^2) algorithm + + // In case of following structure, + // 111111111 + // 222222222 + // computed offset will be negative (=-5 in this case). + // If however, + // 111111111 + // 222222222 + // then offset will be positive (=3 in this case) + int maxScore = 0, bestOffset =0; + for (int offset = -second.length; offset < first.length; offset++) { + int score = 0; + // compute start index for each array + int ind1 = (offset<0)?0:offset; + int ind2 = (offset<0)?-offset:0; + for (int k=0; k < Math.min(first.length, second.length) ; k++) { + if (ind1 >= first.length) + break; + if (ind2 >= second.length ) + break; + if (first[ind1] != 'N' && second[ind2] != 'N') { + if (first[ind1] == second[ind2]) + score += MATCH_SCORE; + else + score += MISMATCH_SCORE; + } + ind1++; + ind2++; + } + if (score > maxScore) { + maxScore = score; + bestOffset = offset; + } + } + return new Pair(bestOffset,maxScore); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 45c5fe090..c75997e67 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -65,13 +65,13 @@ import java.util.List; * reasons why the site may fail validation (nearby variation, for example). *

      * - *

      Input

      + *

      Input

      *

      * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an * interval list defining the size of the amplicons around the sites to be validated *

      * - *

      Output

      + *

      Output

      *

      * Output is a FASTA-formatted file with some modifications at probe sites. For instance: *

      @@ -100,7 +100,7 @@ import java.util.List;
        * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
        * 

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
      index a3e480bd0..06fa455be 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
      @@ -77,12 +77,12 @@ import java.util.*;
        * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
        * and stratification modules.
        *
      - * 

      Input

      + *

      Input

      *

      * One or more variant sets to evaluate plus any number of comparison sets. *

      * - *

      Output

      + *

      Output

      *

      * Evaluation tables detailing the results of the eval modules which were applied. * For example: @@ -103,7 +103,7 @@ import java.util.*; *

      *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
      index e5fe46a07..45dbc937d 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
      @@ -68,7 +68,7 @@ import java.util.*;
        * can be exacted using JEXL expressions on the set attribute using SelectVariants.  If you want to extract just
        * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
        * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
      - * in the detailed example on the wiki.
      + * in the detailed example in the documentation guide.
        *
        * Note that CombineVariants supports multi-threaded parallelism (8/15/12).  This is particularly useful
        * when converting from VCF to BCF2, which can be expensive.  In this case each thread spends CPU time
      @@ -83,17 +83,17 @@ import java.util.*;
        *      max QUAL, which resulted in sometime strange downstream confusion
        *   
        *
      - * 

      Input

      + *

      Input

      *

      * One or more variant sets to combine. *

      * - *

      Output

      + *

      Output

      *

      * A combined VCF. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -133,7 +133,7 @@ public class CombineVariants extends RodWalker implements Tree
           @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
           public List> variants;
       
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter vcfWriter = null;
       
           @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false)
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java
      index efb84edef..b3b4857b6 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java
      @@ -102,15 +102,16 @@ public class ConcordanceMetrics {
           public void update(VariantContext eval, VariantContext truth) {
               overallSiteConcordance.update(eval,truth);
               Set alleleTruth = new HashSet(8);
      -        alleleTruth.add(truth.getReference().getBaseString());
      +        String truthRef = truth.getReference().getBaseString();
      +        alleleTruth.add(truthRef);
               for ( Allele a : truth.getAlternateAlleles() ) {
                   alleleTruth.add(a.getBaseString());
               }
               for ( String sample : perSampleGenotypeConcordance.keySet() ) {
                   Genotype evalGenotype = eval.getGenotype(sample);
                   Genotype truthGenotype = truth.getGenotype(sample);
      -            perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth);
      -            overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth);
      +            perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef);
      +            overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef);
               }
           }
       
      @@ -170,10 +171,14 @@ public class ConcordanceMetrics {
               }
       
               @Requires({"eval!=null","truth != null","truthAlleles != null"})
      -        public void update(Genotype eval, Genotype truth, Set truthAlleles) {
      -            // this is slow but correct
      +        public void update(Genotype eval, Genotype truth, Set truthAlleles, String truthRef) {
      +            // this is slow but correct.
      +
      +            // NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles
      +            // that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will
      +            // all match, so long as A and T are alleles in the truth callset.
                   boolean matchingAlt = true;
      -            if ( eval.isCalled() && truth.isCalled() ) {
      +            if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) {
                       // by default, no-calls "match" between alleles, so if
                       // one or both sites are no-call or unavailable, the alt alleles match
                       // otherwise, check explicitly: if the eval has an allele that's not ref, no-call, or present in truth
      @@ -181,6 +186,17 @@ public class ConcordanceMetrics {
                       for ( Allele evalAllele : eval.getAlleles() ) {
                           matchingAlt &= truthAlleles.contains(evalAllele.getBaseString());
                       }
      +            } else if ( eval.isCalled() && truth.isCalled() ) {
      +                // otherwise, the eval genotype has to match either the alleles in the truth genotype, or the truth reference allele
      +                // todo -- this can be sped up by caching the truth allele sets
      +                Set genoAlleles = new HashSet(3);
      +                genoAlleles.add(truthRef);
      +                for ( Allele truthGenoAl : truth.getAlleles() ) {
      +                    genoAlleles.add(truthGenoAl.getBaseString());
      +                }
      +                for ( Allele evalAllele : eval.getAlleles() ) {
      +                    matchingAlt &= genoAlleles.contains(evalAllele.getBaseString());
      +                }
                   }
       
                   if ( matchingAlt ) {
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java
      index f285fb797..e61cda765 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java
      @@ -56,7 +56,7 @@ public class FilterLiftedVariants extends RodWalker {
       
           private static final int MAX_VARIANT_SIZE = 100;
       
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter writer = null;
       
           private long failedLocs = 0, totalLocs = 0;
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java
      index 048c7ef77..35213af34 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java
      @@ -26,6 +26,7 @@
       package org.broadinstitute.sting.gatk.walkers.variantutils;
       
       import org.broadinstitute.sting.commandline.*;
      +import org.broadinstitute.sting.gatk.CommandLineGATK;
       import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
       import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
       import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
      @@ -33,6 +34,8 @@ import org.broadinstitute.sting.gatk.report.GATKReport;
       import org.broadinstitute.sting.gatk.report.GATKReportTable;
       import org.broadinstitute.sting.gatk.walkers.RodWalker;
       import org.broadinstitute.sting.utils.collections.Pair;
      +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
      +import org.broadinstitute.sting.utils.help.HelpConstants;
       import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
       import org.broadinstitute.variant.variantcontext.*;
       import org.broadinstitute.variant.vcf.VCFHeader;
      @@ -41,29 +44,30 @@ import java.io.PrintStream;
       import java.util.*;
       
       /**
      - * A simple walker for performing genotype concordance calculations between two callsets. Outputs a GATK table with
      - * per-sample and aggregate counts and frequencies, a summary table for NRD/NRS, and a table for site allele overlaps.
      + * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets
        *
        * 

      - * Genotype concordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, + * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, * and for each sample, the genotype-by-genotype counts (for instance, the number of sites at which a sample was * called homozygous reference in the EVAL callset, but homozygous variant in the COMP callset). It outputs these * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in * the COMP) and metrics (such as NRD and NRS). * - *

      INPUT

      + *

      Input

      *

      * Genotype concordance requires two callsets (as it does a comparison): an EVAL and a COMP callset, specified via - * the -eval and -comp arguments - *

      + * the -eval and -comp arguments. + * * (Optional) Jexl expressions for genotype-level filtering of EVAL or COMP genotypes, specified via the -gfe and * -cfe arguments, respectively. + *

      * - *

      OUTPUT

      - * Genotype Concordance writes a GATK report to the specified (via -o) file, consisting of multiple tables of counts + *

      Output

      + * Genotype Concordance writes a GATK report to the specified file (via -o) , consisting of multiple tables of counts * and proportions. These tables may be optionally moltenized via the -moltenize argument. * */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java similarity index 63% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java index 65ec7a4f0..9168d17f0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -25,9 +25,12 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -38,8 +41,11 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; @@ -55,43 +61,59 @@ import java.util.*; * Left-aligns indels from a variants file. * *

      - * LeftAlignVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be + * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. + * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. * - *

      Input

      + *

      Input

      *

      - * A variant set to left-align. + * A variant set to left-align and trim. *

      * - *

      Output

      + *

      Output

      *

      * A left-aligned VCF. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      - *   -T LeftAlignVariants \
      + *   -T LeftAlignAndTrimVariants \
        *   --variant input.vcf \
        *   -o output.vcf
        * 
      * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-200,stop=200)) -public class LeftAlignVariants extends RodWalker { +@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! +public class LeftAlignAndTrimVariants extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + /** + * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation. + */ + @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false) + protected boolean trimAlleles = false; + + /** + * If this argument is set, split multiallelic records and left-align individual alleles. + * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. + */ + @Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false) + protected boolean splitMultiallelics = false; + + + @Output(doc="File to which variants should be written") protected VariantContextWriter baseWriter = null; private VariantContextWriter writer; + private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size! public void initialize() { String trackName = variantCollection.variants.getName(); Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); @@ -110,8 +132,29 @@ public class LeftAlignVariants extends RodWalker { Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); int changedSites = 0; - for ( VariantContext vc : VCs ) - changedSites += alignAndWrite(vc, ref); + for ( final VariantContext vc : VCs ) { + // split first into biallelics, and optionally trim alleles to minimal representation + Pair result = new Pair(vc,0); // default value + if (splitMultiallelics) { + final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics( vc); + for (final VariantContext biallelicVC: vcList) { + final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true):biallelicVC); + result = alignAndWrite(v, ref); + writer.add(result.first); + changedSites += result.second; + } + } + else { + if (trimAlleles) + result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); + else + result = alignAndWrite(vc,ref); + writer.add(result.first); + changedSites += result.second; + + } + + } return changedSites; } @@ -127,18 +170,21 @@ public class LeftAlignVariants extends RodWalker { System.out.println(result + " variants were aligned"); } + /** + * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be + * handled by calling routine. + * @param vc Input VC with variants to left align + * @param ref Reference context + * @return # of records left-aligned (0 or 1) and new VC. + */ + @Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"}) + @Ensures({"result != null","result.first != null", "result.second >=0"}) + protected static Pair alignAndWrite(final VariantContext vc, final ReferenceContext ref) { - private int alignAndWrite(VariantContext vc, final ReferenceContext ref) { - if ( vc.isBiallelic() && vc.isIndel() && !vc.isComplexIndel() ) - return writeLeftAlignedIndel(vc, ref); - else { - writer.add(vc); - return 0; + final Pair retValue = new Pair(vc,0); + if (!vc.isIndel() || vc.isComplexIndel() ) { + return retValue; } - } - - private int writeLeftAlignedIndel(final VariantContext vc, final ReferenceContext ref) { - final byte[] refSeq = ref.getBases(); // get the indel length final int indelLength; @@ -147,13 +193,20 @@ public class LeftAlignVariants extends RodWalker { else indelLength = vc.getAlternateAllele(0).length() - 1; - if ( indelLength > 200 ) { - writer.add(vc); - return 0; - } + if ( indelLength > MAX_INDEL_LENGTH ) + return retValue; + + if (vc.getReference().getBases()[0] != vc.getAlternateAllele(0).getBases()[0]) + return retValue; + + final byte[] refSeq = ref.getBases(); + + // create an indel haplotype. + // + final int originalIndex = vc.getStart() - ref.getWindow().getStart() + 1; + if (originalIndex < 0 || originalIndex >= ref.getBases().length) + return retValue; - // create an indel haplotype - final int originalIndex = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; final byte[] originalIndel = makeHaplotype(vc, refSeq, originalIndex, indelLength); // create a CIGAR string to represent the event @@ -178,15 +231,24 @@ public class LeftAlignVariants extends RodWalker { System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 1, indelLength); final Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); newVC = updateAllele(newVC, newAllele); + // overwrite default return value with new left-aligned VC + retValue.first = newVC; + retValue.second = 1; - writer.add(newVC); - return 1; - } else { - writer.add(vc); - return 0; } + return retValue; } + /** + * Make a haplotype from a given alt allele, using bases in input reference, index of an input reference + * @param vc Input VC - will use only alt allele from it + * @param ref Ref bases + * @param indexOfRef Index in ref where to create indel + * @param indelLength Indel length + * @return + */ + @Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"}) + @Ensures("result != null") private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 0a7ad5b7b..17d50f101 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -62,7 +62,7 @@ public class LiftoverVariants extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected File file = null; protected VariantContextWriter writer = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java index 17aaa7513..478bba846 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -58,17 +58,17 @@ import java.util.*; * SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the * file (based on a complete header name or a pattern match). *

      - *

      Input

      + *

      Input

      *

      * A set of VCFs. *

      *

      - *

      Output

      + *

      Output

      *

      * A header selected VCF. *

      *

      - *

      Examples

      + *

      Examples

      *
        * Select only the FILTER, FORMAT, and INFO headers:
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
      @@ -110,7 +110,7 @@ public class SelectHeaders extends RodWalker implements TreeRe
           @ArgumentCollection
           protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
       
      -    @Output(doc = "File to which variants should be written", required = true)
      +    @Output(doc = "File to which variants should be written")
           protected VariantContextWriter vcfWriter;
       
           @Argument(fullName = "header_name", shortName = "hn", doc = "Include header. Can be specified multiple times", required = false)
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
      index 9c209ae2c..1f2b6d09b 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
      @@ -62,20 +62,20 @@ import java.util.*;
        * Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a
        * pattern match).  Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of
        * coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25).  These JEXL expressions are
      - * documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions).
      + * documented in the Using JEXL expressions section (http://www.broadinstitute.org/gatk/guide/article?id=1255).
        * One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
        *
      - * 

      Input

      + *

      Input

      *

      * A variant set to select from. *

      * - *

      Output

      + *

      Output

      *

      * A selected VCF. *

      * - *

      Examples

      + *

      Examples

      *
        * Select two samples out of a VCF with many samples:
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
      @@ -199,7 +199,7 @@ public class SelectVariants extends RodWalker implements TreeR
           @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false)
           protected RodBinding concordanceTrack;
       
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter vcfWriter = null;
       
           @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
      @@ -377,10 +377,10 @@ public class SelectVariants extends RodWalker implements TreeR
               }
       
               // now, exclude any requested samples
      -        Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
      +        final Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
               samples.removeAll(XLsamplesFromFile);
               samples.removeAll(XLsampleNames);
      -        NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty();
      +        NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty();
       
               if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
                   throw new UserException("All samples requested to be included were also requested to be excluded.");
      @@ -406,8 +406,8 @@ public class SelectVariants extends RodWalker implements TreeR
               headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
       
               if (KEEP_ORIGINAL_CHR_COUNTS) {
      -            headerLines.add(new VCFInfoHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
      -            headerLines.add(new VCFInfoHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
      +            headerLines.add(new VCFInfoHeaderLine("AC_Orig", VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Original AC"));
      +            headerLines.add(new VCFInfoHeaderLine("AF_Orig", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Original AF"));
                   headerLines.add(new VCFInfoHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
               }
               headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
      @@ -507,7 +507,7 @@ public class SelectVariants extends RodWalker implements TreeR
                   if (!selectedTypes.contains(vc.getType()))
                       continue;
       
      -            if ( badIndelSize(vc) )
      +            if ( containsIndelLargerThan(vc, maxIndelSize) )
                       continue;
       
                   VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
      @@ -531,12 +531,20 @@ public class SelectVariants extends RodWalker implements TreeR
               return 1;
           }
       
      -    private boolean badIndelSize(final VariantContext vc) {
      -        List lengths = vc.getIndelLengths();
      +    /*
      +     * Determines if any of the alternate alleles are greater than the max indel size
      +     *
      +     * @param vc            the variant context to check
      +     * @param maxIndelSize  the maximum size of allowed indels
      +     * @return true if the VC contains an indel larger than maxIndelSize and false otherwise
      +     */
      +    protected static boolean containsIndelLargerThan(final VariantContext vc, final int maxIndelSize) {
      +        final List lengths = vc.getIndelLengths();
               if ( lengths == null )
      -            return false; // VC does not harbor indel
      -        for ( Integer indelLength : vc.getIndelLengths() ) {
      -            if ( indelLength > maxIndelSize )
      +            return false;
      +
      +        for ( Integer indelLength : lengths ) {
      +            if ( Math.abs(indelLength) > maxIndelSize )
                       return true;
               }
       
      @@ -662,7 +670,8 @@ public class SelectVariants extends RodWalker implements TreeR
               GenotypesContext newGC = sub.getGenotypes();
       
               // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate)
      -        if ( vc.getAlleles().size() != sub.getAlleles().size() )
      +        final boolean lostAllelesInSelection = vc.getAlleles().size() != sub.getAlleles().size();
      +        if ( lostAllelesInSelection )
                   newGC = GATKVariantContextUtils.stripPLsAndAD(sub.getGenotypes());
       
               // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
      @@ -689,15 +698,22 @@ public class SelectVariants extends RodWalker implements TreeR
       
               builder.genotypes(newGC);
       
      -        addAnnotations(builder, sub);
      +        addAnnotations(builder, sub, lostAllelesInSelection);
       
               return builder.make();
           }
       
      -    private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) {
      +    /*
      +     * Add annotations to the new VC
      +     *
      +     * @param builder     the new VC to annotate
      +     * @param originalVC  the original -- but post-selection -- VC
      +     * @param lostAllelesInSelection  true if the original (pre-selection) VC has more alleles than the new one
      +     */
      +    private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final boolean lostAllelesInSelection) {
               if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data
       
      -        if (KEEP_ORIGINAL_CHR_COUNTS) {
      +        if ( KEEP_ORIGINAL_CHR_COUNTS && !lostAllelesInSelection ) {
                   if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) )
                       builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
                   if ( originalVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) )
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
      index a242f9310..d11cf5aee 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
      @@ -60,12 +60,12 @@ import java.util.Set;
        *
        * If you are looking simply to test the adherence to the VCF specification, use --validationType NONE.
        *
      - * 

      Input

      + *

      Input

      *

      * A variant set to validate. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
      index 02089eb6c..d189459c0 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
      @@ -55,12 +55,12 @@ import java.util.*;
        * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability.
        * If you have .ped files, please first convert them to VCF format.
        *
      - * 

      Input

      + *

      Input

      *

      * A validation VCF to annotate. *

      * - *

      Output

      + *

      Output

      *

      * An annotated VCF. Additionally, a table like the following will be output: *

      @@ -74,7 +74,7 @@ import java.util.*;
        * 
      *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -91,7 +91,7 @@ public class VariantValidationAssessor extends RodWalker
           @ArgumentCollection
           protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
       
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter vcfwriter = null;
       
           @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false)
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java
      new file mode 100644
      index 000000000..e25f158f2
      --- /dev/null
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java
      @@ -0,0 +1,140 @@
      +/*
      +* Copyright (c) 2012 The Broad Institute
      +* 
      +* Permission is hereby granted, free of charge, to any person
      +* obtaining a copy of this software and associated documentation
      +* files (the "Software"), to deal in the Software without
      +* restriction, including without limitation the rights to use,
      +* copy, modify, merge, publish, distribute, sublicense, and/or sell
      +* copies of the Software, and to permit persons to whom the
      +* Software is furnished to do so, subject to the following
      +* conditions:
      +* 
      +* The above copyright notice and this permission notice shall be
      +* included in all copies or substantial portions of the Software.
      +* 
      +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
      +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
      +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
      +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
      +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
      +* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      +*/
      +
      +package org.broadinstitute.sting.gatk.walkers.variantutils;
      +
      +import com.google.java.contract.Requires;
      +import org.broadinstitute.sting.commandline.ArgumentCollection;
      +import org.broadinstitute.sting.commandline.Output;
      +import org.broadinstitute.sting.gatk.CommandLineGATK;
      +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
      +import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
      +import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
      +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
      +import org.broadinstitute.sting.gatk.walkers.RodWalker;
      +import org.broadinstitute.sting.utils.SampleUtils;
      +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
      +import org.broadinstitute.sting.utils.help.HelpConstants;
      +import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
      +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
      +import org.broadinstitute.variant.variantcontext.*;
      +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
      +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory;
      +import org.broadinstitute.variant.vcf.VCFHeader;
      +import org.broadinstitute.variant.vcf.VCFHeaderLine;
      +
      +import java.util.*;
      +
      +/**
      + * Takes alleles from a variants file and breaks them up (if possible) into more basic/primitive alleles.
      + *
      + * 

      + * For now this tool modifies only multi-nucleotide polymorphisms (MNPs) and leaves SNPs, indels, and complex substitutions as is, + * although one day it may be extended to handle the complex substitution case. + * + * This tool will take an MNP (e.g. ACCCA -> TCCCG) and break it up into separate records for each component part (A-T and A->G). + * + * Note that this tool modifies only bi-allelic variants. + * + *

      Input

      + *

      + * A variant set with any type of alleles. + *

      + * + *

      Output

      + *

      + * A VCF with alleles broken into primitive types. + *

      + * + *

      Examples

      + *
      + * java -Xmx2g -jar GenomeAnalysisTK.jar \
      + *   -R ref.fasta \
      + *   -T VariantsToAllelicPrimitives \
      + *   --variant input.vcf \
      + *   -o output.vcf
      + * 
      + * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +public class VariantsToAllelicPrimitives extends RodWalker { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Output(doc="File to which variants should be written") + protected VariantContextWriter baseWriter = null; + + private VariantContextWriter vcfWriter; + + public void initialize() { + final String trackName = variantCollection.variants.getName(); + final Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + + final Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + final Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); + + baseWriter.writeHeader(new VCFHeader(headerLines, samples)); + + vcfWriter = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + final Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + + int changedSites = 0; + for ( final VariantContext vc : VCs ) + changedSites += writeVariants(vc); + + return changedSites; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + System.out.println(result + " MNPs were broken up into primitives"); + vcfWriter.close(); + } + + @Requires("vc != null") + private int writeVariants(final VariantContext vc) { + // for now, we modify only bi-allelic MNPs; update docs above if this changes + if ( vc.isBiallelic() && vc.isMNP() ) { + for ( final VariantContext splitVC : GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc) ) + vcfWriter.add(splitVC); + return 1; + } else { + vcfWriter.add(vc); + return 0; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index b12f51a1e..f1f93f1f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -62,14 +62,13 @@ import java.util.*; * genotypes), NO-CALL (count of no-call genotypes), TYPE (the type of event), VAR (count of * non-reference genotypes), NSAMPLES (number of samples), NCALLED (number of called samples), * GQ (from the genotype field; works only for a file with a single sample), and MULTI-ALLELIC - * (is the record from a multi-allelic site). Note that this tool does not support capturing any - * GENOTYPE field values. If a VCF record is missing a value, then the tool by + * (is the record from a multi-allelic site). Note that if a VCF record is missing a value, then the tool by * default throws an error, but the special value NA can be emitted instead with * appropriate tool arguments. * *

      * - *

      Input

      + *

      Input

      *

      *

        *
      • A VCF file
      • @@ -77,12 +76,12 @@ import java.util.*; *
      *

      * - *

      Output

      + *

      Output

      *

      * A tab-delimited file containing the values of the requested fields in the VCF file *

      * - *

      Examples

      + *

      Examples

      *
        *     java -jar GenomeAnalysisTK.jar \
        *     -R reference.fasta
      @@ -112,7 +111,7 @@ public class VariantsToTable extends RodWalker {
           @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
           public List> variants;
       
      -    @Output(doc="File to which results should be written",required=true)
      +    @Output(doc="File to which results should be written")
           protected PrintStream out;
       
           /**
      diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
      index ffe61f76d..60809134a 100644
      --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
      +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
      @@ -62,17 +62,17 @@ import java.util.*;
        * 

      * Note that there must be a Tribble feature/codec for the file format as well as an adaptor. * - *

      Input

      + *

      Input

      *

      * A variant file to filter. *

      * - *

      Output

      + *

      Output

      *

      * A VCF file. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      @@ -87,7 +87,7 @@ import java.util.*;
       @Reference(window=@Window(start=-40,stop=40))
       public class VariantsToVCF extends RodWalker {
       
      -    @Output(doc="File to which variants should be written",required=true)
      +    @Output(doc="File to which variants should be written")
           protected VariantContextWriter baseWriter = null;
           private VariantContextWriter vcfwriter; // needed because hapmap/dbsnp indel records move
       
      @@ -193,7 +193,10 @@ public class VariantsToVCF extends RodWalker {
                   if ( dbsnp == null )
                       throw new UserException.BadInput("No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records");
       
      -            RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe);
      +            RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
      +                                                          getToolkit().getGenomeLocParser(),
      +                                                          getToolkit().getArguments().unsafe,
      +                                                          getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods);
                   dbsnpIterator = builder.createInstanceOfTrack(VCFCodec.class, new File(dbsnp.dbsnp.getSource())).getIterator();
                   // Note that we should really use some sort of seekable iterator here so that the search doesn't take forever
                   // (but it's complicated because the hapmap location doesn't match the dbsnp location, so we don't know where to seek to)
      diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java
      index 10fb606f9..ad77b2548 100644
      --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java
      +++ b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java
      @@ -35,6 +35,8 @@ import org.broadinstitute.sting.commandline.Argument;
       import org.broadinstitute.sting.commandline.Input;
       import org.broadinstitute.sting.commandline.Output;
       import org.broadinstitute.sting.commandline.CommandLineProgram;
      +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
      +import org.broadinstitute.sting.utils.help.HelpConstants;
       import org.broadinstitute.variant.bcf2.BCF2Codec;
       import org.broadinstitute.sting.utils.collections.Pair;
       import org.broadinstitute.variant.vcf.VCFCodec;
      @@ -51,12 +53,52 @@ import java.util.*;
       
       /**
        *
      - * Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants    [sorted (optional)]");
      - * The input files can be of type: VCF (ends in .vcf or .VCF)");
      - *                                 BCF2 (ends in .bcf or .BCF)");
      - * Output file must be vcf or bcf file (.vcf or .bcf)");
      - * If the input files are already sorted, the last argument can indicate that");
      + * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples
      + *
      + * 

      + * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. + * This tool concatenates the scattered output VCF files. It assumes that: + * - All the input VCFs (or BCFs) contain the same samples in the same order. + * - The variants in each input file are from non-overlapping (scattered) intervals. + * + * When the input files are already sorted based on the intervals start positions, use -assumeSorted. + * + * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. + * + *

      + * + *

      Input

      + *

      + * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). + * The input files should be 'name.vcf' or 'name.VCF' or 'name.bcf' or 'name.BCF'. + * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. + *

      + * + *

      Output

      + *

      + * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'. + * <\p> + * + *

      Important note

      + *

      This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to + * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the + * classic "CommandLineGATK" arguments.

      + * + *

      Example

      + *
      + * java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
      + *    -R ref.fasta \
      + *    -V input1.vcf \
      + *    -V input2.vcf \
      + *    -out output.vcf \
      + *    -assumeSorted
      + * 
      + * + * @author Ami Levy Moonshine + * @since Jan 2012 */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP ) public class CatVariants extends CommandLineProgram { // setup the logging system, used by some codecs private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); @@ -64,6 +106,14 @@ public class CatVariants extends CommandLineProgram { @Input(fullName = "reference", shortName = "R", doc = "genome reference file .fasta", required = true) private File refFile = null; + /** + * The VCF or BCF files to merge together + * + * CatVariants can take any number of -V arguments on the command line. Each -V argument + * will be included in the final merged output VCF. The order of arguments does not matter, but it runs more + * efficiently if they are sorted based on the intervals and the assumeSorted argument is used. + * + */ @Input(fullName="variant", shortName="V", doc="Input VCF file/s named .vcf or .bcf", required = true) private List variant = null; @@ -77,7 +127,7 @@ public class CatVariants extends CommandLineProgram { * print usage information */ private static void printUsage() { - System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.AppendVariants [sorted (optional)]"); + System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants [sorted (optional)]"); System.err.println(" The input files can be of type: VCF (ends in .vcf or .VCF)"); System.err.println(" BCF2 (ends in .bcf or .BCF)"); System.err.println(" Output file must be vcf or bcf file (.vcf or .bcf)"); diff --git a/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java b/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java new file mode 100644 index 000000000..fabcf828a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java @@ -0,0 +1,85 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.tools; + +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.help.HelpUtils; + +/** + * Utility program to print a list of available annotations + * + *

      This is a very simple utility tool that retrieves available annotations for use with tools such as + * UnifiedGenotyper, HaplotypeCaller and VariantAnnotator.

      + * + *

      Important note

      + *

      This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to + * invoke it is a little different from other GATK tools (see usage below), and it does not accept any of the + * classic "CommandLineGATK" arguments.

      + * + *

      Usage

      + *
      java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.ListAnnotations
      + * + * @author vdauwera + * @since 3/14/13 + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_HELPUTILS ) +public class ListAnnotations extends CommandLineProgram { + + /* + * Print usage information + * + * TODO: would be more convenient if we could just call the program by name instead of the full classpath + */ + private static void printUsage() { + System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.ListAnnotations"); + System.err.println(" Prints a list of available annotations and exits."); + } + + // TODO: override CommandLineProgram bit that offers version, logging etc arguments. We don't need that stuff here and it makes the doc confusing. + + @Override + protected int execute() throws Exception { + + HelpUtils.listAnnotations(); + return 0; + } + + public static void main(String[] args){ + try { + ListAnnotations instance = new ListAnnotations(); + start(instance, args); + System.exit(CommandLineProgram.result); + } catch ( UserException e ) { + printUsage(); + exitSystemWithUserError(e); + } catch ( Exception e ) { + exitSystemWithError(e); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java b/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java new file mode 100644 index 000000000..e20872c5b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java @@ -0,0 +1,95 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import it.unimi.dsi.fastutil.objects.Object2ObjectMap; +import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; + +import java.util.*; + +/** + * Utility class for handling deprecated tools gracefully + * + * @author vdauwera + * @since 3/11/13 + */ +public class DeprecatedToolChecks { + + // Mapping from walker name to major version number where the walker first disappeared and optional replacement options + private static Object2ObjectMap deprecatedGATKWalkers = new Object2ObjectOpenHashMap(); + static { + // Indicate recommended replacement in parentheses if applicable + deprecatedGATKWalkers.put("CountCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)"); + deprecatedGATKWalkers.put("AnalyzeCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)"); + deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)"); + deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)"); + deprecatedGATKWalkers.put("CountBestAlignments", "2.2 (no replacement)"); + } + + // Mapping from walker name to major version number where the walker first disappeared and optional replacement options + private static Object2ObjectMap deprecatedGATKAnnotations = new Object2ObjectOpenHashMap(); + static { + // Same comments as for walkers + deprecatedGATKAnnotations.put("DepthOfCoverage", "2.4 (renamed to Coverage)"); + } + + /** + * Utility method to check whether a given walker has been deprecated in a previous GATK release + * + * @param walkerName the walker class name (not the full package) to check + */ + public static boolean isDeprecatedWalker(final String walkerName) { + return deprecatedGATKWalkers.containsKey(walkerName); + } + + /** + * Utility method to check whether a given annotation has been deprecated in a previous GATK release + * + * @param annotationName the annotation class name (not the full package) to check + */ + public static boolean isDeprecatedAnnotation(final String annotationName) { + return deprecatedGATKAnnotations.containsKey(annotationName); + } + + /** + * Utility method to pull up the version number at which a walker was deprecated and the suggested replacement, if any + * + * @param walkerName the walker class name (not the full package) to check + */ + public static String getWalkerDeprecationInfo(final String walkerName) { + return deprecatedGATKWalkers.get(walkerName).toString(); + } + + /** + * Utility method to pull up the version number at which an annotation was deprecated and the suggested replacement, if any + * + * @param annotationName the annotation class name (not the full package) to check + */ + public static String getAnnotationDeprecationInfo(final String annotationName) { + return deprecatedGATKAnnotations.get(annotationName).toString(); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 5adef5cdf..28cdaaf56 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -266,80 +266,96 @@ public class GenomeLocSortedSet extends AbstractSet { } /** - * add a genomeLoc to the collection, simply inserting in order into the set + * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set. + * Throws an exception if the loc overlaps another loc already in the set. * - * TODO -- this may break the contract of the GenomeLocSortedSet if e overlaps or - * TODO -- other locations already in the set. This code should check to see if - * TODO -- e is overlapping with its nearby elements and merge them or alternatively - * TODO -- throw an exception + * @param loc the GenomeLoc to add * - * @param e the GenomeLoc to add - * - * @return true + * @return true if the loc was added or false otherwise (if the loc was null) */ - public boolean add(GenomeLoc e) { - // assuming that the intervals coming arrive in order saves us a fair amount of time (and it's most likely true) - if (mArray.size() > 0 && e.isPast(mArray.get(mArray.size() - 1))) { - mArray.add(e); - return true; - } else { - final int loc = Collections.binarySearch(mArray,e); - if (loc >= 0) { - throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); - } else { - mArray.add((loc+1) * -1,e); - return true; - } - } + public boolean add(final GenomeLoc loc) { + return add(loc, false); } /** * Adds a GenomeLoc to the collection, merging it if it overlaps another region. - * If it's not overlapping then we add it in sorted order. + * If it's not overlapping then we insert it at the correct sorted position into the set. * - * TODO TODO TODO -- this function is buggy and will not properly create a sorted - * TODO TODO TODO -- genome loc is addRegion is called sequentially where the second - * TODO TODO TODO -- loc added is actually before the first. So when creating - * TODO TODO TODO -- sets make sure to sort the input locations first! + * @param loc the GenomeLoc to add * - * @param e the GenomeLoc to add to the collection - * - * @return true, if the GenomeLoc could be added to the collection + * @return true if the loc was added or false otherwise (if the loc was null) */ - public boolean addRegion(GenomeLoc e) { - if (e == null) { - return false; - } - // have we added it to the collection? - boolean haveAdded = false; + public boolean addRegion(final GenomeLoc loc) { + return add(loc, true); + } - /** - * check if the specified element overlaps any current locations, if so - * we should merge the two. - */ - for (GenomeLoc g : mArray) { - if (g.contiguousP(e)) { - GenomeLoc c = g.merge(e); - mArray.set(mArray.indexOf(g), c); - haveAdded = true; - } else if ((g.getContigIndex() == e.getContigIndex()) && - (e.getStart() < g.getStart()) && !haveAdded) { - mArray.add(mArray.indexOf(g), e); - return true; - } else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) || - (g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) { - return true; - } + /** + * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set. + * + * @param loc the GenomeLoc to add + * @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception + * + * @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate) + */ + public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) { + if ( loc == null ) + return false; + + // if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the + // case because locs are generally added in order) then be extra efficient and just add the loc to the end + if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) { + return mArray.add(loc); } - /** we're at the end and we haven't found locations that should fall after it, - * so we'll put it at the end - */ - if (!haveAdded) { - mArray.add(e); + + // find where in the list the new loc belongs + final int binarySearchIndex = Collections.binarySearch(mArray,loc); + + // if it already exists in the list, return or throw an exception as needed + if ( binarySearchIndex >= 0 ) { + if ( mergeIfIntervalOverlaps ) + return false; + throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc); } + + // if it overlaps a loc already in the list merge or throw an exception as needed + final int insertionIndex = -1 * (binarySearchIndex + 1); + if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) { + // it does not overlap any current intervals, so add it to the set + mArray.add(insertionIndex, loc); + } + return true; } + /* + * If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested) + * + * @param loc the GenomeLoc to add + * @param insertionIndex the index in the sorted set to add the new loc + * @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them + * + * @return true if the loc was added or false otherwise + */ + private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) { + // try merging with the previous index + if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) { + if ( throwExceptionIfOverlapping ) + throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString())); + mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc)); + return true; + } + + // try merging with the following index + if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) { + if ( throwExceptionIfOverlapping ) + throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString())); + mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc)); + return true; + } + + return false; + } + public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { LinkedList good = new LinkedList(); Stack toProcess = new Stack(); @@ -401,11 +417,11 @@ public class GenomeLocSortedSet extends AbstractSet { * * @return the GenomeLocSet of all references sequences as GenomeLoc's */ - public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) { - GenomeLocParser parser = new GenomeLocParser(dict); - GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); - for (SAMSequenceRecord record : dict.getSequences()) { - returnSortedSet.add(parser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength())); + public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) { + final GenomeLocParser parser = new GenomeLocParser(dict); + final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); + for ( final SAMSequenceRecord sequence : dict.getSequences() ) { + returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName())); } return returnSortedSet; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 2459c1d36..38c131bc6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -55,7 +54,7 @@ public class MathUtils { private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; private static final double MAX_JACOBIAN_TOLERANCE = 8.0; private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; - private static final int MAXN = 50000; + private static final int MAXN = 70000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients /** @@ -63,6 +62,7 @@ public class MathUtils { * where the real-space value is 0.0. */ public final static double LOG10_P_OF_ZERO = -1000000.0; + public final static double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); static { log10Cache = new double[LOG10_CACHE_SIZE]; @@ -70,6 +70,7 @@ public class MathUtils { jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; + log10FactorialCache[0] = 0.0; for (int k = 1; k < LOG10_CACHE_SIZE; k++) { log10Cache[k] = Math.log10(k); log10FactorialCache[k] = log10FactorialCache[k-1] + log10Cache[k]; @@ -88,14 +89,14 @@ public class MathUtils { * @param max upper bound of the range * @return a random int >= min and <= max */ - public static int randomIntegerInRange( int min, int max ) { + public static int randomIntegerInRange( final int min, final int max ) { return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; } // A fast implementation of the Math.round() method. This method does not perform // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). - public static int fastRound(double d) { + public static int fastRound(final double d) { return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); } @@ -123,7 +124,7 @@ public class MathUtils { return approxSum; } - public static double approximateLog10SumLog10(double a, double b, double c) { + public static double approximateLog10SumLog10(final double a, final double b, final double c) { return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); } @@ -152,97 +153,53 @@ public class MathUtils { return big + MathUtils.jacobianLogTable[ind]; } - public static double sum(Collection numbers) { - return sum(numbers, false); - } - - public static double sum(Collection numbers, boolean ignoreNan) { - double sum = 0; - for (Number n : numbers) { - if (!ignoreNan || !Double.isNaN(n.doubleValue())) { - sum += n.doubleValue(); - } - } - - return sum; - } - - public static int nonNanSize(Collection numbers) { - int size = 0; - for (Number n : numbers) { - size += Double.isNaN(n.doubleValue()) ? 0 : 1; - } - - return size; - } - - public static double average(Collection x) { - return sum(x) / x.size(); - } - - public static double average(Collection numbers, boolean ignoreNan) { - if (ignoreNan) { - return sum(numbers, true) / nonNanSize(numbers); - } - else { - return sum(numbers, false) / nonNanSize(numbers); - } - } - - public static double variance(Collection numbers, Number mean, boolean ignoreNan) { - double mn = mean.doubleValue(); - double var = 0; - for (Number n : numbers) { - var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0; - } - if (ignoreNan) { - return var / (nonNanSize(numbers) - 1); - } - return var / (numbers.size() - 1); - } - - public static double variance(Collection numbers, Number mean) { - return variance(numbers, mean, false); - } - - public static double variance(Collection numbers, boolean ignoreNan) { - return variance(numbers, average(numbers, ignoreNan), ignoreNan); - } - - public static double variance(Collection numbers) { - return variance(numbers, average(numbers, false), false); - } - - public static double sum(double[] values) { + public static double sum(final double[] values) { double s = 0.0; for (double v : values) s += v; return s; } - public static long sum(int[] x) { + public static long sum(final int[] x) { long total = 0; for (int v : x) total += v; return total; } - public static int sum(byte[] x) { + public static int sum(final byte[] x) { int total = 0; for (byte v : x) total += (int)v; return total; } - /** - * Calculates the log10 cumulative sum of an array with log10 probabilities - * - * @param log10p the array with log10 probabilities - * @param upTo index in the array to calculate the cumsum up to - * @return the log10 of the cumulative sum - */ - public static double log10CumulativeSumLog10(double[] log10p, int upTo) { - return log10sumLog10(log10p, 0, upTo); + public static double percentage(int x, int base) { + return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); + } + + public static double ratio(final int num, final int denom) { + if ( denom > 0 ) { + return ((double) num)/denom; + } else { + if ( num == 0 && denom == 0) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + public static double ratio(final long num, final long denom) { + if ( denom > 0L ) { + return ((double) num)/denom; + } else { + if ( num == 0L && denom == 0L ) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } } /** @@ -251,18 +208,18 @@ public class MathUtils { * @param prRealSpace * @return */ - public static double[] toLog10(double[] prRealSpace) { + public static double[] toLog10(final double[] prRealSpace) { double[] log10s = new double[prRealSpace.length]; for (int i = 0; i < prRealSpace.length; i++) log10s[i] = Math.log10(prRealSpace[i]); return log10s; } - public static double log10sumLog10(double[] log10p, int start) { + public static double log10sumLog10(final double[] log10p, final int start) { return log10sumLog10(log10p, start, log10p.length); } - public static double log10sumLog10(double[] log10p, int start, int finish) { + public static double log10sumLog10(final double[] log10p,final int start,final int finish) { double sum = 0.0; double maxValue = arrayMax(log10p, finish); @@ -276,56 +233,42 @@ public class MathUtils { return Math.log10(sum) + maxValue; } - public static double sumDoubles(List values) { - double s = 0.0; - for (double v : values) - s += v; - return s; - } - - public static int sumIntegers(List values) { - int s = 0; - for (int v : values) - s += v; - return s; - } - - public static double sumLog10(double[] log10values) { + public static double sumLog10(final double[] log10values) { return Math.pow(10.0, log10sumLog10(log10values)); // double s = 0.0; // for ( double v : log10values) s += Math.pow(10.0, v); // return s; } - public static double log10sumLog10(double[] log10values) { + public static double log10sumLog10(final double[] log10values) { return log10sumLog10(log10values, 0); } - public static boolean wellFormedDouble(double val) { + public static boolean wellFormedDouble(final double val) { return !Double.isInfinite(val) && !Double.isNaN(val); } - public static double bound(double value, double minBoundary, double maxBoundary) { + public static double bound(final double value, final double minBoundary, final double maxBoundary) { return Math.max(Math.min(value, maxBoundary), minBoundary); } - public static boolean isBounded(double val, double lower, double upper) { + public static boolean isBounded(final double val, final double lower, final double upper) { return val >= lower && val <= upper; } - public static boolean isPositive(double val) { + public static boolean isPositive(final double val) { return !isNegativeOrZero(val); } - public static boolean isPositiveOrZero(double val) { + public static boolean isPositiveOrZero(final double val) { return isBounded(val, 0.0, Double.POSITIVE_INFINITY); } - public static boolean isNegativeOrZero(double val) { + public static boolean isNegativeOrZero(final double val) { return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); } - public static boolean isNegative(double val) { + public static boolean isNegative(final double val) { return !isPositiveOrZero(val); } @@ -336,7 +279,7 @@ public class MathUtils { * @param b the second double value * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b) { + public static byte compareDoubles(final double a, final double b) { return compareDoubles(a, b, 1e-6); } @@ -348,7 +291,7 @@ public class MathUtils { * @param epsilon the precision within which two double values will be considered equal * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b, double epsilon) { + public static byte compareDoubles(final double a, final double b, final double epsilon) { if (Math.abs(a - b) < epsilon) { return 0; } @@ -358,45 +301,31 @@ public class MathUtils { return 1; } - /** - * Compares float values for equality (within 1e-6), or inequality. - * - * @param a the first float value - * @param b the second float value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. - */ - public static byte compareFloats(float a, float b) { - return compareFloats(a, b, 1e-6f); - } - - /** - * Compares float values for equality (within epsilon), or inequality. - * - * @param a the first float value - * @param b the second float value - * @param epsilon the precision within which two float values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. - */ - public static byte compareFloats(float a, float b, float epsilon) { - if (Math.abs(a - b) < epsilon) { - return 0; - } - if (a > b) { - return -1; - } - return 1; - } - - public static double NormalDistribution(double mean, double sd, double x) { + public static double NormalDistribution(final double mean, final double sd, final double x) { double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); return a * b; } - public static double binomialCoefficient(int n, int k) { + /** + * Calculates the log10 of the binomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k number of successes + * @return the log10 of the binomial coefficient + */ + public static double binomialCoefficient(final int n, final int k) { return Math.pow(10, log10BinomialCoefficient(n, k)); } + /** + * @see #binomialCoefficient(int, int) with log10 applied to result + */ + public static double log10BinomialCoefficient(final int n, final int k) { + return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); + } + /** * Computes a binomial probability. This is computed using the formula *

      @@ -409,27 +338,52 @@ public class MathUtils { * @param p probability of success * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. */ - public static double binomialProbability(int n, int k, double p) { + public static double binomialProbability(final int n, final int k, final double p) { return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); } + /** + * @see #binomialProbability(int, int, double) with log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k, final double log10p) { + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 + */ + public static double binomialProbability(final int n, final int k) { + return Math.pow(10, log10BinomialProbability(n, k)); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k) { + return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); + } + /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. + * Assumes that the probability of a successful hit is fair (i.e. 0.5). * - * @param start - start of the cumulant sum (over hits) - * @param end - end of the cumulant sum (over hits) - * @param total - number of attempts for the number of hits - * @param probHit - probability of a successful hit + * @param n number of attempts for the number of hits + * @param k_start start (inclusive) of the cumulant sum (over hits) + * @param k_end end (inclusive) of the cumulant sum (over hits) * @return - returns the cumulative probability */ - public static double binomialCumulativeProbability(int start, int end, int total, double probHit) { + public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { + if ( k_end > n ) + throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); + double cumProb = 0.0; double prevProb; BigDecimal probCache = BigDecimal.ZERO; - for (int hits = start; hits < end; hits++) { + for (int hits = k_start; hits <= k_end; hits++) { prevProb = cumProb; - double probability = binomialProbability(total, hits, probHit); + final double probability = binomialProbability(n, hits); cumProb += probability; if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision probCache = probCache.add(new BigDecimal(prevProb)); @@ -442,6 +396,41 @@ public class MathUtils { return probCache.add(new BigDecimal(cumProb)).doubleValue(); } + /** + * Calculates the log10 of the multinomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) + * @return + */ + public static double log10MultinomialCoefficient(final int n, final int[] k) { + double denominator = 0.0; + for (int x : k) { + denominator += log10Factorial(x); + } + return log10Factorial(n) - denominator; + } + + /** + * Computes the log10 of the multinomial distribution probability given a vector + * of log10 probabilities. Designed to prevent overflows even with very large numbers. + * + * @param n number of trials + * @param k array of number of successes for each possibility + * @param log10p array of log10 probabilities + * @return + */ + public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { + if (log10p.length != k.length) + throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); + double log10Prod = 0.0; + for (int i = 0; i < log10p.length; i++) { + log10Prod += log10p[i] * k[i]; + } + return log10MultinomialCoefficient(n, k) + log10Prod; + } + /** * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. * This is computed using the formula: @@ -454,7 +443,7 @@ public class MathUtils { * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed * @return the multinomial of the specified configuration. */ - public static double multinomialCoefficient(int[] k) { + public static double multinomialCoefficient(final int[] k) { int n = 0; for (int xi : k) { n += xi; @@ -477,7 +466,7 @@ public class MathUtils { * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur * @return the multinomial probability of the specified configuration. */ - public static double multinomialProbability(int[] k, double[] p) { + public static double multinomialProbability(final int[] k, final double[] p) { if (p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); @@ -496,7 +485,7 @@ public class MathUtils { * @param x an byte[] of numbers * @return the RMS of the specified numbers. */ - public static double rms(byte[] x) { + public static double rms(final byte[] x) { if (x.length == 0) return 0.0; @@ -513,7 +502,7 @@ public class MathUtils { * @param x an int[] of numbers * @return the RMS of the specified numbers. */ - public static double rms(int[] x) { + public static double rms(final int[] x) { if (x.length == 0) return 0.0; @@ -530,7 +519,7 @@ public class MathUtils { * @param x a double[] of numbers * @return the RMS of the specified numbers. */ - public static double rms(Double[] x) { + public static double rms(final Double[] x) { if (x.length == 0) return 0.0; @@ -541,7 +530,7 @@ public class MathUtils { return Math.sqrt(rms); } - public static double rms(Collection l) { + public static double rms(final Collection l) { if (l.size() == 0) return 0.0; @@ -560,7 +549,7 @@ public class MathUtils { return dist; } - public static double round(double num, int digits) { + public static double round(final double num, final int digits) { double result = num * Math.pow(10.0, (double) digits); result = Math.round(result); result = result / Math.pow(10.0, (double) digits); @@ -574,7 +563,7 @@ public class MathUtils { * @param takeLog10OfOutput if true, the output will be transformed back into log10 units * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed */ - public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) { + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { return normalizeFromLog10(array, takeLog10OfOutput, false); } @@ -587,7 +576,7 @@ public class MathUtils { * * @return */ - public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) { + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. double maxValue = arrayMax(array); @@ -630,7 +619,7 @@ public class MathUtils { * @param array the array to be normalized * @return a newly allocated array corresponding the normalized values in array */ - public static double[] normalizeFromLog10(double[] array) { + public static double[] normalizeFromLog10(final double[] array) { return normalizeFromLog10(array, false); } @@ -683,7 +672,7 @@ public class MathUtils { return maxElementIndex(array, array.length); } - public static int maxElementIndex(final int[] array, int endIndex) { + public static int maxElementIndex(final int[] array, final int endIndex) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -696,7 +685,7 @@ public class MathUtils { return maxI; } - public static int maxElementIndex(final byte[] array, int endIndex) { + public static int maxElementIndex(final byte[] array, final int endIndex) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -709,7 +698,7 @@ public class MathUtils { return maxI; } - public static byte arrayMax(final byte[] array) { + public static int arrayMax(final int[] array) { return array[maxElementIndex(array)]; } @@ -722,19 +711,19 @@ public class MathUtils { return array[maxElementIndex(array, endIndex)]; } - public static double arrayMin(double[] array) { + public static double arrayMin(final double[] array) { return array[minElementIndex(array)]; } - public static int arrayMin(int[] array) { + public static int arrayMin(final int[] array) { return array[minElementIndex(array)]; } - public static byte arrayMin(byte[] array) { + public static byte arrayMin(final byte[] array) { return array[minElementIndex(array)]; } - public static int minElementIndex(double[] array) { + public static int minElementIndex(final double[] array) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -747,7 +736,7 @@ public class MathUtils { return minI; } - public static int minElementIndex(byte[] array) { + public static int minElementIndex(final byte[] array) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -760,7 +749,7 @@ public class MathUtils { return minI; } - public static int minElementIndex(int[] array) { + public static int minElementIndex(final int[] array) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -773,7 +762,7 @@ public class MathUtils { return minI; } - public static int arrayMaxInt(List array) { + public static int arrayMaxInt(final List array) { if (array == null) throw new IllegalArgumentException("Array cannot be null!"); if (array.size() == 0) @@ -785,19 +774,15 @@ public class MathUtils { return m; } - public static double arrayMaxDouble(List array) { - if (array == null) - throw new IllegalArgumentException("Array cannot be null!"); - if (array.size() == 0) - throw new IllegalArgumentException("Array size cannot be 0!"); - - double m = array.get(0); - for (double e : array) - m = Math.max(m, e); - return m; + public static int sum(final List list ) { + int sum = 0; + for ( Integer i : list ) { + sum += i; + } + return sum; } - public static double average(List vals, int maxI) { + public static double average(final List vals, final int maxI) { long sum = 0L; int i = 0; @@ -814,201 +799,11 @@ public class MathUtils { return (1.0 * sum) / i; } - public static double averageDouble(List vals, int maxI) { - double sum = 0.0; - - int i = 0; - for (double x : vals) { - if (i > maxI) - break; - sum += x; - i++; - } - return (1.0 * sum) / i; - } - - public static double average(List vals) { + public static double average(final List vals) { return average(vals, vals.size()); } - public static double average(int[] x) { - int sum = 0; - for (int v : x) - sum += v; - return (double) sum / x.length; - } - - public static byte average(byte[] vals) { - int sum = 0; - for (byte v : vals) { - sum += v; - } - return (byte) (sum / vals.length); - } - - public static double averageDouble(List vals) { - return averageDouble(vals, vals.size()); - } - - // Java Generics can't do primitive types, so I had to do this the simplistic way - - public static Integer[] sortPermutation(final int[] A) { - class comparator implements Comparator { - public int compare(Integer a, Integer b) { - if (A[a.intValue()] < A[b.intValue()]) { - return -1; - } - if (A[a.intValue()] == A[b.intValue()]) { - return 0; - } - if (A[a.intValue()] > A[b.intValue()]) { - return 1; - } - return 0; - } - } - Integer[] permutation = new Integer[A.length]; - for (int i = 0; i < A.length; i++) { - permutation[i] = i; - } - Arrays.sort(permutation, new comparator()); - return permutation; - } - - public static Integer[] sortPermutation(final double[] A) { - class comparator implements Comparator { - public int compare(Integer a, Integer b) { - if (A[a.intValue()] < A[b.intValue()]) { - return -1; - } - if (A[a.intValue()] == A[b.intValue()]) { - return 0; - } - if (A[a.intValue()] > A[b.intValue()]) { - return 1; - } - return 0; - } - } - Integer[] permutation = new Integer[A.length]; - for (int i = 0; i < A.length; i++) { - permutation[i] = i; - } - Arrays.sort(permutation, new comparator()); - return permutation; - } - - public static Integer[] sortPermutation(List A) { - final Object[] data = A.toArray(); - - class comparator implements Comparator { - public int compare(Integer a, Integer b) { - return ((T) data[a]).compareTo(data[b]); - } - } - Integer[] permutation = new Integer[A.size()]; - for (int i = 0; i < A.size(); i++) { - permutation[i] = i; - } - Arrays.sort(permutation, new comparator()); - return permutation; - } - - public static int[] permuteArray(int[] array, Integer[] permutation) { - int[] output = new int[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static double[] permuteArray(double[] array, Integer[] permutation) { - double[] output = new double[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static Object[] permuteArray(Object[] array, Integer[] permutation) { - Object[] output = new Object[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static String[] permuteArray(String[] array, Integer[] permutation) { - String[] output = new String[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static List permuteList(List list, Integer[] permutation) { - List output = new ArrayList(); - for (int i = 0; i < permutation.length; i++) { - output.add(list.get(permutation[i])); - } - return output; - } - - /** - * Draw N random elements from list. - */ - public static List randomSubset(List list, int N) { - if (list.size() <= N) { - return list; - } - - int idx[] = new int[list.size()]; - for (int i = 0; i < list.size(); i++) { - idx[i] = GenomeAnalysisEngine.getRandomGenerator().nextInt(); - } - - Integer[] perm = sortPermutation(idx); - - List ans = new ArrayList(); - for (int i = 0; i < N; i++) { - ans.add(list.get(perm[i])); - } - - return ans; - } - - /** - * Draw N random elements from an array. - * - * @param array your objects - * @param n number of elements to select at random from the list - * @return a new list with the N randomly chosen elements from list - */ - @Requires({"array != null", "n>=0"}) - @Ensures({"result != null", "result.length == Math.min(n, array.length)"}) - public static Object[] randomSubset(final Object[] array, final int n) { - if (array.length <= n) - return array.clone(); - - Object[] shuffledArray = arrayShuffle(array); - Object[] result = new Object[n]; - System.arraycopy(shuffledArray, 0, result, 0, n); - return result; - } - - public static double percentage(double x, double base) { - return (base > 0 ? (x / base) * 100.0 : 0); - } - - public static double percentage(int x, int base) { - return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); - } - - public static double percentage(long x, long base) { - return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); - } - - public static int countOccurrences(char c, String s) { + public static int countOccurrences(final char c, final String s) { int count = 0; for (int i = 0; i < s.length(); i++) { count += s.charAt(i) == c ? 1 : 0; @@ -1036,27 +831,6 @@ public class MathUtils { return count; } - /** - * Returns the top (larger) N elements of the array. Naive n^2 implementation (Selection Sort). - * Better than sorting if N (number of elements to return) is small - * - * @param array the array - * @param n number of top elements to return - * @return the n larger elements of the array - */ - public static Collection getNMaxElements(double[] array, int n) { - ArrayList maxN = new ArrayList(n); - double lastMax = Double.MAX_VALUE; - for (int i = 0; i < n; i++) { - double max = Double.MIN_VALUE; - for (double x : array) { - max = Math.min(lastMax, Math.max(x, max)); - } - maxN.add(max); - lastMax = max; - } - return maxN; - } /** * Returns n random indices drawn with replacement from the range 0..(k-1) @@ -1065,7 +839,7 @@ public class MathUtils { * @param k the number of random indices to draw (with replacement) * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates */ - static public ArrayList sampleIndicesWithReplacement(int n, int k) { + static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { ArrayList chosen_balls = new ArrayList(k); for (int i = 0; i < k; i++) { @@ -1084,7 +858,7 @@ public class MathUtils { * @param k the number of random indices to draw (without replacement) * @return a list of k random indices ranging from 0 to (n-1) without duplicates */ - static public ArrayList sampleIndicesWithoutReplacement(int n, int k) { + static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { ArrayList chosen_balls = new ArrayList(k); for (int i = 0; i < n; i++) { @@ -1105,7 +879,7 @@ public class MathUtils { * @param the template type of the ArrayList * @return a new ArrayList consisting of the elements at the specified indices */ - static public ArrayList sliceListByIndices(List indices, List list) { + static public ArrayList sliceListByIndices(final List indices, final List list) { ArrayList subset = new ArrayList(); for (int i : indices) { @@ -1115,35 +889,6 @@ public class MathUtils { return subset; } - public static Comparable orderStatisticSearch(int orderStat, List list) { - // this finds the order statistic of the list (kth largest element) - // the list is assumed *not* to be sorted - - final Comparable x = list.get(orderStat); - ArrayList lessThanX = new ArrayList(); - ArrayList equalToX = new ArrayList(); - ArrayList greaterThanX = new ArrayList(); - - for (Comparable y : list) { - if (x.compareTo(y) > 0) { - lessThanX.add(y); - } - else if (x.compareTo(y) < 0) { - greaterThanX.add(y); - } - else - equalToX.add(y); - } - - if (lessThanX.size() > orderStat) - return orderStatisticSearch(orderStat, lessThanX); - else if (lessThanX.size() + equalToX.size() >= orderStat) - return orderStat; - else - return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); - - } - /** * Given two log-probability vectors, compute log of vector product of them: * in Matlab notation, return log10(10.*x'*10.^y) @@ -1151,7 +896,7 @@ public class MathUtils { * @param y vector 2 * @return a double representing log (dotProd(10.^x,10.^y) */ - public static double logDotProduct(double [] x, double[] y) { + public static double logDotProduct(final double [] x, final double[] y) { if (x.length != y.length) throw new ReviewedStingException("BUG: Vectors of different lengths"); @@ -1165,57 +910,6 @@ public class MathUtils { - } - public static Object getMedian(List list) { - return orderStatisticSearch((int) Math.ceil(list.size() / 2), list); - } - - public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { - // version of the order statistic calculator for SAMRecord/Integer lists, where the - // list index maps to a q-score only through the offset index - // returns the kth-largest q-score. - - if (reads.size() == 0) { - return 0; - } - - ArrayList lessThanQReads = new ArrayList(); - ArrayList equalToQReads = new ArrayList(); - ArrayList greaterThanQReads = new ArrayList(); - ArrayList lessThanQOffsets = new ArrayList(); - ArrayList greaterThanQOffsets = new ArrayList(); - - final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; - - for (int iter = 0; iter < reads.size(); iter++) { - SAMRecord read = reads.get(iter); - int offset = offsets.get(iter); - byte quality = read.getBaseQualities()[offset]; - - if (quality < qk) { - lessThanQReads.add(read); - lessThanQOffsets.add(offset); - } - else if (quality > qk) { - greaterThanQReads.add(read); - greaterThanQOffsets.add(offset); - } - else { - equalToQReads.add(reads.get(iter)); - } - } - - if (lessThanQReads.size() > k) - return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); - else if (equalToQReads.size() + lessThanQReads.size() >= k) - return qk; - else - return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); - - } - - public static byte getQScoreMedian(List reads, List offsets) { - return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } /** @@ -1336,29 +1030,6 @@ public class MathUtils { // // useful common utility routines // - public static double rate(long n, long d) { - return n / (1.0 * Math.max(d, 1)); - } - - public static double rate(int n, int d) { - return n / (1.0 * Math.max(d, 1)); - } - - public static long inverseRate(long n, long d) { - return n == 0 ? 0 : d / Math.max(n, 1); - } - - public static long inverseRate(int n, int d) { - return n == 0 ? 0 : d / Math.max(n, 1); - } - - public static double ratio(int num, int denom) { - return ((double) num) / (Math.max(denom, 1)); - } - - public static double ratio(long num, long denom) { - return ((double) num) / (Math.max(denom, 1)); - } static public double max(double x0, double x1, double x2) { double a = Math.max(x0, x1); @@ -1371,8 +1042,8 @@ public class MathUtils { * @param ln log(x) * @return log10(x) */ - public static double lnToLog10(double ln) { - return ln * Math.log10(Math.exp(1)); + public static double lnToLog10(final double ln) { + return ln * Math.log10(Math.E); } /** @@ -1384,7 +1055,7 @@ public class MathUtils { * Efficient rounding functions to simplify the log gamma function calculation * double to long with 32 bit shift */ - private static final int HI(double x) { + private static final int HI(final double x) { return (int) (Double.doubleToLongBits(x) >> 32); } @@ -1392,7 +1063,7 @@ public class MathUtils { * Efficient rounding functions to simplify the log gamma function calculation * double to long without shift */ - private static final int LO(double x) { + private static final int LO(final double x) { return (int) Double.doubleToLongBits(x); } @@ -1400,7 +1071,7 @@ public class MathUtils { * Most efficent implementation of the lnGamma (FDLIBM) * Use via the log10Gamma wrapper method. */ - private static double lnGamma(double x) { + private static double lnGamma(final double x) { double t, y, z, p, p1, p2, p3, q, r, w; int i; @@ -1521,68 +1192,16 @@ public class MathUtils { * @param x the x parameter * @return the log10 of the gamma function at x. */ - public static double log10Gamma(double x) { + public static double log10Gamma(final double x) { return lnToLog10(lnGamma(x)); } - /** - * Calculates the log10 of the binomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k number of successes - * @return the log10 of the binomial coefficient - */ - public static double log10BinomialCoefficient(int n, int k) { - return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); - } - - public static double log10BinomialProbability(int n, int k, double log10p) { - double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); - return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); - } - - /** - * Calculates the log10 of the multinomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) - * @return - */ - public static double log10MultinomialCoefficient(int n, int[] k) { - double denominator = 0.0; - for (int x : k) { - denominator += log10Factorial(x ); - } - return log10Factorial(n) - denominator; - } - - /** - * Computes the log10 of the multinomial distribution probability given a vector - * of log10 probabilities. Designed to prevent overflows even with very large numbers. - * - * @param n number of trials - * @param k array of number of successes for each possibility - * @param log10p array of log10 probabilities - * @return - */ - public static double log10MultinomialProbability(int n, int[] k, double[] log10p) { - if (log10p.length != k.length) - throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); - double log10Prod = 0.0; - for (int i = 0; i < log10p.length; i++) { - log10Prod += log10p[i] * k[i]; - } - return log10MultinomialCoefficient(n, k) + log10Prod; - } - - public static double factorial(int x) { + public static double factorial(final int x) { // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value return (double)Math.round(Math.pow(10, log10Factorial(x))); } - public static double log10Factorial(int x) { + public static double log10Factorial(final int x) { if (x >= log10FactorialCache.length || x < 0) return log10Gamma(x + 1); else @@ -1598,57 +1217,20 @@ public class MathUtils { */ @Requires("a.length == b.length") @Ensures("result.length == a.length") - public static int[] addArrays(int[] a, int[] b) { + public static int[] addArrays(final int[] a, final int[] b) { int[] c = new int[a.length]; for (int i = 0; i < a.length; i++) c[i] = a[i] + b[i]; return c; } - /** - * Quick implementation of the Knuth-shuffle algorithm to generate a random - * permutation of the given array. - * - * @param array the original array - * @return a new array with the elements shuffled - */ - public static Object[] arrayShuffle(Object[] array) { - int n = array.length; - Object[] shuffled = array.clone(); - for (int i = 0; i < n; i++) { - int j = i + GenomeAnalysisEngine.getRandomGenerator().nextInt(n - i); - Object tmp = shuffled[i]; - shuffled[i] = shuffled[j]; - shuffled[j] = tmp; - } - return shuffled; - } - - /** - * Vector operations - * - * @param v1 first numerical array - * @param v2 second numerical array - * @return a new array with the elements added - */ - public static Double[] vectorSum(E v1[], E v2[]) { - if (v1.length != v2.length) - throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); - - Double[] result = new Double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = v1[k].doubleValue() + v2[k].doubleValue(); - - return result; - } - /** Same routine, unboxed types for efficiency * * @param x First vector * @param y Second vector * @return Vector of same length as x and y so that z[k] = x[k]+y[k] */ - public static double[] vectorSum(double[]x, double[] y) { + public static double[] vectorSum(final double[]x, final double[] y) { if (x.length != y.length) throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); @@ -1665,24 +1247,7 @@ public class MathUtils { * @param y Second vector * @return Vector of same length as x and y so that z[k] = x[k]-y[k] */ - public static double[] vectorDiff(double[]x, double[] y) { - if (x.length != y.length) - throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); - - double[] result = new double[x.length]; - for (int k=0; k Double[] scalarTimesVector(E a, E[] v1) { - - Double result[] = new Double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = a.doubleValue() * v1[k].doubleValue(); - - return result; - } - - public static Double dotProduct(E[] v1, E[] v2) { - if (v1.length != v2.length) - throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); - - Double result = 0.0; - for (int k = 0; k < v1.length; k++) - result += v1[k].doubleValue() * v2[k].doubleValue(); - - return result; - } - - public static double dotProduct(double[] v1, double[] v2) { - if (v1.length != v2.length) - throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); - - double result = 0.0; - for (int k = 0; k < v1.length; k++) - result += v1[k] * v2[k]; - - return result; - } - - public static double[] vectorLog10(double v1[]) { - double result[] = new double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = Math.log10(v1[k]); - - return result; - - } - - // todo - silly overloading, just because Java can't unbox/box arrays of primitive types, and we can't do generics with primitive types! - public static Double[] vectorLog10(Double v1[]) { - Double result[] = new Double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = Math.log10(v1[k]); - - return result; - - } - /** * Returns a series of integer values between start and stop, inclusive, * expontentially distributed between the two. That is, if there are @@ -1796,4 +1311,18 @@ public class MathUtils { return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; } } + + /** + * Draw N random elements from list + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSubset(final List list, final int N) { + if (list.size() <= N) { + return list; + } + + return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java index f08564644..029dfad31 100644 --- a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java +++ b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java @@ -28,6 +28,9 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.LinkedList; +import java.util.List; + /** * A canonical, master list of the standard NGS platforms. These values * can be obtained (efficiently) from a GATKSAMRecord object with the @@ -117,4 +120,17 @@ public enum NGSPlatform { public static boolean isKnown(final String platform) { return fromReadGroupPL(platform) != UNKNOWN; } + + /** + * Get a human-readable list of platform names + * @return the list of platform names + */ + public static String knownPlatformsString() { + final List names = new LinkedList(); + for ( final NGSPlatform pl : values() ) { + for ( final String name : pl.BAM_PL_NAMES ) + names.add(name); + } + return Utils.join(",", names); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java deleted file mode 100644 index 7bd937af9..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ /dev/null @@ -1,785 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 23, 2009 - * Time: 1:54:54 PM - * To change this template use File | Settings | File Templates. - */ -public class SWPairwiseAlignment { - private int alignment_offset; // offset of s2 w/respect to s1 - private Cigar alignmentCigar; - - private final double w_match; - private final double w_mismatch; - private final double w_open; - private final double w_extend; - - private static final int MSTATE = 0; - private static final int ISTATE = 1; - private static final int DSTATE = 2; - private static final int CLIP = 3; - - private static boolean cutoff = false; - private static boolean DO_SOFTCLIP = true; - - double[] SW; - -// private double [] best_gap_v ; -// private int [] gap_size_v ; -// private double [] best_gap_h ; -// private int [] gap_size_h ; - - - // private static double [][] sw = new double[500][500]; - // private static int [][] btrack = new int[500][500]; - - // ************************************************************************ - // **** IMPORTANT NOTE: **** - // **** This class assumes that all bytes come from UPPERCASED chars! **** - // ************************************************************************ - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) { - w_match = match; - w_mismatch = mismatch; - w_open = open; - w_extend = extend; - align(seq1,seq2); - } - - - public SWPairwiseAlignment(byte[] seq1, byte[] seq2) { - this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3) - } - - - public Cigar getCigar() { return alignmentCigar ; } - - public int getAlignmentStart2wrt1() { return alignment_offset; } - - public void align(final byte[] a, final byte[] b) { - final int n = a.length; - final int m = b.length; - double [] sw = new double[(n+1)*(m+1)]; - SW = sw; - int [] btrack = new int[(n+1)*(m+1)]; - -// best_gap_v = new double[m+1]; -// Arrays.fill(best_gap_v,-1.0e40); -// gap_size_v = new int[m+1]; -// best_gap_h = new double[n+1]; -// Arrays.fill(best_gap_h,-1.0e40); -// gap_size_h = new int[n+1]; - - calculateMatrix(a, b, sw, btrack); - calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) - } - - - private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) { - final int n = a.length+1; - final int m = b.length+1; - - //final double MATRIX_MIN_CUTOFF=-1e100; // never let matrix elements drop below this cutoff - final double MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff - if ( cutoff ) MATRIX_MIN_CUTOFF = 0.0; - else MATRIX_MIN_CUTOFF = -1e100; - - double [] best_gap_v = new double[m+1]; - Arrays.fill(best_gap_v,-1.0e40); - int [] gap_size_v = new int[m+1]; - double [] best_gap_h = new double[n+1]; - Arrays.fill(best_gap_h,-1.0e40); - int [] gap_size_h = new int[n+1]; - - // build smith-waterman matrix and keep backtrack info: - for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop - byte a_base = a[i-1]; // letter in a at the current pos - - final int row_offset = row_offset_1 + m; - - // On the entrance into the loop, row_offset_1 is the (linear) offset - // of the first element of row (i-1) and row_offset is the linear offset of the - // start of row i - - for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) { - - // data_offset_1 is linearized offset of element [i-1][j-1] - - final byte b_base = b[j-1]; // letter in b at the current pos - - // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_diag = sw[data_offset_1] + wd(a_base,b_base); - - // optimized "traversal" of all the matrix cells above the current one (i.e. traversing - // all 'step down' events that would end in the current cell. The optimized code - // does exactly the same thing as the commented out loop below. IMPORTANT: - // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! - - // if a gap (length 1) was just opened above, this is the cost of arriving to the current cell: - double prev_gap = sw[data_offset_1+1]+w_open; - - best_gap_v[j] += w_extend; // for the gaps that were already opened earlier, extending them by 1 costs w_extend - - if ( prev_gap > best_gap_v[j] ) { - // opening a gap just before the current cell results in better score than extending by one - // the best previously opened gap. This will hold for ALL cells below: since any gap - // once opened always costs w_extend to extend by another base, we will always get a better score - // by arriving to any cell below from the gap we just opened (prev_gap) rather than from the previous best gap - best_gap_v[j] = prev_gap; - gap_size_v[j] = 1; // remember that the best step-down gap from above has length 1 (we just opened it) - } else { - // previous best gap is still the best, even after extension by another base, so we just record that extension: - gap_size_v[j]++; - } - - final double step_down = best_gap_v[j] ; - final int kd = gap_size_v[j]; - -/* - for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) { - // data_offset_k is linearized offset of element [i-k][j] - // in other words, trial = sw[i-k][j]+gap_penalty: - final double trial = sw[data_offset_k]+wk(k); - if ( step_down < trial ) { - step_down=trial; - kd = k; - } - } -*/ - - // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing - // all 'step right' events that would end in the current cell. The optimized code - // does exactly the same thing as the commented out loop below. IMPORTANT: - // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! - - final int data_offset = row_offset + j; // linearized offset of element [i][j] - prev_gap = sw[data_offset-1]+w_open; // what would it cost us to open length 1 gap just to the left from current cell - best_gap_h[i] += w_extend; // previous best gap would cost us that much if extended by another base - - if ( prev_gap > best_gap_h[i] ) { - // newly opened gap is better (score-wise) than any previous gap with the same row index i; since - // gap penalty is linear with k, this new gap location is going to remain better than any previous ones - best_gap_h[i] = prev_gap; - gap_size_h[i] = 1; - } else { - gap_size_h[i]++; - } - - final double step_right = best_gap_h[i]; - final int ki = gap_size_h[i]; - -/* - for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) { - // data_offset is linearized offset of element [i][j-k] - // in other words, step_right=sw[i][j-k]+gap_penalty; - final double trial = sw[data_offset]+wk(k); - if ( step_right < trial ) { - step_right=trial; - ki = k; - } - } - - final int data_offset = row_offset + j; // linearized offset of element [i][j] -*/ - - - if ( step_down > step_right ) { - if ( step_down > step_diag ) { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down); - btrack[data_offset] = kd ; // positive=vertical - } else { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } else { - // step_down <= step_right - if ( step_right > step_diag ) { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_right); - btrack[data_offset] = -ki; // negative = horizontal - } else { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } - -// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - - // IMPORTANT, IMPORTANT, IMPORTANT: - // note that we update this (secondary) outer loop variable here, - // so that we DO NOT need to update it - // in the for() statement itself. - row_offset_1 = row_offset; - } -// print(sw,a,b); - } - - - private void calculateCigar(int n, int m, double [] sw, int [] btrack) { - // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order - //PrimitivePair.Int p = new PrimitivePair.Int(); - int p1 = 0, p2 = 0; - - double maxscore = 0.0; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) - - // look for largest score. we use >= combined with the traversal direction - // to ensure that if two scores are equal, the one closer to diagonal gets picked - for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) { - // data_offset is the offset of [i][m] - if ( sw[data_offset] >= maxscore ) { - p1 = i; p2 = m ; maxscore = sw[data_offset]; - } - } - - for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) { - // data_offset is the offset of [n][j] - if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) { - p1 = n; - p2 = j ; -// maxscore = sw[n][j]; - maxscore = sw[data_offset]; - segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment - } - } -// System.out.println(" Found max score="+maxscore+" at p1="+p1+ " p2="+p2); - - List lce = new ArrayList(5); - - if ( segment_length > 0 && DO_SOFTCLIP ) { - lce.add(makeElement(CLIP, segment_length)); - segment_length = 0; - } - - // we will be placing all insertions and deletions into sequence b, so the states are named w/regard - // to that sequence - - int state = MSTATE; - - int data_offset = p1*(m+1)+p2; // offset of element [p1][p2] - // System.out.println("Backtracking: starts at "+p1+":"+p2+" ("+sw[data_offset]+")"); - do { -// int btr = btrack[p1][p2]; - int btr = btrack[data_offset]; - - int new_state; - int step_length = 1; - - // System.out.print(" backtrack value: "+btr); - - if ( btr > 0 ) { - new_state = DSTATE; - step_length = btr; - } else if ( btr < 0 ) { - new_state = ISTATE; - step_length = (-btr); - } else new_state = MSTATE; // and step_length =1, already set above - - - // move to next best location in the sw matrix: - switch( new_state ) { - case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in th esw matrix - case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left - case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up - } - // System.out.println("; backtracked to p1="+p1+" p2="+p2); - /* - switch( new_state ) { - case MSTATE: System.out.println(" diag (match) to "+ sw[data_offset]); break; // equivalent to p1--; p2-- - case ISTATE: System.out.println(" left (insertion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p2-=step_length; - case DSTATE: System.out.println(" up (deletion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p1 -= step_up - } - */ - // now let's see if the state actually changed: - if ( new_state == state ) segment_length+=step_length; - else { -// System.out.println(" emitting "+segment_length+makeElement(state,segment_length).getOperator().toString()); - // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). - lce.add(makeElement(state, segment_length)); - segment_length = step_length; - state = new_state; - } -// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2: - } while ( p1 > 0 && p2 > 0 ); - - // post-process the last segment we are still keeping; - // NOTE: if reads "overhangs" the ref on the left (i.e. if p2>0) we are counting - // those extra bases sticking out of the ref into the first cigar element if DO_SOFTCLIP is false; - // otherwise they will be softclipped. For instance, - // if read length is 5 and alignment starts at offset -2 (i.e. read starts before the ref, and only - // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if - // DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true. - // The consumers need to check for the alignment offset and deal with it properly. - if (DO_SOFTCLIP ) { - lce.add(makeElement(state, segment_length)); - if ( p2> 0 ) lce.add(makeElement(CLIP, p2)); - alignment_offset = p1 ; - } else { - lce.add(makeElement(state, segment_length + p2)); - alignment_offset = p1 - p2; - } - - Collections.reverse(lce); - alignmentCigar = new Cigar(lce); - - } - - - private CigarElement makeElement(int state, int segment_length) { - CigarOperator o = null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - case CLIP: o = CigarOperator.S; break; - } - return new CigarElement(segment_length,o); - } - - private double wd(byte x, byte y) { - return (x == y ? w_match : w_mismatch); - } - - private double wk(int k) { - return w_open+(k-1)*w_extend; // gap - } - - private void print(double[] s, byte[] a, byte[] b) { - int n = a.length+1; - int m = b.length+1; - System.out.print(" "); - for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ; - System.out.println(); - - for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) { - if ( i > 0 ) System.out.print((char)a[i-1]); - else System.out.print(' '); - System.out.print(" "); - for ( int j = 0; j < m ; j++ ) { - System.out.printf(" %5.1f",s[row_offset+j]); - } - System.out.println(); - } - } - - static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read) { - printAlignment(a,ref,read,100); - } - - static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read, int width) { - StringBuilder bread = new StringBuilder(); - StringBuilder bref = new StringBuilder(); - StringBuilder match = new StringBuilder(); - - int i = 0; - int j = 0; - - final int offset = a.getAlignmentStart2wrt1(); - - Cigar cigar = a.getCigar(); - - if ( ! DO_SOFTCLIP ) { - - // we need to go through all the hassle below only if we do not do softclipping; - // otherwise offset is never negative - if ( offset < 0 ) { - for ( ; j < (-offset) ; j++ ) { - bread.append((char)read[j]); - bref.append(' '); - match.append(' '); - } - // at negative offsets, our cigar's first element carries overhanging bases - // that we have just printed above. Tweak the first element to - // exclude those bases. Here we create a new list of cigar elements, so the original - // list/original cigar are unchanged (they are unmodifiable anyway!) - - List tweaked = new ArrayList(); - tweaked.addAll(cigar.getCigarElements()); - tweaked.set(0,new CigarElement(cigar.getCigarElement(0).getLength()+offset, - cigar.getCigarElement(0).getOperator())); - cigar = new Cigar(tweaked); - } - } - - if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0 - for ( ; i < a.getAlignmentStart2wrt1() ; i++ ) { - bref.append((char)ref[i]); - bread.append(' '); - match.append(' '); - } - } - - for ( CigarElement e : cigar.getCigarElements() ) { - switch (e.getOperator()) { - case M : - for ( int z = 0 ; z < e.getLength() ; z++, i++, j++ ) { - bref.append((i= s.length() ) { - System.out.println(); - return; - } - int end = Math.min(start+width,s.length()); - System.out.println(s.substring(start,end)); - - } - -// BELOW: main() method for testing; old implementations of the core methods are commented out below; -// uncomment everything through the end of the file if benchmarking of new vs old implementations is needed. - - public static void main(String argv[]) { -// String ref="CACGAGCATATGTGTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTATGTGTTCACATGCATGTGTGTCT"; -// String read = "GCATATGTTTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTGTGTTCACATGCATGTG"; - - String ref = null; - String read = null; - - Map> args = processArgs(argv); - - List l = args.get("SEQ"); - args.remove("SEQ"); - if ( l == null ) { - System.err.println("SEQ argument is missing. Two input sequences must be provided"); - System.exit(1); - } - if ( l.size() != 2 ) { - System.err.println("Two input sequences (SEQ arguments) must be provided. Found "+l.size()+" instead"); - System.exit(1); - } - - ref = l.get(0); - read = l.get(1); - - Double m = extractSingleDoubleArg("MATCH",args); - Double mm = extractSingleDoubleArg("MISMATCH",args); - Double open = extractSingleDoubleArg("OPEN",args); - Double ext = extractSingleDoubleArg("EXTEND",args); - - Boolean reverse = extractSingleBooleanArg("REVERSE",args); - if ( reverse != null && reverse.booleanValue() == true ) { - ref = Utils.reverse(ref); - read = Utils.reverse(read); - } - - Boolean print_mat = extractSingleBooleanArg("PRINT_MATRIX",args); - Boolean cut = extractSingleBooleanArg("CUTOFF",args); - if ( cut != null ) SWPairwiseAlignment.cutoff = cut; - - if ( args.size() != 0 ) { - System.err.println("Unknown argument on the command line: "+args.keySet().iterator().next()); - System.exit(1); - } - - double w_match; - double w_mismatch; - double w_open; - double w_extend; - - w_match = (m == null ? 30.0 : m.doubleValue()); - w_mismatch = (mm == null ? -10.0 : mm.doubleValue()); - w_open = (open == null ? -10.0 : open.doubleValue()); - w_extend = (ext == null ? -2.0 : ext.doubleValue()); - - - SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend); - - System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+ - " length1="+ref.length()+" length2="+read.length()); - - - System.out.println(); - printAlignment(a,ref.getBytes(),read.getBytes()); - - System.out.println(); - if ( print_mat != null && print_mat == true ) { - a.print(a.SW,ref.getBytes(),read.getBytes()); - } - } - - - static Pair getArg(String prefix, String argv[], int i) { - String arg = null; - if ( argv[i].startsWith(prefix) ) { - arg = argv[i].substring(prefix.length()); - if( arg.length() == 0 ) { - i++; - if ( i < argv.length ) arg = argv[i]; - else { - System.err.println("No value found after " + prefix + " argument tag"); - System.exit(1); - } - } - i++; - } - return new Pair(arg,i); - } - - static Map> processArgs(String argv[]) { - Map> args = new HashMap>(); - - for ( int i = 0; i < argv.length ; i++ ) { - String arg = argv[i]; - int pos = arg.indexOf('='); - if ( pos < 0 ) { - System.err.println("Argument "+arg+" is not of the form ="); - System.exit(1); - } - String val = arg.substring(pos+1); - if ( val.length() == 0 ) { - // there was a space between '=' and the value - i++; - if ( i < argv.length ) val = argv[i]; - else { - System.err.println("No value found after " + arg + " argument tag"); - System.exit(1); - } - } - arg = arg.substring(0,pos); - - List l = args.get(arg); - if ( l == null ) { - l = new ArrayList(); - args.put(arg,l); - } - l.add(val); - } - return args; - } - - static Double extractSingleDoubleArg(String argname, Map> args) { - List l = args.get(argname); - args.remove(argname); - if ( l == null ) return null; - - if ( l.size() > 1 ) { - System.err.println("Only one "+argname+" argument is allowed"); - System.exit(1); - } - double d=0; - try { - d = Double.parseDouble(l.get(0)); - } catch ( NumberFormatException e) { - System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+")"); - System.exit(1); - } - System.out.println("Argument "+argname+" set to "+d); - return new Double(d); - } - - - static Boolean extractSingleBooleanArg(String argname, Map> args) { - List l = args.get(argname); - args.remove(argname); - if ( l == null ) return null; - - if ( l.size() > 1 ) { - System.err.println("Only one "+argname+" argument is allowed"); - System.exit(1); - } - if ( l.get(0).equals("true") ) return Boolean.valueOf(true); - if ( l.get(0).equals("false") ) return Boolean.valueOf(false); - System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+"); true/false are allowed"); - System.exit(1); - return Boolean.valueOf(false); // This value isn't used because it is preceded by System.exit(1) - } - -/* ############################################## - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend, boolean runOld ) { - w_match = match; - w_mismatch = mismatch; - w_open = open; - w_extend = extend; - if ( runOld ) align_old(seq1,seq2); - else align(seq1,seq2); - } - - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, boolean runOld) { - this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0,runOld); // match=1, mismatch = -1/3, gap=-(1+k/3) - } - - public void align_old(final byte[] a, final byte[] b) { - final int n = a.length; - final int m = b.length; - double [] sw = new double[(n+1)*(m+1)]; - int [] btrack = new int[(n+1)*(m+1)]; - calculateMatrix_old(a, b, sw, btrack); - calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) - } - - private void calculateMatrix_old(final byte[] a, final byte[] b, double [] sw, int [] btrack ) { - final int n = a.length+1; - final int m = b.length+1; - - // build smith-waterman matrix and keep backtrack info: - for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop - byte a_base = a[i-1]; // letter in a at the current pos - - final int row_offset = row_offset_1 + m; - - // On the entrance into the loop, row_offset_1 is the (linear) offset - // of the first element of row (i-1) and row_offset is the linear offset of the - // start of row i - - for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) { - - // data_offset_1 is linearized offset of element [i-1][j-1] - - final byte b_base = b[j-1]; // letter in b at the current pos - - // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_diag = sw[data_offset_1] + wd(a_base,b_base); - int kd = 0; - - double step_down = 0; - - for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) { - // data_offset_k is linearized offset of element [i-k][j] - // in other words, trial = sw[i-k][j]+gap_penalty: - final double trial = sw[data_offset_k]+wk(k); - if ( step_down < trial ) { - step_down=trial; - kd = k; - } - } - - int ki = 0; - - // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing - // all 'step right' events that would end in the current cell. The optimized code - // does exactly the same thing as the commented out loop below. IMPORTANT: - // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! - - double step_right = 0; - - for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) { - // data_offset is linearized offset of element [i][j-k] - // in other words, step_right=sw[i][j-k]+gap_penalty; - final double trial = sw[data_offset]+wk(k); - if ( step_right < trial ) { - step_right=trial; - ki = k; - } - } - - final int data_offset = row_offset + j; // linearized offset of element [i][j] - - if ( step_down > step_right ) { - if ( step_down > step_diag ) { - sw[data_offset] = Math.max(0,step_down); - btrack[data_offset] = kd ; // positive=vertical - } else { - sw[data_offset] = Math.max(0,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } else { - // step_down <= step_right - if ( step_right > step_diag ) { - sw[data_offset] = Math.max(0,step_right); - btrack[data_offset] = -ki; // negative = horizontal - } else { - sw[data_offset] = Math.max(0,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } - -// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - - // IMPORTANT, IMPORTANT, IMPORTANT: - // note that we update this (secondary) outer loop variable here, - // so that we DO NOT need to update it - // in the for() statement itself. - row_offset_1 = row_offset; - } -// print(sw,a,b); - } -##################### -END COMMENTED OUT SECTION -*/ - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index d009ba5bc..ff0ea958c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -29,7 +29,6 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.util.StringUtil; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; @@ -54,6 +53,17 @@ public class Utils { public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; + /** + * Boolean xor operation. Only true if x != y. + * + * @param x a boolean + * @param y a boolean + * @return true if x != y + */ + public static boolean xor(final boolean x, final boolean y) { + return x != y; + } + /** * Calculates the optimum initial size for a hash table given the maximum number * of elements it will need to hold. The optimum size is the smallest size that @@ -76,9 +86,7 @@ public class Utils { * @return True if the two objects are equal, false otherwise. */ public static boolean equals(Object lhs, Object rhs) { - if (lhs == null && rhs == null) return true; - else if (lhs == null) return false; - else return lhs.equals(rhs); + return lhs == null && rhs == null || lhs != null && lhs.equals(rhs); } public static List cons(final T elt, final List l) { @@ -117,35 +125,6 @@ public class Utils { logger.warn(String.format("* %s", builder)); } - public static ArrayList subseq(char[] fullArray) { - byte[] fullByteArray = new byte[fullArray.length]; - StringUtil.charsToBytes(fullArray, 0, fullArray.length, fullByteArray, 0); - return subseq(fullByteArray); - } - - public static ArrayList subseq(byte[] fullArray) { - return subseq(fullArray, 0, fullArray.length - 1); - } - - public static ArrayList subseq(byte[] fullArray, int start, int end) { - assert end < fullArray.length; - ArrayList dest = new ArrayList(end - start + 1); - for (int i = start; i <= end; i++) { - dest.add(fullArray[i]); - } - return dest; - } - - public static String baseList2string(List bases) { - byte[] basesAsbytes = new byte[bases.size()]; - int i = 0; - for (Byte b : bases) { - basesAsbytes[i] = b; - i++; - } - return new String(basesAsbytes); - } - /** * join the key value pairs of a map into one string, i.e. myMap = [A->1,B->2,C->3] with a call of: * joinMap("-","*",myMap) -> returns A-1*B-2*C-3 @@ -244,7 +223,6 @@ public class Utils { * Create a new list that contains the elements of left along with elements elts * @param left a non-null list of elements * @param elts a varargs vector for elts to append in order to left - * @param * @return A newly allocated linked list containing left followed by elts */ public static List append(final List left, T ... elts) { @@ -256,9 +234,9 @@ public class Utils { /** * Returns a string of the values in joined by separator, such as A,B,C * - * @param separator - * @param doubles - * @return + * @param separator separator character + * @param doubles the array with values + * @return a string with the values separated by the separator */ public static String join(String separator, double[] doubles) { if ( doubles == null || doubles.length == 0) @@ -404,6 +382,24 @@ public class Utils { return C; } + /** + * Concatenates byte arrays + * @return a concat of all bytes in allBytes in order + */ + public static byte[] concat(final byte[] ... allBytes) { + int size = 0; + for ( final byte[] bytes : allBytes ) size += bytes.length; + + final byte[] c = new byte[size]; + int offset = 0; + for ( final byte[] bytes : allBytes ) { + System.arraycopy(bytes, 0, c, offset, bytes.length); + offset += bytes.length; + } + + return c; + } + /** * Appends String(s) B to array A. * @param A First array. @@ -457,7 +453,7 @@ public class Utils { return rcbases; } - static public final List reverse(final List l) { + static public List reverse(final List l) { final List newL = new ArrayList(l); Collections.reverse(newL); return newL; @@ -496,10 +492,8 @@ public class Utils { /** * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails, * unresolvable gets returned instead. - * - * @return */ - public static final String resolveHostname() { + public static String resolveHostname() { try { return InetAddress.getLocalHost().getCanonicalHostName(); } @@ -526,17 +520,15 @@ public class Utils { * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets * up the writer with the header and presorted status. * - * @param toolkit the engine * @param originalHeader original header - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param programRecord the program record for this program */ - public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) { - SAMFileHeader header = originalHeader.clone(); - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); + public static SAMFileHeader setupWriter(final SAMFileHeader originalHeader, final SAMProgramRecord programRecord) { + final SAMFileHeader header = originalHeader.clone(); + final List oldRecords = header.getProgramRecords(); + final List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId()))) newRecords.add(record); if (programRecord != null) { @@ -551,14 +543,13 @@ public class Utils { * the new header to be added to the BAM writer. * * @param toolkit the engine - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param walker the walker object (so we can extract the command line) * @param PROGRAM_RECORD_NAME the name for the PG tag * @return a pre-filled header for the bam writer */ - public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + public static SAMFileHeader setupWriter(final GenomeAnalysisEngine toolkit, final SAMFileHeader originalHeader, final Object walker, final String PROGRAM_RECORD_NAME) { final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); - return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord); + return setupWriter(originalHeader, programRecord); } /** @@ -568,12 +559,11 @@ public class Utils { * @param writer BAM file writer * @param toolkit the engine * @param preSorted whether or not the writer can assume reads are going to be added are already sorted - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param walker the walker object (so we can extract the command line) * @param PROGRAM_RECORD_NAME the name for the PG tag */ - public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { - SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME); + public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, walker, PROGRAM_RECORD_NAME); writer.writeHeader(header); writer.setPresorted(preSorted); } @@ -600,23 +590,11 @@ public class Utils { return programRecord; } - public static Collection makeCollection(Iterable iter) { - Collection list = new ArrayList(); - for (E item : iter) { - list.add(item); - } - return list; - } - /** * Returns the number of combinations represented by this collection * of collection of options. * * For example, if this is [[A, B], [C, D], [E, F, G]] returns 2 * 2 * 3 = 12 - * - * @param options - * @param - * @return */ @Requires("options != null") public static int nCombinations(final Collection[] options) { @@ -647,21 +625,18 @@ public class Utils { * if N = 1 => [[A], [B], [C]] * if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]] * - * @param objects - * @param n - * @param + * @param objects list of objects + * @param n size of each combination * @param withReplacement if false, the resulting permutations will only contain unique objects from objects - * @return + * @return a list with all combinations with size n of objects. */ public static List> makePermutations(final List objects, final int n, final boolean withReplacement) { final List> combinations = new ArrayList>(); - if ( n <= 0 ) - ; - else if ( n == 1 ) { + if ( n == 1 ) { for ( final T o : objects ) combinations.add(Collections.singletonList(o)); - } else { + } else if (n > 1) { final List> sub = makePermutations(objects, n - 1, withReplacement); for ( List subI : sub ) { for ( final T a : objects ) { @@ -709,9 +684,6 @@ public class Utils { /** * Create a constant map that maps each value in values to itself - * @param values - * @param - * @return */ public static Map makeIdentityFunctionMap(Collection values) { Map map = new HashMap(values.size()); @@ -727,9 +699,6 @@ public class Utils { * groupSize = 2 * result = [[A, B], [C, D], [E]] * - * @param list - * @param groupSize - * @return */ public static List> groupList(final List list, final int groupSize) { if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1"); @@ -766,4 +735,17 @@ public class Utils { while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32 return md5String; } + + /** + * Does big end with the exact sequence of bytes in suffix? + * + * @param big a non-null byte[] to test if it a prefix + suffix + * @param suffix a non-null byte[] to test if it's a suffix of big + * @return true if big is proper byte[] composed of some prefix + suffix + */ + public static boolean endsWith(final byte[] big, final byte[] suffix) { + if ( big == null ) throw new IllegalArgumentException("big cannot be null"); + if ( suffix == null ) throw new IllegalArgumentException("suffix cannot be null"); + return new String(big).endsWith(new String(suffix)); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index b38d6575e..2f4c1b55d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -149,7 +149,7 @@ public class ActiveRegion implements HasGenomeLocation { @Override public String toString() { - return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size() + " "; + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); } /** @@ -374,6 +374,8 @@ public class ActiveRegion implements HasGenomeLocation { * * Note that the returned list may be empty, if this active region doesn't overlap the set at all * + * Note that the resulting regions are all empty, regardless of whether the current active region has reads + * * @param intervals a non-null set of intervals that are allowed * @return an ordered list of active region where each interval is contained within intervals */ @@ -383,14 +385,59 @@ public class ActiveRegion implements HasGenomeLocation { final List clippedRegions = new LinkedList(); for ( final GenomeLoc overlapping : allOverlapping ) { - final GenomeLoc subLoc = getLocation().intersect(overlapping); - final int subStart = subLoc.getStart() - getLocation().getStart(); - final int subEnd = subStart + subLoc.size(); - final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); - final ActiveRegion clipped = new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension ); - clippedRegions.add(clipped); + clippedRegions.add(trim(overlapping, extension)); } return clippedRegions; } + + /** + * Trim this active to just the newExtent, producing a new active region without any reads that has only + * the extent of newExtend intersected with the current extent + * @param newExtent the new extend of the active region we want + * @param newExtension the extension size we want for the newly trimmed active region + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc newExtent, final int newExtension) { + if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + + final GenomeLoc subLoc = getLocation().intersect(newExtent); + final int subStart = subLoc.getStart() - getLocation().getStart(); + final int subEnd = subStart + subLoc.size(); + final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); + return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, newExtension ); + } + + /** + * Trim this active to no more than the newExtent, producing a new active region without any reads that + * attempts to provide the best possible representation of this active region covering the newExtent. + * + * The challenge here is that newExtent may (1) be larger than can be represented by this active region + * + its original extension and (2) the extension must be symmetric on both sides. This algorithm + * therefore determines how best to represent newExtent as a subset of the span of this + * region with a padding value that captures as much of the newExtent as possible. + * + * For example, suppose this active region is + * + * Active: 100-200 with extension of 50, so that the true span is 50-250 + * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225 + * + * Here we represent the active region as a active region from 150-200 with 25 bp of padding. + * + * The overall constraint is that the active region can never exceed the original active region, and + * the extension is chosen to maximize overlap with the desired region + * + * @param newExtent the new extend of the active region we want + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc newExtent) { + if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + + final GenomeLoc subActive = getLocation().intersect(newExtent); + final int requiredOnRight = Math.max(newExtent.getStop() - subActive.getStop(), 0); + final int requiredOnLeft = Math.max(subActive.getStart() - newExtent.getStart(), 0); + final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); + + return new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 25948a857..ed541b070 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -41,7 +41,7 @@ import java.util.*; * @since Date created */ public class ActivityProfile { - private final static int MAX_PROB_PROPOGATION_DISTANCE = 50; + private final static int MAX_PROB_PROPAGATION_DISTANCE = 50; protected final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author protected final List stateList; @@ -98,7 +98,7 @@ public class ActivityProfile { */ @Ensures("result >= 0") public int getMaxProbPropagationDistance() { - return MAX_PROB_PROPOGATION_DISTANCE; + return MAX_PROB_PROPAGATION_DISTANCE; } /** @@ -210,7 +210,7 @@ public class ActivityProfile { contigLength = parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength(); } else { if ( regionStopLoc.getStart() != loc.getStart() - 1 ) - throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc ); + throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediately after last loc " + regionStopLoc ); regionStopLoc = loc; } @@ -239,7 +239,7 @@ public class ActivityProfile { throw new IllegalArgumentException("Must add state contiguous to existing states: adding " + stateToAdd); if ( position >= 0 ) { - // ignore states starting before this regions start + // ignore states starting before this region's start if ( position < size() ) { stateList.get(position).isActiveProb += stateToAdd.isActiveProb; } else { @@ -259,7 +259,7 @@ public class ActivityProfile { * Can be overridden by subclasses to transform states in any way * * There's no particular contract for the output states, except that they can never refer to states - * beyond the current end of the stateList unless the explictly include preceding states before + * beyond the current end of the stateList unless the explicitly include preceding states before * the reference. So for example if the current state list is [1, 2, 3] this function could return * [1,2,3,4,5] but not [1,2,3,5]. * @@ -352,6 +352,12 @@ public class ActivityProfile { if ( stateList.isEmpty() ) return null; + // If we are flushing the activity profile we need to trim off the excess states so that we don't create regions outside of our current processing interval + if( forceConversion ) { + final List statesToTrimAway = new ArrayList(stateList.subList(getSpan().size(), stateList.size())); + stateList.removeAll(statesToTrimAway); + } + final ActivityProfileState first = stateList.get(0); final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD; final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, minRegionSize, maxRegionSize, forceConversion); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index fe1a386fb..f51881e0b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -378,7 +378,13 @@ public class ClippingOp { hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); } - + + if (read.isReducedRead()) { + final int[] reducedCounts = new int[newLength]; + System.arraycopy(read.getReducedReadCounts(), copyStart, reducedCounts, 0, newLength); + hardClippedRead.setReducedReadCounts(reducedCounts); + } + return hardClippedRead; } @@ -581,8 +587,8 @@ public class ClippingOp { if (cigarElement.getOperator() == CigarOperator.INSERTION) return -clippedLength; - // Deletions should be added to the total hard clip count - else if (cigarElement.getOperator() == CigarOperator.DELETION) + // Deletions and Ns should be added to the total hard clip count (because we want to maintain the original alignment start) + else if (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) return cigarElement.getLength(); // There is no shift if we are not clipping an indel diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 45dd55af7..eaefa3aba 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -28,8 +28,8 @@ package org.broadinstitute.sting.utils.clipping; import com.google.java.contract.Requires; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -534,7 +534,7 @@ public class ReadClipper { throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); if ( start > stop ) - throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop)); + throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen, please check read: %s (CIGAR: %s)", start, stop, read, read.getCigarString())); if ( start > 0 && stop < read.getReadLength() - 1) throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index fb26f6c37..82ee76a81 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -45,8 +45,8 @@ import java.util.ArrayList; *

      * *

      - * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here - * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq + * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the documentation guide here + * http://www.broadinstitute.org/gatk/guide/article?id=1329 *

      *

      Usage

      * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java deleted file mode 100644 index 9f330f226..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java +++ /dev/null @@ -1,132 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.collections; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Dec 29, 2009 - */ - -public class NestedHashMap { - - public final Map data = new HashMap(); - - public Object get( final Object... keys ) { - Map map = this.data; - final int nestedMaps = keys.length - 1; - for( int iii = 0; iii < nestedMaps; iii++ ) { - map = (Map) map.get(keys[iii]); - if( map == null ) { return null; } - } - return map.get(keys[nestedMaps]); - } - - public synchronized void put( final Object value, final Object... keys ) { // WARNING! value comes before the keys! - this.put(value, false, keys ); - } - - public synchronized Object put( final Object value, boolean keepOldBindingIfPresent, final Object... keys ) { - Map map = this.data; - final int keysLength = keys.length; - for( int iii = 0; iii < keysLength; iii++ ) { - if( iii == keysLength - 1 ) { - if ( keepOldBindingIfPresent && map.containsKey(keys[iii]) ) { - // this code test is for parallel protection when you call put() multiple times in different threads - // to initialize the map. It returns the already bound key[iii] -> value - return map.get(keys[iii]); - } else { - // we are a new binding, put it in the map - map.put(keys[iii], value); - return value; - } - } else { - Map tmp = (Map) map.get(keys[iii]); - if( tmp == null ) { - tmp = new HashMap(); - map.put(keys[iii], tmp); - } - map = tmp; - } - } - - return value; // todo -- should never reach this point - } - - public List getAllValues() { - final List result = new ArrayList(); - fillAllValues(data, result); - return result; - } - - private void fillAllValues(final Map map, final List result) { - for ( Object value : map.values() ) { - if ( value == null ) - continue; - if ( value instanceof Map ) - fillAllValues((Map)value, result); - else - result.add(value); - } - } - - public static class Leaf { - public final List keys; - public final Object value; - - public Leaf(final List keys, final Object value) { - this.keys = keys; - this.value = value; - } - } - - public List getAllLeaves() { - final List result = new ArrayList(); - final List path = new ArrayList(); - fillAllLeaves(data, path, result); - return result; - } - - private void fillAllLeaves(final Map map, final List path, final List result) { - for ( final Object key : map.keySet() ) { - final Object value = map.get(key); - if ( value == null ) - continue; - final List newPath = new ArrayList(path); - newPath.add(key); - if ( value instanceof Map ) { - fillAllLeaves((Map) value, newPath, result); - } else { - result.add(new Leaf(newPath, value)); - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 241eb6e10..3abe5a7f4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -75,6 +75,12 @@ public class UserException extends ReviewedStingException { } } + public static class IncompatibleReadFiltersException extends CommandLineException { + public IncompatibleReadFiltersException(final String filter1, final String filter2) { + super(String.format("Two read filters are enabled that are incompatible and cannot be used simultaneously: %s and %s", filter1, filter2)); + } + } + public static class MalformedWalkerArgumentsException extends CommandLineException { public MalformedWalkerArgumentsException(String message) { super(String.format("Malformed walker argument: %s",message)); @@ -276,8 +282,14 @@ public class UserException extends ReviewedStingException { } public static class ReadMissingReadGroup extends MalformedBAM { - public ReadMissingReadGroup(SAMRecord read) { - super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + public ReadMissingReadGroup(final SAMRecord read) { + super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + } + } + + public static class ReadHasUndefinedReadGroup extends MalformedBAM { + public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { + super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); } } @@ -359,14 +371,18 @@ public class UserException extends ReviewedStingException { } } - - public static class DeprecatedWalker extends UserException { public DeprecatedWalker(String walkerName, String version) { super(String.format("Walker %s is no longer available in the GATK; it has been deprecated since version %s", walkerName, version)); } } + public static class DeprecatedAnnotation extends UserException { + public DeprecatedAnnotation(String annotationName, String version) { + super(String.format("Annotation %s is no longer available in the GATK; it has been deprecated since version %s", annotationName, version)); + } + } + public static class CannotExecuteQScript extends UserException { public CannotExecuteQScript(String message) { super(String.format("Unable to execute QScript: " + message)); @@ -376,29 +392,25 @@ public class UserException extends ReviewedStingException { } } - public static class CouldNotCreateReferenceIndexFile extends UserException { - public CouldNotCreateReferenceIndexFile(File f, Exception e) { - this(f, "", e); - } - - public CouldNotCreateReferenceIndexFile(File f, String message, Exception e) { - super(String.format("Index file %s does not exist but could not be created because: %s. ", f, message) - + (e == null ? "" : getMessage(e))); - } - } - public static class CannotHandleGzippedRef extends UserException { - public CannotHandleGzippedRef() { - super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); - } + public CannotHandleGzippedRef() { + super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); + } } - public static class CouldNotCreateReferenceIndexFileBecauseOfLock extends UserException.CouldNotCreateReferenceIndexFile { - public CouldNotCreateReferenceIndexFileBecauseOfLock(File f) { - super(f, "could not be written because an exclusive file lock could not be obtained. " + - "If you are running multiple instances of GATK, another GATK process is " + - "probably creating this file now, and has locked it. Please wait until this process finishes " + - "and try again.", null); + public static class MissingReferenceFaiFile extends UserException { + public MissingReferenceFaiFile( final File indexFile, final File fastaFile ) { + super(String.format("Fasta index file %s for reference %s does not exist. Please see %s for help creating it.", + indexFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); + } + } + + public static class MissingReferenceDictFile extends UserException { + public MissingReferenceDictFile( final File dictFile, final File fastaFile ) { + super(String.format("Fasta dict file %s for reference %s does not exist. Please see %s for help creating it.", + dictFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); } } @@ -428,4 +440,21 @@ public class UserException extends ReviewedStingException { f.getAbsolutePath(), PHONE_HOME_DOCS_URL)); } } + + /** + * A special exception that happens only in the case where + * the filesystem, by design or configuration, is completely unable + * to handle locking. This exception will specifically NOT be thrown + * in the case where the filesystem handles locking but is unable to + * acquire a lock due to concurrency. + */ + public static class FileSystemInabilityToLockException extends UserException { + public FileSystemInabilityToLockException( String message ) { + super(message); + } + + public FileSystemInabilityToLockException( String message, Exception innerException ) { + super(message,innerException); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java b/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java index 3813cfc85..87e89e0f1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java +++ b/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java @@ -26,15 +26,13 @@ package org.broadinstitute.sting.utils.file; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; -import java.nio.channels.ClosedChannelException; -import java.nio.channels.FileChannel; -import java.nio.channels.FileLock; -import java.nio.channels.OverlappingFileLockException; +import java.nio.channels.*; +import java.util.concurrent.*; /** * a quick implementation of a file based lock, using the Java NIO classes @@ -52,125 +50,244 @@ public class FSLockWithShared { // the file channel we open private FileChannel channel = null; - /** - * A bit of experimental code for Siva at Partners. Conditionally throw an - * exception in the case where an unknown failure occurs, in an effort to stave - * off disabled nfs file locks. - */ - private boolean throwExceptionOnUnknownFailure = false; + // Timeout (in milliseconds) before we give up during non-blocking lock-acquisition calls. + // Necessary because these "non-blocking" calls can hang if there's a problem with the + // OS file locking support. + private int lockAcquisitionTimeout; + + // Default value for lockAcquisitionTimeout when none is explicitly provided + public static final int DEFAULT_LOCK_ACQUISITION_TIMEOUT_IN_MILLISECONDS = 30 * 1000; + + // Amount of time to wait when trying to shut down the lock-acquisition thread before giving up + public static final int THREAD_TERMINATION_TIMEOUT_IN_MILLISECONDS = 30 * 1000; /** - * create a file system, given a base file to which a lock string gets appended. - * @param baseFile File descriptor of file to lock + * Create a lock associated with the specified File. Use the default lock + * acquisition timeout of 30 seconds. + * + * @param file file to lock */ - public FSLockWithShared(File baseFile) { - file = baseFile; - } - - public FSLockWithShared(File baseFile,boolean throwExceptionOnUnknownFailure) { - this(baseFile); - this.throwExceptionOnUnknownFailure = throwExceptionOnUnknownFailure; + public FSLockWithShared( final File file ) { + this.file = file; + lockAcquisitionTimeout = DEFAULT_LOCK_ACQUISITION_TIMEOUT_IN_MILLISECONDS; } /** - * Get a shared (read) lock on a file - * Cannot get shared lock if it does not exist - * @return boolean true if we obtained a lock - * @throws FileSystemInabilityToLockException in cases of unexpected failure to capture lock. + * Create a lock associated with the specified File, and set a custom lock + * acquisition timeout. + * + * @param file file to lock + * @param lockAcquisitionTimeout maximum number of milliseconds to wait during non-blocking + * lock acquisition calls before concluding that there's a + * problem with the OS file locking support and throwing an error. */ - public boolean sharedLock() throws FileSystemInabilityToLockException { + public FSLockWithShared( final File file, final int lockAcquisitionTimeout ) { + this.file = file; + this.lockAcquisitionTimeout = lockAcquisitionTimeout; + } + + /** + * Get a shared (read) lock on a file. Does not block, and returns immediately + * under normal conditions with the result of the lock acquisition attempt. Will + * throw an exception if there's a problem with the OS file locking support. + * + * @return boolean true if we obtained a lock, false if we failed to obtain one + */ + public boolean sharedLock() { + return acquireLockWithTimeout(true); + } + + /** + * Get an exclusive (read-write) lock on a file. Does not block, and returns immediately + * under normal conditions with the result of the lock acquisition attempt. Will + * throw an exception if there's a problem with the OS file locking support. + * + * @return boolean true if we obtained a lock, false if we failed to obtain one + */ + public boolean exclusiveLock() { + return acquireLockWithTimeout(false); + } + + /** + * Attempt to acquire a lock of the specified type on the file in a background thread. + * Uses non-blocking lock-acquisition calls that should return immediately, but may + * get stuck if there's a problem with the OS file locking support. If the call gets + * stuck and the timeout elapses, throws a UserException, since it's not safe to + * proceed with a stuck lock acquisition thread (and there's no way to reliably + * interrupt it once the underlying system call hangs). + * + * @param acquireSharedLock if true, request a shared lock rather than an exclusive lock + * @return true if a lock was acquired, false if we failed + */ + private boolean acquireLockWithTimeout( final boolean acquireSharedLock ) { + // Use daemon threads so that hopelessly stuck lock acquisition threads won't prevent the JVM from exiting + final ExecutorService executor = Executors.newSingleThreadExecutor(new ThreadFactory() { + public Thread newThread( Runnable r ) { + Thread lockAcquisitionThread = new Thread(r); + lockAcquisitionThread.setDaemon(true); + return lockAcquisitionThread; + } + }); + final FutureTask lockAcquisitionTask = new FutureTask(new LockAcquisitionTask(acquireSharedLock)); + boolean lockAcquired = false; - // get read-only file channel try { - channel = new RandomAccessFile(file, "r").getChannel(); + executor.execute(lockAcquisitionTask); + + // Wait at most lockAcquisitionTimeout milliseconds for the lock acquisition task to finish. + lockAcquired = lockAcquisitionTask.get(lockAcquisitionTimeout, TimeUnit.MILLISECONDS); } - catch (IOException e) { - logger.warn(String.format("WARNING: Unable to lock file %s (could not open read only file channel)",file.getAbsolutePath())); - return false; + // Lock acquisition timeout elapsed. Since we're using NON-BLOCKING lock-acquisition calls, + // this implies that there's a problem with the OS locking daemon, or locks are not supported. + // Since it's not safe to proceed with a potentially stuck lock acquisition thread, we need to + // shut down the JVM in order to kill it. + catch ( TimeoutException e ) { + throw new UserException.FileSystemInabilityToLockException( + String.format("Timeout of %d milliseconds was reached while trying to acquire a lock on file %s. " + + "Since the GATK uses non-blocking lock acquisition calls that are not supposed to wait, " + + "this implies a problem with the file locking support in your operating system.", + lockAcquisitionTimeout, file.getAbsolutePath())); } - // get shared lock (third argument is true) + // Lock acquisition thread threw an exception. Need to unpack it via e.getCause() + catch ( ExecutionException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because exception %s occurred with error message %s", + file.getAbsolutePath(), + e.getCause() != null ? e.getCause().getClass().getSimpleName() : "unknown", + e.getCause() != null ? e.getCause().getMessage() : "none")); + lockAcquired = false; + } + // Interrupted while waiting for the lock acquisition thread -- not likely to happen + catch ( InterruptedException e ) { + logger.warn(String.format("WARNING: interrupted while attempting to acquire a lock for file %s", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( Exception e ) { + logger.warn(String.format("WARNING: error while attempting to acquire a lock for file %s. Error message: %s", + file.getAbsolutePath(), e.getMessage())); + lockAcquired = false; + } + + shutdownLockAcquisitionTask(executor); + + // Upon failure to acquire a lock, we always call unlock() to close the FileChannel if it was opened + // and to deal with very hypothetical edge cases where a lock might actually have been acquired despite the + // lock acquisition thread returning false. + if ( ! lockAcquired ) { + unlock(); + } + + return lockAcquired; + } + + /** + * Ensures that the lock acquisition task running in the provided executor has cleanly terminated. + * Throws a UserException if unable to shut it down within the period defined by the THREAD_TERMINATION_TIMEOUT. + * + * @param executor ExecutorService executing the lock-acquisition thread + */ + private void shutdownLockAcquisitionTask( final ExecutorService executor ) { + boolean shutdownAttemptSucceeded; + try { - lock = channel.tryLock(0, Long.MAX_VALUE, true); - if (lock == null) { - logger.warn(String.format("WARNING: Unable to lock file %s because there is already a lock active.",file.getAbsolutePath())); + executor.shutdownNow(); + shutdownAttemptSucceeded = executor.awaitTermination(THREAD_TERMINATION_TIMEOUT_IN_MILLISECONDS, TimeUnit.MILLISECONDS); + } + catch ( InterruptedException e ) { + shutdownAttemptSucceeded = false; + } + + if ( ! shutdownAttemptSucceeded ) { + throw new UserException(String.format("Failed to terminate lock acquisition thread while trying to lock file %s. " + + "Exiting because it's not safe to proceed with this run of the GATK.", + file.getAbsolutePath())); + } + } + + /** + * Background task that attempts to acquire a lock of the specified type, and returns a boolean + * indicating success/failure. Uses a non-blocking tryLock() call that should return immediately + * (but may get stuck if there's a problem with the OS locking daemon). + */ + private class LockAcquisitionTask implements Callable { + private final boolean acquireSharedLock; + + public LockAcquisitionTask( final boolean acquireSharedLock ) { + this.acquireSharedLock = acquireSharedLock; + } + + public Boolean call() { + // Get a read-only or read-write file channel, depending on the type of lock + try { + channel = new RandomAccessFile(file, acquireSharedLock ? "r" : "rw").getChannel(); + } + catch ( IOException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because we could not open a file channel", file.getAbsolutePath())); return false; } - } - catch (ClosedChannelException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because the file channel is closed.",file.getAbsolutePath())); - return false; - } - catch (OverlappingFileLockException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because you already have a lock on this file.",file.getAbsolutePath())); - return false; - } - catch (IOException e) { - logger.warn(String.format("WARNING: Unable to lock file %s: %s.",file.getAbsolutePath(),e.getMessage())); - if(throwExceptionOnUnknownFailure) - throw new FileSystemInabilityToLockException(e.getMessage(),e); - else - return false; - } - return true; - } - /** - * Get an exclusive lock on a file - * @return boolean true if we obtained a lock - * @throws FileSystemInabilityToLockException in cases of unexpected failure to capture lock. - */ - public boolean exclusiveLock() throws FileSystemInabilityToLockException { + boolean lockAcquired = false; - // read/write file channel is necessary for exclusive lock - try { - channel = new RandomAccessFile(file, "rw").getChannel(); - } - catch (Exception e) { - logger.warn(String.format("WARNING: Unable to lock file %s (could not open read/write file channel)",file.getAbsolutePath())); - // do we need to worry about deleting file here? Does RandomAccessFile will only create file if successful? - return false; - } - - // get exclusive lock (third argument is false) - try { - lock = channel.tryLock(0, Long.MAX_VALUE, false); - if (lock == null) { - logger.warn(String.format("WARNING: Unable to lock file %s because there is already a lock active.",file.getAbsolutePath())); - return false; + try { + // Non-blocking lock-acquisition call, should return right away. If it doesn't return immediately + // due to problems with the OS locking daemon, it will potentially be timed-out and interrupted. + lock = channel.tryLock(0, Long.MAX_VALUE, acquireSharedLock); + lockAcquired = lock != null; } - else return true; - } - catch (ClosedChannelException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because the file channel is closed.",file.getAbsolutePath())); - return false; - } - catch (OverlappingFileLockException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because you already have a lock on this file.",file.getAbsolutePath())); - return false; - } - catch (IOException e) { - logger.warn(String.format("WARNING: Unable to lock file %s: %s.",file.getAbsolutePath(),e.getMessage())); - if(throwExceptionOnUnknownFailure) - throw new FileSystemInabilityToLockException(e.getMessage(),e); - else - return false; + catch ( AsynchronousCloseException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because the file channel was closed by another thread", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( ClosedChannelException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because the file channel is closed.", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( OverlappingFileLockException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because you already have a lock on this file.", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( FileLockInterruptionException e ) { + logger.warn(String.format("WARNING: Interrupted while attempting to lock file %s", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( IOException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because an IOException occurred with message: %s.", file.getAbsolutePath(), e.getMessage())); + lockAcquired = false; + } + + return lockAcquired; } } - + /** - * unlock the file + * Unlock the file * * note: this allows unlocking a file that failed to lock (no required user checks on null locks). */ public void unlock() { + releaseLock(); + closeChannel(); + } + + private void releaseLock() { try { - if (lock != null) + if ( lock != null ) lock.release(); - if (channel != null) + } + catch ( ClosedChannelException e ) { + // if the channel was already closed we don't have to worry + } + catch ( IOException e ) { + throw new UserException(String.format("An error occurred while releasing the lock for file %s", file.getAbsolutePath()), e); + } + } + + private void closeChannel() { + try { + if ( channel != null ) channel.close(); } - catch (Exception e) { - throw new ReviewedStingException("An error occurred while unlocking file", e); + catch ( IOException e ) { + throw new UserException(String.format("An error occurred while closing channel for file %s", file.getAbsolutePath()), e); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 76ccede62..5d882ba8c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -25,10 +25,13 @@ package org.broadinstitute.sting.utils.fragments; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -56,7 +59,8 @@ import java.util.*; * Date: 3/26/11 * Time: 10:09 PM */ -public class FragmentUtils { +public final class FragmentUtils { + protected final static byte MIN_QUAL_BAD_OVERLAP = 16; private FragmentUtils() {} // private constructor /** @@ -65,18 +69,28 @@ public class FragmentUtils { * Allows us to write a generic T -> Fragment algorithm that works with any object containing * a read. * - * @param + * @param The type of the object that contains a GATKSAMRecord */ public interface ReadGetter { + /** + * Get the GATKSAMRecord associated with object + * + * @param object the thing that contains the read + * @return a non-null GATKSAMRecord read + */ public GATKSAMRecord get(T object); } - /** Identify getter for SAMRecords themselves */ + /** + * Identify getter for SAMRecords themselves + */ private final static ReadGetter SamRecordGetter = new ReadGetter() { @Override public GATKSAMRecord get(final GATKSAMRecord object) { return object; } }; - /** Gets the SAMRecord in a PileupElement */ + /** + * Gets the SAMRecord in a PileupElement + */ private final static ReadGetter PileupElementGetter = new ReadGetter() { @Override public GATKSAMRecord get(final PileupElement object) { return object.getRead(); } }; @@ -87,13 +101,20 @@ public class FragmentUtils { * and returns a FragmentCollection that contains the T objects whose underlying reads either overlap (or * not) with their mate pairs. * - * @param readContainingObjects - * @param nElements - * @param getter + * @param readContainingObjects An iterator of objects that contain GATKSAMRecords + * @param nElements the number of elements to be provided by the iterator, which is usually known upfront and + * greatly improves the efficiency of the fragment calculation + * @param getter a helper function that takes an object of type T and returns is associated GATKSAMRecord * @param - * @return + * @return a fragment collection */ - private final static FragmentCollection create(Iterable readContainingObjects, int nElements, ReadGetter getter) { + @Requires({ + "readContainingObjects != null", + "nElements >= 0", + "getter != null" + }) + @Ensures("result != null") + private static FragmentCollection create(final Iterable readContainingObjects, final int nElements, final ReadGetter getter) { Collection singletons = null; Collection> overlapping = null; Map nameMap = null; @@ -145,33 +166,76 @@ public class FragmentUtils { return new FragmentCollection(singletons, overlapping); } - public final static FragmentCollection create(ReadBackedPileup rbp) { + /** + * Create a FragmentCollection containing PileupElements from the ReadBackedPileup rbp + * @param rbp a non-null read-backed pileup. The elements in this ReadBackedPileup must be ordered + * @return a non-null FragmentCollection + */ + @Ensures("result != null") + public static FragmentCollection create(final ReadBackedPileup rbp) { + if ( rbp == null ) throw new IllegalArgumentException("Pileup cannot be null"); return create(rbp, rbp.getNumberOfElements(), PileupElementGetter); } - public final static FragmentCollection create(List reads) { + /** + * Create a FragmentCollection containing GATKSAMRecords from a list of reads + * + * @param reads a non-null list of reads, ordered by their start location + * @return a non-null FragmentCollection + */ + @Ensures("result != null") + public static FragmentCollection create(final List reads) { + if ( reads == null ) throw new IllegalArgumentException("Pileup cannot be null"); return create(reads, reads.size(), SamRecordGetter); } - public final static List mergeOverlappingPairedFragments( final List overlappingPair ) { - final byte MIN_QUAL_BAD_OVERLAP = 16; + public static List mergeOverlappingPairedFragments( final List overlappingPair ) { if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } - GATKSAMRecord firstRead = overlappingPair.get(0); - GATKSAMRecord secondRead = overlappingPair.get(1); + final GATKSAMRecord firstRead = overlappingPair.get(0); + final GATKSAMRecord secondRead = overlappingPair.get(1); + final GATKSAMRecord merged; if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { - firstRead = overlappingPair.get(1); // swap them - secondRead = overlappingPair.get(0); - } - if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { - return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A - } - if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { - return overlappingPair; // fragments contain indels so don't merge them + merged = mergeOverlappingPairedFragments(secondRead, firstRead); + } else { + merged = mergeOverlappingPairedFragments(firstRead, secondRead); } - final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); + return merged == null ? overlappingPair : Collections.singletonList(merged); + } + + /** + * Merge two overlapping reads from the same fragment into a single super read, if possible + * + * firstRead and secondRead must be part of the same fragment (though this isn't checked). Looks + * at the bases and alignment, and tries its best to create a meaningful synthetic single super read + * that represents the entire sequenced fragment. + * + * Assumes that firstRead starts before secondRead (according to their soft clipped starts) + * + * @param unclippedFirstRead the left most read + * @param unclippedSecondRead the right most read + * + * @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one + */ + public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord unclippedFirstRead, final GATKSAMRecord unclippedSecondRead) { + if ( unclippedFirstRead == null ) throw new IllegalArgumentException("unclippedFirstRead cannot be null"); + if ( unclippedSecondRead == null ) throw new IllegalArgumentException("unclippedSecondRead cannot be null"); + if ( ! unclippedFirstRead.getReadName().equals(unclippedSecondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + unclippedFirstRead + " and " + unclippedSecondRead); + + if( unclippedFirstRead.getCigarString().contains("I") || unclippedFirstRead.getCigarString().contains("D") || unclippedSecondRead.getCigarString().contains("I") || unclippedSecondRead.getCigarString().contains("D") ) { + return null; // fragments contain indels so don't merge them + } + + final GATKSAMRecord firstRead = ReadClipper.hardClipAdaptorSequence(ReadClipper.revertSoftClippedBases(unclippedFirstRead)); + final GATKSAMRecord secondRead = ReadClipper.hardClipAdaptorSequence(ReadClipper.revertSoftClippedBases(unclippedSecondRead)); + + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { + return null; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A + } + + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getAlignmentStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); final int numBases = firstReadStop + secondRead.getReadLength(); @@ -190,10 +254,10 @@ public class FragmentUtils { } for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { - return overlappingPair; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + return null; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them } if( firstReadQuals[iii] < MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] < MIN_QUAL_BAD_OVERLAP ) { - return overlappingPair; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on + return null; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on } bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); @@ -204,7 +268,8 @@ public class FragmentUtils { } final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() ); - returnRead.setAlignmentStart( firstRead.getSoftStart() ); + returnRead.setIsStrandless(true); + returnRead.setAlignmentStart( firstRead.getAlignmentStart() ); returnRead.setReadBases( bases ); returnRead.setBaseQualities( quals ); returnRead.setReadGroup( firstRead.getReadGroup() ); @@ -237,8 +302,6 @@ public class FragmentUtils { returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION ); } - final ArrayList returnList = new ArrayList(); - returnList.add(returnRead); - return returnList; + return returnRead; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java new file mode 100644 index 000000000..03a2b8077 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java @@ -0,0 +1,134 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.genotyper; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.Allele; + +/** + * Stores the most likely and second most likely alleles, along with a threshold + * for assuming computing that a read is informative. + * + * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD + * then the most likely allele is set to "no call", and isInformative will return false. This constant can be + * overridden simply by using one of the version of these calls that accepts informative threshold as an argument. + * + * For convenience, there are functions called getAlleleIfInformative that return either the most likely allele, or + * NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD of one another. + * + * By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the + * corresponding key + * + * User: depristo + * Date: 3/24/13 + * Time: 1:39 PM + */ +public final class MostLikelyAllele { + public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; + + final Allele mostLikely; + final Allele secondLikely; + final double log10LikelihoodOfMostLikely; + final double log10LikelihoodOfSecondBest; + + /** + * Create a new MostLikelyAllele + * + * If there's a meaningful most likely allele, allele should be a real allele. If none can be determined, + * mostLikely should be a NO_CALL allele. + * + * @param mostLikely the most likely allele + * @param secondMostLikely the most likely allele after mostLikely + * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele + * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available) + */ + public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { + if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null"); + if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) ) + throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely); + if ( log10LikelihoodOfSecondBest != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfSecondBest) ) + throw new IllegalArgumentException("log10LikelihoodOfSecondBest must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfSecondBest); + if ( log10LikelihoodOfMostLikely < log10LikelihoodOfSecondBest ) + throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest); + + this.mostLikely = mostLikely; + this.secondLikely = secondMostLikely; + this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely; + this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest; + } + + public Allele getMostLikelyAllele() { + return mostLikely; + } + + public Allele getSecondMostLikelyAllele() { + return secondLikely; + } + + public double getLog10LikelihoodOfMostLikely() { + return log10LikelihoodOfMostLikely; + } + + public double getLog10LikelihoodOfSecondBest() { + return log10LikelihoodOfSecondBest; + } + + /** + * @see #isInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD + */ + public boolean isInformative() { + return isInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD); + } + + /** + * Was this allele selected from an object that was specifically informative about the allele? + * + * The calculation that implements this is whether the likelihood of the most likely allele is larger + * than the second most likely by at least the log10ThresholdForInformative + * + * @return true if so, false if not + */ + public boolean isInformative(final double log10ThresholdForInformative) { + return getLog10LikelihoodOfMostLikely() - getLog10LikelihoodOfSecondBest() > log10ThresholdForInformative; + } + + /** + * @see #getAlleleIfInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD + */ + public Allele getAlleleIfInformative() { + return getAlleleIfInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD); + } + + /** + * Get the most likely allele if isInformative(log10ThresholdForInformative) is true, or NO_CALL otherwise + * + * @param log10ThresholdForInformative a log10 threshold to determine if the most likely allele was informative + * @return a non-null allele + */ + public Allele getAlleleIfInformative(final double log10ThresholdForInformative) { + return isInformative(log10ThresholdForInformative) ? getMostLikelyAllele() : Allele.NO_CALL; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index cc4fc6129..150e24c51 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -28,12 +28,13 @@ package org.broadinstitute.sting.utils.genotyper; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import java.io.PrintStream; import java.util.*; /** @@ -41,12 +42,8 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { - - - public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; - - protected List alleles; - protected Map> likelihoodReadMap; + protected final List alleles; + protected final Map> likelihoodReadMap; public PerReadAlleleLikelihoodMap() { likelihoodReadMap = new LinkedHashMap>(); @@ -78,17 +75,16 @@ public class PerReadAlleleLikelihoodMap { } - public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { - return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log); + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { + return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction); } /** * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed. - * @param log - a PrintStream to log the removed reads to (passed through to the utility function) */ - public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) { + public void performPerAlleleDownsampling(final double downsamplingFraction) { // special case removal of all or no reads if ( downsamplingFraction <= 0.0 ) return; @@ -101,7 +97,7 @@ public class PerReadAlleleLikelihoodMap { final Map> alleleReadMap = getAlleleStratifiedReadMap(); // compute the reads to remove and actually remove them - final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log); + final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction); for ( final GATKSAMRecord read : readsToRemove ) likelihoodReadMap.remove(read); } @@ -117,11 +113,12 @@ public class PerReadAlleleLikelihoodMap { alleleReadMap.put(allele, new ArrayList()); for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - // do not remove reduced reads! + // TODO -- come up with a strategy for down-sampling reduced reads + // Currently we are unable to remove reduced reads because their representative base count differs throughout the read if ( !entry.getKey().isReducedRead() ) { - final Allele bestAllele = getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - alleleReadMap.get(bestAllele).add(entry.getKey()); + final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); + if ( bestAllele.isInformative() ) + alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); } } @@ -191,35 +188,102 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + /** + * Get the most likely alleles estimated across all reads in this object + * + * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for + * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum + * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. + * + * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele + * doesn't have a meaningful get best likelihood. + * + * @return a MostLikelyAllele object, or null if this map is empty + */ + public MostLikelyAllele getMostLikelyDiploidAlleles() { + if ( isEmpty() ) return null; + + int hap1 = 0; + int hap2 = 0; + double maxElement = Double.NEGATIVE_INFINITY; + for( int iii = 0; iii < alleles.size(); iii++ ) { + final Allele iii_allele = alleles.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleles.get(jjj); + + double haplotypeLikelihood = 0.0; + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + final GATKSAMRecord read = entry.getKey(); + final int count = ReadUtils.getMeanRepresentativeReadCount(read); + final double likelihood_iii = entry.getValue().get(iii_allele); + final double likelihood_jjj = entry.getValue().get(jjj_allele); + haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF); + + // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair + if ( haplotypeLikelihood < maxElement ) break; + } + + // keep track of the max element and associated indices + if ( haplotypeLikelihood > maxElement ) { + hap1 = iii; + hap2 = jjj; + maxElement = haplotypeLikelihood; + } + } + } + + if ( maxElement == Double.NEGATIVE_INFINITY ) + throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); + + return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); + } + private static final double LOG_ONE_HALF = -Math.log10(2.0); /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD - * then the most likely allele is set to "no call" + * * @param alleleMap - a map from alleles to likelihoods - * @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD - * of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the - * corresponding key + * @return - a MostLikelyAllele object */ @Ensures("result != null") - public static Allele getMostLikelyAllele( final Map alleleMap ) { + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap ) { + return getMostLikelyAllele(alleleMap, null); + } + + /** + * Given a map from alleles to likelihoods, find the allele with the largest likelihood. + * + * @param alleleMap - a map from alleles to likelihoods + * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. + * this is useful for the case where you've selected a subset of the alleles that + * the reads have been computed for further analysis. If null totally ignored + * @return - a MostLikelyAllele object + */ + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; Allele mostLikelyAllele = Allele.NO_CALL; + Allele secondMostLikely = null; for (final Map.Entry el : alleleMap.entrySet()) { + if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) + continue; + if (el.getValue() > maxLike) { prevMaxLike = maxLike; maxLike = el.getValue(); + secondMostLikely = mostLikelyAllele; mostLikelyAllele = el.getKey(); } else if( el.getValue() > prevMaxLike ) { + secondMostLikely = el.getKey(); prevMaxLike = el.getValue(); } } - return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL ); - } + return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); + } /** * Debug method to dump contents of object into string for display @@ -242,4 +306,62 @@ public class PerReadAlleleLikelihoodMap { } return sb.toString(); } + + /** + * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods + * + * Goes through each read in this map, and if it is poorly modelled removes it from the map. + * + * @see #readIsPoorlyModelled(org.broadinstitute.sting.utils.sam.GATKSAMRecord, java.util.Collection, double) + * for more information about the poorly modelled test. + * + * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled + * @return the list of reads removed from this map because they are poorly modelled + */ + public List filterPoorlyModelledReads(final double maxErrorRatePerBase) { + final List removedReads = new LinkedList(); + final Iterator>> it = likelihoodReadMap.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> record = it.next(); + if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) { + it.remove(); + removedReads.add(record.getKey()); + } + } + + return removedReads; + } + + /** + * Is this read poorly modelled by all of the alleles in this map? + * + * A read is poorly modeled when it's likelihood is below what would be expected for a read + * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. + * + * This function makes a number of key assumptions. First, that the likelihoods reflect the total likelihood + * of the read. In other words, that the read would be fully explained by one of the alleles. This means + * that the allele should be something like the full haplotype from which the read might originate. + * + * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base). So + * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors + * even against the true haplotype. So for this read to be well modelled by at least one allele we'd expect + * a likelihood to be >= 10 * -3. + * + * @param read the read we want to evaluate + * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes. + * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space. So + * 0.01 means a 1% error rate + * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes + */ + protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection log10Likelihoods, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.ceil(read.getReadLength() * maxErrorRatePerBase); + final double log10QualPerBase = -3.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + for ( final double log10Likelihood : log10Likelihoods ) + if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele ) + return false; + + return true; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java new file mode 100644 index 000000000..752c880b9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotype; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; + +import java.util.*; + +/** + * Extract simple VariantContext events from a single haplotype + * + * User: depristo + * Date: 3/27/13 + * Time: 8:35 AM + */ +public class EventMap extends TreeMap { + private final static Logger logger = Logger.getLogger(EventMap.class); + protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3; + public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); + + private final Haplotype haplotype; + private final byte[] ref; + private final GenomeLoc refLoc; + private final String sourceNameToAdd; + + public EventMap(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { + super(); + this.haplotype = haplotype; + this.ref = ref; + this.refLoc = refLoc; + this.sourceNameToAdd = sourceNameToAdd; + + processCigarForInitialEvents(); + } + + /** + * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference + * @param stateForTesting + */ + protected EventMap(final Collection stateForTesting) { + haplotype = null; + ref = null; + refLoc = null; + sourceNameToAdd = null; + for ( final VariantContext vc : stateForTesting ) + addVC(vc); + } + + protected void processCigarForInitialEvents() { + final Cigar cigar = haplotype.getCigar(); + final byte[] alignment = haplotype.getBases(); + + int refPos = haplotype.getAlignmentStartHapwrtRef(); + if( refPos < 0 ) { + return; + } // Protection against SW failures + + int alignmentPos = 0; + + for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { + final CigarElement ce = cigar.getCigarElement(cigarIndex); + final int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case I: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final List insertionAlleles = new ArrayList(); + final int insertionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) ) { + insertionAlleles.add( Allele.create(refByte, true) ); + } + if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { + // if the insertion isn't completely resolved in the haplotype, skip it + // note this used to emit SYMBOLIC_UNASSEMBLED_EVENT_ALLELE but that seems dangerous + } else { + byte[] insertionBases = new byte[]{}; + insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base + insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength)); + if( BaseUtils.isAllRegularBases(insertionBases) ) { + insertionAlleles.add( Allele.create(insertionBases, false) ); + } + } + if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); + } + } + alignmentPos += elementLength; + break; + } + case S: + { + alignmentPos += elementLength; + break; + } + case D: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base + final List deletionAlleles = new ArrayList(); + final int deletionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { + deletionAlleles.add( Allele.create(deletionBases, true) ); + deletionAlleles.add( Allele.create(refByte, false) ); + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + } + } + refPos += elementLength; + break; + } + case M: + case EQ: + case X: + { + for( int iii = 0; iii < elementLength; iii++ ) { + final byte refByte = ref[refPos]; + final byte altByte = alignment[alignmentPos]; + if( refByte != altByte ) { // SNP! + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { + final List snpAlleles = new ArrayList(); + snpAlleles.add( Allele.create( refByte, true ) ); + snpAlleles.add( Allele.create( altByte, false ) ); + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); + } + } + refPos++; + alignmentPos++; + } + break; + } + case N: + case H: + case P: + default: + throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); + } + } + } + + /** + * Add VariantContext vc to this map, merging events with the same start sites if necessary + * @param vc the variant context to add + */ + protected void addVC(final VariantContext vc) { + addVC(vc, true); + } + + /** + * Add VariantContext vc to this map + * @param vc the variant context to add + * @param merge should we attempt to merge it with an already existing element, or should we throw an error in that case? + */ + protected void addVC(final VariantContext vc, final boolean merge) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + + if ( containsKey(vc.getStart()) ) { + if ( merge ) { + final VariantContext prev = get(vc.getStart()); + put(vc.getStart(), makeBlock(prev, vc)); + } else { + throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc); + } + } else + put(vc.getStart(), vc); + } + + /** + * Create a block substitution out of two variant contexts that start at the same position + * + * vc1 can be SNP, and vc2 can then be either a insertion or deletion. + * If vc1 is an indel, then vc2 must be the opposite type (vc1 deletion => vc2 must be an insertion) + * + * @param vc1 the first variant context we want to merge + * @param vc2 the second + * @return a block substitution that represents the composite substitution implied by vc1 and vc2 + */ + protected VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) { + if ( vc1.getStart() != vc2.getStart() ) throw new IllegalArgumentException("vc1 and 2 must have the same start but got " + vc1 + " and " + vc2); + if ( ! vc1.isBiallelic() ) throw new IllegalArgumentException("vc1 must be biallelic"); + if ( ! vc1.isSNP() ) { + if ( ! ((vc1.isSimpleDeletion() && vc2.isSimpleInsertion()) || (vc1.isSimpleInsertion() && vc2.isSimpleDeletion()))) + throw new IllegalArgumentException("Can only merge single insertion with deletion (or vice versa) but got " + vc1 + " merging with " + vc2); + } else if ( vc2.isSNP() ) { + throw new IllegalArgumentException("vc1 is " + vc1 + " but vc2 is a SNP, which implies there's been some terrible bug in the cigar " + vc2); + } + + final Allele ref, alt; + final VariantContextBuilder b = new VariantContextBuilder(vc1); + if ( vc1.isSNP() ) { + // we have to repair the first base, so SNP case is special cased + if ( vc1.getReference().equals(vc2.getReference()) ) { + // we've got an insertion, so we just update the alt to have the prev alt + ref = vc1.getReference(); + alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false); + } else { + // we're dealing with a deletion, so we patch the ref + ref = vc2.getReference(); + alt = vc1.getAlternateAllele(0); + b.stop(vc2.getEnd()); + } + } else { + final VariantContext insertion = vc1.isSimpleInsertion() ? vc1 : vc2; + final VariantContext deletion = vc1.isSimpleInsertion() ? vc2 : vc1; + ref = deletion.getReference(); + alt = insertion.getAlternateAllele(0); + b.stop(deletion.getEnd()); + } + + return b.alleles(Arrays.asList(ref, alt)).make(); + } + + // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it + @Requires("getNumberOfEvents() > 0") + protected void replaceClumpedEventsWithBlockSubstititions() { + if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { + int lastStart = -1; + for ( boolean foundOne = true; foundOne; ) { + foundOne = false; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() > lastStart ) { + lastStart = vc.getStart(); + final List neighborhood = getNeighborhood(vc, 10); + if ( updateToBlockSubstitutionIfBetter(neighborhood) ) { + foundOne = true; + break; + } + } + } + } + } + } + + protected boolean updateToBlockSubstitutionIfBetter(final List neighbors) { + if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) + return false; + // TODO -- need more tests to decide if this is really so good + + final VariantContext first = neighbors.get(0); + final int refStartOffset = first.getStart() - refLoc.getStart(); + final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart(); + + final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1); + final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar()); + + final VariantContextBuilder builder = new VariantContextBuilder(first); + builder.stop(first.getStart() + refBases.length - 1); + builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases))); + final VariantContext block = builder.make(); + + // remove all merged events + for ( final VariantContext merged : neighbors ) { + if ( remove(merged.getStart()) == null ) + throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged); + } + + // note must be after we remove the previous events as the treeset only allows one key per start + logger.info("Transforming into block substitution at " + block); + addVC(block, false); + + return true; + } + + /** + * Get all of the variant contexts starting at leftMost that are within maxBP of each other + * + * @param leftMost the left most (smallest position) variant context that will start the neighborhood + * @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next + * to be included the the resulting list + * @return a list that contains at least one element (leftMost) + */ + @Requires({"leftMost != null", "maxBPBetweenEvents >= 0"}) + @Ensures({"result != null", "! result.isEmpty()"}) + protected List getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) { + final List neighbors = new LinkedList(); + + VariantContext left = leftMost; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() < leftMost.getStart() ) + continue; + + if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) { + // this vc is within max distance to the end of the left event, so accumulate it + neighbors.add(vc); + left = vc; + } + } + + return neighbors; + } + + /** + * Get the starting positions of events in this event map + * @return + */ + public Set getStartPositions() { + return keySet(); + } + + /** + * Get the variant contexts in order of start position in this event map + * @return + */ + public Collection getVariantContexts() { + return values(); + } + + /** + * How many events do we have? + * @return + */ + public int getNumberOfEvents() { + return size(); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("EventMap{"); + for ( final VariantContext vc : getVariantContexts() ) + b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles())); + b.append("}"); + return b.toString(); + } + + /** + * Build event maps for each haplotype, returning the sorted set of all of the starting positions of all + * events across all haplotypes + * + * @param haplotypes a list of haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + * @param debug if true, we'll emit debugging information during this operation + * @return a sorted set of start positions of all events among all haplotypes + */ + public static TreeSet buildEventMapsForHaplotypes( final List haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final boolean debug) { + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = new TreeSet(); + int hapNumber = 0; + + if( debug ) logger.info("=== Best Haplotypes ==="); + for( final Haplotype h : haplotypes ) { + // Walk along the alignment and turn any difference from the reference into an event + h.setEventMap( new EventMap( h, ref, refLoc, "HC" + hapNumber++ ) ); + startPosKeySet.addAll(h.getEventMap().getStartPositions()); + + if( debug ) { + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); + } + } + + return startPosKeySet; + } + + private static class VariantContextComparator implements Comparator { + @Override + public int compare(VariantContext vc1, VariantContext vc2) { + return vc1.getStart() - vc2.getStart(); + } + } + + /** + * Get all of the VariantContexts in the event maps for all haplotypes, sorted by their start position + * @param haplotypes the set of haplotypes to grab the VCs from + * @return a sorted set of variant contexts + */ + public static TreeSet getAllVariantContexts( final List haplotypes ) { + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet vcs = new TreeSet(new VariantContextComparator()); + + for( final Haplotype h : haplotypes ) { + vcs.addAll(h.getEventMap().getVariantContexts()); + } + + return vcs; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java similarity index 58% rename from public/java/src/org/broadinstitute/sting/utils/Haplotype.java rename to public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index cce6abbee..bacee7942 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -23,42 +23,67 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.haplotype; import com.google.java.contract.Requires; import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import java.io.Serializable; -import java.util.*; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; public class Haplotype extends Allele { - private GenomeLoc genomeLocation = null; - private Map eventMap = null; + private EventMap eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; private Event artificialEvent = null; + private double score = 0; /** * Main constructor * - * @param bases bases - * @param isRef is reference allele? + * @param bases a non-null array of bases + * @param isRef is this the reference haplotype? */ public Haplotype( final byte[] bases, final boolean isRef ) { super(bases.clone(), isRef); } + /** + * Create a new non-ref haplotype + * + * @param bases a non-null array of bases + */ public Haplotype( final byte[] bases ) { this(bases, false); } + /** + * Create a new haplotype with bases + * + * Requires bases.length == cigar.getReadLength() + * + * @param bases a non-null array of bases + * @param isRef is this the reference haplotype? + * @param alignmentStartHapwrtRef offset of this haplotype w.r.t. the reference + * @param cigar the cigar that maps this haplotype to the reference sequence + */ + public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) { + this(bases, isRef); + this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; + setCigar(cigar); + } + /** * Copy constructor. Note the ref state of the provided allele is ignored! * @@ -78,6 +103,40 @@ public class Haplotype extends Allele { this.genomeLocation = loc; } + /** + * Create a new Haplotype derived from this one that exactly spans the provided location + * + * Note that this haplotype must have a contain a genome loc for this operation to be successful. If no + * GenomeLoc is contained than @throws an IllegalStateException + * + * Also loc must be fully contained within this Haplotype's genomeLoc. If not an IllegalArgumentException is + * thrown. + * + * @param loc a location completely contained within this Haplotype's location + * @return a new Haplotype within only the bases spanning the provided location, or null for some reason the haplotype would be malformed if + */ + public Haplotype trim(final GenomeLoc loc) { + if ( loc == null ) throw new IllegalArgumentException("Loc cannot be null"); + if ( genomeLocation == null ) throw new IllegalStateException("Cannot trim a Haplotype without containing GenomeLoc"); + if ( ! genomeLocation.containsP(loc) ) throw new IllegalArgumentException("Can only trim a Haplotype to a containing span. My loc is " + genomeLocation + " but wanted trim to " + loc); + if ( getCigar() == null ) throw new IllegalArgumentException("Cannot trim haplotype without a cigar " + this); + + final int newStart = loc.getStart() - this.genomeLocation.getStart(); + final int newStop = newStart + loc.size() - 1; + final byte[] newBases = AlignmentUtils.getBasesCoveringRefInterval(newStart, newStop, getBases(), 0, getCigar()); + final Cigar newCigar = AlignmentUtils.trimCigarByReference(getCigar(), newStart, newStop); + + if ( newBases == null || AlignmentUtils.startsOrEndsWithInsertionOrDeletion(newCigar) ) + // we cannot meaningfully chop down the haplotype, so return null + return null; + + final Haplotype ret = new Haplotype(newBases, isReference()); + ret.setCigar(newCigar); + ret.setGenomeLocation(loc); + ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef()); + return ret; + } + @Override public boolean equals( Object h ) { return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); @@ -88,11 +147,11 @@ public class Haplotype extends Allele { return Arrays.hashCode(getBases()); } - public Map getEventMap() { + public EventMap getEventMap() { return eventMap; } - public void setEventMap( final Map eventMap ) { + public void setEventMap( final EventMap eventMap ) { this.eventMap = eventMap; } @@ -101,6 +160,18 @@ public class Haplotype extends Allele { return getDisplayString(); } + /** + * Get the span of this haplotype (may be null) + * @return a potentially null genome loc + */ + public GenomeLoc getGenomeLocation() { + return genomeLocation; + } + + public void setGenomeLocation(GenomeLoc genomeLocation) { + this.genomeLocation = genomeLocation; + } + public long getStartPosition() { return genomeLocation.getStart(); } @@ -117,12 +188,39 @@ public class Haplotype extends Allele { this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; } + /** + * Get the cigar for this haplotype. Note that cigar is guarenteed to be consolidated + * in that multiple adjacent equal operates will have been merged + * @return the cigar of this haplotype + */ public Cigar getCigar() { return cigar; } + /** + * Get the haplotype cigar extended by padSize M at the tail, consolidated into a clean cigar + * + * @param padSize how many additional Ms should be appended to the end of this cigar. Must be >= 0 + * @return a newly allocated Cigar that consolidate(getCigar + padSize + M) + */ + public Cigar getConsolidatedPaddedCigar(final int padSize) { + if ( padSize < 0 ) throw new IllegalArgumentException("padSize must be >= 0 but got " + padSize); + final Cigar extendedHaplotypeCigar = new Cigar(getCigar().getCigarElements()); + if ( padSize > 0 ) extendedHaplotypeCigar.add(new CigarElement(padSize, CigarOperator.M)); + return AlignmentUtils.consolidateCigar(extendedHaplotypeCigar); + } + + /** + * Set the cigar of this haplotype to cigar. + * + * Note that this function consolidates the cigar, so that 1M1M1I1M1M => 2M1I2M + * + * @param cigar a cigar whose readLength == length() + */ public void setCigar( final Cigar cigar ) { - this.cigar = cigar; + this.cigar = AlignmentUtils.consolidateCigar(cigar); + if ( this.cigar.getReadLength() != length() ) + throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength()); } public boolean isArtificialHaplotype() { @@ -165,25 +263,6 @@ public class Haplotype extends Allele { return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation)); } - public static class HaplotypeBaseComparator implements Comparator, Serializable { - @Override - public int compare( final Haplotype hap1, final Haplotype hap2 ) { - return compareHaplotypeBases(hap1, hap2); - } - - public static int compareHaplotypeBases(final Haplotype hap1, final Haplotype hap2) { - final byte[] arr1 = hap1.getBases(); - final byte[] arr2 = hap2.getBases(); - // compares byte arrays using lexical ordering - final int len = Math.min(arr1.length, arr2.length); - for( int iii = 0; iii < len; iii++ ) { - final int cmp = arr1[iii] - arr2[iii]; - if (cmp != 0) { return cmp; } - } - return arr2.length - arr1.length; - } - } - public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, @@ -243,4 +322,23 @@ public class Haplotype extends Allele { this.pos = pos; } } + + /** + * Get the score (an estimate of the support) of this haplotype + * @return a double, where higher values are better + */ + public double getScore() { + return this.isReference() ? Double.MAX_VALUE : score; + } + + /** + * Set the score (an estimate of the support) of this haplotype. + * + * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score + * + * @param score a double, where higher values are better + */ + public void setScore(double score) { + this.score = this.isReference() ? Double.MAX_VALUE : score; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java new file mode 100644 index 000000000..191442e3e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java @@ -0,0 +1,42 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import java.util.Comparator; + +/** + * Compares two haplotypes in the lexicographic order of their bases + * + * User: depristo + * Date: 3/29/13 + * Time: 11:09 AM + */ +public class HaplotypeBaseComparator implements Comparator { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + return hap1.getBaseString().compareTo(hap2.getBaseString()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java new file mode 100644 index 000000000..40146ba88 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java @@ -0,0 +1,39 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import java.util.Comparator; + +/** + * A comparator that sorts haplotypes in decreasing order of score, so that the best supported + * haplotypes are at the top + */ +public class HaplotypeScoreComparator implements Comparator { + @Override + public int compare(Haplotype o1, Haplotype o2) { + return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java index f99ff7538..2ed35d848 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java @@ -56,6 +56,7 @@ public class HelpConstants { public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; public final static String DOCS_CAT_TEST = "Testing Tools"; + public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; public static String forumPost(String post) { return GATK_FORUM_URL + post; diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 81606d2f3..9a23fd022 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -28,9 +28,15 @@ package org.broadinstitute.sting.utils.help; import com.sun.javadoc.FieldDoc; import com.sun.javadoc.PackageDoc; import com.sun.javadoc.ProgramElementDoc; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; import java.lang.reflect.Field; +import java.util.List; public class HelpUtils { @@ -70,4 +76,27 @@ public class HelpUtils { String.format("%s", doc.name()); } + /** + * Simple method to print a list of available annotations. + */ + public static void listAnnotations() { + System.out.println("\nThis is a list of available Variant Annotations for use with tools such as UnifiedGenotyper, HaplotypeCaller and VariantAnnotator. Please see the Technical Documentation for more details about these annotations:"); + System.out.println("http://www.broadinstitute.org/gatk/gatkdocs/"); + System.out.println("\nStandard annotations in the list below are marked with a '*'."); + List> infoAnnotationClasses = new PluginManager(InfoFieldAnnotation.class).getPlugins(); + System.out.println("\nAvailable annotations for the VCF INFO field:"); + for (int i = 0; i < infoAnnotationClasses.size(); i++) + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName()); + System.out.println(); + List> genotypeAnnotationClasses = new PluginManager(GenotypeAnnotation.class).getPlugins(); + System.out.println("\nAvailable annotations for the VCF FORMAT field:"); + for (int i = 0; i < genotypeAnnotationClasses.size(); i++) + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName()); + System.out.println(); + System.out.println("\nAvailable classes/groups of annotations:"); + for ( Class c : new PluginManager(AnnotationType.class).getInterfaces() ) + System.out.println("\t" + c.getSimpleName()); + System.out.println(); + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 8fbd302a8..9728bdb1c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; -import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index c9d364aac..ab6c321e8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.pairhmm; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; @@ -34,15 +35,22 @@ import java.util.Arrays; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * - * User: rpoplin + * User: rpoplin, carneiro * Date: 3/1/12 */ -public class Log10PairHMM extends PairHMM { +public final class Log10PairHMM extends PairHMM { /** * Should we use exact log10 calculation (true), or an approximation (false)? */ private final boolean doExactLog10; + private static final int matchToMatch = 0; + private static final int indelToMatch = 1; + private static final int matchToInsertion = 2; + private static final int insertionToInsertion = 3; + private static final int matchToDeletion = 4; + private static final int deletionToDeletion = 5; + /** * Create an uninitialized PairHMM * @@ -64,14 +72,17 @@ public class Log10PairHMM extends PairHMM { * {@inheritDoc} */ @Override - public void initialize( final int readMaxLength, final int haplotypeMaxLength) { + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { super.initialize(readMaxLength, haplotypeMaxLength); - for( int iii=0; iii < X_METRIC_MAX_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + for( int iii=0; iii < paddedMaxReadLength; iii++ ) { + Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); } + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** @@ -86,34 +97,91 @@ public class Log10PairHMM extends PairHMM { final byte[] overallGCP, final int hapStartIndex, final boolean recacheReadValues ) { - // the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual - // read and haplotypes, not the maximum - matchMetricArray[1][1] = getNPotentialXStartsLikelihoodPenaltyLog10(haplotypeBases.length, readBases.length); - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + final double initialValue = Math.log10(1.0 / haplotypeBases.length); + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; } } - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return myLog10SumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + if ( ! constantsAreInitialized || recacheReadValues ) + initializeProbabilities(insertionGOP, deletionGOP, overallGCP); + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + + for (int i = 1; i < paddedReadLength; i++) { + // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based + for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. + final int endI = paddedReadLength - 1; + double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); + for (int j = 2; j < paddedHaplotypeLength; j++) + finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]}); + + return finalSumProbabilities; } + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + for (int i = 0; i < insertionGOP.length; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + transition[i+1][matchToMatch] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); + transition[i+1][indelToMatch] = QualityUtils.qualToProbLog10(overallGCP[i]); + transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); + transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); + transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + } + + // note that we initialized the constants + constantsAreInitialized = true; + } + + /** * Compute the log10SumLog10 of the values * @@ -132,37 +200,24 @@ public class Log10PairHMM extends PairHMM { return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); } - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ - 1] + d0, XMetricArray[indI - 1][indJ - 1] + e0, YMetricArray[indI - 1][indJ - 1] + e0}); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ] + d1, XMetricArray[indI - 1][indJ] + e1}); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI][indJ - 1] + d2, YMetricArray[indI][indJ - 1] + e2}); + matchMatrix[indI][indJ] = prior + + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch], + insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch], + deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]}); + insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]}); + deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]}); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index 4035150d8..6b57a1354 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -25,11 +25,12 @@ package org.broadinstitute.sting.utils.pairhmm; -import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import java.util.Arrays; + /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * @@ -39,9 +40,11 @@ import org.broadinstitute.sting.utils.MathUtils; public abstract class PairHMM { protected final static Logger logger = Logger.getLogger(PairHMM.class); - protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - protected static final byte DEFAULT_GOP = (byte) 45; - protected static final byte DEFAULT_GCP = (byte) 10; + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + protected boolean constantsAreInitialized = false; + + protected byte[] previousHaplotypeBases; public enum HMM_IMPLEMENTATION { /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ @@ -52,17 +55,21 @@ public abstract class PairHMM { LOGLESS_CACHING } - protected double[][] matchMetricArray = null; - protected double[][] XMetricArray = null; - protected double[][] YMetricArray = null; + protected double[][] matchMatrix = null; + protected double[][] insertionMatrix = null; + protected double[][] deletionMatrix = null; protected int maxHaplotypeLength, maxReadLength; - protected int X_METRIC_MAX_LENGTH, Y_METRIC_MAX_LENGTH; + protected int paddedMaxReadLength, paddedMaxHaplotypeLength; + protected int paddedReadLength, paddedHaplotypeLength; private boolean initialized = false; /** * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * @param readMaxLength the max length of reads we want to use with this PairHMM + * + * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. + * * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM */ public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); @@ -72,15 +79,21 @@ public abstract class PairHMM { maxReadLength = readMaxLength; // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - X_METRIC_MAX_LENGTH = readMaxLength + 2; - Y_METRIC_MAX_LENGTH = haplotypeMaxLength + 2; + paddedMaxReadLength = readMaxLength + 1; + paddedMaxHaplotypeLength = haplotypeMaxLength + 1; - matchMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; - XMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; - YMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; + matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + previousHaplotypeBases = null; + + constantsAreInitialized = false; initialized = true; } + + /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities. @@ -97,8 +110,6 @@ public abstract class PairHMM { * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases - * @param hapStartIndex start the hmm calculation at this offset in haplotype bases. Used in the caching calculation - * where multiple haplotypes are used, and they only diff starting at hapStartIndex * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated * parameters are the same, and only the haplotype bases are changing underneath us * @return the log10 probability of read coming from the haplotype under the provided error model @@ -109,7 +120,6 @@ public abstract class PairHMM { final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final int hapStartIndex, final boolean recacheReadValues ) { if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); @@ -120,25 +130,29 @@ public abstract class PairHMM { if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); - if ( hapStartIndex < 0 || hapStartIndex > haplotypeBases.length ) throw new IllegalArgumentException("hapStartIndex is bad, must be between 0 and haplotype length " + haplotypeBases.length + " but got " + hapStartIndex); + + paddedReadLength = readBases.length + 1; + paddedHaplotypeLength = haplotypeBases.length + 1; + + final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length || recacheReadValues) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases); double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); - // TODO -- remove max when PairHMM no longer returns likelihoods >= 0 - result = Math.min(result, 0.0); + if ( ! MathUtils.goodLog10Probability(result) ) + throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); - if ( MathUtils.goodLog10Probability(result) ) - return result; - else - throw new IllegalStateException("Bad likelihoods detected: " + result); -// return result; + // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). + // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. + previousHaplotypeBases = haplotypeBases; + + return result; } /** * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 */ @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", - "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, @@ -148,41 +162,13 @@ public abstract class PairHMM { final int hapStartIndex, final boolean recacheReadValues ); - /** - * How many potential starting locations are a read with readSize bases against a haplotype with haplotypeSize bases? - * - * for example, a 3 bp read against a 5 bp haplotype could potentially start at 1, 2, 3 = 5 - 3 + 1 = 3 - * the max value is necessary in the case where the read is longer than the haplotype, in which case - * there's a single unique start site by assumption - * - * @param haplotypeSize the number of bases in the haplotype we are testing - * @param readSize the number of bases in the read we are testing - * @return a positive integer >= 1 - */ - @Ensures("result >= 1") - protected int getNPotentialXStarts(final int haplotypeSize, final int readSize) { - return Math.max(haplotypeSize - readSize + 1, 1); - } - - /** - * The the log10 probability penalty for the number of potential start sites of the read aginst the haplotype - * - * @param haplotypeSize the number of bases in the haplotype we are testing - * @param readSize the number of bases in the read we are testing - * @return a log10 probability - */ - @Ensures("MathUtils.goodLog10Probability(result)") - protected double getNPotentialXStartsLikelihoodPenaltyLog10(final int haplotypeSize, final int readSize) { - return - Math.log10(getNPotentialXStarts(haplotypeSize, readSize)); - } - /** * Print out the core hmm matrices for debugging */ protected void dumpMatrices() { - dumpMatrix("matchMetricArray", matchMetricArray); - dumpMatrix("XMetricArray", XMetricArray); - dumpMatrix("YMetricArray", YMetricArray); + dumpMatrix("matchMetricArray", matchMatrix); + dumpMatrix("insertionMatrix", insertionMatrix); + dumpMatrix("deletionMatrix", deletionMatrix); } /** @@ -215,8 +201,8 @@ public abstract class PairHMM { * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same */ public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { - if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + haplotype1); - if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + haplotype2); + if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); + if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { if( haplotype1[iii] != haplotype2[iii] ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 51753ca5e..f4c673e61 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -303,7 +303,7 @@ public class PileupElement implements Comparable { * this being a reduced read and a deletion, we return the average number of elements between the left * and right elements to the deletion. We assume the deletion to be left aligned. * - * @return + * @return the representative count */ public int getRepresentativeCount() { if (read.isReducedRead()) { @@ -318,6 +318,21 @@ public class PileupElement implements Comparable { } } + /** + * Adjusts the representative count of this pileup element. + * Throws an exception if this element does not represent a reduced read. + * + * See GATKSAMRecord.adjustReducedCount() for warnings on the permanency of this operation. + * + * @param adjustmentFactor how much to adjust the representative count (can be positive or negative) + */ + public void adjustRepresentativeCount(final int adjustmentFactor) { + if ( read.isReducedRead() ) + read.adjustReducedCount(offset, adjustmentFactor); + else + throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced"); + } + /** * Get the cigar element aligning this element to the genome * @return a non-null CigarElement diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index d34e2996c..e48d1ca4c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,24 +31,151 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.EventType; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; +import java.util.*; public final class AlignmentUtils { + private final static Logger logger = Logger.getLogger(AlignmentUtils.class); private final static EnumSet ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); private final static EnumSet ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S); // cannot be instantiated private AlignmentUtils() { } + /** + * Does cigar start or end with a deletion operation? + * + * @param cigar a non-null cigar to test + * @return true if the first or last operator of cigar is a D + */ + public static boolean startsOrEndsWithInsertionOrDeletion(final Cigar cigar) { + if ( cigar == null ) throw new IllegalArgumentException("Cigar cannot be null"); + + if ( cigar.isEmpty() ) + return false; + + final CigarOperator first = cigar.getCigarElement(0).getOperator(); + final CigarOperator last = cigar.getCigarElement(cigar.numCigarElements()-1).getOperator(); + return first == CigarOperator.D || first == CigarOperator.I || last == CigarOperator.D || last == CigarOperator.I; + } + + + /** + * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the + * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference + * + * refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference + * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X. + * + * If refStart or refEnd would start or end the new bases within a deletion, this function will return null + * + * @param bases + * @param refStart + * @param refEnd + * @param basesStartOnRef where does the bases array start w.r.t. the reference start? For example, bases[0] of + * could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at + * 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10. + * It's not trivial to eliminate this parameter because it's tied up with the cigar + * @param basesToRefCigar the cigar that maps the bases to the reference genome + * @return a byte[] containing the bases covering this interval, or null if we would start or end within a deletion + */ + public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) { + if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd); + if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef); + if ( bases == null ) throw new IllegalArgumentException("Bases cannot be null"); + if ( basesToRefCigar == null ) throw new IllegalArgumentException("basesToRefCigar cannot be null"); + if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar); + + int refPos = basesStartOnRef; + int basesPos = 0; + int basesStart = -1; + int basesStop = -1; + boolean done = false; + + for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) { + final CigarElement ce = basesToRefCigar.getCigarElement(iii); + switch ( ce.getOperator() ) { + case I: + basesPos += ce.getLength(); + break; + case M: case X: case EQ: + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refStart ) + basesStart = basesPos; + if ( refPos == refEnd ) { + basesStop = basesPos; + done = true; + break; + } + refPos++; + basesPos++; + } + break; + case D: + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refEnd || refPos == refStart ) { + // if we ever reach a ref position that is either a start or an end, we fail + return null; + } + refPos++; + } + break; + default: + throw new IllegalStateException("Unsupported operator " + ce); + } + } + + if ( basesStart == -1 || basesStop == -1 ) + throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar); + + return Arrays.copyOfRange(bases, basesStart, basesStop + 1); + } + + /** + * Get the number of bases at which refSeq and readSeq differ, given their alignment + * + * @param cigar the alignment of readSeq to refSeq + * @param refSeq the bases of the reference sequence + * @param readSeq the bases of the read sequence + * @return the number of bases that differ between refSeq and readSeq + */ + public static int calcNumDifferentBases(final Cigar cigar, final byte[] refSeq, final byte[] readSeq) { + int refIndex = 0, readIdx = 0, delta = 0; + + for (final CigarElement ce : cigar.getCigarElements()) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case X:case EQ:case M: + for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) + delta += refSeq[refIndex] != readSeq[readIdx] ? 1 : 0; + break; + case I: + delta += elementLength; + case S: + readIdx += elementLength; + break; + case D: + delta += elementLength; + case N: + refIndex += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + } + + return delta; + } + public static class MismatchCount { public int numMismatches = 0; public long mismatchQualities = 0; @@ -58,6 +185,9 @@ public final class AlignmentUtils { return getMismatchCount(r, refSeq, refIndex).mismatchQualities; } + /** + * @see #getMismatchCount(GATKSAMRecord, byte[], int, int, int) with startOnRead == 0 and nReadBases == read.getReadLength() + */ public static MismatchCount getMismatchCount(GATKSAMRecord r, byte[] refSeq, int refIndex) { return getMismatchCount(r, refSeq, refIndex, 0, r.getReadLength()); } @@ -70,7 +200,10 @@ public final class AlignmentUtils { * * @param r the sam record to check against * @param refSeq the byte array representing the reference sequence - * @param refIndex the index in the reference byte array of the read's first base (the reference index is matching the alignment start, there may be tons of soft-clipped bases before/after that so it's wrong to compare with getReadLength() here.) + * @param refIndex the index in the reference byte array of the read's first base (the reference index + * is matching the alignment start, there may be tons of soft-clipped bases before/after + * that so it's wrong to compare with getReadLength() here.). Note that refIndex is + * zero based, not 1 based * @param startOnRead the index in the read's bases from which we start counting * @param nReadBases the number of bases after (but including) startOnRead that we check * @return non-null object representing the mismatch count @@ -440,26 +573,64 @@ public final class AlignmentUtils { * Need a well-formed, consolidated Cigar string so that the left aligning code works properly. * For example, 1M1M1M1D2M1M --> 3M1D3M * If the given cigar is empty then the returned cigar will also be empty + * + * Note that this routine collapses cigar elements of size 0, so 2M0M => 2M + * * @param c the cigar to consolidate * @return a non-null cigar with consecutive matching operators merged into single operators. */ @Ensures({"result != null"}) public static Cigar consolidateCigar( final Cigar c ) { - if( c == null ) { throw new IllegalArgumentException("Cigar cannot be null"); } - if( c.isEmpty() ) { return c; } + if ( c == null ) { throw new IllegalArgumentException("Cigar cannot be null"); } + + // fast check to determine if there's anything worth doing before we create new Cigar and actually do some work + if ( ! needsConsolidation(c) ) + return c; final Cigar returnCigar = new Cigar(); int sumLength = 0; - for( int iii = 0; iii < c.numCigarElements(); iii++ ) { - sumLength += c.getCigarElement(iii).getLength(); - if( iii == c.numCigarElements() - 1 || !c.getCigarElement(iii).getOperator().equals(c.getCigarElement(iii+1).getOperator())) { // at the end so finish the current element - returnCigar.add(new CigarElement(sumLength, c.getCigarElement(iii).getOperator())); + CigarElement lastElement = null; + + for( final CigarElement cur : c.getCigarElements() ) { + if ( cur.getLength() == 0 ) + continue; // don't add elements of 0 length + + if ( lastElement != null && lastElement.getOperator() != cur.getOperator() ) { + returnCigar.add(new CigarElement(sumLength, lastElement.getOperator())); sumLength = 0; } + + sumLength += cur.getLength(); + lastElement = cur; } + + if ( sumLength > 0 ) { + returnCigar.add(new CigarElement(sumLength, lastElement.getOperator())); + } + return returnCigar; } + /** + * Does the cigar C need to be consolidated? + * + * @param c a non-null cigar + * @return true if so + */ + private static boolean needsConsolidation(final Cigar c) { + if ( c.numCigarElements() <= 1 ) + return false; // fast path for empty or single cigar + + CigarOperator lastOp = null; + for( final CigarElement cur : c.getCigarElements() ) { + if ( cur.getLength() == 0 || lastOp == cur.getOperator() ) + return true; + lastOp = cur.getOperator(); + } + + return false; + } + /** * Takes the alignment of the read sequence readSeq to the reference sequence refSeq * starting at 0-based position refIndex on the refSeq and specified by its cigar. @@ -616,7 +787,7 @@ public final class AlignmentUtils { */ @Requires("c != null") @Ensures("result != null") - private static Cigar cleanUpCigar(final Cigar c) { + public static Cigar cleanUpCigar(final Cigar c) { final List elements = new ArrayList(c.numCigarElements() - 1); for (final CigarElement ce : c.getCigarElements()) { @@ -730,4 +901,355 @@ public final class AlignmentUtils { return alt; } + + + /** + * Trim cigar down to one that starts at start reference on the left and extends to end on the reference + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases on the reference? The first position is 0 + * @param end Where should we stop keeping bases on the reference? The maximum value is cigar.getReferenceLength() + * @return a new Cigar with reference length == start - end + 1 + */ + public static Cigar trimCigarByReference(final Cigar cigar, final int start, final int end) { + if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start); + if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start start " + start); + if ( end > cigar.getReferenceLength() ) throw new IllegalArgumentException("End is beyond the cigar's reference length " + end + " for cigar " + cigar ); + + final Cigar result = trimCigar(cigar, start, end, true); + + if ( result.getReferenceLength() != end - start + 1) + throw new IllegalStateException("trimCigarByReference failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result); + return result; + } + + /** + * Trim cigar down to one that starts at start base in the cigar and extends to (inclusive) end base + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases in the cigar? The first position is 0 + * @param end Where should we stop keeping bases in the cigar? The maximum value is cigar.getReadLength() + * @return a new Cigar containing == start - end + 1 reads + */ + public static Cigar trimCigarByBases(final Cigar cigar, final int start, final int end) { + if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start); + if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start start " + start); + if ( end > cigar.getReadLength() ) throw new IllegalArgumentException("End is beyond the cigar's read length " + end + " for cigar " + cigar ); + + final Cigar result = trimCigar(cigar, start, end, false); + + final int expectedSize = end - start + 1; + if ( result.getReadLength() != expectedSize) + throw new IllegalStateException("trimCigarByBases failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result + " with size " + result.getReadLength() + " expected " + expectedSize + " for input cigar " + cigar); + return result; + } + + + /** + * Workhorse for trimCigarByBases and trimCigarByReference + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases in the cigar? The first position is 0 + * @param end Where should we stop keeping bases in the cigar? The maximum value is cigar.getReadLength() + * @param byReference should start and end be intrepreted as position in the reference or the read to trim to/from? + * @return a non-null cigar + */ + @Requires({"cigar != null", "start >= 0", "start <= end"}) + @Ensures("result != null") + private static Cigar trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference) { + final List newElements = new LinkedList(); + + int pos = 0; + for ( final CigarElement elt : cigar.getCigarElements() ) { + if ( pos > end ) break; + + switch ( elt.getOperator() ) { + case D: + if ( ! byReference ) { + if ( pos >= start ) + newElements.add(elt); + break; + } + // otherwise fall through to the next case + case EQ: case M: case X: + pos = addCigarElements(newElements, pos, start, end, elt); + break; + case S: case I: + if ( byReference ) { + if ( pos >= start ) + newElements.add(elt); + } else { + pos = addCigarElements(newElements, pos, start, end, elt); + } + break; + default: + throw new IllegalStateException("Cannot handle " + elt); + } + } + + return AlignmentUtils.consolidateCigar(new Cigar(newElements)); + } + + /** + * Helper function for trimCigar that adds cigar elements (of total length X) of elt.op to dest for + * X bases that fall between start and end, where the last position of the base is pos. + * + * The primary use of this function is to create a new cigar element list that contains only + * elements that occur between start and end bases in an initial cigar. + * + * Note that this function may return multiple cigar elements (1M1M etc) that are best consolidated + * after the fact into a single simpler representation. + * + * @param dest we will append our cigar elements to this list + * @param pos the position (0 indexed) where elt started + * @param start only include bases that occur >= this position + * @param end only include bases that occur <= this position + * @param elt the element we are slicing down + * @return the position after we've traversed all elt.length bases of elt + */ + protected static int addCigarElements(final List dest, int pos, final int start, final int end, final CigarElement elt) { + final int length = Math.min(pos + elt.getLength() - 1, end) - Math.max(pos, start) + 1; + if ( length > 0 ) + dest.add(new CigarElement(length, elt.getOperator())); + return pos + elt.getLength(); + } + + /** + * Get the offset (base 0) of the first reference aligned base in Cigar that occurs after readStartByBaseOfCigar base of the cigar + * + * The main purpose of this routine is to find a good start position for a read given it's cigar. The real + * challenge is that the starting base might be inside an insertion, in which case the read actually starts + * at the next M/EQ/X operator. + * + * @param cigar a non-null cigar + * @param readStartByBaseOfCigar finds the first base after this (0 indexed) that aligns to the reference genome (M, EQ, X) + * @throws IllegalStateException if no such base can be found + * @return an offset into cigar + */ + public static int calcFirstBaseMatchingReferenceInCigar(final Cigar cigar, int readStartByBaseOfCigar) { + if ( cigar == null ) throw new IllegalArgumentException("cigar cannot be null"); + if ( readStartByBaseOfCigar >= cigar.getReadLength() ) throw new IllegalArgumentException("readStartByBaseOfCigar " + readStartByBaseOfCigar + " must be <= readLength " + cigar.getReadLength()); + + int hapOffset = 0, refOffset = 0; + for ( final CigarElement ce : cigar.getCigarElements() ) { + for ( int i = 0; i < ce.getLength(); i++ ) { + switch ( ce.getOperator() ) { + case M:case EQ:case X: + if ( hapOffset >= readStartByBaseOfCigar ) + return refOffset; + hapOffset++; + refOffset++; + break; + case I: case S: + hapOffset++; + break; + case D: + refOffset++; + break; + default: + throw new IllegalStateException("calcFirstBaseMatchingReferenceInCigar does not support cigar " + ce.getOperator() + " in cigar " + cigar); + } + } + } + + throw new IllegalStateException("Never found appropriate matching state for cigar " + cigar + " given start of " + readStartByBaseOfCigar); + } + + /** + * Generate a new Cigar that maps the operations of the first cigar through those in a second + * + * For example, if first is 5M and the second is 2M1I2M then the result is 2M1I2M. + * However, if first is 1M2D3M and second is 2M1I3M this results in a cigar X + * + * ref : AC-GTA + * hap : ACxGTA - 2M1I3M + * read : A--GTA - 1M2D3M + * result: A--GTA => 1M1D3M + * + * ref : ACxG-TA + * hap : AC-G-TA - 2M1D3M + * read : AC-GxTA - 3M1I2M + * result: AC-GxTA => 2M1D1M1I2M + * + * ref : ACGTA + * hap : ACGTA - 5M + * read : A-GTA - 1M1I3M + * result: A-GTA => 1M1I3M + * + * ref : ACGTAC + * hap : AC---C - 2M3D1M + * read : AC---C - 3M + * result: AG---C => 2M3D + * + * The constraint here is that both cigars should imply that the result have the same number of + * reference bases (i.e.g, cigar.getReferenceLength() are equals). + * + * @param firstToSecond the cigar mapping hap1 -> hap2 + * @param secondToThird the cigar mapping hap2 -> hap3 + * @return A cigar mapping hap1 -> hap3 + */ + public static Cigar applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird) { + final boolean DEBUG = false; + + final List newElements = new LinkedList(); + final int nElements12 = firstToSecond.getCigarElements().size(); + final int nElements23 = secondToThird.getCigarElements().size(); + + int cigar12I = 0, cigar23I = 0; + int elt12I = 0, elt23I = 0; + + while ( cigar12I < nElements12 && cigar23I < nElements23 ) { + final CigarElement elt12 = firstToSecond.getCigarElement(cigar12I); + final CigarElement elt23 = secondToThird.getCigarElement(cigar23I); + + final CigarPairTransform transform = getTransformer(elt12.getOperator(), elt23.getOperator()); + + if ( DEBUG ) + System.out.printf("Transform %s => %s with elt1 = %d %s @ %d elt2 = %d %s @ %d with transform %s%n", + firstToSecond, secondToThird, cigar12I, elt12.getOperator(), elt12I, cigar23I, elt23.getOperator(), elt23I, transform); + + if ( transform.op13 != null ) // skip no ops + newElements.add(new CigarElement(1, transform.op13)); + + elt12I += transform.advance12; + elt23I += transform.advance23; + + // if have exhausted our current element, advance to the next one + if ( elt12I == elt12.getLength() ) { cigar12I++; elt12I = 0; } + if ( elt23I == elt23.getLength() ) { cigar23I++; elt23I = 0; } + } + + return AlignmentUtils.consolidateCigar(new Cigar(newElements)); + } + + private static CigarPairTransform getTransformer(final CigarOperator op12, final CigarOperator op23) { + for ( final CigarPairTransform transform : cigarPairTransformers) { + if ( transform.op12.contains(op12) && transform.op23.contains(op23) ) + return transform; + } + + throw new IllegalStateException("No transformer for operators " + op12 + " and " + op23); + } + + /** + * transformations that project one alignment state through another + * + * Think about this as a state machine, where we have: + * + * bases3 : xxx A zzz + * bases2 : xxx B zzz + * bases1 : xxx C zzz + * + * where A, B and C are alignment states of a three way alignment. We want to capture + * the transition from operation mapping 1 -> 2 and an operation mapping 2 -> 3 and its + * associated mapping from 1 -> 3 and the advancement of the cigar states of 1->2 and 2->3. + * + * Imagine that A, B, and C are all equivalent (so that op12 = M and op23 = M). This implies + * a mapping of 1->3 of M, and in this case the next states to consider in the 3 way alignment + * are the subsequent states in 1 and 2 (so that advance12 and advance23 are both 1). + * + * Obviously not all of the states and their associated transitions are so simple. Suppose instead + * that op12 = I, and op23 = M. What does this look like: + * + * bases3 : xxx - A zzz + * bases2 : xxx - B zzz + * bases1 : xxx I C zzz + * + * It means that op13 must be an insertion (as we have an extra base in 1 thats not present in 2 and + * so not present in 3). We advance the cigar in 1 by 1 (as we've consumed one base in 1 for the I) + * but we haven't yet found the base corresponding to the M of op23. So we don't advance23. + */ + private static class CigarPairTransform { + private final EnumSet op12, op23; + private final CigarOperator op13; + private final int advance12, advance23; + + private CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23) { + this.op12 = getCigarSet(op12); + this.op23 = getCigarSet(op23); + this.op13 = op13; + this.advance12 = advance12; + this.advance23 = advance23; + } + + private static EnumSet getCigarSet(final CigarOperator masterOp) { + switch ( masterOp ) { + case M: return EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); + case I: return EnumSet.of(CigarOperator.I, CigarOperator.S); + case D: return EnumSet.of(CigarOperator.D); + default: throw new IllegalStateException("Unexpected state " + masterOp); + } + } + + @Override + public String toString() { + return "CigarPairTransform{" + + "op12=" + op12 + + ", op23=" + op23 + + ", op13=" + op13 + + ", advance12=" + advance12 + + ", advance23=" + advance23 + + '}'; + } + } + + + private final static List cigarPairTransformers = Arrays.asList( + // + // op12 is a match + // + // 3: xxx B yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.M, CigarOperator.M, 1, 1), + // 3: xxx I yyy + // ^^^^^^^^^^^^ + // 2: xxx I yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.I, CigarOperator.I, 1, 1), + // 3: xxx D yyy + // ^^^^^^^^^^^^ + // 2: xxx D yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.D, CigarOperator.D, 0, 1), + + // + // op12 is a deletion + // + // 3: xxx D M yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx D yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.M, CigarOperator.D, 1, 1), + // 3: xxx D1 D2 yyy + // ^^^^^^^^^^^^ + // 2: xxx D2 yyy + // 1: xxx D1 yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.D, CigarOperator.D, 1, 0), + // 3: xxx X yyy => no-op, we skip emitting anything here + // ^^^^^^^^^^^^ + // 2: xxx I yyy + // 1: xxx D yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.I, null, 1, 1), + + // + // op12 is a insertion + // + // 3: xxx I M yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx I yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.M, CigarOperator.I, 1, 0), + // 3: xxx I D yyy + // ^^^^^^^^^^^^ + // 2: xxx D yyy + // 1: xxx I yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.D, CigarOperator.I, 1, 0), + // 3: xxx I1 I2 yyy + // ^^^^^^^^^^^^ + // 2: xxx I2 yyy + // 1: xxx I1 yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0) + ); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index fcebbec9b..ec9d7d219 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -72,6 +72,11 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { return mNGSPlatform; } + @Override + public String toString() { + return "GATKSAMReadGroupRecord @RG:" + getReadGroupId(); + } + /////////////////////////////////////////////////////////////////////////////// // *** The following methods are overloaded to cache the appropriate data ***// /////////////////////////////////////////////////////////////////////////////// diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 01a8c1996..c39245730 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; @@ -52,6 +53,7 @@ import java.util.*; public class GATKSAMRecord extends BAMRecord { // ReduceReads specific attribute tags public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool + public static final String REDUCED_READ_STRANDED_TAG = "RS"; // marks a stranded synthetic read produced by the ReduceReads tool public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end @@ -68,12 +70,14 @@ public class GATKSAMRecord extends BAMRecord { // the SAMRecord data we're caching private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; - private byte[] reducedReadCounts = null; + private int[] reducedReadCounts = null; private final static int UNINITIALIZED = -1; private int softStart = UNINITIALIZED; private int softEnd = UNINITIALIZED; private Integer adapterBoundary = null; + private Boolean isStrandlessRead = null; + // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; private boolean retrievedReduceReadCounts = false; @@ -141,6 +145,48 @@ public class GATKSAMRecord extends BAMRecord { return ArtificialSAMUtils.createArtificialRead(cigar); } + /////////////////////////////////////////////////////////////////////////////// + // *** support for reads without meaningful strand information ***// + /////////////////////////////////////////////////////////////////////////////// + + /** + * Does this read have a meaningful strandedness value? + * + * Some advanced types of reads, such as reads coming from merged fragments, + * don't have meaningful strandedness values, as they are composites of multiple + * other reads. Strandless reads need to be handled specially by code that cares about + * stranded information, such as FS. + * + * @return true if this read doesn't have meaningful strand information + */ + public boolean isStrandless() { + if ( isStrandlessRead == null ) { + isStrandlessRead = isReducedRead() && getCharacterAttribute(REDUCED_READ_STRANDED_TAG) == null; + } + return isStrandlessRead; + } + + /** + * Set the strandless state of this read to isStrandless + * @param isStrandless true if this read doesn't have a meaningful strandedness value + */ + public void setIsStrandless(final boolean isStrandless) { + this.isStrandlessRead = isStrandless; + } + + @Override + public boolean getReadNegativeStrandFlag() { + return ! isStrandless() && super.getReadNegativeStrandFlag(); + } + + @Override + public void setReadNegativeStrandFlag(final boolean flag) { + if ( isStrandless() ) + throw new IllegalStateException("Cannot set the strand of a strandless read"); + super.setReadNegativeStrandFlag(flag); + } + + /////////////////////////////////////////////////////////////////////////////// // *** The following methods are overloaded to cache the appropriate data ***// /////////////////////////////////////////////////////////////////////////////// @@ -300,29 +346,179 @@ public class GATKSAMRecord extends BAMRecord { // *** ReduceReads functions ***// /////////////////////////////////////////////////////////////////////////////// - public byte[] getReducedReadCounts() { + /** + * Get the counts of the bases in this reduced read + * + * NOTE that this is not the value of the REDUCED_READ_CONSENSUS_TAG, which + * is encoded in a special way. This is the actual positive counts of the + * depth at each bases. So for a RR with a tag of: + * + * [10, 5, -1, -5] + * + * this function returns + * + * [10, 15, 9, 5] + * + * as one might expect. + * + * @return a int[] holding the depth of the bases in this reduced read, or null if this isn't a reduced read + */ + public int[] getReducedReadCounts() { if ( ! retrievedReduceReadCounts ) { - reducedReadCounts = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); + final byte[] tag = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); + if ( tag != null ) reducedReadCounts = decodeReduceReadCounts(tag); retrievedReduceReadCounts = true; } return reducedReadCounts; } - public boolean isReducedRead() { - return getReducedReadCounts() != null; - } - /** * The number of bases corresponding the i'th base of the reduced read. * * @param i the read based coordinate inside the read * @return the number of bases corresponding to the i'th base of the reduced read */ - public final byte getReducedCount(final int i) { - byte firstCount = getReducedReadCounts()[0]; - byte offsetCount = getReducedReadCounts()[i]; - return (i==0) ? firstCount : (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); + public final int getReducedCount(final int i) { + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced"); + if ( i < 0 || i >= getReadBases().length ) + throw new IllegalArgumentException("illegal offset used when retrieving reduced counts: " + i); + + final int[] reducedCounts = getReducedReadCounts(); + return reducedCounts[i]; + } + + /** + * Is this read a reduced read? + * @return true if yes + */ + public boolean isReducedRead() { + return getReducedReadCounts() != null; + } + + /** + * Set the reduced read counts tag for this record. + * Note that this method is slightly expensive as it converts to the correct reduced counts representation and sets the + * appropriate binary tag. If you want to modify the reduced count in place without triggering the permanent conversion + * internally, use the #setReducedCount() method. + * + * @param counts the count array + */ + public void setReducedReadCountsTag(final int[] counts) { + setAttribute(REDUCED_READ_CONSENSUS_TAG, encodeReduceReadCounts(counts)); + retrievedReduceReadCounts = false; // need to force new decode in case we had to handle precision problems with the counts + } + + /** + * @see #setReducedReadCountsTag() and uses the currently stored values of the internal array. + * Useful if you've been using #setReducedCount() to modify the reduced count and now want to trigger the expensive conversion. + */ + public void setReducedReadCountsTag() { + if ( !retrievedReduceReadCounts ) + throw new IllegalStateException("Trying to write the reduced reads counts using an uninitialized internal array of counts"); + setReducedReadCountsTag(reducedReadCounts); + } + + /** + * Sets the reduced read count corresponding the i'th base of the reduced read. + * + * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion + * and push that value into the read's binary tags, use #setReducedReadCountsTag(). + * + * @param i the read based coordinate inside the read + * @param count the new count + */ + public final void setReducedCount(final int i, final int count) { + if ( count < 0 ) + throw new IllegalArgumentException("the reduced count cannot be set to a negative value"); + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); + if ( i < 0 || i >= getReadBases().length ) + throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); + + // force the initialization of the counts array if it hasn't happened yet + getReducedReadCounts()[i] = count; + } + + /** + * Set the reduced read counts tag for this record to counts + * + * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion + * and push that value into the read's binary tags, use #setReducedReadCountsTag(). + * + * @param counts the count array + */ + public void setReducedReadCounts(final int[] counts) { + if ( counts.length != getReadBases().length ) + throw new IllegalArgumentException("Reduced counts length " + counts.length + " != bases length " + getReadBases().length); + retrievedReduceReadCounts = true; + reducedReadCounts = counts; + } + + /** + * Sets the number of bases corresponding the i'th base of the reduced read. + * + * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion + * and push that value into the read's binary tags, use #setReducedReadCountsTag(). + * + * @param i the read based coordinate inside the read + * @param adjustmentFactor how much to add/subtract to the current count + */ + public final void adjustReducedCount(final int i, final int adjustmentFactor) { + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); + if ( i < 0 || i >= getReadBases().length ) + throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); + + setReducedCount(i, getReducedReadCounts()[i] + adjustmentFactor); + } + + /** + * Actually decode the consensus tag of a reduce read, returning a newly allocated + * set of values countsFromTag to be the real depth of cover at each base of the reduced read. + * + * for example, if the tag contains [10, 5, -1, -5], after running this function the + * byte[] will contain the true counts [10, 15, 9, 5]. + * + * as one might expect. + * + * @param countsFromTag a non-null byte[] containing the tag encoded reduce reads counts + * @return a non-null int[] containing the true depth values for the vector + */ + protected static int[] decodeReduceReadCounts(final byte[] countsFromTag) { + final int n = countsFromTag.length; + final int[] result = new int[n]; + final int firstCount = countsFromTag[0] & 0xff; // unsigned byte + result[0] = firstCount; + for ( int i = 1; i < n; i++ ) { + final int offsetCount = countsFromTag[i] & 0xff; // unsigned byte + result[i] = (firstCount + offsetCount) % 256; + } + + return result; + } + + /** + * Converts int array from straight counts to the appropriate reduce reads representation in BAM (offset from first value) + * + * @param counts the counts array + * @return non-null converted byte array + */ + protected static byte[] encodeReduceReadCounts(final int[] counts) { + if ( counts.length == 0 ) + throw new IllegalArgumentException("Trying to write a reduced read with a counts array of length 0"); + + final byte[] compressedCountsArray = new byte[counts.length]; + final int firstCount = (int) MathUtils.bound(counts[0], 0, 255); // we want an unsigned byte capped at max byte representation + compressedCountsArray[0] = (byte)firstCount; + for ( int i = 1; i < counts.length; i++ ) { + final int count = (int) MathUtils.bound(counts[i], 0, 255); + final byte offset = (byte)(count - firstCount + (count >= firstCount ? 0 : 256)); // unsigned byte + compressedCountsArray[i] = offset; + } + + return compressedCountsArray; } /////////////////////////////////////////////////////////////////////////////// @@ -542,6 +738,7 @@ public class GATKSAMRecord extends BAMRecord { emptyRead.setCigarString(""); emptyRead.setReadBases(new byte[0]); emptyRead.setBaseQualities(new byte[0]); + if ( read.isReducedRead() ) emptyRead.setReducedReadCounts(new int[0]); SAMReadGroupRecord samRG = read.getReadGroup(); emptyRead.clearAttributes(); @@ -575,7 +772,7 @@ public class GATKSAMRecord extends BAMRecord { /** * A caching version of ReadUtils.getAdaptorBoundary() * - * @see ReadUtils.getAdaptorBoundary(SAMRecord) for more information about the meaning of this function + * see #ReadUtils.getAdaptorBoundary(SAMRecord) for more information about the meaning of this function * * WARNING -- this function caches a value depending on the inferred insert size and alignment starts * and stops of this read and its mate. Changing these values in any way will invalidate the cached value. diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index d26a1f807..4cd361ba1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -141,7 +141,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); - SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord); + SAMFileHeader header = Utils.setupWriter(toolkit.getSAMFileHeader(id), programRecord); SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 709afeef5..0eed80f3a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -29,12 +29,12 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.BaseUtils; import java.io.File; import java.util.*; @@ -62,7 +62,7 @@ public class ReadUtils { return 1; // compute mean representative read counts - final byte[] counts = read.getReducedReadCounts(); + final int[] counts = read.getReducedReadCounts(); return (int)Math.round((double)MathUtils.sum(counts)/counts.length); } @@ -485,7 +485,7 @@ public class ReadUtils { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); } else { - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); } } @@ -506,7 +506,7 @@ public class ReadUtils { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); } else { - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); } } @@ -524,7 +524,7 @@ public class ReadUtils { // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need // to add the shift of the current cigar element but go back to it's last element to return the last // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar) + else if (fallsInsideDeletion && !endsWithinCigar && cigarElement.getOperator().consumesReadBases()) readBases += shift - 1; // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java new file mode 100644 index 000000000..d4364afdf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.smithwaterman; + +/** + * Holds the core Smith-Waterman alignment parameters of + * + * match value, and mismatch, gap open and gap extension penalties + * + * User: depristo + * Date: 4/11/13 + * Time: 12:03 PM + */ +public final class Parameters { + public final double w_match; + public final double w_mismatch; + public final double w_open; + public final double w_extend; + + /** + * Create a new set of SW parameters + * @param w_match the match score + * @param w_mismatch the mismatch penalty + * @param w_open the gap open penalty + * @param w_extend the gap extension penalty + */ + public Parameters(double w_match, double w_mismatch, double w_open, double w_extend) { + if ( w_mismatch > 0 ) throw new IllegalArgumentException("w_mismatch must be <= 0 but got " + w_mismatch); + if ( w_open> 0 ) throw new IllegalArgumentException("w_open must be <= 0 but got " + w_open); + if ( w_extend> 0 ) throw new IllegalArgumentException("w_extend must be <= 0 but got " + w_extend); + + this.w_match = w_match; + this.w_mismatch = w_mismatch; + this.w_open = w_open; + this.w_extend = w_extend; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java new file mode 100644 index 000000000..890faa82a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java @@ -0,0 +1,458 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.smithwaterman; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; + +import java.util.*; + +/** + * Pairwise discrete smith-waterman alignment + * + * ************************************************************************ + * **** IMPORTANT NOTE: **** + * **** This class assumes that all bytes come from UPPERCASED chars! **** + * ************************************************************************ + * + * User: asivache + * Date: Mar 23, 2009 + * Time: 1:54:54 PM + */ +public final class SWPairwiseAlignment { + private int alignment_offset; // offset of s2 w/respect to s1 + private Cigar alignmentCigar; + + private final Parameters parameters; + + private static final int MSTATE = 0; + private static final int ISTATE = 1; + private static final int DSTATE = 2; + private static final int CLIP = 3; + + protected static boolean cutoff = false; + private static boolean DO_SOFTCLIP = true; + + /** + * The SW scoring matrix, stored for debugging purposes if keepScoringMatrix is true + */ + protected double[] SW = null; + + /** + * Only for testing purposes in the SWPairwiseAlignmentMain function + * set to true to keep SW scoring matrix after align call + */ + protected static boolean keepScoringMatrix = false; + + /** + * Create a new SW pairwise aligner. + * + * @deprecated in favor of constructors using the Parameter or ParameterSet class + */ + @Deprecated + public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) { + this(seq1, seq2, new Parameters(match, mismatch, open, extend)); + } + + /** + * Create a new SW pairwise aligner + * + * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2) + * + * @param seq1 the first sequence we want to align + * @param seq2 the second sequence we want to align + * @param parameters the SW parameters to use + */ + public SWPairwiseAlignment(byte[] seq1, byte[] seq2, Parameters parameters) { + this.parameters = parameters; + align(seq1,seq2); + } + + /** + * Create a new SW pairwise aligner + * + * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2) + * + * @param seq1 the first sequence we want to align + * @param seq2 the second sequence we want to align + * @param namedParameters the named parameter set to get our parameters from + */ + public SWPairwiseAlignment(byte[] seq1, byte[] seq2, SWParameterSet namedParameters) { + this(seq1, seq2, namedParameters.parameters); + } + + public SWPairwiseAlignment(byte[] seq1, byte[] seq2) { + this(seq1,seq2,SWParameterSet.ORIGINAL_DEFAULT); + } + + public Cigar getCigar() { return alignmentCigar ; } + + public int getAlignmentStart2wrt1() { return alignment_offset; } + + public void align(final byte[] a, final byte[] b) { + final int n = a.length; + final int m = b.length; + double [] sw = new double[(n+1)*(m+1)]; + if ( keepScoringMatrix ) SW = sw; + int [] btrack = new int[(n+1)*(m+1)]; + + calculateMatrix(a, b, sw, btrack); + calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) + } + + + private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) { + final int n = a.length+1; + final int m = b.length+1; + + //final double MATRIX_MIN_CUTOFF=-1e100; // never let matrix elements drop below this cutoff + final double MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff + if ( cutoff ) MATRIX_MIN_CUTOFF = 0.0; + else MATRIX_MIN_CUTOFF = -1e100; + + double [] best_gap_v = new double[m+1]; + Arrays.fill(best_gap_v,-1.0e40); + int [] gap_size_v = new int[m+1]; + double [] best_gap_h = new double[n+1]; + Arrays.fill(best_gap_h,-1.0e40); + int [] gap_size_h = new int[n+1]; + + // build smith-waterman matrix and keep backtrack info: + for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop + byte a_base = a[i-1]; // letter in a at the current pos + + final int row_offset = row_offset_1 + m; + + // On the entrance into the loop, row_offset_1 is the (linear) offset + // of the first element of row (i-1) and row_offset is the linear offset of the + // start of row i + + for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) { + + // data_offset_1 is linearized offset of element [i-1][j-1] + + final byte b_base = b[j-1]; // letter in b at the current pos + + // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base); + double step_diag = sw[data_offset_1] + wd(a_base,b_base); + + // optimized "traversal" of all the matrix cells above the current one (i.e. traversing + // all 'step down' events that would end in the current cell. The optimized code + // does exactly the same thing as the commented out loop below. IMPORTANT: + // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! + + // if a gap (length 1) was just opened above, this is the cost of arriving to the current cell: + double prev_gap = sw[data_offset_1+1]+parameters.w_open; + + best_gap_v[j] += parameters.w_extend; // for the gaps that were already opened earlier, extending them by 1 costs w_extend + + if ( prev_gap > best_gap_v[j] ) { + // opening a gap just before the current cell results in better score than extending by one + // the best previously opened gap. This will hold for ALL cells below: since any gap + // once opened always costs w_extend to extend by another base, we will always get a better score + // by arriving to any cell below from the gap we just opened (prev_gap) rather than from the previous best gap + best_gap_v[j] = prev_gap; + gap_size_v[j] = 1; // remember that the best step-down gap from above has length 1 (we just opened it) + } else { + // previous best gap is still the best, even after extension by another base, so we just record that extension: + gap_size_v[j]++; + } + + final double step_down = best_gap_v[j] ; + final int kd = gap_size_v[j]; + + // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing + // all 'step right' events that would end in the current cell. The optimized code + // does exactly the same thing as the commented out loop below. IMPORTANT: + // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! + + final int data_offset = row_offset + j; // linearized offset of element [i][j] + prev_gap = sw[data_offset-1]+parameters.w_open; // what would it cost us to open length 1 gap just to the left from current cell + best_gap_h[i] += parameters.w_extend; // previous best gap would cost us that much if extended by another base + + if ( prev_gap > best_gap_h[i] ) { + // newly opened gap is better (score-wise) than any previous gap with the same row index i; since + // gap penalty is linear with k, this new gap location is going to remain better than any previous ones + best_gap_h[i] = prev_gap; + gap_size_h[i] = 1; + } else { + gap_size_h[i]++; + } + + final double step_right = best_gap_h[i]; + final int ki = gap_size_h[i]; + + if ( step_down > step_right ) { + if ( step_down > step_diag ) { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down); + btrack[data_offset] = kd ; // positive=vertical + } else { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); + btrack[data_offset] = 0; // 0 = diagonal + } + } else { + // step_down <= step_right + if ( step_right > step_diag ) { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_right); + btrack[data_offset] = -ki; // negative = horizontal + } else { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); + btrack[data_offset] = 0; // 0 = diagonal + } + } + } + + // IMPORTANT, IMPORTANT, IMPORTANT: + // note that we update this (secondary) outer loop variable here, + // so that we DO NOT need to update it + // in the for() statement itself. + row_offset_1 = row_offset; + } + } + + + private void calculateCigar(int n, int m, double [] sw, int [] btrack) { + // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order + int p1 = 0, p2 = 0; + + double maxscore = Double.NEGATIVE_INFINITY; // sw scores are allowed to be negative + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + + // look for largest score. we use >= combined with the traversal direction + // to ensure that if two scores are equal, the one closer to diagonal gets picked + for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) { + // data_offset is the offset of [i][m] + if ( sw[data_offset] >= maxscore ) { + p1 = i; p2 = m ; maxscore = sw[data_offset]; + } + } + + for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) { + // data_offset is the offset of [n][j] + if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) { + p1 = n; + p2 = j ; + maxscore = sw[data_offset]; + segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment + } + } + + List lce = new ArrayList(5); + + if ( segment_length > 0 && DO_SOFTCLIP ) { + lce.add(makeElement(CLIP, segment_length)); + segment_length = 0; + } + + // we will be placing all insertions and deletions into sequence b, so the states are named w/regard + // to that sequence + + int state = MSTATE; + + int data_offset = p1*(m+1)+p2; // offset of element [p1][p2] + do { + int btr = btrack[data_offset]; + + int new_state; + int step_length = 1; + + if ( btr > 0 ) { + new_state = DSTATE; + step_length = btr; + } else if ( btr < 0 ) { + new_state = ISTATE; + step_length = (-btr); + } else new_state = MSTATE; // and step_length =1, already set above + + // move to next best location in the sw matrix: + switch( new_state ) { + case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in the sw matrix + case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left + case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up + } + + // now let's see if the state actually changed: + if ( new_state == state ) segment_length+=step_length; + else { + // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). + lce.add(makeElement(state, segment_length)); + segment_length = step_length; + state = new_state; + } +// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2: + } while ( p1 > 0 && p2 > 0 ); + + // post-process the last segment we are still keeping; + // NOTE: if reads "overhangs" the ref on the left (i.e. if p2>0) we are counting + // those extra bases sticking out of the ref into the first cigar element if DO_SOFTCLIP is false; + // otherwise they will be softclipped. For instance, + // if read length is 5 and alignment starts at offset -2 (i.e. read starts before the ref, and only + // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if + // DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true. + // The consumers need to check for the alignment offset and deal with it properly. + if (DO_SOFTCLIP ) { + lce.add(makeElement(state, segment_length)); + if ( p2> 0 ) lce.add(makeElement(CLIP, p2)); + alignment_offset = p1 ; + } else { + lce.add(makeElement(state, segment_length + p2)); + alignment_offset = p1 - p2; + } + + Collections.reverse(lce); + alignmentCigar = AlignmentUtils.consolidateCigar(new Cigar(lce)); + } + + private CigarElement makeElement(int state, int segment_length) { + CigarOperator o = null; + switch(state) { + case MSTATE: o = CigarOperator.M; break; + case ISTATE: o = CigarOperator.I; break; + case DSTATE: o = CigarOperator.D; break; + case CLIP: o = CigarOperator.S; break; + } + return new CigarElement(segment_length,o); + } + + private double wd(byte x, byte y) { + return (x == y ? parameters.w_match : parameters.w_mismatch); + } + + public void printAlignment(byte[] ref, byte[] read) { + printAlignment(ref,read,100); + } + + public void printAlignment(byte[] ref, byte[] read, int width) { + StringBuilder bread = new StringBuilder(); + StringBuilder bref = new StringBuilder(); + StringBuilder match = new StringBuilder(); + + int i = 0; + int j = 0; + + final int offset = getAlignmentStart2wrt1(); + + Cigar cigar = getCigar(); + + if ( ! DO_SOFTCLIP ) { + + // we need to go through all the hassle below only if we do not do softclipping; + // otherwise offset is never negative + if ( offset < 0 ) { + for ( ; j < (-offset) ; j++ ) { + bread.append((char)read[j]); + bref.append(' '); + match.append(' '); + } + // at negative offsets, our cigar's first element carries overhanging bases + // that we have just printed above. Tweak the first element to + // exclude those bases. Here we create a new list of cigar elements, so the original + // list/original cigar are unchanged (they are unmodifiable anyway!) + + List tweaked = new ArrayList(); + tweaked.addAll(cigar.getCigarElements()); + tweaked.set(0,new CigarElement(cigar.getCigarElement(0).getLength()+offset, + cigar.getCigarElement(0).getOperator())); + cigar = new Cigar(tweaked); + } + } + + if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0 + for ( ; i < getAlignmentStart2wrt1() ; i++ ) { + bref.append((char)ref[i]); + bread.append(' '); + match.append(' '); + } + } + + for ( CigarElement e : cigar.getCigarElements() ) { + switch (e.getOperator()) { + case M : + for ( int z = 0 ; z < e.getLength() ; z++, i++, j++ ) { + bref.append((i= s.length() ) { + System.out.println(); + return; + } + int end = Math.min(start+width,s.length()); + System.out.println(s.substring(start,end)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java new file mode 100644 index 000000000..8c832fa75 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.smithwaterman; + +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Simple program to run SW performance test. + * + * // TODO -- should be replaced with Caliper before using again + * + * User: depristo + * Date: 2/28/13 + * Time: 4:54 PM + * To change this template use File | Settings | File Templates. + */ +public class SWPairwiseAlignmentMain { + // BELOW: main() method for testing; old implementations of the core methods are commented out below; +// uncomment everything through the end of the file if benchmarking of new vs old implementations is needed. + + public static void main(String argv[]) { +// String ref="CACGAGCATATGTGTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTATGTGTTCACATGCATGTGTGTCT"; +// String read = "GCATATGTTTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTGTGTTCACATGCATGTG"; + + String ref = null; + String read = null; + + Map> args = processArgs(argv); + + List l = args.get("SEQ"); + args.remove("SEQ"); + if ( l == null ) { + System.err.println("SEQ argument is missing. Two input sequences must be provided"); + System.exit(1); + } + if ( l.size() != 2 ) { + System.err.println("Two input sequences (SEQ arguments) must be provided. Found "+l.size()+" instead"); + System.exit(1); + } + + ref = l.get(0); + read = l.get(1); + + Double m = extractSingleDoubleArg("MATCH",args); + Double mm = extractSingleDoubleArg("MISMATCH",args); + Double open = extractSingleDoubleArg("OPEN",args); + Double ext = extractSingleDoubleArg("EXTEND",args); + + Boolean reverse = extractSingleBooleanArg("REVERSE",args); + if ( reverse != null && reverse.booleanValue() == true ) { + ref = Utils.reverse(ref); + read = Utils.reverse(read); + } + + Boolean print_mat = extractSingleBooleanArg("PRINT_MATRIX",args); + Boolean cut = extractSingleBooleanArg("CUTOFF",args); + if ( cut != null ) SWPairwiseAlignment.cutoff = cut; + + if ( args.size() != 0 ) { + System.err.println("Unknown argument on the command line: "+args.keySet().iterator().next()); + System.exit(1); + } + + double w_match; + double w_mismatch; + double w_open; + double w_extend; + + w_match = (m == null ? 30.0 : m.doubleValue()); + w_mismatch = (mm == null ? -10.0 : mm.doubleValue()); + w_open = (open == null ? -10.0 : open.doubleValue()); + w_extend = (ext == null ? -2.0 : ext.doubleValue()); + + + SWPairwiseAlignment.keepScoringMatrix = true; + SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend); + + System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+ + " length1="+ref.length()+" length2="+read.length()); + + + System.out.println(); + a.printAlignment(ref.getBytes(),read.getBytes()); + + System.out.println(); + if ( print_mat != null && print_mat == true ) { + print(a.SW,ref.getBytes(),read.getBytes()); + } + } + + private static void print(double[] s, byte[] a, byte[] b) { + int n = a.length+1; + int m = b.length+1; + System.out.print(" "); + for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ; + System.out.println(); + + for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) { + if ( i > 0 ) System.out.print((char)a[i-1]); + else System.out.print(' '); + System.out.print(" "); + for ( int j = 0; j < m ; j++ ) { + System.out.printf(" %5.1f",s[row_offset+j]); + } + System.out.println(); + } + } + + + static Pair getArg(String prefix, String argv[], int i) { + String arg = null; + if ( argv[i].startsWith(prefix) ) { + arg = argv[i].substring(prefix.length()); + if( arg.length() == 0 ) { + i++; + if ( i < argv.length ) arg = argv[i]; + else { + System.err.println("No value found after " + prefix + " argument tag"); + System.exit(1); + } + } + i++; + } + return new Pair(arg,i); + } + + static Map> processArgs(String argv[]) { + Map> args = new HashMap>(); + + for ( int i = 0; i < argv.length ; i++ ) { + String arg = argv[i]; + int pos = arg.indexOf('='); + if ( pos < 0 ) { + System.err.println("Argument "+arg+" is not of the form ="); + System.exit(1); + } + String val = arg.substring(pos+1); + if ( val.length() == 0 ) { + // there was a space between '=' and the value + i++; + if ( i < argv.length ) val = argv[i]; + else { + System.err.println("No value found after " + arg + " argument tag"); + System.exit(1); + } + } + arg = arg.substring(0,pos); + + List l = args.get(arg); + if ( l == null ) { + l = new ArrayList(); + args.put(arg,l); + } + l.add(val); + } + return args; + } + + static Double extractSingleDoubleArg(String argname, Map> args) { + List l = args.get(argname); + args.remove(argname); + if ( l == null ) return null; + + if ( l.size() > 1 ) { + System.err.println("Only one "+argname+" argument is allowed"); + System.exit(1); + } + double d=0; + try { + d = Double.parseDouble(l.get(0)); + } catch ( NumberFormatException e) { + System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+")"); + System.exit(1); + } + System.out.println("Argument "+argname+" set to "+d); + return new Double(d); + } + + + static Boolean extractSingleBooleanArg(String argname, Map> args) { + List l = args.get(argname); + args.remove(argname); + if ( l == null ) return null; + + if ( l.size() > 1 ) { + System.err.println("Only one "+argname+" argument is allowed"); + System.exit(1); + } + if ( l.get(0).equals("true") ) return Boolean.valueOf(true); + if ( l.get(0).equals("false") ) return Boolean.valueOf(false); + System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+"); true/false are allowed"); + System.exit(1); + return Boolean.valueOf(false); // This value isn't used because it is preceded by System.exit(1) + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java similarity index 60% rename from public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java rename to public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java index a17dc612b..100780023 100644 --- a/public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java @@ -23,25 +23,29 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.file; +package org.broadinstitute.sting.utils.smithwaterman; /** - * A special checked exception that happens only in the case where - * the filesystem, by design or configuration, is completely unable - * to handle locking. This exception will specifically NOT be thrown - * in the case where the filesystem handles locking but is unable to - * acquire a lock due to concurrency. + * Handy named collection of common Smith-waterman parameter sets * - * @author hanna - * @version 0.1 + * User: depristo + * Date: 4/11/13 + * Time: 12:02 PM */ -public class FileSystemInabilityToLockException extends Exception { +public enum SWParameterSet { + // match=1, mismatch = -1/3, gap=-(1+k/3) + ORIGINAL_DEFAULT(new Parameters(1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0)), + /** - * Force user to create this exception with a nested inner stack trace. - * @param message Exception message. - * @param innerException Caused-by exception. + * A standard set of values for NGS alignments */ - public FileSystemInabilityToLockException(String message,Exception innerException) { - super(message,innerException); + STANDARD_NGS(new Parameters(5.0, -10.0, -22.0, -1.2)); + + protected Parameters parameters; + + SWParameterSet(final Parameters parameters) { + if ( parameters == null ) throw new IllegalArgumentException("parameters cannot be null"); + + this.parameters = parameters; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 3a5ddb7a0..4565402b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -51,7 +51,6 @@ public class GATKVariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_INTERSECTION = "Intersection"; - public enum GenotypeMergeType { /** * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. @@ -97,6 +96,46 @@ public class GATKVariantContextUtils { MIX_TYPES } + /** + * Refactored out of the AverageAltAlleleLength annotation class + * @param vc the variant context + * @return the average length of the alt allele (a double) + */ + public static double getMeanAltAlleleLength(VariantContext vc) { + double averageLength = 1.0; + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + averageLength = ( (double) averageLengthNum )/averageLengthDenom; + } + + return averageLength; + } + /** * create a genome location, given a variant context * @param genomeLocParser parser @@ -114,14 +153,14 @@ public class GATKVariantContextUtils { } /** - * If this is a BiAlleic SNP, is it a transition? + * If this is a BiAllelic SNP, is it a transition? */ public static boolean isTransition(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; } /** - * If this is a BiAlleic SNP, is it a transversion? + * If this is a BiAllelic SNP, is it a transversion? */ public static boolean isTransversion(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; @@ -397,14 +436,19 @@ public class GATKVariantContextUtils { // the genotypes with PLs final GenotypesContext oldGTs = vc.getGenotypes(); + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + // optimization: if no input genotypes, just exit + if (oldGTs.isEmpty()) + return newGTs; + // samples final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); final int numNewAltAlleles = allelesToUse.size() - 1; // which PLs should be carried forward? @@ -444,6 +488,9 @@ public class GATKVariantContextUtils { double[] newLikelihoods; if ( likelihoodIndexesToUse == null ) { newLikelihoods = originalLikelihoods; + } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { + logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + newLikelihoods = null; } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; int newIndex = 0; @@ -455,13 +502,13 @@ public class GATKVariantContextUtils { } // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + if ( newLikelihoods != null && MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); } else { final GenotypeBuilder gb = new GenotypeBuilder(g); - if ( numNewAltAlleles == 0 ) + if ( newLikelihoods == null || numNewAltAlleles == 0 ) gb.noPL(); else gb.PL(newLikelihoods); @@ -957,14 +1004,15 @@ public class GATKVariantContextUtils { public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); - if ( inputVC.getNAlleles() <= 1 ) + if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) return inputVC; // see whether we need to trim common reference base from all alleles final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; - return trimAlleles(revTrimVC, fwdTrim, 0); + final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); + return vc; } /** @@ -985,7 +1033,6 @@ public class GATKVariantContextUtils { return inputVC; final List alleles = new LinkedList(); - final GenotypesContext genotypes = GenotypesContext.create(); final Map originalToTrimmedAlleleMap = new HashMap(); for (final Allele a : inputVC.getAlleles()) { @@ -1002,17 +1049,8 @@ public class GATKVariantContextUtils { } // now we can recreate new genotypes with trimmed alleles - for ( final Genotype genotype : inputVC.getGenotypes() ) { - final List originalAlleles = genotype.getAlleles(); - final List trimmedAlleles = new ArrayList(); - for ( final Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); - } + final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); + final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); final int start = inputVC.getStart() + (fwdTrimEnd + 1); final VariantContextBuilder builder = new VariantContextBuilder(inputVC); @@ -1023,6 +1061,18 @@ public class GATKVariantContextUtils { return builder.make(); } + @Requires("originalGenotypes != null && alleleMapper != null") + protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { + final GenotypesContext updatedGenotypes = GenotypesContext.create(); + + for ( final Genotype genotype : originalGenotypes ) { + final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); + updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); + } + + return updatedGenotypes; + } + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { int clipping = 0; boolean stillClipping = true; @@ -1259,7 +1309,7 @@ public class GATKVariantContextUtils { } - private static class AlleleMapper { + protected static class AlleleMapper { private VariantContext vc = null; private Map map = null; public AlleleMapper(VariantContext vc) { this.vc = vc; } @@ -1319,4 +1369,76 @@ public class GATKVariantContextUtils { } return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); } + + /** + * Splits the alleles for the provided variant context into its primitive parts. + * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. + * Currently works only for MNPs. + * + * @param vc the non-null VC to split + * @return a non-empty list of VCs split into primitive parts or the original VC otherwise + */ + public static List splitIntoPrimitiveAlleles(final VariantContext vc) { + if ( vc == null ) + throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); + + if ( !vc.isBiallelic() ) + throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); + + // currently only works for MNPs + if ( !vc.isMNP() ) + return Arrays.asList(vc); + + final byte[] ref = vc.getReference().getBases(); + final byte[] alt = vc.getAlternateAllele(0).getBases(); + + if ( ref.length != alt.length ) + throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); + + final List result = new ArrayList(ref.length); + + for ( int i = 0; i < ref.length; i++ ) { + + // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) + if ( ref[i] != alt[i] ) { + + // create the ref and alt SNP alleles + final Allele newRefAllele = Allele.create(ref[i], true); + final Allele newAltAllele = Allele.create(alt[i], false); + + // create a new VariantContext with the new SNP alleles + final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); + + // create new genotypes with updated alleles + final Map alleleMap = new HashMap(); + alleleMap.put(vc.getReference(), newRefAllele); + alleleMap.put(vc.getAlternateAllele(0), newAltAllele); + final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); + + result.add(newVC.genotypes(newGenotypes).make()); + } + } + + if ( result.isEmpty() ) + result.add(vc); + + return result; + } + + /** + * Are vc1 and 2 equal including their position and alleles? + * @param vc1 non-null VariantContext + * @param vc2 non-null VariantContext + * @return true if vc1 and vc2 are equal, false otherwise + */ + public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { + if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); + if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); + + if ( vc1.getStart() != vc2.getStart() ) return false; + if ( vc1.getEnd() != vc2.getEnd() ) return false; + if ( ! vc1.getChr().equals(vc2.getChr())) return false; + if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; + return true; + } } diff --git a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java index 362d409cb..772c86563 100644 --- a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java +++ b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java @@ -35,7 +35,7 @@ import java.lang.reflect.Method; /** * Provide default @Test values for GATK testng tests. * - * Currently only sets the maximum runtime to 10 minutes, if it's not been specified. + * Currently only sets the maximum runtime to 40 minutes, if it's not been specified. * * See http://beust.com/weblog/2006/10/18/annotation-transformers-in-java/ * @@ -44,7 +44,7 @@ import java.lang.reflect.Method; * @version 0.1 */ public class TestNGTestTransformer implements IAnnotationTransformer { - public static final long DEFAULT_TIMEOUT = 1000 * 60 * 20; // 20 minutes max per test + public static final long DEFAULT_TIMEOUT = 1000 * 60 * 40; // 40 minutes max per test final static Logger logger = Logger.getLogger(TestNGTestTransformer.class); diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 155d44ecd..dd5a2b0a7 100644 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -53,6 +53,7 @@ public class WalkerTest extends BaseTest { private static final boolean GENERATE_SHADOW_BCF = true; private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false; + private static final boolean ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS = false; private static MD5DB md5DB = new MD5DB(); @@ -209,6 +210,8 @@ public class WalkerTest extends BaseTest { String.format(" -et %s -K %s ", GATKRunReport.PhoneHomeOption.NO_ET, gatkKeyFile)); if ( includeShadowBCF && GENERATE_SHADOW_BCF ) args = args + " --generateShadowBCF "; + if ( ! ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS ) + args = args + " --disable_auto_index_creation_and_locking_when_reading_rods "; } return args; diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java new file mode 100644 index 000000000..85ad5d575 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java @@ -0,0 +1,183 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import net.sf.samtools.SAMFileWriter; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.io.stubs.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Collection; + + +public class ArgumentTypeDescriptorUnitTest extends BaseTest { + + //////////////////////////////////////////////////////////////////// + // This section tests the functionality of the @Output annotation // + //////////////////////////////////////////////////////////////////// + + private class ATDTestCommandLineProgram extends CommandLineProgram { + public int execute() { return 0; } + + @Override + public Collection getArgumentTypeDescriptors() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + return Arrays.asList( new SAMFileWriterArgumentTypeDescriptor(engine, System.out), + new OutputStreamArgumentTypeDescriptor(engine, System.out), + new VCFWriterArgumentTypeDescriptor(engine, System.out, null)); + } + + protected abstract class ATDTestOutputArgumentSource { + public abstract Object getOut(); + } + + protected class OutputRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public PrintStream out; + public Object getOut() { return out; } + } + } + + @DataProvider(name = "OutputProvider") + public Object[][] OutputProvider() { + + ObjectArrayList tests = new ObjectArrayList(); + + final ATDTestCommandLineProgram clp = new ATDTestCommandLineProgram(); + + for ( final Object obj : Arrays.asList(clp.new OutputRequiredSamArgumentSource(), clp.new OutputRequiredVcfArgumentSource(), clp.new OutputRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, true, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredSamArgumentSource(), clp.new OutputNotRequiredVcfArgumentSource(), clp.new OutputNotRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredNoDefaultSamArgumentSource(), clp.new OutputNotRequiredNoDefaultVcfArgumentSource(), clp.new OutputNotRequiredNoDefaultStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, false, provided}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "OutputProvider") + public void testOutput(final ATDTestCommandLineProgram.ATDTestOutputArgumentSource argumentSource, final boolean required, final boolean hasDefault, final boolean provided) { + + final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); + parser.addArgumentSource(argumentSource.getClass()); + parser.parse(provided ? new String[] {"out", "foo"} : new String[] {}); + + try { + parser.loadArgumentsIntoObject(argumentSource); + + if ( !provided && (required || !hasDefault) ) + Assert.assertEquals(argumentSource.getOut(), null); + else if ( !provided ) + Assert.assertNotEquals(argumentSource.getOut(), null); + else if ( argumentSource.getOut() == null || !(argumentSource.getOut() instanceof SAMFileWriterStub) ) // can't test this one case + Assert.assertEquals(!provided, outputIsStdout(argumentSource.getOut())); + + } catch (Exception e) { + throw new ReviewedStingException(e.getMessage()); + } + } + + private static boolean outputIsStdout(final Object out) { + if ( out == null ) { + return false; + } else if ( out instanceof SAMFileWriterStub ) { + return ((SAMFileWriterStub)out).getOutputStream() != System.out; + } else if ( out instanceof VariantContextWriterStub ) { + return ((VariantContextWriterStub)out).getOutputStream() == System.out; + } else if ( out instanceof OutputStreamStub ) { + return ((OutputStreamStub)out).getOutputStream() == System.out; + } + return false; + } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 0b6e08fa7..3f74e0eae 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -29,15 +29,23 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ArgumentException; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.readutils.PrintReads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.List; /** * Tests selected functionality in the GenomeAnalysisEngine class @@ -81,4 +89,89 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } + + + /////////////////////////////////////////////////// + // Test the ReadTransformer ordering enforcement // + /////////////////////////////////////////////////// + + public static class TestReadTransformer extends ReadTransformer { + + private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; + private boolean enabled; + + protected TestReadTransformer(final OrderingConstraint orderingConstraint) { + this.orderingConstraint = orderingConstraint; + enabled = true; + } + + // need this because PackageUtils will pick up this class as a possible ReadTransformer + protected TestReadTransformer() { + enabled = false; + } + + @Override + public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } + + @Override + public boolean enabled() { return enabled; } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } + + } + + @DataProvider(name = "ReadTransformerData") + public Object[][] makeReadTransformerData() { + List tests = new ArrayList(); + + for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { + tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadTransformerData") + public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { + + final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + final List readTransformers = new ArrayList(3); + readTransformers.add(new TestReadTransformer(oc1)); + readTransformers.add(new TestReadTransformer(oc2)); + readTransformers.add(new TestReadTransformer(oc3)); + + final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || + numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; + + try { + testEngine.setReadTransformers(readTransformers); + + Assert.assertFalse(shouldThrowException); + Assert.assertEquals(testEngine.getReadTransformers().size(), 3); + + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + } catch (UserException.IncompatibleReadFiltersException e) { + Assert.assertTrue(shouldThrowException); + } + } + + private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { + int count = 0; + for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { + if ( constraint == target ) + count++; + } + return count; + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java index 55f9e1f7d..e6176dbe8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java @@ -39,7 +39,7 @@ import java.util.concurrent.TimeUnit; * */ public class MaxRuntimeIntegrationTest extends WalkerTest { - private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(20, TimeUnit.SECONDS); + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); private class MaxRuntimeTestProvider extends TestDataProvider { final long maxRuntime; @@ -68,7 +68,7 @@ public class MaxRuntimeIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 60 * 1000) + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 120 * 1000) public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T PrintReads -R " + hg18Reference diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index 3cd059333..fad632cfd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -84,7 +84,8 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { // sequence seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq); - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true); } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 23720e60d..8d33aa8b6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -111,7 +111,7 @@ public class SAMDataSourceUnitTest extends BaseTest { new ArrayList(), false); - Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); + Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); int count = 0; try { diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java new file mode 100644 index 000000000..00d0dd051 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java @@ -0,0 +1,75 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.datasources.reference; + +import junit.framework.Assert; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +public class ReferenceDataSourceIntegrationTest extends WalkerTest { + + @Test + public void testReferenceWithMissingFaiFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File dictFile = new File(dummyReference.getAbsolutePath().replace(".fasta", ".dict")); + dictFile.deleteOnExit(); + Assert.assertTrue(dictFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T PrintReads" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceFaiFile.class + ); + + executeTest("testReferenceWithMissingFaiFile", spec); + } + + @Test + public void testReferenceWithMissingDictFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File faiFile = new File(dummyReference.getAbsolutePath() + ".fai"); + faiFile.deleteOnExit(); + Assert.assertTrue(faiFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T PrintReads" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceDictFile.class + ); + + executeTest("testReferenceWithMissingDictFile", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java index 2144cd09b..4a6d14d32 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -95,12 +95,9 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { public void setUp() { String fileName = privateTestDir + "TabularDataTest.dat"; - // check to see if we have an index, if so, delete it - File indexFileName = new File(privateTestDir + "TabularDataTest.dat.idx"); - if (indexFileName.exists()) indexFileName.delete(); - triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java new file mode 100644 index 000000000..6e908a3bf --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -0,0 +1,235 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + + +/** + * Basic unit test for AlleleBiasedDownsamplingUtils + */ +public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { + + + @Test + public void testSmartDownsampling() { + + final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; + final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; + + // no contamination, no removal + testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // het sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // hom sample, het contaminant, overlapping alleles + final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; + testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + + // hom sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); + + // het sample, het contaminant, overlapping alleles + testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); + testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + } + + private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, + final int pileupSize, final int[] initialCounts, final int[] targetCounts) { + + final int[] actualCounts = initialCounts.clone(); + actualCounts[0] += addA; + actualCounts[1] += addC; + actualCounts[2] += addG; + actualCounts[3] += addT; + + final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); + Assert.assertTrue(countsAreEqual(results, targetCounts)); + } + + private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { + for ( int i = 0; i < 4; i++ ) { + if ( counts1[i] != counts2[i] ) + return false; + } + return true; + } + + @DataProvider(name = "BiasedDownsamplingTest") + public Object[][] makeBiasedDownsamplingTest() { + final List tests = new LinkedList(); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) { + for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) { + for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) { + if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount ) + continue; + for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { + if ( toRemove <= originalNormalCount + originalReducedCount ) + tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BiasedDownsamplingTest") + public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) { + + final LinkedList elements = new LinkedList(); + for ( int i = 0; i < originalNormalCount; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + if ( originalReducedCount > 0 ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + read.setReducedReadCountsTag(new int[]{originalReducedCount}); + elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + + final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove); + int pileupCount = 0; + for ( final PileupElement pe : elements ) // reduced reads may have gotten modified + pileupCount += pe.getRepresentativeCount(); + for ( final PileupElement pe : result ) + pileupCount -= pe.getRepresentativeCount(); + + Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove); + } + + @Test + public void testLoadContaminationFileDetails(){ + Logger logger=org.apache.log4j.Logger.getRootLogger(); + + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt"); + + Map Contam1=new HashMap(); + Set Samples1=new HashSet(); + + Contam1.put("NA11918",0.15); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Contam1.put("NA12842",0.13); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Samples1.add("DUMMY"); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + } + + private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){ + Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger); + Assert.assertTrue(loadedMap.equals(map)); + } + + @DataProvider(name = "goodContaminationFiles") + public Integer[][] goodContaminationFiles() { + return new Integer[][]{ + {1, 2}, + {2, 3}, + {3, 2}, + {4, 2}, + {5, 3}, + {6, 2}, + {7, 2}, + {8, 2} + }; + } + + @Test(dataProvider = "goodContaminationFiles") + public void testLoadContaminationFile(final Integer ArtificalBAMnumber, final Integer numberOfSamples) { + final String ArtificialBAM = String.format("ArtificallyContaminatedBams/contamination.case.%d.txt", ArtificalBAMnumber); + Logger logger = org.apache.log4j.Logger.getRootLogger(); + + File ContamFile = new File(privateTestDir, ArtificialBAM); + Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile, 0.0, null, logger).size() == numberOfSamples); + + } + + + @DataProvider(name = "badContaminationFiles") + public Integer[][] badContaminationFiles() { + return new Integer[][]{{1}, {2}, {3}, {4}, {5}}; + } + + @Test(dataProvider = "badContaminationFiles", expectedExceptions = UserException.MalformedFile.class) + public void testLoadBrokenContaminationFile(final int i) { + Logger logger = org.apache.log4j.Logger.getRootLogger(); + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + + File ContaminationFile = new File(ArtificalBAMLocation + String.format("contamination.case.broken.%d.txt", i)); + AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile, 0.0, null, logger); + + } + + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java index 3a12c7ce7..972e51dcd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -139,6 +139,7 @@ public class LevelingDownsamplerUnitTest extends BaseTest { Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); } + final int sizeFromDownsampler = downsampler.size(); List> downsampledStacks = downsampler.consumeFinalizedItems(); Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); @@ -151,6 +152,7 @@ public class LevelingDownsamplerUnitTest extends BaseTest { totalRemainingItems += stack.size(); } + Assert.assertEquals(sizeFromDownsampler, totalRemainingItems); int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java index 74a17189e..022eb02d2 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -115,6 +115,7 @@ public class ReservoirDownsamplerUnitTest extends BaseTest { Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); } + Assert.assertEquals(downsampler.size(), test.expectedNumReadsAfterDownsampling); List downsampledReads = downsampler.consumeFinalizedItems(); Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java new file mode 100644 index 000000000..12d875a4d --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.filters; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + + +public class BadReadGroupsIntegrationTest extends WalkerTest { + + @Test + public void testMissingReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T PrintReads -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", + 0, + UserException.ReadMissingReadGroup.class); + executeTest("test Missing Read Group", spec); + } + + @Test + public void testUndefinedReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T PrintReads -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", + 0, + UserException.ReadHasUndefinedReadGroup.class); + executeTest("test Undefined Read Group", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java new file mode 100644 index 000000000..981d54d54 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java @@ -0,0 +1,62 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.filters; + +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** + * Tests for the MalformedReadFilter + * + * @author Eric Banks + * @since 3/14/13 + */ +public class MalformedReadFilterUnitTest { + + ////////////////////////////////////// + // Test the checkSeqStored() method // + ////////////////////////////////////// + + @Test(enabled = true) + public void testcheckSeqStored () { + + final GATKSAMRecord goodRead = ArtificialSAMUtils.createArtificialRead(new byte[]{(byte)'A'}, new byte[]{(byte)'A'}, "1M"); + final GATKSAMRecord badRead = ArtificialSAMUtils.createArtificialRead(new byte[]{}, new byte[]{}, "1M"); + badRead.setReadString("*"); + + Assert.assertTrue(MalformedReadFilter.checkSeqStored(goodRead, true)); + Assert.assertFalse(MalformedReadFilter.checkSeqStored(badRead, true)); + + try { + MalformedReadFilter.checkSeqStored(badRead, false); + Assert.assertTrue(false, "We should have exceptioned out in the previous line"); + } catch (UserException e) { } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java index e30ab6e5d..4904428d0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -27,17 +27,15 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; -import org.broadinstitute.variant.vcf.VCF3Codec; +import org.broad.tribble.util.LittleEndianOutputStream; import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.testng.annotations.BeforeMethod; @@ -61,7 +59,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @BeforeMethod public void setup() { - File referenceFile = new File(b36KGReference); + File referenceFile = new File(b37KGReference); try { seq = new CachingIndexedFastaSequenceFile(referenceFile); } @@ -69,7 +67,11 @@ public class RMDTrackBuilderUnitTest extends BaseTest { throw new UserException.CouldNotReadInputFile(referenceFile,ex); } genomeLocParser = new GenomeLocParser(seq); - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); + + // We have to disable auto-index creation/locking in the RMDTrackBuilder for tests, + // as the lock acquisition calls were intermittently hanging on our farm. This unfortunately + // means that we can't include tests for the auto-index creation feature. + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true); } @Test @@ -78,134 +80,83 @@ public class RMDTrackBuilderUnitTest extends BaseTest { } @Test - // in this test, the index exists, but is out of date. - public void testBuilderIndexUnwriteable() { - File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/relic.vcf"); - try { - builder.loadIndex(vcfFile, new VCF3Codec()); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - // make sure we didn't write the file (check that it's timestamp is within bounds) - //System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()); - Assert.assertTrue(Math.abs(1279591752000l - Tribble.indexFile(vcfFile).lastModified()) < 100); + public void testDisableAutoIndexGeneration() throws IOException { + final File unindexedVCF = new File(privateTestDir + "unindexed.vcf"); + final File unindexedVCFIndex = Tribble.indexFile(unindexedVCF); + Index index = builder.loadIndex(unindexedVCF, new VCFCodec()); + + Assert.assertFalse(unindexedVCFIndex.exists()); + Assert.assertNotNull(index); } - // we have a good index file, in a read-only dir. This would cause the previous version to remake the index; make - // sure we don't do this @Test - public void testDirIsLockedIndexFromDisk() { - File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf"); - File vcfFileIndex = Tribble.indexFile(vcfFile); - Index ind = null; - try { - ind = builder.attemptIndexFromDisk(vcfFile,new VCFCodec(),vcfFileIndex,new FSLockWithShared(vcfFile)); - } catch (IOException e) { - Assert.fail("We weren't expecting an exception -> " + e.getMessage()); - } - // make sure we get back a null index; i.e. we can't load the index from disk - Assert.assertTrue(ind == null); + public void testLoadOnDiskIndex() { + final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); + final File tempVCFWithCorrectIndex = createTempVCFFileAndIndex(originalVCF, false); + final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithCorrectIndex); + + final Index index = builder.loadFromDisk(tempVCFWithCorrectIndex, tempVCFIndexFile); + + Assert.assertNotNull(index); + Assert.assertTrue(tempVCFIndexFile.exists()); + + final Index inMemoryIndex = builder.createIndexInMemory(tempVCFWithCorrectIndex, new VCFCodec()); + Assert.assertTrue(index.equalsIgnoreProperties(inMemoryIndex)); } - - @Test - public void testBuilderIndexDirectoryUnwritable() { - File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf"); - File vcfFileIndex = Tribble.indexFile(vcfFile); + public void testLoadOnDiskOutdatedIndex() { + final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); + final File tempVCFWithOutdatedIndex = createTempVCFFileAndIndex(originalVCF, true); + final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithOutdatedIndex); - Index ind = null; - try { - ind = builder.loadIndex(vcfFile, new VCF3Codec()); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - // make sure we didn't write the file (check that it's timestamp is within bounds) - Assert.assertTrue(!vcfFileIndex.exists()); - Assert.assertTrue(ind != null); + final Index index = builder.loadFromDisk(tempVCFWithOutdatedIndex, tempVCFIndexFile); - } - - - @Test - public void testGenerateIndexForUnindexedFile() { - File vcfFile = new File(privateTestDir + "always_reindex.vcf"); - File vcfFileIndex = Tribble.indexFile(vcfFile); - - // if we can't write to the directory, don't fault the tester, just pass - if (!vcfFileIndex.getParentFile().canWrite()) { - logger.warn("Unable to run test testGenerateIndexForUnindexedFile: unable to write to dir " + vcfFileIndex.getParentFile()); - return; - } - // clean-up our test, and previous tests that may have written the file - vcfFileIndex.deleteOnExit(); - if (vcfFileIndex.exists()) - vcfFileIndex.delete(); - - try { - builder.loadIndex(vcfFile, new VCFCodec()); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - // make sure we wrote the file - Assert.assertTrue(vcfFileIndex.exists()); - } - - - // test to make sure we get a full sequence dictionary from the VCF (when we set the dictionary in the builder) - @Test - public void testBuilderIndexSequenceDictionary() { - File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf")); - Long indexTimeStamp = Tribble.indexFile(vcfFile).lastModified(); - try { - Index idx = builder.loadIndex(vcfFile, new VCFCodec()); - // catch any exception; this call should pass correctly - SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - - // make sure that we removed and updated the index - Assert.assertTrue(Tribble.indexFile(vcfFile).lastModified() >= indexTimeStamp,"Fail: index file was modified"); + // loadFromDisk() should return null to indicate that the index is outdated and should not be used, + // but should not delete the index since our builder has disableAutoIndexCreation set to true + Assert.assertNull(index); + Assert.assertTrue(tempVCFIndexFile.exists()); } /** - * create a temporary file and an associated out of date index file + * Create a temporary vcf file and an associated index file, which may be set to be out-of-date + * relative to the vcf * - * @param tribbleFile the tribble file - * @return a file pointing to the new tmp location, with out of date index + * @param vcfFile the vcf file + * @param createOutOfDateIndex if true, ensure that the temporary vcf file is modified after the index + * @return a file pointing to the new tmp location, with accompanying index */ - private File createCorrectDateIndexFile(File tribbleFile) { + private File createTempVCFFileAndIndex( final File vcfFile, final boolean createOutOfDateIndex ) { try { - // first copy the tribble file to a temperary file - File tmpFile = File.createTempFile("TribbleUnitTestFile", ""); + final File tmpFile = File.createTempFile("RMDTrackBuilderUnitTest", ""); + final File tmpIndex = Tribble.indexFile(tmpFile); tmpFile.deleteOnExit(); - logger.info("creating temp file " + tmpFile); - - // copy the vcf (tribble) file to the tmp file location - copyFile(tribbleFile, tmpFile); - - // sleep again, to make sure the timestamps are different (vcf vrs updated index file) - Thread.sleep(2000); - - // create a fake index, before we copy so it's out of date - File tmpIndex = Tribble.indexFile(tmpFile); tmpIndex.deleteOnExit(); - // copy the vcf (tribble) file to the tmp file location - copyFile(Tribble.indexFile(tribbleFile), tmpIndex); + copyFile(vcfFile, tmpFile); + final Index inMemoryIndex = builder.createIndexInMemory(tmpFile, new VCFCodec()); + final LittleEndianOutputStream indexOutputStream = new LittleEndianOutputStream(new FileOutputStream(tmpIndex)); + + // If requested, modify the tribble file after the index. Otherwise, modify the index last. + if ( createOutOfDateIndex ) { + inMemoryIndex.write(indexOutputStream); + indexOutputStream.close(); + Thread.sleep(2000); + copyFile(vcfFile, tmpFile); + } + else { + copyFile(vcfFile, tmpFile); + Thread.sleep(2000); + inMemoryIndex.write(indexOutputStream); + indexOutputStream.close(); + } return tmpFile; - } catch (IOException e) { Assert.fail("Unable to create temperary file"); } catch (InterruptedException e) { - Assert.fail("Somehow our thread got interupted"); + Assert.fail("Somehow our thread got interrupted"); } return null; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java index a993d1783..48e3bbd8c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java @@ -45,7 +45,8 @@ public class TestRMDTrackBuilder extends RMDTrackBuilder { private GenomeLocParser genomeLocParser; public TestRMDTrackBuilder(SAMSequenceDictionary dict, GenomeLocParser genomeLocParser) { - super(dict, genomeLocParser, null); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + super(dict, genomeLocParser, null, true); this.genomeLocParser = genomeLocParser; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java new file mode 100644 index 000000000..f3e1ce44b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class TAROrderedReadCacheUnitTest extends BaseTest { + // example fasta index file, can be deleted if you don't use the reference + private IndexedFastaSequenceFile seq; + + @BeforeClass + public void setup() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + } + + @DataProvider(name = "ReadCacheTestData") + public Object[][] makeReadCacheTestData() { + List tests = new ArrayList(); + + for ( final int nReadsPerLocus : Arrays.asList(0, 1, 10, 100) ) { + for ( final int nLoci : Arrays.asList(1, 10, 100) ) { + for ( final int max : Arrays.asList(10, 50, 1000) ) { + for ( final boolean addAllAtOnce : Arrays.asList(true, false) ) { + tests.add(new Object[]{nReadsPerLocus, nLoci, max, addAllAtOnce}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadCacheTestData") + public void testReadCache(final int nReadsPerLocus, final int nLoci, final int max, final boolean addAllAtOnce) { + final TAROrderedReadCache cache = new TAROrderedReadCache(max); + + Assert.assertEquals(cache.getMaxCapacity(), max); + Assert.assertEquals(cache.getNumDiscarded(), 0); + Assert.assertEquals(cache.size(), 0); + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(seq, nReadsPerLocus, nLoci); + final List reads = bamBuilder.makeReads(); + + if ( addAllAtOnce ) { + cache.addAll(reads); + } else { + for ( final GATKSAMRecord read : reads ) { + cache.add(read); + } + } + + final int nTotalReads = reads.size(); + final int nExpectedToKeep = Math.min(nTotalReads, max); + final int nExpectedToDiscard = nTotalReads - nExpectedToKeep; + Assert.assertEquals(cache.getNumDiscarded(), nExpectedToDiscard, "wrong number of reads discarded"); + Assert.assertEquals(cache.size(), nExpectedToKeep, "wrong number of reads kept"); + + final List cacheReads = cache.popCurrentReads(); + Assert.assertEquals(cache.size(), 0, "Should be no reads left"); + Assert.assertEquals(cache.getNumDiscarded(), 0, "should have reset stats"); + Assert.assertEquals(cacheReads.size(), nExpectedToKeep, "should have 1 read for every read we expected to keep"); + + int lastStart = -1; + for ( final GATKSAMRecord read : cacheReads ) { + Assert.assertTrue(lastStart <= read.getAlignmentStart(), "Reads should be sorted but weren't. Found read with start " + read.getAlignmentStart() + " while last was " + lastStart); + lastStart = read.getAlignmentStart(); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index a574932a7..b6106d4bc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -58,6 +58,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.*; /** @@ -86,11 +87,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List intervals; - private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; - private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; + private File testBAM; @BeforeClass - private void init() throws FileNotFoundException { + private void init() throws IOException { //reference = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); // hg19Reference)); reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); dictionary = reference.getSequenceDictionary(); @@ -133,17 +133,18 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { createBAM(reads); } - private void createBAM(List reads) { - File outFile = new File(testBAM); - outFile.deleteOnExit(); - File indexFile = new File(testBAI); - indexFile.deleteOnExit(); + private void createBAM(List reads) throws IOException { + testBAM = File.createTempFile("TraverseActiveRegionsUnitTest", ".bam"); + testBAM.deleteOnExit(); - SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { out.addAlignment(read); } out.close(); + + new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); } @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") @@ -400,7 +401,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return getActiveRegions(t, walker, intervals, testBAM); } - private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final String bam) { + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final File bam) { for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) t.traverse(walker, dataProvider, 0); @@ -466,13 +467,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return record; } - private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, String bamFile) { + private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, File bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); - traverseActiveRegions.initialize(engine, walker); Collection samFiles = new ArrayList(); - SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); + SAMReaderID readerID = new SAMReaderID(bamFile, new Tags()); samFiles.add(readerID); SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, @@ -485,8 +485,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { new ArrayList(), false, (byte)30, false, true); + engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); + traverseActiveRegions.initialize(engine, walker); List providers = new ArrayList(); for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { @@ -594,7 +596,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { walker.setStates(readStates); final TraverseActiveRegions traversal = new TraverseActiveRegions(); - final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary for ( final ActiveRegion region : activeRegionsMap.values() ) { @@ -666,7 +668,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); final TraverseActiveRegions traversal = new TraverseActiveRegions(); - final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); final ActiveRegion region = activeRegionsMap.values().iterator().next(); int nReadsExpectedInRegion = 0; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java similarity index 77% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java index 859f6c4c7..c07bf171a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java @@ -30,8 +30,9 @@ import org.testng.annotations.Test; import java.util.Arrays; -public class CallableLociWalkerIntegrationTest extends WalkerTest { - final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; +public class CallableLociIntegrationTest extends WalkerTest { + final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; + final static String reduceReadArgs = "-R " + b37KGReference + " -T CallableLoci -I " + " private/testdata/NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s"; final static String SUMMARY_MD5 = "ffdbd9cdcb4169ebed5ae4bec797260f"; @@ -66,4 +67,13 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { Arrays.asList("46a53379aaaf9803276a0a34b234f6ab", "da431d393f7c2b2b3e27556b86c1dbc7")); executeTest("formatBed lots of arguments", spec); } + + @Test(enabled=true) + public void testWithReducedRead() { + String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, + Arrays.asList("69fc303c888fd1fa2937b9518dc82f9e", "f512a85c373087ce03a24ab0f98522c0")); + executeTest("CallableLoci with ReducedRead", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java new file mode 100644 index 000000000..3b184ae3b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.readutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + +/** + * Created with IntelliJ IDEA. + * User: delangel + * Date: 4/13/13 + * Time: 7:28 AM + * To change this template use File | Settings | File Templates. + */ +public class ReadAdaptorTrimmerIntegrationTest extends WalkerTest { + private String getBaseCommand(final String BAM) { + return "-T ReadAdaptorTrimmer -R " + b37KGReference + + " -I " + privateTestDir + BAM + + " -o %s"; + } + + @Test + public void testBasicTrimmer() { + WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest.bam"), 1, Arrays.asList("1d42414e12b45d44e6f396d97d0f60fe")); + executeTest(String.format("testBasicTrimmer"), spec); + } + + @Test + public void testSkippingBadPairs() { + WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest2.bam")+" -removeUnpairedReads", 1, Arrays.asList("5e796345502fbfc31134f7736ce68868")); + executeTest(String.format("testSkippingBadPairs"), spec); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java new file mode 100644 index 000000000..ca60c6cfe --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java @@ -0,0 +1,88 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class SelectVariantsUnitTest extends BaseTest { + + ////////////////////////////////////////// + // Tests for maxIndelSize functionality // + ////////////////////////////////////////// + + @DataProvider(name = "MaxIndelSize") + public Object[][] MaxIndelSizeTestData() { + + List tests = new ArrayList(); + + for ( final int size : Arrays.asList(1, 3, 10, 100) ) { + for ( final int otherSize : Arrays.asList(0, 1) ) { + for ( final int max : Arrays.asList(0, 1, 5, 50, 100000) ) { + for ( final String op : Arrays.asList("D", "I") ) { + tests.add(new Object[]{size, otherSize, max, op}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MaxIndelSize") + public void maxIndelSizeTest(final int size, final int otherSize, final int max, final String op) { + + final byte[] largerAllele = Utils.dupBytes((byte) 'A', size+1); + final byte[] smallerAllele = Utils.dupBytes((byte) 'A', 1); + + final List alleles = new ArrayList(2); + final Allele ref = Allele.create(op.equals("I") ? smallerAllele : largerAllele, true); + final Allele alt = Allele.create(op.equals("D") ? smallerAllele : largerAllele, false); + alleles.add(ref); + alleles.add(alt); + if ( otherSize > 0 && otherSize != size ) { + final Allele otherAlt = Allele.create(op.equals("D") ? Utils.dupBytes((byte) 'A', size-otherSize+1) : Utils.dupBytes((byte) 'A', otherSize+1), false); + alleles.add(otherAlt); + } + + final VariantContext vc = new VariantContextBuilder("test", "1", 10, 10 + ref.length() - 1, alleles).make(); + + boolean hasTooLargeIndel = SelectVariants.containsIndelLargerThan(vc, max); + Assert.assertEquals(hasTooLargeIndel, size > max); + } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java index 677f87cac..d2da0e228 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java @@ -34,7 +34,7 @@ import org.testng.annotations.Test; import java.io.File; import java.util.*; -public class JnaSessionIntegrationTest extends BaseTest { +public class JnaSessionPipelineTest extends BaseTest { private String implementation = null; private static final SessionFactory factory = new JnaSessionFactory(); diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java index 038bfd85d..efeeb3640 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java @@ -40,7 +40,7 @@ import java.io.File; import java.util.Arrays; import java.util.List; -public class LibDrmaaIntegrationTest extends BaseTest { +public class LibDrmaaPipelineTest extends BaseTest { private String implementation = null; @Test diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java index 4898f17c3..af8d0e7b1 100644 --- a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java @@ -40,7 +40,7 @@ import java.io.File; /** * Really unit tests, but these test will only run on systems with LSF setup. */ -public class LibBatIntegrationTest extends BaseTest { +public class LibBatPipelineTest extends BaseTest { @BeforeClass public void initLibBat() { Assert.assertFalse(LibBat.lsb_init("LibBatIntegrationTest") < 0, LibBat.lsb_sperror("lsb_init() failed")); diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java index df41dc642..443cf2771 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java @@ -117,11 +117,31 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); mSortedSet.addRegion(f); assertTrue(mSortedSet.size() == 1); - } + @Test + public void addRegionsOutOfOrder() { + final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName(); + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); + mSortedSet.addRegion(f); + assertTrue(mSortedSet.size() == 2); + assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName)); + assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName)); + } - @Test(expectedExceptions=ReviewedStingException.class) + @Test(expectedExceptions = IllegalArgumentException.class) + public void addThrowsException() { + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); + mSortedSet.add(f); + } + + @Test(expectedExceptions=IllegalArgumentException.class) public void testAddDuplicate() { assertTrue(mSortedSet.size() == 0); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 0, 0); @@ -141,9 +161,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(mSortedSet.size() == 1); Iterator iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); - assertTrue(loc.getStart() == 0); - assertTrue(loc.getStop() == 100); - assertTrue(loc.getContigIndex() == 1); + assertEquals(loc.getStart(), 0); + assertEquals(loc.getStop(), 100); + assertEquals(loc.getContigIndex(), 1); } @Test @@ -192,9 +212,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(mSortedSet.size() == 1); Iterator iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); - assertTrue(loc.getStart() == 0); - assertTrue(loc.getStop() == 100); - assertTrue(loc.getContigIndex() == 1); + assertEquals(loc.getStart(), 0); + assertEquals(loc.getStop(), 100); + assertEquals(loc.getContigIndex(), 1); } @Test diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 2c57e8b33..27af8ec68 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -56,6 +56,22 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.binomialProbability(300, 112, 0.98), 2.34763e-236, 1e-237); } + /** + * Tests that we get the right values from the binomial distribution + */ + @Test + public void testCumulativeBinomialProbability() { + logger.warn("Executing testCumulativeBinomialProbability"); + + final int numTrials = 10; + for ( int i = 0; i < numTrials; i++ ) + Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials)); + + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7); + } + /** * Tests that we get the right values from the multinomial distribution */ @@ -150,6 +166,21 @@ public class MathUtilsUnitTest extends BaseTest { @Test public void testLog10BinomialCoefficient() { logger.warn("Executing testLog10BinomialCoefficient"); + // note that we can test the binomial coefficient calculation indirectly via Newton's identity + // (1+z)^m = sum (m choose k)z^k + double[] z_vals = new double[]{0.999,0.9,0.8,0.5,0.2,0.01,0.0001}; + int[] exponent = new int[]{5,15,25,50,100}; + for ( double z : z_vals ) { + double logz = Math.log10(z); + for ( int exp : exponent ) { + double expected_log = exp*Math.log10(1+z); + double[] newtonArray_log = new double[1+exp]; + for ( int k = 0 ; k <= exp; k++ ) { + newtonArray_log[k] = MathUtils.log10BinomialCoefficient(exp,k)+k*logz; + } + Assert.assertEquals(MathUtils.log10sumLog10(newtonArray_log),expected_log,1e-6); + } + } Assert.assertEquals(MathUtils.log10BinomialCoefficient(4, 2), 0.7781513, 1e-6); Assert.assertEquals(MathUtils.log10BinomialCoefficient(10, 3), 2.079181, 1e-6); @@ -172,36 +203,19 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.log10Factorial(12), 8.680337, 1e-6); Assert.assertEquals(MathUtils.log10Factorial(200), 374.8969, 1e-3); Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); - } - - @Test(enabled = true) - public void testRandomSubset() { - Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0); - Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1); - Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2); - Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3); - Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4); - Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5); - Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6); - Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7); - Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8); - Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9); - Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10); - Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10); - - for (int i = 0; i < 25; i++) - Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5))); - - } - - @Test(enabled = true) - public void testArrayShuffle() { - Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - for (int i = 0; i < 25; i++) { - Object[] t = MathUtils.arrayShuffle(x); - Assert.assertTrue(hasUniqueElements(t)); - Assert.assertTrue(hasAllElements(x, t)); + double log10factorial_small = 0; + double log10factorial_middle = 374.8969; + double log10factorial_large = 45138.26; + int small_start = 1; + int med_start = 200; + int large_start = 12342; + for ( int i = 1; i < 1000; i++ ) { + log10factorial_small += Math.log10(i+small_start); + log10factorial_middle += Math.log10(i+med_start); + log10factorial_large += Math.log10(i+large_start); + Assert.assertEquals(MathUtils.log10Factorial(small_start+i),log10factorial_small,1e-6); + Assert.assertEquals(MathUtils.log10Factorial(med_start+i),log10factorial_middle,1e-3); + Assert.assertEquals(MathUtils.log10Factorial(large_start+i),log10factorial_large,1e-1); } } @@ -286,17 +300,29 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - } - @Test - public void testNormalizeFromLog10() { - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, -1.0, -1.1, -7.8})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, 0.0, -0.1, -6.8})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[] {-8.9, -6.7, -9.4, 0.0, -8.9})); - - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.0}), new double[] {0.25, 0.25, 0.25, 0.25})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -1.0}), new double[] {0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.approximateLog10SumLog10(equal),0.0,requiredPrecision,String.format("Did not sum to one: k=%d equal partitions.",npart)); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(multiplicative),0.0,requiredPrecision, String.format("Did not sum to one: k=%d multiplicative partitions with alpha=%f",npart,alpha)); + } + } } @Test @@ -342,12 +368,29 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - } - @Test - public void testDotProduct() { - Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0,-3.0,2.0}, new Double[]{6.0,7.0,8.0}),-35.0,1e-3); - Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0}, new Double[]{6.0}),-30.0,1e-3); + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.log10sumLog10(equal),0.0,requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(multiplicative),0.0,requiredPrecision,String.format("Did not sum to one: nPartitions=%d, alpha=%f",npart,alpha)); + } + } } @Test @@ -355,19 +398,4 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0,-3.0,2.0}, new double[]{6.0,7.0,8.0}),10.0,1e-3); Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0}, new double[]{6.0}),1.0,1e-3); } - - /** - * Private function used by testNormalizeFromLog10() - */ - private boolean compareDoubleArrays(double[] b1, double[] b2) { - if (b1.length != b2.length) { - return false; // sanity check - } - - for (int i = 0; i < b1.length; i++) { - if (MathUtils.compareDoubles(b1[i], b2[i]) != 0) - return false; - } - return true; - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 5a6db4d9c..f92cd4bcf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -118,8 +118,8 @@ public class SimpleTimerUnitTest extends BaseTest { Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); - final long maxTimeInMicro = 100; - final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(100); + final long maxTimeInMicro = 10000; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(maxTimeInMicro); Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); } diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java index 29c643153..154b000ce 100644 --- a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -75,6 +75,14 @@ public class UtilsUnitTest extends BaseTest { Assert.assertEquals(duped.charAt(0), 'b', "dupString character was incorrect"); } + @Test + public void testXor() { + Assert.assertEquals(Utils.xor(false, false), false, "xor F F failed"); + Assert.assertEquals(Utils.xor(false, true), true, "xor F T failed"); + Assert.assertEquals(Utils.xor(true, false), true, "xor T F failed"); + Assert.assertEquals(Utils.xor(true, true), false, "xor T T failed"); + } + @Test public void testDupStringMultiChar() { String duped = Utils.dupString('c',5); @@ -104,6 +112,19 @@ public class UtilsUnitTest extends BaseTest { Assert.assertTrue("one-1;two-2;three-1;four-2;five-1;six-2".equals(joined)); } + @Test + public void testConcat() { + final String s1 = "A"; + final String s2 = "CC"; + final String s3 = "TTT"; + final String s4 = "GGGG"; + Assert.assertEquals(new String(Utils.concat()), ""); + Assert.assertEquals(new String(Utils.concat(s1.getBytes())), s1); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes())), s1 + s2); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes())), s1 + s2 + s3); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes(), s4.getBytes())), s1 + s2 + s3 + s4); + } + @Test public void testEscapeExpressions() { String[] expected, actual; diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java index 7f0f93704..ad5fd3642 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -49,7 +49,7 @@ import java.util.*; public class ActiveRegionUnitTest extends BaseTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; private IndexedFastaSequenceFile seq; private String contig; @@ -309,4 +309,75 @@ public class ActiveRegionUnitTest extends BaseTest { } } } + + // ----------------------------------------------------------------------------------------------- + // + // Make sure we can properly cut up an active region based on engine intervals + // + // ----------------------------------------------------------------------------------------------- + + @DataProvider(name = "TrimActiveRegionData") + public Object[][] makeTrimActiveRegionData() { + List tests = new ArrayList(); + + // fully enclosed within active region + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 16), + genomeLocParser.createGenomeLoc("20", 15, 16), 0}); + + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 10, 15), + genomeLocParser.createGenomeLoc("20", 10, 15), 0}); + + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 20), + genomeLocParser.createGenomeLoc("20", 15, 20), 0}); + + // needs extra padding on the right + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 25), + genomeLocParser.createGenomeLoc("20", 15, 20), 5}); + + // needs extra padding on the left + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 5, 15), + genomeLocParser.createGenomeLoc("20", 10, 15), 5}); + + // needs extra padding on both + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 7, 21), + genomeLocParser.createGenomeLoc("20", 10, 20), 3}); + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 9, 23), + genomeLocParser.createGenomeLoc("20", 10, 20), 3}); + + // desired span captures everything, so we're returning everything. Tests that extension is set correctly + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 1, 50), + genomeLocParser.createGenomeLoc("20", 10, 20), 10}); + + // At the start of the chromosome, potentially a bit weird + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 1, 10), 10, + genomeLocParser.createGenomeLoc("20", 1, 50), + genomeLocParser.createGenomeLoc("20", 1, 10), 10}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimActiveRegionData") + public void testTrimActiveRegion(final GenomeLoc regionLoc, final int extension, final GenomeLoc desiredSpan, final GenomeLoc expectedActiveRegion, final int expectedExtension) { + final ActiveRegion region = new ActiveRegion(regionLoc, Collections.emptyList(), true, genomeLocParser, extension); + final ActiveRegion trimmed = region.trim(desiredSpan); + Assert.assertEquals(trimmed.getLocation(), expectedActiveRegion, "Incorrect region"); + Assert.assertEquals(trimmed.getExtension(), expectedExtension, "Incorrect region"); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index d5231c30b..2470364c4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -120,12 +120,21 @@ public class BandPassActivityProfileUnitTest extends BaseTest { for( int iii = 0; iii < activeProbArray.length; iii++ ) { final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii)); final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1)); - bandPassProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); + bandPassProbArray[iii] = dotProduct(activeProbSubArray, kernel); } return bandPassProbArray; } + public static double dotProduct(double[] v1, double[] v2) { + Assert.assertEquals(v1.length,v2.length,"Array lengths do not mach in dotProduct"); + double result = 0.0; + for (int k = 0; k < v1.length; k++) + result += v1[k] * v2[k]; + + return result; + } + @DataProvider(name = "BandPassComposition") public Object[][] makeBandPassComposition() { final List tests = new LinkedList(); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index a85ed2ce0..ae7c1e01c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -35,6 +35,7 @@ import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -348,4 +349,20 @@ public class ReadClipperUnitTest extends BaseTest { } + @Test(enabled = true) + public void testHardClipReducedRead() { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M"); + final int[] counts = new int[read.getReadLength()]; + for ( int i = 0; i < counts.length; i++ ) counts[i] = i; + read.setReducedReadCounts(counts); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + int readLength = read.getReadLength(); + for (int i = 0; i < readLength / 2; i++) { + GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); + final int[] expectedReducedCounts = Arrays.copyOfRange(counts, i + 1, readLength - i - 1); + Assert.assertEquals(clippedRead.getReducedReadCounts(), expectedReducedCounts); + } + } + } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java new file mode 100644 index 000000000..5c0eec252 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.file; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.io.File; + +public class FSLockWithSharedUnitTest extends BaseTest { + + private static final int MAX_EXPECTED_LOCK_ACQUISITION_TIME = FSLockWithShared.DEFAULT_LOCK_ACQUISITION_TIMEOUT_IN_MILLISECONDS + + FSLockWithShared.THREAD_TERMINATION_TIMEOUT_IN_MILLISECONDS; + + /** + * Test to ensure that we're never spending more than the maximum configured amount of time in lock acquisition calls. + */ + @Test( timeOut = MAX_EXPECTED_LOCK_ACQUISITION_TIME + 10 * 1000 ) + public void testLockAcquisitionTimeout() { + final File lockFile = createTempFile("FSLockWithSharedUnitTest", ".lock"); + final FSLockWithShared lock = new FSLockWithShared(lockFile); + boolean lockAcquisitionSucceeded = false; + + try { + lockAcquisitionSucceeded = lock.sharedLock(); + } + catch ( UserException e ) { + logger.info("Caught UserException from lock acquisition call: lock acquisition must have timed out. Message: " + e.getMessage()); + } + finally { + if ( lockAcquisitionSucceeded ) { + lock.unlock(); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index 15d69c400..e9600480a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -27,23 +27,30 @@ package org.broadinstitute.sting.utils.fragments; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; /** * Test routines for read-backed pileup. */ public class FragmentUtilsUnitTest extends BaseTest { private static SAMFileHeader header; + private static GATKSAMReadGroupRecord rgForMerged; + private final static boolean DEBUG = false; private class FragmentUtilsTest extends TestDataProvider { List statesForPileup = new ArrayList(); @@ -119,7 +126,7 @@ public class FragmentUtilsUnitTest extends BaseTest { return FragmentUtilsTest.getTests(FragmentUtilsTest.class); } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsPileup(FragmentUtilsTest test) { for ( TestState testState : test.statesForPileup ) { ReadBackedPileup rbp = testState.pileup; @@ -129,7 +136,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsListOfReadsFromPileup(FragmentUtilsTest test) { for ( TestState testState : test.statesForPileup ) { FragmentCollection fp = FragmentUtils.create(testState.pileup.getReads()); @@ -138,7 +145,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsListOfReads(FragmentUtilsTest test) { for ( TestState testState : test.statesForReads ) { FragmentCollection fp = FragmentUtils.create(testState.rawReads); @@ -147,7 +154,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, expectedExceptions = IllegalArgumentException.class) + @Test(enabled = !DEBUG, expectedExceptions = IllegalArgumentException.class) public void testOutOfOrder() { final List pair = ArtificialSAMUtils.createPair(header, "readpair", 100, 1, 50, true, true); final GATKSAMRecord left = pair.get(0); @@ -161,5 +168,132 @@ public class FragmentUtilsUnitTest extends BaseTest { @BeforeTest public void setup() { header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); + rgForMerged = new GATKSAMReadGroupRecord("RG1"); + } + + @DataProvider(name = "MergeFragmentsTest") + public Object[][] createMergeFragmentsTest() throws Exception { + List tests = new ArrayList(); + + final String leftFlank = "CCC"; + final String rightFlank = "AAA"; + final String allOverlappingBases = "ACGTACGTGGAACCTTAG"; + for ( int overlapSize = 1; overlapSize < allOverlappingBases.length(); overlapSize++ ) { + final String overlappingBases = allOverlappingBases.substring(0, overlapSize); + final byte[] overlappingBaseQuals = new byte[overlapSize]; + for ( int i = 0; i < overlapSize; i++ ) overlappingBaseQuals[i] = (byte)(i + 30); + final GATKSAMRecord read1 = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, "", 30, 1); + final GATKSAMRecord read2 = makeOverlappingRead("", 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, leftFlank.length() + 1); + final GATKSAMRecord merged = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, 1); + tests.add(new Object[]{"equalQuals", read1, read2, merged}); + + // test that the merged read base quality is the + tests.add(new Object[]{"lowQualLeft", modifyBaseQualities(read1, leftFlank.length(), overlapSize), read2, merged}); + tests.add(new Object[]{"lowQualRight", read1, modifyBaseQualities(read2, 0, overlapSize), merged}); + } + + return tests.toArray(new Object[][]{}); + } + + private GATKSAMRecord modifyBaseQualities(final GATKSAMRecord read, final int startOffset, final int length) throws Exception { + final GATKSAMRecord readWithLowQuals = (GATKSAMRecord)read.clone(); + final byte[] withLowQuals = Arrays.copyOf(read.getBaseQualities(), read.getBaseQualities().length); + for ( int i = startOffset; i < startOffset + length; i++ ) + withLowQuals[i] = (byte)(read.getBaseQualities()[i] + (i % 2 == 0 ? -1 : 0)); + readWithLowQuals.setBaseQualities(withLowQuals); + return readWithLowQuals; + } + + private GATKSAMRecord makeOverlappingRead(final String leftFlank, final int leftQual, final String overlapBases, + final byte[] overlapQuals, final String rightFlank, final int rightQual, + final int alignmentStart) { + final String bases = leftFlank + overlapBases + rightFlank; + final int readLength = bases.length(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, alignmentStart, readLength); + final byte[] leftQuals = Utils.dupBytes((byte) leftQual, leftFlank.length()); + final byte[] rightQuals = Utils.dupBytes((byte) rightQual, rightFlank.length()); + final byte[] quals = Utils.concat(leftQuals, overlapQuals, rightQuals); + read.setCigarString(readLength + "M"); + read.setReadBases(bases.getBytes()); + for ( final EventType type : EventType.values() ) + read.setBaseQualities(quals, type); + read.setReadGroup(rgForMerged); + read.setMappingQuality(60); + return read; + } + + @Test(enabled = !DEBUG, dataProvider = "MergeFragmentsTest") + public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) { + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + + if ( expectedMerged == null ) { + Assert.assertNull(actual, "Expected reads not to merge, but got non-null result from merging"); + } else { + Assert.assertTrue(actual.isStrandless(), "Merged reads should be strandless"); + Assert.assertNotNull(actual, "Expected reads to merge, but got null result from merging"); + // I really care about the bases, the quals, the CIGAR, and the read group tag + Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); + Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases()); + Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup()); + Assert.assertEquals(actual.getMappingQuality(), expectedMerged.getMappingQuality()); + for ( final EventType type : EventType.values() ) + Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); + } + } + + @Test(enabled = !DEBUG) + public void testHardClippingBeforeMerge() { + final String common = Utils.dupString("A", 10); + final byte[] commonQuals = Utils.dupBytes((byte)30, common.length()); + final String adapter = "NNNN"; + + final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, common, commonQuals, "", 30, 10); + final GATKSAMRecord read2 = makeOverlappingRead("", 30, common, commonQuals, adapter, 30, 10); + final GATKSAMRecord expectedMerged = makeOverlappingRead("", 30, common, commonQuals, "", 30, 10); + read1.setCigarString("4S" + common.length() + "M"); + read1.setProperPairFlag(true); + read1.setFirstOfPairFlag(true); + read1.setReadNegativeStrandFlag(true); + read1.setMateAlignmentStart(10); + read2.setCigarString(common.length() + "M4S"); + read2.setProperPairFlag(true); + read2.setFirstOfPairFlag(false); + read2.setReadNegativeStrandFlag(false); + + final int insertSize = common.length() - 1; + read1.setInferredInsertSize(insertSize); + read2.setInferredInsertSize(-insertSize); + + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); + Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases()); + Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup()); + Assert.assertEquals(actual.getMappingQuality(), expectedMerged.getMappingQuality()); + for ( final EventType type : EventType.values() ) + Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); + } + + @Test(enabled = true) + public void testHardClippingBeforeMergeResultingInCompletelyContainedSecondRead() { + final String adapter = "NNNN"; + + final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, Utils.dupString("A", 10), Utils.dupBytes((byte)30, 10), "", 30, 10); + final GATKSAMRecord read2 = makeOverlappingRead("", 30, Utils.dupString("A", 7), Utils.dupBytes((byte)30, 7), adapter, 30, 10); + read1.setCigarString("4S10M"); + read1.setProperPairFlag(true); + read1.setFirstOfPairFlag(true); + read1.setReadNegativeStrandFlag(true); + read1.setMateAlignmentStart(10); + read2.setCigarString("7M4S"); + read2.setProperPairFlag(true); + read2.setFirstOfPairFlag(false); + read2.setReadNegativeStrandFlag(false); + + final int insertSize = 7 - 1; + read1.setInferredInsertSize(insertSize); + read2.setInferredInsertSize(-insertSize); + + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + Assert.assertNull(actual); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java new file mode 100644 index 000000000..d0b418b96 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java @@ -0,0 +1,203 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class EventMapUnitTest extends BaseTest { + private final static String CHR = "20"; + private final static String NAME = "foo"; + + @DataProvider(name = "MyDataProvider") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList(); + + final List SNP_ALLELES = Arrays.asList("A", "C"); + final List INS_ALLELES = Arrays.asList("A", "ACGTGA"); + final List DEL_ALLELES = Arrays.asList("ACGTA", "C"); + final List> allAlleles = Arrays.asList(SNP_ALLELES, INS_ALLELES, DEL_ALLELES); + for ( final int leftNotClump : Arrays.asList(-1, 3) ) { + for ( final int middleNotClump : Arrays.asList(-1, 10, 500) ) { + for ( final int rightNotClump : Arrays.asList(-1, 1000) ) { + for ( final int nClumped : Arrays.asList(3, 4) ) { + for ( final List> alleles : Utils.makePermutations(allAlleles, nClumped, true)) { + final List allVCS = new LinkedList(); + + if ( leftNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, leftNotClump, SNP_ALLELES)); + if ( middleNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, middleNotClump, SNP_ALLELES)); + if ( rightNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, rightNotClump, SNP_ALLELES)); + + int clumpStart = 50; + final List vcs = new LinkedList(); + for ( final List myAlleles : alleles ) { + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(NAME, CHR, clumpStart, myAlleles); + clumpStart = vc.getEnd() + 3; + vcs.add(vc); + } + + tests.add(new Object[]{new EventMap(new LinkedList(allVCS)), Collections.emptyList()}); + allVCS.addAll(vcs); + tests.add(new Object[]{new EventMap(allVCS), vcs}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MyDataProvider", enabled = true) + public void testGetNeighborhood(final EventMap eventMap, final List expectedNeighbors) { + final VariantContext leftOfNeighors = expectedNeighbors.isEmpty() ? null : expectedNeighbors.get(0); + + for ( final VariantContext vc : eventMap.getVariantContexts() ) { + final List n = eventMap.getNeighborhood(vc, 5); + if ( leftOfNeighors == vc ) + Assert.assertEquals(n, expectedNeighbors); + else if ( ! expectedNeighbors.contains(vc) ) + Assert.assertEquals(n, Collections.singletonList(vc), "Should only contain the original vc but " + n); + } + } + + @DataProvider(name = "BlockSubstitutionsData") + public Object[][] makeBlockSubstitutionsData() { + List tests = new ArrayList(); + + for ( int size = EventMap.MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION; size < 10; size++ ) { + final String ref = Utils.dupString("A", size); + final String alt = Utils.dupString("C", size); + tests.add(new Object[]{ref, alt, size + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList(ref, alt))}); + } + + tests.add(new Object[]{"AAAAAA", "GAGAGA", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAA", "GAGAG"))}); + tests.add(new Object[]{"AAAAAA", "GAGAGG", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAAA", "GAGAGG"))}); + + for ( int len = 0; len < 10; len++ ) { + final String s = len == 0 ? "" : Utils.dupString("A", len); + tests.add(new Object[]{s + "AACCCCAA", s + "GAAG", len + 2 + "M4D2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AACCCCAA", "GAAG"))}); + tests.add(new Object[]{s + "AAAA", s + "GACCCCAG", len + 2 + "M4I2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AAAA", "GACCCCAG"))}); + + tests.add(new Object[]{"AACCCCAA" + s, "GAAG" + s, "2M4D" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AACCCCAA", "GAAG"))}); + tests.add(new Object[]{"AAAA" + s, "GACCCCAG" + s, "2M4I" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAA", "GACCCCAG"))}); + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "BlockSubstitutionsData") + public void testBlockSubstitutionsData(final String refBases, final String haplotypeBases, final String cigar, final VariantContext expectedBlock) { + final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); + final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); + ee.replaceClumpedEventsWithBlockSubstititions(); + Assert.assertEquals(ee.getNumberOfEvents(), 1); + final VariantContext actual = ee.getVariantContexts().iterator().next(); + Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual); + } + + @DataProvider(name = "AdjacentSNPIndelTest") + public Object[][] makeAdjacentSNPIndelTest() { + List tests = new ArrayList(); + + tests.add(new Object[]{"TT", "GCT", "1M1I1M", Arrays.asList(Arrays.asList("T", "GC"))}); + tests.add(new Object[]{"GCT", "TT", "1M1D1M", Arrays.asList(Arrays.asList("GC", "T"))}); + tests.add(new Object[]{"TT", "GCCT", "1M2I1M", Arrays.asList(Arrays.asList("T", "GCC"))}); + tests.add(new Object[]{"GCCT", "TT", "1M2D1M", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "AATT", "3M2D1M", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "GATT", "3M2D1M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAAAA", "AGACA", "5M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("A", "C"))}); + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "AdjacentSNPIndelTest") + public void testAdjacentSNPIndelTest(final String refBases, final String haplotypeBases, final String cigar, final List> expectedAlleles) { + final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); + final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); + ee.replaceClumpedEventsWithBlockSubstititions(); + Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size()); + final List actuals = new ArrayList(ee.getVariantContexts()); + for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) { + final VariantContext actual = actuals.get(i); + Assert.assertEquals(actual.getReference().getDisplayString(), expectedAlleles.get(i).get(0)); + Assert.assertEquals(actual.getAlternateAllele(0).getDisplayString(), expectedAlleles.get(i).get(1)); + } + } + + @DataProvider(name = "MakeBlockData") + public Object[][] makeMakeBlockData() { + List tests = new ArrayList(); + + tests.add(new Object[]{Arrays.asList("A", "G"), Arrays.asList("AGT", "A"), Arrays.asList("AGT", "G")}); + tests.add(new Object[]{Arrays.asList("A", "G"), Arrays.asList("A", "AGT"), Arrays.asList("A", "GGT")}); + + tests.add(new Object[]{Arrays.asList("AC", "A"), Arrays.asList("A", "AGT"), Arrays.asList("AC", "AGT")}); + tests.add(new Object[]{Arrays.asList("ACGTA", "A"), Arrays.asList("A", "AG"), Arrays.asList("ACGTA", "AG")}); + tests.add(new Object[]{Arrays.asList("AC", "A"), Arrays.asList("A", "AGCGT"), Arrays.asList("AC", "AGCGT")}); + tests.add(new Object[]{Arrays.asList("A", "ACGTA"), Arrays.asList("AG", "A"), Arrays.asList("AG", "ACGTA")}); + tests.add(new Object[]{Arrays.asList("A", "AC"), Arrays.asList("AGCGT", "A"), Arrays.asList("AGCGT", "AC")}); + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MakeBlockData", enabled = true) + public void testGetNeighborhood(final List firstAlleles, final List secondAlleles, final List expectedAlleles) { + final VariantContext vc1 = GATKVariantContextUtils.makeFromAlleles("x", "20", 10, firstAlleles); + final VariantContext vc2 = GATKVariantContextUtils.makeFromAlleles("x", "20", 10, secondAlleles); + final VariantContext expected = GATKVariantContextUtils.makeFromAlleles("x", "20", 10, expectedAlleles); + + final EventMap eventMap = new EventMap(Collections.emptyList()); + final VariantContext block = eventMap.makeBlock(vc1, vc2); + + Assert.assertEquals(block.getStart(), expected.getStart()); + Assert.assertEquals(block.getAlleles(), expected.getAlleles()); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java similarity index 61% rename from public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java index 1b16266a9..cfbc4a3e0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java @@ -23,18 +23,23 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.haplotype; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -43,10 +48,6 @@ import java.util.*; * Basic unit test for Haplotype Class */ public class HaplotypeUnitTest extends BaseTest { - @BeforeClass - public void init() { - } - @Test public void testSimpleInsertionAllele() { final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; @@ -163,4 +164,86 @@ public class HaplotypeUnitTest extends BaseTest { final Haplotype h1expected = new Haplotype(newHap.getBytes()); Assert.assertEquals(h1, h1expected); } + + private Haplotype makeHCForCigar(final String bases, final String cigar) { + final Haplotype h = new Haplotype(bases.getBytes()); + h.setCigar(TextCigarCodec.getSingleton().decode(cigar)); + return h; + } + + @Test + public void testConsolidateCigar() throws Exception { + Assert.assertEquals(makeHCForCigar("AGCT", "4M").getConsolidatedPaddedCigar(0).toString(), "4M"); + Assert.assertEquals(makeHCForCigar("AGCT", "4M").getConsolidatedPaddedCigar(1).toString(), "5M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(0).toString(), "1M2I1M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(1).toString(), "1M2I2M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(2).toString(), "1M2I3M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(0).toString(), "1M3I"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(1).toString(), "1M3I1M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(2).toString(), "1M3I2M"); + } + + @DataProvider(name = "TrimmingData") + public Object[][] makeTrimmingData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, 10, 20); + final String fullBases = "ACGTAACCGGT"; + for ( int trimStart = loc.getStart(); trimStart < loc.getStop(); trimStart++ ) { + for ( int trimStop = trimStart; trimStop <= loc.getStop(); trimStop++ ) { + final int start = trimStart - loc.getStart(); + final int stop = start + (trimStop - trimStart) + 1; + final GenomeLoc trimmedLoc = new UnvalidatingGenomeLoc("20", 0, start + loc.getStart(), stop + loc.getStart() - 1); + final String expectedBases = fullBases.substring(start, stop); + final Haplotype full = new Haplotype(fullBases.getBytes(), loc); + final Haplotype trimmed = new Haplotype(expectedBases.getBytes(), trimmedLoc); + + final int hapStart = 10; + full.setAlignmentStartHapwrtRef(hapStart); + full.setCigar(TextCigarCodec.getSingleton().decode(full.length() + "M")); + + trimmed.setAlignmentStartHapwrtRef(hapStart + start); + trimmed.setCigar(TextCigarCodec.getSingleton().decode(trimmed.length() + "M")); + + tests.add(new Object[]{full, trimmedLoc, trimmed}); + } + } + + final Haplotype full = new Haplotype("ACT".getBytes(), new UnvalidatingGenomeLoc("20", 0, 10, 14)); + full.setAlignmentStartHapwrtRef(10); + full.setCigar(TextCigarCodec.getSingleton().decode("1M2D2M")); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 11, 12), null}); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 10, 12), null}); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 11, 13), null}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimmingData") + public void testTrim(final Haplotype full, final GenomeLoc trimTo, final Haplotype expected) { + final Haplotype actual = full.trim(trimTo); + if ( expected != null ) { + Assert.assertEquals(actual.getBases(), expected.getBases()); + Assert.assertEquals(actual.getStartPosition(), trimTo.getStart()); + Assert.assertEquals(actual.getStopPosition(), trimTo.getStop()); + Assert.assertEquals(actual.getCigar(), expected.getCigar()); + Assert.assertEquals(actual.getAlignmentStartHapwrtRef(), expected.getAlignmentStartHapwrtRef()); + } else { + Assert.assertNull(actual); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testBadTrimLoc() { + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, 10, 20); + final Haplotype hap = new Haplotype("ACGTAACCGGT".getBytes(), loc); + hap.trim(new UnvalidatingGenomeLoc("20", 0, 1, 20)); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testBadTrimNoLoc() { + final Haplotype hap = new Haplotype("ACGTAACCGGT".getBytes()); + hap.trim(new UnvalidatingGenomeLoc("20", 0, 1, 20)); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 98ecd0f43..69466d163 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -285,4 +285,20 @@ public class IntervalIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("testSymbolicAlleles", spec); } + + @Test + public void testIntersectionOfLexicographicallySortedIntervals() { + final String md5 = "18be9375e5a753f766616a51eb6131f0"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T CountLoci" + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -R " + b37KGReference + + " -L " + privateTestDir + "lexicographicallySortedIntervals.bed" + + " -L 4" + + " -isr INTERSECTION" + + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntersectionOfLexicographicallySortedIntervals", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index fd87c1c12..d2f29ee7a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -51,7 +51,7 @@ import java.util.*; * testing of the new (non-legacy) version of LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { - private static final boolean DEBUG = true; + private static final boolean DEBUG = false; protected LocusIteratorByState li; @Test(enabled = true) @@ -361,7 +361,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { // comprehensive LIBS/PileupElement tests // //////////////////////////////////////////// - @DataProvider(name = "LIBSTest") + @DataProvider(name = "MyLIBSTest") public Object[][] makeLIBSTest() { final List tests = new LinkedList(); @@ -377,7 +377,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { // Arrays.asList(3)); } - @Test(enabled = true && ! DEBUG, dataProvider = "LIBSTest") + @Test(enabled = ! DEBUG, dataProvider = "MyLIBSTest") public void testLIBS(LIBSTest params) { // create the iterator by state with the fake reads and fake records final GATKSAMRecord read = params.makeRead(); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 5587d32f8..b734ecc96 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -216,7 +216,7 @@ public class NanoSchedulerUnitTest extends BaseTest { nanoScheduler.shutdown(); } - @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = 2 * NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { if ( test.bufferSize > 1) { logger.warn("Running " + test); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index f845e6670..2a2d80206 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -37,6 +37,7 @@ import org.testng.annotations.Test; import java.util.*; public class AlignmentUtilsUnitTest { + private final static boolean DEBUG = false; private SAMFileHeader header; /** Basic aligned and mapped read. */ @@ -85,7 +86,7 @@ public class AlignmentUtilsUnitTest { new Object[] {readUnknownStart, false} }; } - @Test(dataProvider = "genomeLocUnmappedReadTests") + @Test(enabled = !DEBUG, dataProvider = "genomeLocUnmappedReadTests") public void testIsReadGenomeLocUnmapped(SAMRecord read, boolean expected) { Assert.assertEquals(AlignmentUtils.isReadGenomeLocUnmapped(read), expected); } @@ -103,7 +104,7 @@ public class AlignmentUtilsUnitTest { new Object[] {readUnknownStart, true} }; } - @Test(dataProvider = "unmappedReadTests") + @Test(enabled = !DEBUG, dataProvider = "unmappedReadTests") public void testIsReadUnmapped(SAMRecord read, boolean expected) { Assert.assertEquals(AlignmentUtils.isReadUnmapped(read), expected); } @@ -144,6 +145,34 @@ public class AlignmentUtilsUnitTest { } + @DataProvider(name = "CalcNumDifferentBasesData") + public Object[][] makeCalcNumDifferentBasesData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"5M", "ACGTA", "ACGTA", 0}); + tests.add(new Object[]{"5M", "ACGTA", "ACGTT", 1}); + tests.add(new Object[]{"5M", "ACGTA", "TCGTT", 2}); + tests.add(new Object[]{"5M", "ACGTA", "TTGTT", 3}); + tests.add(new Object[]{"5M", "ACGTA", "TTTTT", 4}); + tests.add(new Object[]{"5M", "ACGTA", "TTTCT", 5}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "ACNNNGTA", 3}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "ACNNNGTT", 4}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "TCNNNGTT", 5}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "ACA", 2}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "ACT", 3}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "TCT", 4}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "TGT", 5}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "CalcNumDifferentBasesData") + public void testCalcNumDifferentBases(final String cigarString, final String ref, final String read, final int expectedDifferences) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + Assert.assertEquals(AlignmentUtils.calcNumDifferentBases(cigar, ref.getBytes(), read.getBytes()), expectedDifferences); + } + + @DataProvider(name = "NumAlignedBasesCountingSoftClips") public Object[][] makeNumAlignedBasesCountingSoftClips() { List tests = new ArrayList(); @@ -160,7 +189,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumAlignedBasesCountingSoftClips") + @Test(enabled = !DEBUG, dataProvider = "NumAlignedBasesCountingSoftClips") public void testNumAlignedBasesCountingSoftClips(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); @@ -180,7 +209,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "CigarHasZeroElement") + @Test(enabled = !DEBUG, dataProvider = "CigarHasZeroElement") public void testCigarHasZeroSize(final Cigar cigar, final boolean hasZero) { Assert.assertEquals(AlignmentUtils.cigarHasZeroSizeElement(cigar), hasZero, "Cigar " + cigar.toString() + " failed cigarHasZeroSizeElement"); } @@ -200,7 +229,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumHardClipped") + @Test(enabled = !DEBUG, dataProvider = "NumHardClipped") public void testNumHardClipped(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); @@ -227,49 +256,54 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumAlignedBlocks") + @Test(enabled = !DEBUG, dataProvider = "NumAlignedBlocks") public void testNumAlignedBlocks(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); Assert.assertEquals(AlignmentUtils.getNumAlignmentBlocks(read), expected, "Cigar " + cigar + " failed NumAlignedBlocks"); } - @Test - public void testConsolidateCigar() { - { - //1M1M1M1D2M1M --> 3M1D3M - List list = new ArrayList(); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.D)); - list.add( new CigarElement(2, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - Cigar unconsolidatedCigar = new Cigar(list); + @DataProvider(name = "ConsolidateCigarData") + public Object[][] makeConsolidateCigarData() { + List tests = new ArrayList(); - list.clear(); - list.add( new CigarElement(3, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.D)); - list.add( new CigarElement(3, CigarOperator.M)); - Cigar consolidatedCigar = new Cigar(list); + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{"1M1M", "2M"}); + tests.add(new Object[]{"2M", "2M"}); + tests.add(new Object[]{"2M0M", "2M"}); + tests.add(new Object[]{"0M2M", "2M"}); + tests.add(new Object[]{"0M2M0M0I0M1M", "3M"}); + tests.add(new Object[]{"2M0M1M", "3M"}); + tests.add(new Object[]{"1M1M1M1D2M1M", "3M1D3M"}); + tests.add(new Object[]{"6M6M6M", "18M"}); - Assert.assertEquals(consolidatedCigar.toString(), AlignmentUtils.consolidateCigar(unconsolidatedCigar).toString()); + final List elements = new LinkedList(); + int i = 1; + for ( final CigarOperator op : CigarOperator.values() ) { + elements.add(new CigarElement(i++, op)); + } + for ( final List ops : Utils.makePermutations(elements, 3, false) ) { + final String expected = new Cigar(ops).toString(); + final List cutElements = new LinkedList(); + for ( final CigarElement elt : ops ) { + for ( int j = 0; j < elt.getLength(); j++ ) { + cutElements.add(new CigarElement(1, elt.getOperator())); + } + } + + final String actual = new Cigar(cutElements).toString(); + tests.add(new Object[]{actual, expected}); } - { - //6M6M6M --> 18M - List list = new ArrayList(); - list.add( new CigarElement(6, CigarOperator.M)); - list.add( new CigarElement(6, CigarOperator.M)); - list.add( new CigarElement(6, CigarOperator.M)); - Cigar unconsolidatedCigar = new Cigar(list); + return tests.toArray(new Object[][]{}); + } - list.clear(); - list.add( new CigarElement(18, CigarOperator.M)); - Cigar consolidatedCigar = new Cigar(list); - - Assert.assertEquals(consolidatedCigar.toString(), AlignmentUtils.consolidateCigar(unconsolidatedCigar).toString()); - } + @Test(enabled = !DEBUG, dataProvider = "ConsolidateCigarData") + public void testConsolidateCigarWithData(final String testCigarString, final String expectedCigarString) { + final Cigar testCigar = TextCigarCodec.getSingleton().decode(testCigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.consolidateCigar(testCigar); + Assert.assertEquals(actualCigar, expectedCigar); } @DataProvider(name = "SoftClipsDataProvider") @@ -304,7 +338,7 @@ public class AlignmentUtilsUnitTest { return array; } - @Test(dataProvider = "SoftClipsDataProvider") + @Test(enabled = !DEBUG, dataProvider = "SoftClipsDataProvider") public void testSoftClipsData(final byte[] qualsOfSoftClipsOnLeft, final int middleSize, final String middleOp, final byte[] qualOfSoftClipsOnRight, final int qualThreshold, final int numExpected) { final int readLength = (middleOp.equals("D") ? 0 : middleSize) + qualOfSoftClipsOnRight.length + qualsOfSoftClipsOnLeft.length; final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLength); @@ -391,7 +425,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "MismatchCountDataProvider") + @Test(enabled = !DEBUG, dataProvider = "MismatchCountDataProvider") public void testMismatchCountData(final GATKSAMRecord read, final int refIndex, final int startOnRead, final int basesToRead, final boolean isMismatch) { final byte[] reference = Utils.dupBytes((byte)'A', 100); final int actual = AlignmentUtils.getMismatchCount(read, reference, refIndex, startOnRead, basesToRead).numMismatches; @@ -476,7 +510,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "AlignmentByteArrayOffsetDataProvider") + @Test(enabled = !DEBUG, dataProvider = "AlignmentByteArrayOffsetDataProvider") public void testAlignmentByteArrayOffsetData(final Cigar cigar, final int offset, final int expectedResult, final boolean isDeletion, final int lengthOfSoftClip) { final int actual = AlignmentUtils.calcAlignmentByteArrayOffset(cigar, isDeletion ? -1 : offset, isDeletion, 20, 20 + offset - lengthOfSoftClip); Assert.assertEquals(actual, expectedResult, "Wrong alignment offset detected for cigar " + cigar.toString()); @@ -514,7 +548,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "ReadToAlignmentByteArrayDataProvider") + @Test(enabled = !DEBUG, dataProvider = "ReadToAlignmentByteArrayDataProvider") public void testReadToAlignmentByteArrayData(final Cigar cigar, final int expectedLength, final char middleOp, final int startOfIndelBases, final int lengthOfDeletion) { final byte[] read = Utils.dupBytes((byte)'A', cigar.getReadLength()); final byte[] alignment = AlignmentUtils.readToAlignmentByteArray(cigar, read); @@ -645,9 +679,358 @@ public class AlignmentUtilsUnitTest { return readString; } - @Test(dataProvider = "LeftAlignIndelDataProvider", enabled = true) + @Test(enabled = !DEBUG, dataProvider = "LeftAlignIndelDataProvider") public void testLeftAlignIndelData(final Cigar originalCigar, final Cigar expectedCigar, final byte[] reference, final byte[] read, final int repeatLength) { final Cigar actualCigar = AlignmentUtils.leftAlignIndel(originalCigar, reference, read, 0, 0, true); Assert.assertTrue(expectedCigar.equals(actualCigar), "Wrong left alignment detected for cigar " + originalCigar.toString() + " to " + actualCigar.toString() + " but expected " + expectedCigar.toString() + " with repeat length " + repeatLength); } + + ////////////////////////////////////////// + // Test AlignmentUtils.trimCigarByReference() // + ////////////////////////////////////////// + + @DataProvider(name = "TrimCigarData") + public Object[][] makeTrimCigarData() { + List tests = new ArrayList(); + + for ( final CigarOperator op : Arrays.asList(CigarOperator.D, CigarOperator.EQ, CigarOperator.X, CigarOperator.M) ) { + for ( int myLength = 1; myLength < 6; myLength++ ) { + for ( int start = 0; start < myLength - 1; start++ ) { + for ( int end = start; end < myLength; end++ ) { + final int length = end - start + 1; + + final List padOps = Arrays.asList(CigarOperator.D, CigarOperator.M); + for ( final CigarOperator padOp: padOps) { + for ( int leftPad = 0; leftPad < 2; leftPad++ ) { + for ( int rightPad = 0; rightPad < 2; rightPad++ ) { + tests.add(new Object[]{ + (leftPad > 0 ? leftPad + padOp.toString() : "") + myLength + op.toString() + (rightPad > 0 ? rightPad + padOp.toString() : ""), + start + leftPad, + end + leftPad, + length + op.toString()}); + } + } + } + } + } + } + } + + for ( final int leftPad : Arrays.asList(0, 1, 2, 5) ) { + for ( final int rightPad : Arrays.asList(0, 1, 2, 5) ) { + final int length = leftPad + rightPad; + if ( length > 0 ) { + for ( final int insSize : Arrays.asList(1, 10) ) { + for ( int start = 0; start <= leftPad; start++ ) { + for ( int stop = leftPad; stop < length; stop++ ) { + final int leftPadRemaining = leftPad - start; + final int rightPadRemaining = stop - leftPad + 1; + final String insC = insSize + "I"; + tests.add(new Object[]{ + leftPad + "M" + insC + rightPad + "M", + start, + stop, + (leftPadRemaining > 0 ? leftPadRemaining + "M" : "") + insC + (rightPadRemaining > 0 ? rightPadRemaining + "M" : "") + }); + } + } + } + } + } + } + + tests.add(new Object[]{"3M2D4M", 0, 8, "3M2D4M"}); + tests.add(new Object[]{"3M2D4M", 2, 8, "1M2D4M"}); + tests.add(new Object[]{"3M2D4M", 2, 6, "1M2D2M"}); + tests.add(new Object[]{"3M2D4M", 3, 6, "2D2M"}); + tests.add(new Object[]{"3M2D4M", 4, 6, "1D2M"}); + tests.add(new Object[]{"3M2D4M", 5, 6, "2M"}); + tests.add(new Object[]{"3M2D4M", 6, 6, "1M"}); + + tests.add(new Object[]{"2M3I4M", 0, 5, "2M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 5, "1M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 4, "1M3I3M"}); + tests.add(new Object[]{"2M3I4M", 2, 4, "3I3M"}); + tests.add(new Object[]{"2M3I4M", 2, 3, "3I2M"}); + tests.add(new Object[]{"2M3I4M", 2, 2, "3I1M"}); + tests.add(new Object[]{"2M3I4M", 3, 4, "2M"}); + tests.add(new Object[]{"2M3I4M", 3, 3, "1M"}); + tests.add(new Object[]{"2M3I4M", 4, 4, "1M"}); + + // this doesn't work -- but I'm not sure it should + // tests.add(new Object[]{"2M3I4M", 2, 1, "3I"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimCigarData", enabled = ! DEBUG) + public void testTrimCigar(final String cigarString, final int start, final int length, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.trimCigarByReference(cigar, start, length); + Assert.assertEquals(actualCigar, expectedCigar); + } + + @DataProvider(name = "TrimCigarByBasesData") + public Object[][] makeTrimCigarByBasesData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"2M3I4M", 0, 8, "2M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 8, "1M3I4M"}); + tests.add(new Object[]{"2M3I4M", 2, 8, "3I4M"}); + tests.add(new Object[]{"2M3I4M", 3, 8, "2I4M"}); + tests.add(new Object[]{"2M3I4M", 4, 8, "1I4M"}); + tests.add(new Object[]{"2M3I4M", 4, 7, "1I3M"}); + tests.add(new Object[]{"2M3I4M", 4, 6, "1I2M"}); + tests.add(new Object[]{"2M3I4M", 4, 5, "1I1M"}); + tests.add(new Object[]{"2M3I4M", 4, 4, "1I"}); + tests.add(new Object[]{"2M3I4M", 5, 5, "1M"}); + + tests.add(new Object[]{"2M2D2I", 0, 3, "2M2D2I"}); + tests.add(new Object[]{"2M2D2I", 1, 3, "1M2D2I"}); + tests.add(new Object[]{"2M2D2I", 2, 3, "2D2I"}); + tests.add(new Object[]{"2M2D2I", 3, 3, "1I"}); + tests.add(new Object[]{"2M2D2I", 2, 2, "2D1I"}); + tests.add(new Object[]{"2M2D2I", 1, 2, "1M2D1I"}); + tests.add(new Object[]{"2M2D2I", 1, 1, "1M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimCigarByBasesData", enabled = !DEBUG) + public void testTrimCigarByBase(final String cigarString, final int start, final int length, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.trimCigarByBases(cigar, start, length); + Assert.assertEquals(actualCigar, expectedCigar); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.applyCigarToCigar() // + ////////////////////////////////////////// + + @DataProvider(name = "ApplyCigarToCigarData") + public Object[][] makeApplyCigarToCigarData() { + List tests = new ArrayList(); + + for ( int i = 1; i < 5; i++ ) + tests.add(new Object[]{i + "M", i + "M", i + "M"}); + +// * ref : ACGTAC +// * hap : AC---C - 2M3D1M +// * read : AC---C - 3M +// * result: AG---C => 2M3D + tests.add(new Object[]{"3M", "2M3D1M", "2M3D1M"}); + +// * ref : ACxG-TA +// * hap : AC-G-TA - 2M1D3M +// * read : AC-GxTA - 3M1I2M +// * result: AC-GxTA => 2M1D1M1I2M + tests.add(new Object[]{"3M1I2M", "2M1D3M", "2M1D1M1I2M"}); + +// * ref : A-CGTA +// * hap : A-CGTA - 5M +// * read : AxCGTA - 1M1I4M +// * result: AxCGTA => 1M1I4M + tests.add(new Object[]{"1M1I4M", "5M", "1M1I4M"}); + +// * ref : ACGTA +// * hap : ACGTA - 5M +// * read : A--TA - 1M2D2M +// * result: A--TA => 1M2D2M + tests.add(new Object[]{"1M2D2M", "5M", "1M2D2M"}); + +// * ref : AC-GTA +// * hap : ACxGTA - 2M1I3M +// * read : A--GTA - 1M2D3M +// * result: A--GTA => 1M1D3M + tests.add(new Object[]{"108M14D24M2M18I29M92M1000M", "2M1I3M", "2M1I3M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ApplyCigarToCigarData", enabled = !DEBUG) + public void testApplyCigarToCigar(final String firstToSecondString, final String secondToThirdString, final String expectedCigarString) { + final Cigar firstToSecond = TextCigarCodec.getSingleton().decode(firstToSecondString); + final Cigar secondToThird = TextCigarCodec.getSingleton().decode(secondToThirdString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.applyCigarToCigar(firstToSecond, secondToThird); + Assert.assertEquals(actualCigar, expectedCigar); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.applyCigarToCigar() // + ////////////////////////////////////////// + + @DataProvider(name = "ReadOffsetFromCigarData") + public Object[][] makeReadOffsetFromCigarData() { + List tests = new ArrayList(); + + final int SIZE = 10; + for ( int i = 0; i < SIZE; i++ ) { + tests.add(new Object[]{SIZE + "M", i, i}); + } + + // 0123ii45 + // ref : ACGT--AC + // hap : AC--xxAC (2M2D2I2M) + // ref.pos: 01 45 + tests.add(new Object[]{"2M2D2I2M", 0, 0}); + tests.add(new Object[]{"2M2D2I2M", 1, 1}); + tests.add(new Object[]{"2M2D2I2M", 2, 4}); + tests.add(new Object[]{"2M2D2I2M", 3, 4}); + tests.add(new Object[]{"2M2D2I2M", 4, 4}); + tests.add(new Object[]{"2M2D2I2M", 5, 5}); + + // 10132723 - 10132075 - 500 = 148 + // what's the offset of the first match after the I? + // 108M + 14D + 24M + 2M = 148 + // What's the offset of the first base that is after the I? + // 108M + 24M + 2M + 18I = 134M + 18I = 152 - 1 = 151 + tests.add(new Object[]{"108M14D24M2M18I29M92M", 0, 0}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 107, 107}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 108, 108 + 14}); // first base after the deletion + + tests.add(new Object[]{"108M14D24M2M18I29M92M", 132, 132+14}); // 2 before insertion + tests.add(new Object[]{"108M14D24M2M18I29M92M", 133, 133+14}); // last base before insertion + + // entering into the insertion + for ( int i = 0; i < 18; i++ ) { + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+i, 148}); // inside insertion + } + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18, 148}); // first base after insertion matches at same as insertion + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18+1, 149}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18+2, 150}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadOffsetFromCigarData", enabled = !DEBUG) + public void testReadOffsetFromCigar(final String cigarString, final int startOnCigar, final int expectedOffset) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final int actualOffset = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(cigar, startOnCigar); + Assert.assertEquals(actualOffset, expectedOffset); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.addCigarElements() // + ////////////////////////////////////////// + + @DataProvider(name = "AddCigarElementsData") + public Object[][] makeAddCigarElementsData() { + List tests = new ArrayList(); + + final int SIZE = 10; + for ( final CigarOperator op : Arrays.asList(CigarOperator.I, CigarOperator.M, CigarOperator.S, CigarOperator.EQ, CigarOperator.X)) { + for ( int start = 0; start < SIZE; start++ ) { + for ( int end = start; end < SIZE * 2; end ++ ) { + for ( int pos = 0; pos < SIZE * 3; pos++ ) { + int length = 0; + for ( int i = 0; i < SIZE; i++ ) length += (i+pos) >= start && (i+pos) <= end ? 1 : 0; + tests.add(new Object[]{SIZE + op.toString(), pos, start, end, length > 0 ? length + op.toString() : "*"}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AddCigarElementsData", enabled = !DEBUG) + public void testAddCigarElements(final String cigarString, final int pos, final int start, final int end, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final CigarElement elt = cigar.getCigarElement(0); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + + final List elts = new LinkedList(); + final int actualEndPos = AlignmentUtils.addCigarElements(elts, pos, start, end, elt); + + Assert.assertEquals(actualEndPos, pos + elt.getLength()); + Assert.assertEquals(AlignmentUtils.consolidateCigar(new Cigar(elts)), expectedCigar); + } + + @DataProvider(name = "GetBasesCoveringRefIntervalData") + public Object[][] makeGetBasesCoveringRefIntervalData() { + List tests = new ArrayList(); + + // matches + // 0123 + // ACGT + tests.add(new Object[]{"ACGT", 0, 3, "4M", "ACGT"}); + tests.add(new Object[]{"ACGT", 1, 3, "4M", "CGT"}); + tests.add(new Object[]{"ACGT", 1, 2, "4M", "CG"}); + tests.add(new Object[]{"ACGT", 1, 1, "4M", "C"}); + + // deletions + // 012345 + // AC--GT + tests.add(new Object[]{"ACGT", 0, 5, "2M2D2M", "ACGT"}); + tests.add(new Object[]{"ACGT", 1, 5, "2M2D2M", "CGT"}); + tests.add(new Object[]{"ACGT", 2, 5, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 3, 5, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 4, 5, "2M2D2M", "GT"}); + tests.add(new Object[]{"ACGT", 5, 5, "2M2D2M", "T"}); + tests.add(new Object[]{"ACGT", 0, 4, "2M2D2M", "ACG"}); + tests.add(new Object[]{"ACGT", 0, 3, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 0, 2, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 0, 1, "2M2D2M", "AC"}); + tests.add(new Object[]{"ACGT", 0, 0, "2M2D2M", "A"}); + + // insertions + // 01--23 + // ACTTGT + tests.add(new Object[]{"ACTTGT", 0, 3, "2M2I2M", "ACTTGT"}); + tests.add(new Object[]{"ACTTGT", 1, 3, "2M2I2M", "CTTGT"}); + tests.add(new Object[]{"ACTTGT", 2, 3, "2M2I2M", "GT"}); + tests.add(new Object[]{"ACTTGT", 3, 3, "2M2I2M", "T"}); + tests.add(new Object[]{"ACTTGT", 0, 2, "2M2I2M", "ACTTG"}); + tests.add(new Object[]{"ACTTGT", 0, 1, "2M2I2M", "AC"}); + tests.add(new Object[]{"ACTTGT", 1, 2, "2M2I2M", "CTTG"}); + tests.add(new Object[]{"ACTTGT", 2, 2, "2M2I2M", "G"}); + tests.add(new Object[]{"ACTTGT", 1, 1, "2M2I2M", "C"}); + + tests.add(new Object[]{"ACGT", 0, 1, "2M2I", "AC"}); + tests.add(new Object[]{"ACGT", 1, 1, "2M2I", "C"}); + tests.add(new Object[]{"ACGT", 0, 0, "2M2I", "A"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "GetBasesCoveringRefIntervalData", enabled = true) + public void testGetBasesCoveringRefInterval(final String basesString, final int refStart, final int refEnd, final String cigarString, final String expected) { + final byte[] actualBytes = AlignmentUtils.getBasesCoveringRefInterval(refStart, refEnd, basesString.getBytes(), 0, TextCigarCodec.getSingleton().decode(cigarString)); + if ( expected == null ) + Assert.assertNull(actualBytes); + else + Assert.assertEquals(new String(actualBytes), expected); + } + + @DataProvider(name = "StartsOrEndsWithInsertionOrDeletionData") + public Object[][] makeStartsOrEndsWithInsertionOrDeletionData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"2M", false}); + tests.add(new Object[]{"1D2M", true}); + tests.add(new Object[]{"2M1D", true}); + tests.add(new Object[]{"2M1I", true}); + tests.add(new Object[]{"1I2M", true}); + tests.add(new Object[]{"1M1I2M", false}); + tests.add(new Object[]{"1M1D2M", false}); + tests.add(new Object[]{"1M1I2M1I", true}); + tests.add(new Object[]{"1M1I2M1D", true}); + tests.add(new Object[]{"1D1M1I2M", true}); + tests.add(new Object[]{"1I1M1I2M", true}); + tests.add(new Object[]{"1M1I2M1I1M", false}); + tests.add(new Object[]{"1M1I2M1D1M", false}); + tests.add(new Object[]{"1M1D2M1D1M", false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true) + public void testStartsOrEndsWithInsertionOrDeletion(final String cigar, final boolean expected) { + Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected); + } + + } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index baf4bfbb0..eefc92799 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -31,15 +31,18 @@ import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.List; + public class GATKSAMRecordUnitTest extends BaseTest { GATKSAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; - final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets + final private static int[] REDUCED_READ_COUNTS = new int[]{10, 20, 30, 40, 1}; @BeforeClass public void init() { @@ -52,11 +55,13 @@ public class GATKSAMRecordUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); } @Test public void testReducedReads() { + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); + Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); @@ -66,8 +71,66 @@ public class GATKSAMRecordUnitTest extends BaseTest { } } + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetReducedCountOnNormalRead() { + read.getReducedCount(0); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetReducedTagOnNormalRead() { + read.setReducedCount(0, 2); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testAdjustReducedCountToNegativeNumber() { + reducedRead.setReducedCount(0, 1); + reducedRead.adjustReducedCount(0, -2); + } + + @Test + public void testSetReducedCountOnReducedRead() { + for (int i = 0; i < reducedRead.getReadLength(); i++) { + final byte newCount = (byte)i; + reducedRead.setReducedCount(i, newCount); + Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); + } + + for (int i = 0; i < reducedRead.getReadLength(); i++) { + final int newCount = reducedRead.getReducedCount(i) + i; + reducedRead.adjustReducedCount(i, i); + Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); + } + } + + @Test + public void testReducedReadEncodeAndDecode() { + + // encode + byte[] encoded = GATKSAMRecord.encodeReduceReadCounts(REDUCED_READ_COUNTS); + + // decode + int[] decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); + + // for the heck of it, let's encode and decode again! + encoded = GATKSAMRecord.encodeReduceReadCounts(decoded); + decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); + + for (int i = 0; i < decoded.length; i++) + Assert.assertEquals(decoded[i], REDUCED_READ_COUNTS[i]); + } + + @Test + public void testByteBoundsOnReducedTag() { + reducedRead.setReducedCount(0, 1000); + reducedRead.setReducedReadCountsTag(); + reducedRead.adjustReducedCount(0, -255); + Assert.assertEquals(reducedRead.getReducedCount(0), 0); + } + @Test public void testReducedReadPileupElement() { + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); + PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0); PileupElement reducedreadp = LocusIteratorByState.createPileupForReadAndOffset(reducedRead, 0); @@ -103,7 +166,66 @@ public class GATKSAMRecordUnitTest extends BaseTest { read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null); Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); - } + @Test + public void testStrandlessReads() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + Assert.assertEquals(read.isStrandless(), false); + + read.setReadNegativeStrandFlag(false); + Assert.assertEquals(read.isStrandless(), false); + Assert.assertEquals(read.getReadNegativeStrandFlag(), false); + + read.setReadNegativeStrandFlag(true); + Assert.assertEquals(read.isStrandless(), false); + Assert.assertEquals(read.getReadNegativeStrandFlag(), true); + + read.setReadNegativeStrandFlag(true); + read.setIsStrandless(true); + Assert.assertEquals(read.isStrandless(), true); + Assert.assertEquals(read.getReadNegativeStrandFlag(), false, "negative strand flag should return false even through its set for a strandless read"); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testStrandlessReadsFailSetStrand() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + read.setIsStrandless(true); + read.setReadNegativeStrandFlag(true); + } + + @Test + public void testGetReducedCountsIsCorrect() { + final int[] counts = reducedRead.getReducedReadCounts(); + Assert.assertNotSame(counts, reducedRead.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); + for ( int i = 0; i < counts.length; i++ ) + Assert.assertEquals(counts[i], reducedRead.getReducedCount(i), "Reduced counts vector not equal to getReducedCount(i) at " + i); + } + + @DataProvider(name = "ReducedReadCountConversionProvider") + public Object[][] ReducedReadCountConversionTestData() { + List tests = new ArrayList(); + + tests.add(new Object[]{new int[] {100, 100, 100, 101}, new byte[] {100, 0, 0, 1}}); + tests.add(new Object[]{new int[] {1, 100, 100, 0}, new byte[] {1, 99, 99, -1}}); + tests.add(new Object[]{new int[] {127, 100, 0, 1}, new byte[] {127, -27, -127, -126}}); + tests.add(new Object[]{new int[] {1, 127, 51, 126}, new byte[] {1, 126, 50, 125}}); + tests.add(new Object[]{new int[] {300, 127, 1, 255}, new byte[] {-1, -128, 2, 0}}); + tests.add(new Object[]{new int[] {1, 300, 51, 126}, new byte[] {1, -2, 50, 125}}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReducedReadCountConversionProvider", enabled = true) + public void reducedReadCountConversionTest(final int[] counts, final byte[] expectedConversion) { + + reducedRead.setReducedReadCountsTag(counts); + final byte[] actualConversion = reducedRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG); + for ( int i = 0; i < actualConversion.length; i++ ) + Assert.assertEquals(actualConversion[i], expectedConversion[i], "Conversion differs at position " + i + ": " + actualConversion[i] + " vs. " + expectedConversion[i]); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index baad67d53..331121c55 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -25,13 +25,19 @@ package org.broadinstitute.sting.utils.sam; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; @@ -179,4 +185,20 @@ public class ReadUtilsUnitTest extends BaseTest { final List reads = new LinkedList(); Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); } + + @Test (enabled = true) + public void testReadWithNs() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 3); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 2a15d709a..fcc7c7998 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.utils.variant; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; @@ -692,6 +694,15 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { root.alleles(Arrays.asList(CAref, C)).stop(11).make(), root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); + final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); + final Allele twoCopies = Allele.create("GTTTTATTTTA", true); + final Allele zeroCopies = Allele.create("G", false); + final Allele oneCopies = Allele.create("GTTTTA", false); + tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), + Arrays.asList( + root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), + root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); + return tests.toArray(new Object[][]{}); } @@ -976,4 +987,119 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); } } + + // -------------------------------------------------------------------------------- + // + // test primitive allele splitting + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "PrimitiveAlleleSplittingData") + public Object[][] makePrimitiveAlleleSplittingData() { + List tests = new ArrayList(); + + // no split + tests.add(new Object[]{"A", "C", 0, null}); + tests.add(new Object[]{"A", "AC", 0, null}); + tests.add(new Object[]{"AC", "A", 0, null}); + + // one split + tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); + tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); + tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); + + // two splits + tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); + tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); + tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); + + // three splits + tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PrimitiveAlleleSplittingData") + public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { + + final int start = 10; + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + + final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); + + if ( expectedSplit > 0 ) { + Assert.assertEquals(result.size(), expectedSplit); + for ( int i = 0; i < variantPositions.size(); i++ ) { + Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); + } + } else { + Assert.assertEquals(result.size(), 1); + Assert.assertEquals(vc, result.get(0)); + } + } + + // -------------------------------------------------------------------------------- + // + // test allele remapping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "AlleleRemappingData") + public Object[][] makeAlleleRemappingData() { + List tests = new ArrayList(); + + final Allele originalBase1 = Allele.create((byte)'A'); + final Allele originalBase2 = Allele.create((byte)'T'); + + for ( final byte base1 : BaseUtils.BASES ) { + for ( final byte base2 : BaseUtils.BASES ) { + for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { + Map map = new HashMap(2); + map.put(originalBase1, Allele.create(base1)); + map.put(originalBase2, Allele.create(base2)); + + tests.add(new Object[]{map, numGenotypes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AlleleRemappingData") + public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { + + final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); + + final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); + + final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); + + for ( int i = 0; i < numGenotypes; i++ ) { + final Genotype originalG = originalGC.get(String.format("%d", i)); + final Genotype remappedG = remappedGC.get(String.format("%d", i)); + + Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); + for ( int j = 0; j < originalG.getAlleles().size(); j++ ) + Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); + } + } + + private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { + GenomeAnalysisEngine.resetRandomGenerator(); + final Random random = GenomeAnalysisEngine.getRandomGenerator(); + + final GenotypesContext gc = GenotypesContext.create(); + for ( int i = 0; i < numGenotypes; i++ ) { + // choose alleles at random + final List myAlleles = new ArrayList(); + myAlleles.add(alleles.get(random.nextInt(2))); + myAlleles.add(alleles.get(random.nextInt(2))); + + final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); + gc.add(g); + } + + return gc; + } } \ No newline at end of file diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 8a8c76806..55e56889a 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -160,6 +160,9 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/1kg/DCC/ftp/technical/working/20120312_phase1_v2_indel_cleaned_sites_list/ALL.wgs.phase1_release_v2.20101123.official_indel_calls.20120312.sites.vcf", "1000G_phase1.indels", b37, true, false)) + addResource(new Resource("/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf", + "1000G_phase1.snps.high_confidence, b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", "Mills_and_1000G_gold_standard.indels", b37, true, false)) @@ -171,7 +174,7 @@ class GATKResourcesBundle extends QScript { "CEUTrio.HiSeq.WGS.b37.bestPractices.phased",b37,true,false)) // - // example call set for wiki tutorial + // example call set for documentation guide tutorial // addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf", "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala new file mode 100644 index 000000000..7b9e657bf --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala @@ -0,0 +1,104 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.extensions.picard + +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +class FastqToSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction /*with PicardBamFunction*/ { + analysisName = "FastqToSam" + javaMainClass = "net.sf.picard.sam.FastqToSam" + + @Input(shortName = "fq1", fullName = "input_fq_file1", required = true, doc = "Input Fastq file to extract reads from (single-end fastq or, if paired, first end of the pair fastq)") + var fastq: File = _ + + @Input(shortName = "fq2", fullName = "input_fq_file2", required = false, doc = "Input Fastq file to extract reads from (if paired, second end of the pair fastq).") + var secondEndFastQ: File = _ + + @Output(shortName = "bam", fullName = "output_bam_file", required = true, doc = "Output bam file .") + var bam: File = _ + + @Argument(shortName = "SM", fullName = "SM", required = false, doc = "SM") + var SM: String = "SM" + + @Argument(shortName = "LIB", fullName = "LIB", required = false, doc = "LIB") + var LIB: String = "LIB" + + @Argument(shortName = "PU", fullName = "PU", required = false, doc = "PU") + var PU: String = "PU" + + @Argument(shortName = "RG", fullName = "RG", required = false, doc = "RG") + var RG: String = "RG" + + @Argument(shortName = "PL", fullName = "PL", required = false, doc = "PL") + var PL: String = "illumina" + + @Argument(shortName = "CN", fullName = "CN", required = false, doc = "CN") + var CN: String = "CN" + + +// override def inputBams = Seq(fastq) +// override def outputBam = bam +// this.sortOrder = null + val createIndex:Boolean = true + override def commandLine = super.commandLine + + required("FASTQ=" + fastq) + + optional("FASTQ2=", secondEndFastQ, spaceSeparated=false) + + required("OUTPUT=" + bam) + + optional("READ_GROUP_NAME=", RG, spaceSeparated=false) + + required("SAMPLE_NAME=" + SM) + + optional("LIBRARY_NAME=", LIB, spaceSeparated=false) + + optional("PLATFORM_UNIT=", PU, spaceSeparated=false) + + optional("PLATFORM=", PL, spaceSeparated=false) + + optional("CREATE_INDEX=", createIndex, spaceSeparated=false) + + optional("SEQUENCING_CENTER=", CN, spaceSeparated=false) +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala index 344f5fe5b..529615c24 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala @@ -31,7 +31,7 @@ import org.broadinstitute.sting.commandline.{Argument, Output, Input} /** * Basic snpEff support. - * See: http://www.broadinstitute.org/gsa/wiki/index.php/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator + * See: http://www.broadinstitute.org/gatk/guide/article?id=50 */ class SnpEff extends JavaCommandLineFunction { javaMainClass = "ca.mcgill.mcb.pcingola.snpEffect.commandLine.SnpEff" diff --git a/public/testdata/exampleGRP.grp b/public/testdata/exampleGRP.grp index 2ec55ec57..61a10ac4a 100644 --- a/public/testdata/exampleGRP.grp +++ b/public/testdata/exampleGRP.grp @@ -1,1518 +1,1118 @@ #:GATKReport.v1.1:5 -#:GATKTable:2:14::; +#:GATKTable:2:18:%s:%s:; #:GATKTable:Arguments:Recalibration argument collection values used in this run -Argument Value -covariate null -default_platform null -deletions_context_size 8 -force_platform null -insertions_context_size 8 -insertions_default_quality 45 -low_quality_tail 2 -mismatches_context_size 2 -mismatches_default_quality -1 -no_standard_covs false -quantizing_levels 16 -run_without_dbsnp false -solid_nocall_strategy THROW_EXCEPTION -solid_recal_mode SET_Q_ZERO +Argument Value +binary_tag_name null +covariate ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate +default_platform null +deletions_default_quality 45 +force_platform null +indels_context_size 3 +insertions_default_quality 45 +low_quality_tail 2 +maximum_cycle_value 500 +mismatches_context_size 2 +mismatches_default_quality -1 +no_standard_covs false +plot_pdf_file null +quantizing_levels 16 +recalibration_report null +run_without_dbsnp false +solid_nocall_strategy THROW_EXCEPTION +solid_recal_mode SET_Q_ZERO -#:GATKTable:3:94:::; +#:GATKTable:3:94:%s:%s:%s:; #:GATKTable:Quantized:Quality quantization map QualityScore Count QuantizedScore -0 20 3 -1 0 3 -2 6 3 -3 1041 3 -4 8 3 -5 190 3 -6 102 3 -7 28 7 -8 795 8 -9 0 93 -10 0 93 -11 0 93 -12 0 93 -13 0 93 -14 0 93 -15 0 93 -16 0 93 -17 0 93 -18 0 93 -19 0 93 -20 0 93 -21 0 93 -22 0 93 -23 0 93 -24 0 93 -25 0 93 -26 0 93 -27 0 93 -28 0 93 -29 0 93 -30 0 93 -31 0 93 -32 0 93 -33 0 93 -34 0 93 -35 0 93 -36 0 93 -37 0 93 -38 0 93 -39 0 93 -40 0 93 -41 0 93 -42 0 93 -43 0 93 -44 0 93 -45 0 93 -46 0 93 -47 0 93 -48 0 93 -49 0 93 -50 0 93 -51 0 93 -52 0 93 -53 0 93 -54 0 93 -55 0 93 -56 0 93 -57 0 93 -58 0 93 -59 0 93 -60 0 93 -61 0 93 -62 0 93 -63 0 93 -64 0 93 -65 0 93 -66 0 93 -67 0 93 -68 0 93 -69 0 93 -70 0 93 -71 0 93 -72 0 93 -73 0 93 -74 0 93 -75 0 93 -76 0 93 -77 0 93 -78 0 93 -79 0 93 -80 0 93 -81 0 93 -82 0 82 -83 0 83 -84 0 84 -85 0 85 -86 0 86 -87 0 87 -88 0 88 -89 0 89 -90 0 90 -91 0 91 -92 0 92 -93 0 93 + 0 0 8 + 1 0 8 + 2 0 8 + 3 0 8 + 4 0 8 + 5 0 8 + 6 11 8 + 7 0 8 + 8 7 8 + 9 4 8 + 10 1 8 + 11 2 8 + 12 4 19 + 13 3 19 + 14 1 19 + 15 5 19 + 16 10 19 + 17 6 19 + 18 7 19 + 19 15 19 + 20 5 19 + 21 17 19 + 22 9 19 + 23 15 23 + 24 20 24 + 25 15 13 + 26 6 13 + 27 22 27 + 28 15 28 + 29 20 29 + 30 20 30 + 31 25 31 + 32 32 32 + 33 35 33 + 34 36 34 + 35 0 93 + 36 0 93 + 37 0 93 + 38 0 93 + 39 0 93 + 40 0 93 + 41 0 93 + 42 0 93 + 43 0 93 + 44 0 93 + 45 736 45 + 46 0 93 + 47 0 93 + 48 0 93 + 49 0 93 + 50 0 93 + 51 0 93 + 52 0 93 + 53 0 93 + 54 0 93 + 55 0 93 + 56 0 93 + 57 0 93 + 58 0 93 + 59 0 93 + 60 0 93 + 61 0 93 + 62 0 93 + 63 0 93 + 64 0 93 + 65 0 93 + 66 0 93 + 67 0 93 + 68 0 93 + 69 0 93 + 70 0 93 + 71 0 93 + 72 0 93 + 73 0 93 + 74 0 93 + 75 0 93 + 76 0 93 + 77 0 93 + 78 0 93 + 79 0 93 + 80 0 93 + 81 0 93 + 82 0 93 + 83 0 93 + 84 0 93 + 85 0 93 + 86 0 93 + 87 0 93 + 88 0 93 + 89 0 93 + 90 0 93 + 91 0 93 + 92 0 93 + 93 0 93 -#:GATKTable:6:3:%s:%s:%.4f:%.4f:%d:%d:; +#:GATKTable:6:3:%s:%s:%.4f:%.4f:%d:%.2f:; #:GATKTable:RecalTable0: -ReadGroup EventType EmpiricalQuality EstimatedQReported Observations Errors -exampleBAM.bam.bam D 25.8092 45.0000 380 0 -exampleBAM.bam.bam M 14.0483 15.4820 380 14 -exampleBAM.bam.bam I 25.8092 45.0000 380 0 +ReadGroup EventType EmpiricalQuality EstimatedQReported Observations Errors +exampleBAM.bam M 17.0000 17.4959 368 11.00 +exampleBAM.bam I 45.0000 45.0000 368 0.00 +exampleBAM.bam D 45.0000 45.0000 368 0.00 -#:GATKTable:6:32:%s:%s:%s:%.4f:%d:%d:; +#:GATKTable:6:30:%s:%s:%s:%.4f:%d:%.2f:; #:GATKTable:RecalTable1: -ReadGroup QualityScore EventType EmpiricalQuality Observations Errors -exampleBAM.bam.bam 32 M 15.1851 32 0 -exampleBAM.bam.bam 19 M 9.0309 15 1 -exampleBAM.bam.bam 33 M 15.5630 35 0 -exampleBAM.bam.bam 18 M 6.0206 7 1 -exampleBAM.bam.bam 34 M 15.6820 36 0 -exampleBAM.bam.bam 17 M 5.4407 6 1 -exampleBAM.bam.bam 16 M 7.4036 10 1 -exampleBAM.bam.bam 23 M 12.0412 15 0 -exampleBAM.bam.bam 6 M 4.7712 11 3 -exampleBAM.bam.bam 45 I 25.8092 380 0 -exampleBAM.bam.bam 22 M 10.0000 9 0 -exampleBAM.bam.bam 4 M 4.7712 5 1 -exampleBAM.bam.bam 21 M 12.5527 17 0 -exampleBAM.bam.bam 5 M 4.2597 7 2 -exampleBAM.bam.bam 20 M 4.7712 5 1 -exampleBAM.bam.bam 27 M 13.6173 22 0 -exampleBAM.bam.bam 10 M 3.0103 1 0 -exampleBAM.bam.bam 26 M 8.4510 6 0 -exampleBAM.bam.bam 11 M 1.7609 2 1 -exampleBAM.bam.bam 8 M 6.0206 7 1 -exampleBAM.bam.bam 25 M 12.0412 15 0 -exampleBAM.bam.bam 9 M 6.9897 4 0 -exampleBAM.bam.bam 24 M 10.2119 20 1 -exampleBAM.bam.bam 31 M 14.1497 25 0 -exampleBAM.bam.bam 14 M 3.0103 1 0 -exampleBAM.bam.bam 30 M 13.2222 20 0 -exampleBAM.bam.bam 15 M 7.7815 5 0 -exampleBAM.bam.bam 12 M 6.9897 4 0 -exampleBAM.bam.bam 29 M 13.2222 20 0 -exampleBAM.bam.bam 45 D 25.8092 380 0 -exampleBAM.bam.bam 13 M 6.0206 3 0 -exampleBAM.bam.bam 28 M 12.0412 15 0 +ReadGroup QualityScore EventType EmpiricalQuality Observations Errors +exampleBAM.bam 6 M 6.0000 11 3.00 +exampleBAM.bam 8 M 8.0000 7 1.00 +exampleBAM.bam 9 M 9.0000 4 0.00 +exampleBAM.bam 10 M 10.0000 1 0.00 +exampleBAM.bam 11 M 11.0000 2 1.00 +exampleBAM.bam 12 M 12.0000 4 0.00 +exampleBAM.bam 13 M 13.0000 3 0.00 +exampleBAM.bam 14 M 14.0000 1 0.00 +exampleBAM.bam 15 M 15.0000 5 0.00 +exampleBAM.bam 16 M 16.0000 10 1.00 +exampleBAM.bam 17 M 17.0000 6 1.00 +exampleBAM.bam 18 M 18.0000 7 1.00 +exampleBAM.bam 19 M 19.0000 15 1.00 +exampleBAM.bam 20 M 20.0000 5 1.00 +exampleBAM.bam 21 M 21.0000 17 0.00 +exampleBAM.bam 22 M 22.0000 9 0.00 +exampleBAM.bam 23 M 23.0000 15 0.00 +exampleBAM.bam 24 M 24.0000 20 1.00 +exampleBAM.bam 25 M 25.0000 15 0.00 +exampleBAM.bam 26 M 26.0000 6 0.00 +exampleBAM.bam 27 M 27.0000 22 0.00 +exampleBAM.bam 28 M 28.0000 15 0.00 +exampleBAM.bam 29 M 29.0000 20 0.00 +exampleBAM.bam 30 M 30.0000 20 0.00 +exampleBAM.bam 31 M 31.0000 25 0.00 +exampleBAM.bam 32 M 32.0000 32 0.00 +exampleBAM.bam 33 M 33.0000 35 0.00 +exampleBAM.bam 34 M 34.0000 36 0.00 +exampleBAM.bam 45 I 45.0000 368 0.00 +exampleBAM.bam 45 D 45.0000 368 0.00 -#:GATKTable:8:1354:%s:%s:%s:%s:%s:%.4f:%d:%d:; +#:GATKTable:8:952:%s:%s:%s:%s:%s:%.4f:%d:%.2f:; #:GATKTable:RecalTable2: -ReadGroup QualityScore CovariateValue CovariateName EventType EmpiricalQuality Observations Errors -exampleBAM.bam.bam 45 TGAAAGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTATTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTCGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTGTGTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTAAGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTATTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 23 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 27 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ATTCTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTAATCTC Context I 3.0103 1 0 -exampleBAM.bam.bam 34 GC Context M 4.7712 2 0 -exampleBAM.bam.bam 8 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 TAGAGTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 9 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTCGGG Context I 6.0206 3 0 -exampleBAM.bam.bam 45 AGTTTCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CATTTCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 16 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 5 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGATAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 53 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 57 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 25 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCAGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 33 CT Context M 8.4510 6 0 -exampleBAM.bam.bam 45 AAGTGACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGACAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGAGTTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTCTTTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTGAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 12 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 41 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 21 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 26 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ACCTGGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACAGCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 20 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGTGGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCAAAATC Context I 3.0103 1 0 -exampleBAM.bam.bam 27 TA Context M 6.9897 4 0 -exampleBAM.bam.bam 27 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AAAATCTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 22 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 26 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 33 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTATTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTCAATGT Context I 3.0103 1 0 -exampleBAM.bam.bam 21 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 17 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATCGTGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGATCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATCGTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 52 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 56 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 9 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 23 CT Context M 4.7712 2 0 -exampleBAM.bam.bam 31 26 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 ATGTGAAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTACTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACACAGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 26 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTTGG Context D 4.7712 2 0 -exampleBAM.bam.bam 33 8 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 34 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTCTTAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GAGCCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 20 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAGGG Context D 4.7712 2 0 -exampleBAM.bam.bam 33 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGCAAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 6 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 32 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 29 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 13 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 21 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTAATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATTATTG Context D 3.0103 1 0 -exampleBAM.bam.bam 24 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTCAGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GACATGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCATGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 21 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 25 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 34 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 GG Context M 3.9794 4 1 -exampleBAM.bam.bam 9 16 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCAGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAAGTGAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGACATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 55 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 59 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CATGATCG Context I 3.0103 1 0 -exampleBAM.bam.bam 16 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 32 43 Cycle M 6.0206 3 0 -exampleBAM.bam.bam 19 33 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GTATTTGC Context D 3.0103 1 0 -exampleBAM.bam.bam 26 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTAAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 33 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 11 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 18 1 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ACCCAGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGACAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTTTGC Context D 3.0103 1 0 -exampleBAM.bam.bam 27 16 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 32 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 21 44 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTACTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGCTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 16 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 25 21 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 22 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGCCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 20 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 24 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 30 26 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGTATTT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 53 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 19 70 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 25 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCCACC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 54 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 58 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ACTTTCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGTGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTGATAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AATGTGAA Context I 3.0103 1 0 -exampleBAM.bam.bam 9 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 19 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 28 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTTG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 TCTTTGTA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GTTCGGGT Context I 6.0206 3 0 -exampleBAM.bam.bam 27 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 27 17 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCAAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGCAGCCT Context I 3.0103 1 0 -exampleBAM.bam.bam 20 GT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 TGGAGCCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTGGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 28 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGTGTCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCTCCAGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 49 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 61 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CCTCGTCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCACCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 44 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 AGGTTATC Context I 3.0103 1 0 -exampleBAM.bam.bam 34 41 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTGTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTTGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTCACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 9 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 19 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 31 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 25 TA Context M 6.0206 3 0 -exampleBAM.bam.bam 34 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 28 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAGATTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 GGCTGGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATTAGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 GG Context M 3.0103 3 1 -exampleBAM.bam.bam 32 15 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 22 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCAGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGCCAGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTCTTTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGAACTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 26 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTCTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGATAACC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTTTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCTTTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 46 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 29 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATCCATTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 48 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 60 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GATCCAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATGAGTC Context D 3.0103 1 0 -exampleBAM.bam.bam 24 TT Context M 3.0103 3 1 -exampleBAM.bam.bam 45 TCTTTATA Context I 3.0103 1 0 -exampleBAM.bam.bam 6 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 23 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 34 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 18 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 30 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CAAAATCT Context I 3.0103 1 0 -exampleBAM.bam.bam 22 15 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCATGGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAATCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 CA Context M 6.9897 4 0 -exampleBAM.bam.bam 45 CCCAGATC Context D 3.0103 1 0 -exampleBAM.bam.bam 18 36 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 16 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGTATTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 33 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTAGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 19 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 32 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 32 14 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 12 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGCCT Context I 3.0103 1 0 -exampleBAM.bam.bam 4 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 27 53 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 23 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTATTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATAAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 51 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 63 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CACCCAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CGTGAGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCTTTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 34 CT Context M 4.7712 2 0 -exampleBAM.bam.bam 4 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTT Context I 4.7712 2 0 -exampleBAM.bam.bam 24 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCATGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CACATGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 17 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 29 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ATCAATAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ACCATGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 32 GT Context M 8.4510 6 0 -exampleBAM.bam.bam 19 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCATTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATAACCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AACTGGGA Context I 3.0103 1 0 -exampleBAM.bam.bam 4 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 33 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCAGGCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCACTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTCCAGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 6 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 23 15 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GATATAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTAGAGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 50 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 62 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GCCACCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTCGG Context D 6.0206 3 0 -exampleBAM.bam.bam 24 TC Context M 6.0206 3 0 -exampleBAM.bam.bam 25 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 16 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 28 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ACATGGTA Context I 3.0103 1 0 -exampleBAM.bam.bam 16 34 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 AATCTCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTCACT Context I 3.0103 1 0 -exampleBAM.bam.bam 22 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 ATATCAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAATGTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTCAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGCAATCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTAATGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 30 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 GG Context M 7.7815 5 0 -exampleBAM.bam.bam 20 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 20 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTAGATT Context D 3.0103 1 0 -exampleBAM.bam.bam 33 44 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCTGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGAGATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 21 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATATAAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTTTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACTTTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CCATTTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGGCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 11 TT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 TTTCACTG Context I 3.0103 1 0 -exampleBAM.bam.bam 33 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCGTGAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TACTCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAATGAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCTTTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 22 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTTTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGCCAGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 1 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TTTCAGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATTCTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGATAACC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTCTTTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGAACTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 21 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 32 33 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGCTGGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GATTAGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 33 35 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAGATTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGG Context D 4.7712 2 0 -exampleBAM.bam.bam 19 CT Context M 1.7609 2 1 -exampleBAM.bam.bam 45 19 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 31 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TGTTGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTGTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTCACA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 30 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 30 17 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 33 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 36 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 21 64 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 16 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTCGTCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 49 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 61 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGGTTATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCACCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTGTCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTCCAGG Context D 3.0103 1 0 -exampleBAM.bam.bam 6 AA Context M 4.7712 2 0 -exampleBAM.bam.bam 31 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 31 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGCCT Context D 3.0103 1 0 -exampleBAM.bam.bam 18 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 18 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 27 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TGTATTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTAGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 13 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 20 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCCAGATC Context I 3.0103 1 0 -exampleBAM.bam.bam 32 2 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCATGGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAATCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 30 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 18 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 30 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CCAGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAAAATCT Context D 3.0103 1 0 -exampleBAM.bam.bam 25 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 6 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 17 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 23 35 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTTATA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GATCCAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 48 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 60 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATCCATTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATGAGTC Context I 3.0103 1 0 -exampleBAM.bam.bam 31 TA Context M 4.7712 2 0 -exampleBAM.bam.bam 21 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 34 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTCCAGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 18 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 33 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCAGGCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCACTT Context D 3.0103 1 0 -exampleBAM.bam.bam 28 53 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 19 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 32 1 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GATAACCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AACTGGGA Context D 3.0103 1 0 -exampleBAM.bam.bam 16 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCATTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 21 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 AT Context M 8.4510 6 0 -exampleBAM.bam.bam 16 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CACATGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 17 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 29 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATCAATAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACCATGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTT Context D 4.7712 2 0 -exampleBAM.bam.bam 45 TCCATGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 6 AG Context M -0.0000 1 1 -exampleBAM.bam.bam 6 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATAAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 51 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 63 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CGTGAGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CACCCAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 16 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 5 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GCTTTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTATTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 34 64 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 AC Context M 6.0206 3 0 -exampleBAM.bam.bam 33 2 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCACTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCGTGAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAATGAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TACTCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CACTTTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCATTTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATATAAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTTTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGGCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 29 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TTTCTGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 33 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGAGATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTAGATT Context I 3.0103 1 0 -exampleBAM.bam.bam 34 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGCAATCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTAATGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 30 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 TA Context M 6.9897 4 0 -exampleBAM.bam.bam 45 16 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 28 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ACATGGTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTCAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAATGTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATCTCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTCACT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATATCAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 8 57 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 34 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 16 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 GGGTTCGG Context I 6.0206 3 0 -exampleBAM.bam.bam 45 CTAGAGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 50 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 62 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GATATAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCACCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACCTGGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 5 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGTGGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCAAAATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACAGCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 28 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 33 39 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 23 64 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AAGTGACA Context D 3.0103 1 0 -exampleBAM.bam.bam 5 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGAGTTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGACAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTGAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTCTTTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 33 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGGCAGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 4 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 29 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 34 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGTTTCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CATTTCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 53 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 57 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CATGATAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGAGTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTCGGG Context D 6.0206 3 0 -exampleBAM.bam.bam 45 CTTTATTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTCGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTGTGTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTAAGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTCTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTAATCTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 23 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 27 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 30 21 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGAAAGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTATTA Context I 3.0103 1 0 -exampleBAM.bam.bam 23 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAGGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 GTGCAAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 28 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 ATTCTTAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGCCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 27 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 19 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 4 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTTGG Context I 4.7712 2 0 -exampleBAM.bam.bam 33 AG Context M 6.0206 3 0 -exampleBAM.bam.bam 28 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTACTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ACACAGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGTGAAC Context I 3.0103 1 0 -exampleBAM.bam.bam 32 36 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 29 TA Context M 4.7712 2 0 -exampleBAM.bam.bam 34 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 76 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 30 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATCGTGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GATCGTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 52 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 56 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CCAGATCC Context D 3.0103 1 0 -exampleBAM.bam.bam 16 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 8 63 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 14 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 23 AT Context M 6.0206 3 0 -exampleBAM.bam.bam 19 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTATTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTCAATGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AAAATCTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 22 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 26 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 34 2 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 6 68 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 23 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 28 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 5 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTACTC Context D 3.0103 1 0 -exampleBAM.bam.bam 33 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGCTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 28 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 4 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 29 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 18 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGACAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTTTGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACCCAGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTAAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 13 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTATTTGC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 23 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 8 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 22 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGATCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 55 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 59 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TCCAGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGACATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAAGTGAC Context D 3.0103 1 0 -exampleBAM.bam.bam 4 64 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 25 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 22 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 CTTTCAGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCATGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 21 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 25 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GACATGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTATTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTAATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 32 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 23 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGAGCCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTGGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 28 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCAAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCAGCCT Context D 3.0103 1 0 -exampleBAM.bam.bam 34 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTTGTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTCGGGT Context D 6.0206 3 0 -exampleBAM.bam.bam 28 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 18 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTTG Context D 4.7712 2 0 -exampleBAM.bam.bam 34 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 30 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCCACC Context D 3.0103 1 0 -exampleBAM.bam.bam 20 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGTGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTGATAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATGTGAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 54 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 58 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ACTTTCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 23 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 15 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGTATTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 20 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 24 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CAGGCCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 23 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 CG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGATATA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTTAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 15 14 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GAACTGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 6 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 10 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GGGCTGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 31 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 31 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 30 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTCTAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TATTTGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 24 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTTTGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGCACC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 36 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 40 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 29 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 21 29 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAATCTCC Context I 3.0103 1 0 -exampleBAM.bam.bam 15 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 33 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCTGGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 66 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CTTGGCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCCACCA Context D 3.0103 1 0 -exampleBAM.bam.bam 19 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TTCAGGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGAGC Context I 3.0103 1 0 -exampleBAM.bam.bam 28 GG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 GAGATTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 7 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 11 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 TTACTCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTATATC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTTAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTATTACT Context D 3.0103 1 0 -exampleBAM.bam.bam 31 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 34 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 36 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ACAGCAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGCAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 37 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 41 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 TCCAGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAGTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTATCATG Context D 3.0103 1 0 -exampleBAM.bam.bam 24 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 29 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 32 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 67 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 18 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTGGAGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGATTTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAATCTAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTGAAAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCACCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTGTGTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 28 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 19 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 29 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTGGAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCAGGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTATTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 33 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTATTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAACCTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 30 CA Context M 6.0206 3 0 -exampleBAM.bam.bam 15 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GACACAGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AACCTGGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 4 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 8 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 25 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 6 63 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TTTGCAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGCACT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTAAGTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGAGTCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTCGTCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 38 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 42 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 34 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 CG Context M 3.0103 1 0 -exampleBAM.bam.bam 31 8 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 26 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATAAAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGTTGG Context D 4.7712 2 0 -exampleBAM.bam.bam 45 64 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 76 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GATTCTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGACACAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGTTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 29 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 GG Context M 6.9897 4 0 -exampleBAM.bam.bam 8 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAACTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 9 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTGAAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 20 29 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 12 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCACCCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 16 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATCGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 5 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 9 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 30 CC Context M 4.7712 2 0 -exampleBAM.bam.bam 23 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 43 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATAACCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 39 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 43 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GAAAGTGC Context D 3.0103 1 0 -exampleBAM.bam.bam 24 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 24 6 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TTATTGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 34 63 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 65 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 18 TT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 GATTTTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGTTCTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAAAGACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGAGTGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCACAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGAGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 19 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 5 26 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 AAGTGCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTGCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTAATC Context I 3.0103 1 0 -exampleBAM.bam.bam 20 28 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 GGTATTAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGTGAACT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCCTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TCGTCCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGATTCTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATCCAGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 32 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 44 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CATGATTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAATCCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGTTCTA Context I 3.0103 1 0 -exampleBAM.bam.bam 34 26 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 AT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 GGGTTAGG Context D 4.7712 2 0 -exampleBAM.bam.bam 30 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATATCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCAATCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGAGCCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGATCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 2 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 14 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GAGTGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 32 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 21 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGTCTTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCTTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 13 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATT Context D 3.0103 1 0 -exampleBAM.bam.bam 29 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 19 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATCAATA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGGGCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGTTAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGCACTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAGAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 26 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 20 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTCGTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 70 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 74 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 18 22 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 15 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 31 GC Context M 6.0206 3 0 -exampleBAM.bam.bam 45 33 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 45 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GGAGATTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGATCCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 16 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTCCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 13 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATCATGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGACATGG Context I 3.0103 1 0 -exampleBAM.bam.bam 17 TT Context M 3.0103 3 1 -exampleBAM.bam.bam 31 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 34 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 3 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 15 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 TTATATCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGATATAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTATCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCACTGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGCCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 19 21 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 32 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CACTGATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATAAAGAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCACTTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCCTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 28 CT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 71 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 75 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 AGCAAAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCAATC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 29 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 26 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTTGGG Context D 4.7712 2 0 -exampleBAM.bam.bam 45 GGGTTGGG Context D 6.0206 3 0 -exampleBAM.bam.bam 24 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTTCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTAGATTT Context D 3.0103 1 0 -exampleBAM.bam.bam 16 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 34 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 46 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ATGAGTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 27 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 GG Context M 6.9897 4 0 -exampleBAM.bam.bam 34 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 33 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 15 8 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 26 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 12 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GGCCTGAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGATTAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCAGCCTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AATCCATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTATAT Context D 3.0103 1 0 -exampleBAM.bam.bam 29 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GTTAGGGT Context I 6.0206 3 0 -exampleBAM.bam.bam 45 ACTCTTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACATGATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTATTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 32 28 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 29 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 AT Context M 6.9897 4 0 -exampleBAM.bam.bam 45 TGGGTTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGTTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 26 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTCTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 68 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 72 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 AGTCAATG Context I 3.0103 1 0 -exampleBAM.bam.bam 29 8 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 CG Context M 4.7712 2 0 -exampleBAM.bam.bam 4 29 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 16 TT Context M 3.9794 4 1 -exampleBAM.bam.bam 45 CACCATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 35 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 47 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CTATTCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATCTAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTTGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 30 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCACATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 9 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GTCCATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 31 13 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 34 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AAGACACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCACCATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 1 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 13 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 16 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CGTCCATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTGGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCGGGTT Context I 6.0206 3 0 -exampleBAM.bam.bam 45 TTAGGGTT Context I 6.0206 3 0 -exampleBAM.bam.bam 45 TGGGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 9 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTATCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 17 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 34 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATA Context D 3.0103 1 0 -exampleBAM.bam.bam 28 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTGATA Context D 3.0103 1 0 -exampleBAM.bam.bam 29 43 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGTTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGTTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 69 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 73 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 28 41 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGATCGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 29 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 12 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 29 6 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTCGTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 70 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 74 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TTTGGGCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TATCAATA Context D 3.0103 1 0 -exampleBAM.bam.bam 33 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 TTGGTTAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAGAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGCACTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 4 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 10 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 27 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 TT Context M 1.7609 2 1 -exampleBAM.bam.bam 18 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCTTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTCTTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 12 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGAGCCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGATCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 2 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 14 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GCAATCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 15 AA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GGGTTAGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 TATATCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 17 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGATTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 32 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 44 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATCCAGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGTTCTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAATCCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGATTCTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCGTCCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 24 13 Cycle M 6.0206 3 0 -exampleBAM.bam.bam 30 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 29 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 39 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCAATC Context D 3.0103 1 0 -exampleBAM.bam.bam 33 TT Context M 6.9897 4 0 -exampleBAM.bam.bam 30 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 71 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 75 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGCAAAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 32 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 TC Context M 6.0206 3 0 -exampleBAM.bam.bam 29 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 ATAAAGAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACTGATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCCTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCACTTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 25 14 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 52 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 TGATATAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTATCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTATATCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCACTGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGCCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 3 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 15 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 17 63 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 30 35 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TATCATGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGACATGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGATCCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 33 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 45 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GGAGATTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTCCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 68 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 72 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGTCAATG Context D 3.0103 1 0 -exampleBAM.bam.bam 33 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGTTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGTTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTCTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 4 TT Context M -0.0000 1 1 -exampleBAM.bam.bam 29 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ACTCTTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 18 58 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 ATTATTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACATGATC Context I 3.0103 1 0 -exampleBAM.bam.bam 28 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 33 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTAGGGT Context D 6.0206 3 0 -exampleBAM.bam.bam 32 16 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 32 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GGCCTGAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 12 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGATTAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCAGCCTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATCCATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTATAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 22 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 24 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 GT Context M 6.0206 3 0 -exampleBAM.bam.bam 31 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 34 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 46 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATGAGTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTTCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTGGG Context I 6.0206 3 0 -exampleBAM.bam.bam 45 GGTTTGGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 TTAGATTT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 25 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 10 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 11 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 33 TC Context M 8.4510 6 0 -exampleBAM.bam.bam 45 TGATCGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGTTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGTTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 69 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 73 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 32 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 29 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTGATA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATA Context I 3.0103 1 0 -exampleBAM.bam.bam 32 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGGGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTAGGGTT Context D 6.0206 3 0 -exampleBAM.bam.bam 45 TTCGGGTT Context D 6.0206 3 0 -exampleBAM.bam.bam 45 TTGGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTATCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CGTCCATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCACCATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAGACACA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 1 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 13 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CTGGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 22 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 25 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 8 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 34 21 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTTGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCACATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTCCATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACCATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 35 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 47 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CTATTCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AATCTAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 25 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 1 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GAGATTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCAGGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 13 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGAGC Context D 3.0103 1 0 -exampleBAM.bam.bam 21 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 21 17 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 12 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGCCACCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCTGGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTGGCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 66 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 26 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TAATCTCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 28 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 17 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 6 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTTTGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 36 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 40 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CAGGCACC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTCTAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATTTGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 34 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 25 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 22 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GAACTGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 6 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 10 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GGGCTGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGATATA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTTAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 27 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 27 14 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAACCTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTATTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 11 40 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 TTTATTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCTGGAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCAGGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 12 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 32 53 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 26 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTGTGTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAATCTAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 67 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CTGGAGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGATTTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCACCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTGAAAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 8 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCAGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAGTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 CG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTATCATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACAGCAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 37 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 41 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGTGCAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 34 TC Context M 6.0206 3 0 -exampleBAM.bam.bam 25 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 30 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTATATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTACTCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTATTACT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTTAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 7 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 11 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CCTGAAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAACTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 28 2 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 64 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 76 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGTGTTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATTCTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGACACAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGTTGG Context I 4.7712 2 0 -exampleBAM.bam.bam 15 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATAAAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 22 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 12 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 32 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTCGTCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 38 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 42 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TTAAGTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGCAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGCACT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 CC Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGAGTCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 6 TT Context M 1.7609 2 1 -exampleBAM.bam.bam 31 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 34 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GACACAGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AACCTGGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 4 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 8 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 16 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 AA Context M 4.7712 2 0 -exampleBAM.bam.bam 24 41 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 29 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 26 44 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTATTAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTGAACT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCCTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 5 22 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AAGTGCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTGCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTAATC Context D 3.0103 1 0 -exampleBAM.bam.bam 27 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 21 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGAGTGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 13 39 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAAAGACA Context D 3.0103 1 0 -exampleBAM.bam.bam 33 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGAGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCACAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 65 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GATTTTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTTCTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 19 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 15 35 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 24 10 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 TTATTGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATAACCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GAAAGTGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 39 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 43 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 31 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 31 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATCGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 5 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 9 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GCACCCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 34 TT Context M 8.4510 6 0 -exampleBAM.bam.bam 31 39 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 14 33 Cycle M 3.0103 1 0 +ReadGroup QualityScore CovariateValue CovariateName EventType EmpiricalQuality Observations Errors +exampleBAM.bam 6 AA Context M 6.0000 1 0.00 +exampleBAM.bam 6 GA Context M 6.0000 1 0.00 +exampleBAM.bam 6 GC Context M 6.0000 2 2.00 +exampleBAM.bam 6 TG Context M 6.0000 1 0.00 +exampleBAM.bam 6 AT Context M 6.0000 1 0.00 +exampleBAM.bam 6 CT Context M 6.0000 1 0.00 +exampleBAM.bam 6 GT Context M 6.0000 2 0.00 +exampleBAM.bam 6 TT Context M 6.0000 2 1.00 +exampleBAM.bam 8 AA Context M 8.0000 1 0.00 +exampleBAM.bam 8 GA Context M 8.0000 2 0.00 +exampleBAM.bam 8 GC Context M 8.0000 1 0.00 +exampleBAM.bam 8 TG Context M 8.0000 1 1.00 +exampleBAM.bam 8 GT Context M 8.0000 2 0.00 +exampleBAM.bam 9 TG Context M 9.0000 1 0.00 +exampleBAM.bam 9 AT Context M 9.0000 1 0.00 +exampleBAM.bam 9 CT Context M 9.0000 1 0.00 +exampleBAM.bam 9 GT Context M 9.0000 1 0.00 +exampleBAM.bam 10 TT Context M 10.0000 1 0.00 +exampleBAM.bam 11 TC Context M 11.0000 1 1.00 +exampleBAM.bam 11 GT Context M 11.0000 1 0.00 +exampleBAM.bam 12 GA Context M 12.0000 1 0.00 +exampleBAM.bam 12 CC Context M 12.0000 1 0.00 +exampleBAM.bam 12 TC Context M 12.0000 1 0.00 +exampleBAM.bam 12 AG Context M 12.0000 1 0.00 +exampleBAM.bam 13 AA Context M 13.0000 1 0.00 +exampleBAM.bam 13 AG Context M 13.0000 1 0.00 +exampleBAM.bam 13 AT Context M 13.0000 1 0.00 +exampleBAM.bam 14 GA Context M 14.0000 1 0.00 +exampleBAM.bam 15 AA Context M 15.0000 2 0.00 +exampleBAM.bam 15 GA Context M 15.0000 1 0.00 +exampleBAM.bam 15 GT Context M 15.0000 2 0.00 +exampleBAM.bam 16 AA Context M 16.0000 1 0.00 +exampleBAM.bam 16 TA Context M 16.0000 4 1.00 +exampleBAM.bam 16 TC Context M 16.0000 1 0.00 +exampleBAM.bam 16 GG Context M 16.0000 1 0.00 +exampleBAM.bam 16 TG Context M 16.0000 1 0.00 +exampleBAM.bam 16 CT Context M 16.0000 1 0.00 +exampleBAM.bam 16 GT Context M 16.0000 1 0.00 +exampleBAM.bam 17 CA Context M 17.0000 1 0.00 +exampleBAM.bam 17 TA Context M 17.0000 1 0.00 +exampleBAM.bam 17 TG Context M 17.0000 1 1.00 +exampleBAM.bam 17 GT Context M 17.0000 1 0.00 +exampleBAM.bam 17 TT Context M 17.0000 2 0.00 +exampleBAM.bam 18 AC Context M 18.0000 1 0.00 +exampleBAM.bam 18 TC Context M 18.0000 1 1.00 +exampleBAM.bam 18 AT Context M 18.0000 1 0.00 +exampleBAM.bam 18 GT Context M 18.0000 1 0.00 +exampleBAM.bam 18 TT Context M 18.0000 2 0.00 +exampleBAM.bam 19 AA Context M 19.0000 2 0.00 +exampleBAM.bam 19 CA Context M 19.0000 2 0.00 +exampleBAM.bam 19 GA Context M 19.0000 2 0.00 +exampleBAM.bam 19 TA Context M 19.0000 3 0.00 +exampleBAM.bam 19 GC Context M 19.0000 1 0.00 +exampleBAM.bam 19 GG Context M 19.0000 1 0.00 +exampleBAM.bam 19 TG Context M 19.0000 2 1.00 +exampleBAM.bam 19 GT Context M 19.0000 1 0.00 +exampleBAM.bam 19 TT Context M 19.0000 1 0.00 +exampleBAM.bam 20 AA Context M 20.0000 1 0.00 +exampleBAM.bam 20 CA Context M 20.0000 1 0.00 +exampleBAM.bam 20 CC Context M 20.0000 1 0.00 +exampleBAM.bam 20 TG Context M 20.0000 1 0.00 +exampleBAM.bam 20 TT Context M 20.0000 1 1.00 +exampleBAM.bam 21 CA Context M 21.0000 1 0.00 +exampleBAM.bam 21 GA Context M 21.0000 1 0.00 +exampleBAM.bam 21 TA Context M 21.0000 1 0.00 +exampleBAM.bam 21 CC Context M 21.0000 1 0.00 +exampleBAM.bam 21 TC Context M 21.0000 2 0.00 +exampleBAM.bam 21 AG Context M 21.0000 1 0.00 +exampleBAM.bam 21 GG Context M 21.0000 4 0.00 +exampleBAM.bam 21 TG Context M 21.0000 1 0.00 +exampleBAM.bam 21 AT Context M 21.0000 2 0.00 +exampleBAM.bam 21 CT Context M 21.0000 1 0.00 +exampleBAM.bam 21 GT Context M 21.0000 2 0.00 +exampleBAM.bam 22 CA Context M 22.0000 1 0.00 +exampleBAM.bam 22 GA Context M 22.0000 3 0.00 +exampleBAM.bam 22 TA Context M 22.0000 1 0.00 +exampleBAM.bam 22 GC Context M 22.0000 1 0.00 +exampleBAM.bam 22 GG Context M 22.0000 1 0.00 +exampleBAM.bam 22 TG Context M 22.0000 1 0.00 +exampleBAM.bam 22 TT Context M 22.0000 1 0.00 +exampleBAM.bam 23 AA Context M 23.0000 1 0.00 +exampleBAM.bam 23 CA Context M 23.0000 1 0.00 +exampleBAM.bam 23 TA Context M 23.0000 1 0.00 +exampleBAM.bam 23 CC Context M 23.0000 2 0.00 +exampleBAM.bam 23 GC Context M 23.0000 1 0.00 +exampleBAM.bam 23 TC Context M 23.0000 2 0.00 +exampleBAM.bam 23 GG Context M 23.0000 1 0.00 +exampleBAM.bam 23 TG Context M 23.0000 3 0.00 +exampleBAM.bam 23 AT Context M 23.0000 1 0.00 +exampleBAM.bam 23 TT Context M 23.0000 2 0.00 +exampleBAM.bam 24 CA Context M 24.0000 3 0.00 +exampleBAM.bam 24 GA Context M 24.0000 2 0.00 +exampleBAM.bam 24 TA Context M 24.0000 2 1.00 +exampleBAM.bam 24 GC Context M 24.0000 1 0.00 +exampleBAM.bam 24 AG Context M 24.0000 2 0.00 +exampleBAM.bam 24 CG Context M 24.0000 1 0.00 +exampleBAM.bam 24 GG Context M 24.0000 3 0.00 +exampleBAM.bam 24 AT Context M 24.0000 1 0.00 +exampleBAM.bam 24 CT Context M 24.0000 1 0.00 +exampleBAM.bam 24 GT Context M 24.0000 1 0.00 +exampleBAM.bam 24 TT Context M 24.0000 3 0.00 +exampleBAM.bam 25 AA Context M 25.0000 2 0.00 +exampleBAM.bam 25 CA Context M 25.0000 1 0.00 +exampleBAM.bam 25 GG Context M 25.0000 2 0.00 +exampleBAM.bam 25 TG Context M 25.0000 2 0.00 +exampleBAM.bam 25 AT Context M 25.0000 2 0.00 +exampleBAM.bam 25 GT Context M 25.0000 1 0.00 +exampleBAM.bam 25 TT Context M 25.0000 5 0.00 +exampleBAM.bam 26 TA Context M 26.0000 1 0.00 +exampleBAM.bam 26 GG Context M 26.0000 1 0.00 +exampleBAM.bam 26 TG Context M 26.0000 1 0.00 +exampleBAM.bam 26 AT Context M 26.0000 1 0.00 +exampleBAM.bam 26 GT Context M 26.0000 1 0.00 +exampleBAM.bam 26 TT Context M 26.0000 1 0.00 +exampleBAM.bam 27 AA Context M 27.0000 2 0.00 +exampleBAM.bam 27 CA Context M 27.0000 1 0.00 +exampleBAM.bam 27 TA Context M 27.0000 1 0.00 +exampleBAM.bam 27 TC Context M 27.0000 2 0.00 +exampleBAM.bam 27 AG Context M 27.0000 3 0.00 +exampleBAM.bam 27 GG Context M 27.0000 3 0.00 +exampleBAM.bam 27 TG Context M 27.0000 2 0.00 +exampleBAM.bam 27 AT Context M 27.0000 4 0.00 +exampleBAM.bam 27 CT Context M 27.0000 2 0.00 +exampleBAM.bam 27 TT Context M 27.0000 2 0.00 +exampleBAM.bam 28 AA Context M 28.0000 1 0.00 +exampleBAM.bam 28 CA Context M 28.0000 1 0.00 +exampleBAM.bam 28 TA Context M 28.0000 2 0.00 +exampleBAM.bam 28 AG Context M 28.0000 3 0.00 +exampleBAM.bam 28 GG Context M 28.0000 3 0.00 +exampleBAM.bam 28 TG Context M 28.0000 1 0.00 +exampleBAM.bam 28 GT Context M 28.0000 4 0.00 +exampleBAM.bam 29 CA Context M 29.0000 1 0.00 +exampleBAM.bam 29 TA Context M 29.0000 1 0.00 +exampleBAM.bam 29 AC Context M 29.0000 1 0.00 +exampleBAM.bam 29 CC Context M 29.0000 1 0.00 +exampleBAM.bam 29 GC Context M 29.0000 1 0.00 +exampleBAM.bam 29 AG Context M 29.0000 3 0.00 +exampleBAM.bam 29 CG Context M 29.0000 1 0.00 +exampleBAM.bam 29 GG Context M 29.0000 4 0.00 +exampleBAM.bam 29 TG Context M 29.0000 1 0.00 +exampleBAM.bam 29 AT Context M 29.0000 1 0.00 +exampleBAM.bam 29 GT Context M 29.0000 1 0.00 +exampleBAM.bam 29 TT Context M 29.0000 4 0.00 +exampleBAM.bam 30 AA Context M 30.0000 2 0.00 +exampleBAM.bam 30 CA Context M 30.0000 1 0.00 +exampleBAM.bam 30 AC Context M 30.0000 4 0.00 +exampleBAM.bam 30 CC Context M 30.0000 1 0.00 +exampleBAM.bam 30 TC Context M 30.0000 2 0.00 +exampleBAM.bam 30 AG Context M 30.0000 3 0.00 +exampleBAM.bam 30 GG Context M 30.0000 1 0.00 +exampleBAM.bam 30 TG Context M 30.0000 1 0.00 +exampleBAM.bam 30 AT Context M 30.0000 2 0.00 +exampleBAM.bam 30 CT Context M 30.0000 2 0.00 +exampleBAM.bam 30 GT Context M 30.0000 1 0.00 +exampleBAM.bam 31 CA Context M 31.0000 1 0.00 +exampleBAM.bam 31 GA Context M 31.0000 1 0.00 +exampleBAM.bam 31 CC Context M 31.0000 2 0.00 +exampleBAM.bam 31 GC Context M 31.0000 2 0.00 +exampleBAM.bam 31 AG Context M 31.0000 2 0.00 +exampleBAM.bam 31 GG Context M 31.0000 6 0.00 +exampleBAM.bam 31 TG Context M 31.0000 2 0.00 +exampleBAM.bam 31 AT Context M 31.0000 2 0.00 +exampleBAM.bam 31 CT Context M 31.0000 2 0.00 +exampleBAM.bam 31 GT Context M 31.0000 1 0.00 +exampleBAM.bam 31 TT Context M 31.0000 3 0.00 +exampleBAM.bam 32 CA Context M 32.0000 2 0.00 +exampleBAM.bam 32 TA Context M 32.0000 1 0.00 +exampleBAM.bam 32 AC Context M 32.0000 1 0.00 +exampleBAM.bam 32 CC Context M 32.0000 1 0.00 +exampleBAM.bam 32 TC Context M 32.0000 1 0.00 +exampleBAM.bam 32 AG Context M 32.0000 1 0.00 +exampleBAM.bam 32 CG Context M 32.0000 1 0.00 +exampleBAM.bam 32 GG Context M 32.0000 8 0.00 +exampleBAM.bam 32 TG Context M 32.0000 3 0.00 +exampleBAM.bam 32 AT Context M 32.0000 4 0.00 +exampleBAM.bam 32 CT Context M 32.0000 2 0.00 +exampleBAM.bam 32 TT Context M 32.0000 6 0.00 +exampleBAM.bam 33 CA Context M 33.0000 2 0.00 +exampleBAM.bam 33 GA Context M 33.0000 2 0.00 +exampleBAM.bam 33 TA Context M 33.0000 1 0.00 +exampleBAM.bam 33 AC Context M 33.0000 1 0.00 +exampleBAM.bam 33 CC Context M 33.0000 2 0.00 +exampleBAM.bam 33 GC Context M 33.0000 1 0.00 +exampleBAM.bam 33 TC Context M 33.0000 3 0.00 +exampleBAM.bam 33 AG Context M 33.0000 2 0.00 +exampleBAM.bam 33 CG Context M 33.0000 1 0.00 +exampleBAM.bam 33 GG Context M 33.0000 2 0.00 +exampleBAM.bam 33 TG Context M 33.0000 5 0.00 +exampleBAM.bam 33 AT Context M 33.0000 2 0.00 +exampleBAM.bam 33 CT Context M 33.0000 4 0.00 +exampleBAM.bam 33 GT Context M 33.0000 1 0.00 +exampleBAM.bam 33 TT Context M 33.0000 4 0.00 +exampleBAM.bam 34 AA Context M 34.0000 1 0.00 +exampleBAM.bam 34 CA Context M 34.0000 3 0.00 +exampleBAM.bam 34 GA Context M 34.0000 1 0.00 +exampleBAM.bam 34 TA Context M 34.0000 2 0.00 +exampleBAM.bam 34 AC Context M 34.0000 1 0.00 +exampleBAM.bam 34 CC Context M 34.0000 1 0.00 +exampleBAM.bam 34 GC Context M 34.0000 2 0.00 +exampleBAM.bam 34 TC Context M 34.0000 6 0.00 +exampleBAM.bam 34 AG Context M 34.0000 1 0.00 +exampleBAM.bam 34 CG Context M 34.0000 1 0.00 +exampleBAM.bam 34 GG Context M 34.0000 1 0.00 +exampleBAM.bam 34 TG Context M 34.0000 4 0.00 +exampleBAM.bam 34 AT Context M 34.0000 4 0.00 +exampleBAM.bam 34 CT Context M 34.0000 2 0.00 +exampleBAM.bam 34 GT Context M 34.0000 1 0.00 +exampleBAM.bam 34 TT Context M 34.0000 5 0.00 +exampleBAM.bam 45 AAA Context I 45.0000 5 0.00 +exampleBAM.bam 45 AAA Context D 45.0000 5 0.00 +exampleBAM.bam 45 CAA Context I 45.0000 5 0.00 +exampleBAM.bam 45 CAA Context D 45.0000 5 0.00 +exampleBAM.bam 45 GAA Context I 45.0000 2 0.00 +exampleBAM.bam 45 GAA Context D 45.0000 2 0.00 +exampleBAM.bam 45 TAA Context I 45.0000 6 0.00 +exampleBAM.bam 45 TAA Context D 45.0000 6 0.00 +exampleBAM.bam 45 ACA Context I 45.0000 4 0.00 +exampleBAM.bam 45 ACA Context D 45.0000 4 0.00 +exampleBAM.bam 45 CCA Context I 45.0000 8 0.00 +exampleBAM.bam 45 CCA Context D 45.0000 8 0.00 +exampleBAM.bam 45 GCA Context I 45.0000 5 0.00 +exampleBAM.bam 45 GCA Context D 45.0000 5 0.00 +exampleBAM.bam 45 TCA Context I 45.0000 6 0.00 +exampleBAM.bam 45 TCA Context D 45.0000 6 0.00 +exampleBAM.bam 45 AGA Context I 45.0000 5 0.00 +exampleBAM.bam 45 AGA Context D 45.0000 5 0.00 +exampleBAM.bam 45 GGA Context I 45.0000 3 0.00 +exampleBAM.bam 45 GGA Context D 45.0000 3 0.00 +exampleBAM.bam 45 TGA Context I 45.0000 10 0.00 +exampleBAM.bam 45 TGA Context D 45.0000 10 0.00 +exampleBAM.bam 45 ATA Context I 45.0000 6 0.00 +exampleBAM.bam 45 ATA Context D 45.0000 6 0.00 +exampleBAM.bam 45 CTA Context I 45.0000 3 0.00 +exampleBAM.bam 45 CTA Context D 45.0000 3 0.00 +exampleBAM.bam 45 GTA Context I 45.0000 2 0.00 +exampleBAM.bam 45 GTA Context D 45.0000 2 0.00 +exampleBAM.bam 45 TTA Context I 45.0000 11 0.00 +exampleBAM.bam 45 TTA Context D 45.0000 11 0.00 +exampleBAM.bam 45 CAC Context I 45.0000 6 0.00 +exampleBAM.bam 45 CAC Context D 45.0000 6 0.00 +exampleBAM.bam 45 GAC Context I 45.0000 2 0.00 +exampleBAM.bam 45 GAC Context D 45.0000 2 0.00 +exampleBAM.bam 45 TAC Context I 45.0000 1 0.00 +exampleBAM.bam 45 TAC Context D 45.0000 1 0.00 +exampleBAM.bam 45 ACC Context I 45.0000 3 0.00 +exampleBAM.bam 45 ACC Context D 45.0000 3 0.00 +exampleBAM.bam 45 CCC Context I 45.0000 1 0.00 +exampleBAM.bam 45 CCC Context D 45.0000 1 0.00 +exampleBAM.bam 45 GCC Context I 45.0000 5 0.00 +exampleBAM.bam 45 GCC Context D 45.0000 5 0.00 +exampleBAM.bam 45 TCC Context I 45.0000 4 0.00 +exampleBAM.bam 45 TCC Context D 45.0000 4 0.00 +exampleBAM.bam 45 AGC Context I 45.0000 3 0.00 +exampleBAM.bam 45 AGC Context D 45.0000 3 0.00 +exampleBAM.bam 45 GGC Context I 45.0000 6 0.00 +exampleBAM.bam 45 GGC Context D 45.0000 6 0.00 +exampleBAM.bam 45 TGC Context I 45.0000 4 0.00 +exampleBAM.bam 45 TGC Context D 45.0000 4 0.00 +exampleBAM.bam 45 ATC Context I 45.0000 7 0.00 +exampleBAM.bam 45 ATC Context D 45.0000 7 0.00 +exampleBAM.bam 45 CTC Context I 45.0000 3 0.00 +exampleBAM.bam 45 CTC Context D 45.0000 3 0.00 +exampleBAM.bam 45 GTC Context I 45.0000 3 0.00 +exampleBAM.bam 45 GTC Context D 45.0000 3 0.00 +exampleBAM.bam 45 TTC Context I 45.0000 9 0.00 +exampleBAM.bam 45 TTC Context D 45.0000 9 0.00 +exampleBAM.bam 45 AAG Context I 45.0000 4 0.00 +exampleBAM.bam 45 AAG Context D 45.0000 4 0.00 +exampleBAM.bam 45 CAG Context I 45.0000 7 0.00 +exampleBAM.bam 45 CAG Context D 45.0000 7 0.00 +exampleBAM.bam 45 GAG Context I 45.0000 6 0.00 +exampleBAM.bam 45 GAG Context D 45.0000 6 0.00 +exampleBAM.bam 45 TAG Context I 45.0000 5 0.00 +exampleBAM.bam 45 TAG Context D 45.0000 5 0.00 +exampleBAM.bam 45 TCG Context I 45.0000 5 0.00 +exampleBAM.bam 45 TCG Context D 45.0000 5 0.00 +exampleBAM.bam 45 AGG Context I 45.0000 7 0.00 +exampleBAM.bam 45 AGG Context D 45.0000 7 0.00 +exampleBAM.bam 45 CGG Context I 45.0000 3 0.00 +exampleBAM.bam 45 CGG Context D 45.0000 3 0.00 +exampleBAM.bam 45 GGG Context I 45.0000 16 0.00 +exampleBAM.bam 45 GGG Context D 45.0000 16 0.00 +exampleBAM.bam 45 TGG Context I 45.0000 16 0.00 +exampleBAM.bam 45 TGG Context D 45.0000 16 0.00 +exampleBAM.bam 45 ATG Context I 45.0000 8 0.00 +exampleBAM.bam 45 ATG Context D 45.0000 8 0.00 +exampleBAM.bam 45 CTG Context I 45.0000 6 0.00 +exampleBAM.bam 45 CTG Context D 45.0000 6 0.00 +exampleBAM.bam 45 GTG Context I 45.0000 8 0.00 +exampleBAM.bam 45 GTG Context D 45.0000 8 0.00 +exampleBAM.bam 45 TTG Context I 45.0000 11 0.00 +exampleBAM.bam 45 TTG Context D 45.0000 11 0.00 +exampleBAM.bam 45 AAT Context I 45.0000 7 0.00 +exampleBAM.bam 45 AAT Context D 45.0000 7 0.00 +exampleBAM.bam 45 CAT Context I 45.0000 6 0.00 +exampleBAM.bam 45 CAT Context D 45.0000 6 0.00 +exampleBAM.bam 45 GAT Context I 45.0000 8 0.00 +exampleBAM.bam 45 GAT Context D 45.0000 8 0.00 +exampleBAM.bam 45 TAT Context I 45.0000 9 0.00 +exampleBAM.bam 45 TAT Context D 45.0000 9 0.00 +exampleBAM.bam 45 ACT Context I 45.0000 4 0.00 +exampleBAM.bam 45 ACT Context D 45.0000 4 0.00 +exampleBAM.bam 45 CCT Context I 45.0000 4 0.00 +exampleBAM.bam 45 CCT Context D 45.0000 4 0.00 +exampleBAM.bam 45 GCT Context I 45.0000 2 0.00 +exampleBAM.bam 45 GCT Context D 45.0000 2 0.00 +exampleBAM.bam 45 TCT Context I 45.0000 8 0.00 +exampleBAM.bam 45 TCT Context D 45.0000 8 0.00 +exampleBAM.bam 45 AGT Context I 45.0000 5 0.00 +exampleBAM.bam 45 AGT Context D 45.0000 5 0.00 +exampleBAM.bam 45 CGT Context I 45.0000 2 0.00 +exampleBAM.bam 45 CGT Context D 45.0000 2 0.00 +exampleBAM.bam 45 GGT Context I 45.0000 13 0.00 +exampleBAM.bam 45 GGT Context D 45.0000 13 0.00 +exampleBAM.bam 45 TGT Context I 45.0000 5 0.00 +exampleBAM.bam 45 TGT Context D 45.0000 5 0.00 +exampleBAM.bam 45 ATT Context I 45.0000 9 0.00 +exampleBAM.bam 45 ATT Context D 45.0000 9 0.00 +exampleBAM.bam 45 CTT Context I 45.0000 7 0.00 +exampleBAM.bam 45 CTT Context D 45.0000 7 0.00 +exampleBAM.bam 45 GTT Context I 45.0000 17 0.00 +exampleBAM.bam 45 GTT Context D 45.0000 17 0.00 +exampleBAM.bam 45 TTT Context I 45.0000 12 0.00 +exampleBAM.bam 45 TTT Context D 45.0000 12 0.00 +exampleBAM.bam 6 -4 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 31 Cycle M 6.0000 1 1.00 +exampleBAM.bam 6 36 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -52 Cycle M 6.0000 1 1.00 +exampleBAM.bam 6 -62 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 63 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -63 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -65 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 67 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -68 Cycle M 6.0000 1 1.00 +exampleBAM.bam 6 75 Cycle M 6.0000 1 0.00 +exampleBAM.bam 8 17 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 46 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 57 Cycle M 8.0000 1 1.00 +exampleBAM.bam 8 58 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 60 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 63 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 71 Cycle M 8.0000 1 0.00 +exampleBAM.bam 9 -16 Cycle M 9.0000 1 0.00 +exampleBAM.bam 9 38 Cycle M 9.0000 1 0.00 +exampleBAM.bam 9 52 Cycle M 9.0000 1 0.00 +exampleBAM.bam 9 69 Cycle M 9.0000 1 0.00 +exampleBAM.bam 10 -75 Cycle M 10.0000 1 0.00 +exampleBAM.bam 11 -20 Cycle M 11.0000 1 0.00 +exampleBAM.bam 11 -40 Cycle M 11.0000 1 1.00 +exampleBAM.bam 12 25 Cycle M 12.0000 1 0.00 +exampleBAM.bam 12 40 Cycle M 12.0000 1 0.00 +exampleBAM.bam 12 62 Cycle M 12.0000 1 0.00 +exampleBAM.bam 12 68 Cycle M 12.0000 1 0.00 +exampleBAM.bam 13 39 Cycle M 13.0000 1 0.00 +exampleBAM.bam 13 55 Cycle M 13.0000 1 0.00 +exampleBAM.bam 13 75 Cycle M 13.0000 1 0.00 +exampleBAM.bam 14 -33 Cycle M 14.0000 1 0.00 +exampleBAM.bam 15 -8 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 -14 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 35 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 68 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 74 Cycle M 15.0000 1 0.00 +exampleBAM.bam 16 7 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 19 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -34 Cycle M 16.0000 1 1.00 +exampleBAM.bam 16 -47 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 51 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -55 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -58 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -65 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 70 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 73 Cycle M 16.0000 1 0.00 +exampleBAM.bam 17 -4 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 -20 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 58 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 62 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 -63 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 -76 Cycle M 17.0000 1 1.00 +exampleBAM.bam 18 -1 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 10 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 -19 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 22 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 36 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 -56 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 -58 Cycle M 18.0000 1 1.00 +exampleBAM.bam 19 5 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 -7 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 10 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 21 Cycle M 19.0000 2 0.00 +exampleBAM.bam 19 -30 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 32 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 33 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 49 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 54 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 61 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 65 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 -70 Cycle M 19.0000 1 1.00 +exampleBAM.bam 19 -71 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 72 Cycle M 19.0000 1 0.00 +exampleBAM.bam 20 9 Cycle M 20.0000 1 0.00 +exampleBAM.bam 20 -28 Cycle M 20.0000 1 1.00 +exampleBAM.bam 20 -29 Cycle M 20.0000 1 0.00 +exampleBAM.bam 20 -57 Cycle M 20.0000 1 0.00 +exampleBAM.bam 20 69 Cycle M 20.0000 1 0.00 +exampleBAM.bam 21 -3 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 11 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 17 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 29 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -42 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -44 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 48 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -50 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 59 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -60 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -61 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 64 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 66 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 67 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 71 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 73 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 74 Cycle M 21.0000 1 0.00 +exampleBAM.bam 22 -9 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -15 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -23 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 38 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 44 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -44 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 51 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -59 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -70 Cycle M 22.0000 1 0.00 +exampleBAM.bam 23 -12 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -15 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 18 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 19 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -35 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 37 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -38 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 56 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 59 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 61 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 64 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -64 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 66 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -67 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -75 Cycle M 23.0000 1 0.00 +exampleBAM.bam 24 3 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 5 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 6 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -6 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -10 Cycle M 24.0000 1 1.00 +exampleBAM.bam 24 13 Cycle M 24.0000 2 0.00 +exampleBAM.bam 24 -13 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -25 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 27 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 33 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 41 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 45 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -48 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -49 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 50 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 52 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 53 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 56 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -62 Cycle M 24.0000 1 0.00 +exampleBAM.bam 25 -9 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 14 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -21 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -24 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 31 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -32 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -36 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 37 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 46 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 47 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -51 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -52 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 55 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -73 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -74 Cycle M 25.0000 1 0.00 +exampleBAM.bam 26 -3 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 7 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 20 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 -44 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 50 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 -67 Cycle M 26.0000 1 0.00 +exampleBAM.bam 27 11 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 14 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 16 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -17 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -18 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 22 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -27 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 28 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 30 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -31 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 40 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 53 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -53 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -55 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -56 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 57 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 65 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -66 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -69 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -72 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -73 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 76 Cycle M 27.0000 1 0.00 +exampleBAM.bam 28 -2 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -11 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 25 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -27 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 30 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 34 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 39 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -41 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 47 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 48 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -50 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -53 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 54 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -61 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -71 Cycle M 28.0000 1 0.00 +exampleBAM.bam 29 4 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -5 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 6 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -7 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -8 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 9 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 12 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -24 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 27 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -28 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -37 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 42 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -43 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -45 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -47 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -48 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -54 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -60 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -68 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -76 Cycle M 29.0000 1 0.00 +exampleBAM.bam 30 -9 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 12 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -17 Cycle M 30.0000 2 0.00 +exampleBAM.bam 30 18 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 20 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -21 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 23 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 24 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 26 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -30 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 32 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 34 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 35 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -35 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -42 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -45 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -52 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -54 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -69 Cycle M 30.0000 1 0.00 +exampleBAM.bam 31 -1 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 4 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -5 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -6 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 8 Cycle M 31.0000 2 0.00 +exampleBAM.bam 31 -10 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -11 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -12 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -13 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 15 Cycle M 31.0000 2 0.00 +exampleBAM.bam 31 -16 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -19 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -25 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 26 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -26 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -32 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -34 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -39 Cycle M 31.0000 2 0.00 +exampleBAM.bam 31 43 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -45 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 60 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -66 Cycle M 31.0000 1 0.00 +exampleBAM.bam 32 1 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 2 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -2 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -14 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 -15 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 16 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -16 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -18 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -19 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 23 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 24 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 28 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -28 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -30 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -31 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -33 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 -36 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 41 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -41 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 43 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -43 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 -46 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -49 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -51 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -53 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -54 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -57 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -72 Cycle M 32.0000 1 0.00 +exampleBAM.bam 33 1 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -1 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 2 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 3 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -7 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -8 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -10 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -12 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -18 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -22 Cycle M 33.0000 2 0.00 +exampleBAM.bam 33 -23 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -24 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -29 Cycle M 33.0000 2 0.00 +exampleBAM.bam 33 -31 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -32 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -35 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -37 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -39 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -40 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 42 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 44 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 45 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -46 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -48 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -49 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -57 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -59 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -66 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -67 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -69 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -72 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -73 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -76 Cycle M 33.0000 1 0.00 +exampleBAM.bam 34 -2 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -3 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -4 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -5 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -6 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -11 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -13 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -20 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -21 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -23 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -25 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -26 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -27 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -34 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -38 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -40 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -41 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -42 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -47 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -50 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -51 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -55 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -56 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -58 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -59 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -60 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -61 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -62 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -63 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -64 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -65 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -68 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -70 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -71 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -74 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -75 Cycle M 34.0000 1 0.00 +exampleBAM.bam 45 5 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 5 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -5 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -5 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 6 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 6 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -6 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -6 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 7 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 7 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -7 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -7 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 8 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 8 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -8 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -8 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 9 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 9 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -9 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -9 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 10 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 10 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -10 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -10 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 11 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 11 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -11 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -11 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 12 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 12 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -12 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -12 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 13 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 13 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -13 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -13 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 14 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 14 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -14 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -14 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 15 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 15 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -15 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -15 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 16 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 16 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -16 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -16 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 17 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 17 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -17 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -17 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 18 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 18 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -18 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -18 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 19 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 19 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -19 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -19 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 20 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 20 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -20 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -20 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 21 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 21 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -21 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -21 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 22 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 22 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -22 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -22 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 23 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 23 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -23 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -23 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 24 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 24 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -24 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -24 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 25 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 25 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -25 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -25 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 26 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 26 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -26 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -26 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 27 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 27 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -27 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -27 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 28 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 28 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -28 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -28 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 29 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 29 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -29 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -29 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 30 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 30 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -30 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -30 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 31 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 31 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -31 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -31 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 32 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 32 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -32 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -32 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 33 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 33 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -33 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -33 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 34 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 34 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -34 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -34 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 35 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 35 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -35 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -35 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 36 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 36 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -36 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -36 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 37 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 37 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -37 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -37 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 38 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 38 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -38 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -38 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 39 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 39 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -39 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -39 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 40 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 40 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -40 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -40 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 41 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 41 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -41 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -41 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 42 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 42 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -42 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -42 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 43 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 43 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -43 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -43 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 44 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 44 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -44 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -44 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 45 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 45 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -45 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -45 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 46 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 46 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -46 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -46 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 47 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 47 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -47 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -47 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 48 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 48 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -48 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -48 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 49 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 49 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -49 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -49 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 50 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 50 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -50 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -50 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 51 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 51 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -51 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -51 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 52 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 52 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -52 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -52 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 53 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 53 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -53 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -53 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 54 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 54 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -54 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -54 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 55 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 55 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -55 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -55 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 56 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 56 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -56 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -56 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 57 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 57 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -57 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -57 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 58 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 58 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -58 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -58 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 59 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 59 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -59 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -59 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 60 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 60 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -60 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -60 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 61 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 61 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -61 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -61 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 62 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 62 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -62 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -62 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 63 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 63 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -63 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -63 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 64 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 64 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -64 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -64 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 65 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 65 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -65 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -65 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 66 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 66 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -66 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -66 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 67 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 67 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -67 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -67 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 68 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 68 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -68 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -68 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 69 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 69 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -69 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -69 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 70 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 70 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -70 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -70 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 71 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 71 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -71 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -71 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 72 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 72 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -72 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -72 Cycle D 45.0000 3 0.00 diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index 587828d1e..b05ad65c0 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -130,7 +130,7 @@
      -

      Introduction

      +

      Overview

      ${description} <#-- Create references to additional capabilities if appropriate --> diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2662.jar b/settings/repository/edu.mit.broad/picard-private-parts-2872.jar similarity index 68% rename from settings/repository/edu.mit.broad/picard-private-parts-2662.jar rename to settings/repository/edu.mit.broad/picard-private-parts-2872.jar index 54ef6d5e2..b6e685684 100644 Binary files a/settings/repository/edu.mit.broad/picard-private-parts-2662.jar and b/settings/repository/edu.mit.broad/picard-private-parts-2872.jar differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2662.xml b/settings/repository/edu.mit.broad/picard-private-parts-2872.xml similarity index 63% rename from settings/repository/edu.mit.broad/picard-private-parts-2662.xml rename to settings/repository/edu.mit.broad/picard-private-parts-2872.xml index 119255e8d..677d27d80 100644 --- a/settings/repository/edu.mit.broad/picard-private-parts-2662.xml +++ b/settings/repository/edu.mit.broad/picard-private-parts-2872.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf/picard-1.84.1337.xml b/settings/repository/net.sf/picard-1.84.1337.xml deleted file mode 100644 index 99f746ff6..000000000 --- a/settings/repository/net.sf/picard-1.84.1337.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/picard-1.84.1337.jar b/settings/repository/net.sf/picard-1.90.1442.jar similarity index 75% rename from settings/repository/net.sf/picard-1.84.1337.jar rename to settings/repository/net.sf/picard-1.90.1442.jar index 68db41848..caf2bc09d 100644 Binary files a/settings/repository/net.sf/picard-1.84.1337.jar and b/settings/repository/net.sf/picard-1.90.1442.jar differ diff --git a/settings/repository/net.sf/picard-1.90.1442.xml b/settings/repository/net.sf/picard-1.90.1442.xml new file mode 100644 index 000000000..4ec267817 --- /dev/null +++ b/settings/repository/net.sf/picard-1.90.1442.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/net.sf/sam-1.84.1337.xml b/settings/repository/net.sf/sam-1.84.1337.xml deleted file mode 100644 index 4d31fe250..000000000 --- a/settings/repository/net.sf/sam-1.84.1337.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/sam-1.84.1337.jar b/settings/repository/net.sf/sam-1.90.1442.jar similarity index 86% rename from settings/repository/net.sf/sam-1.84.1337.jar rename to settings/repository/net.sf/sam-1.90.1442.jar index 3d28e1928..f6e003657 100644 Binary files a/settings/repository/net.sf/sam-1.84.1337.jar and b/settings/repository/net.sf/sam-1.90.1442.jar differ diff --git a/settings/repository/net.sf/sam-1.90.1442.xml b/settings/repository/net.sf/sam-1.90.1442.xml new file mode 100644 index 000000000..918ea6ff0 --- /dev/null +++ b/settings/repository/net.sf/sam-1.90.1442.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/org.broad/tribble-1.84.1337.jar b/settings/repository/org.broad/tribble-1.90.1442.jar similarity index 74% rename from settings/repository/org.broad/tribble-1.84.1337.jar rename to settings/repository/org.broad/tribble-1.90.1442.jar index a4c336101..75b4c2fc5 100644 Binary files a/settings/repository/org.broad/tribble-1.84.1337.jar and b/settings/repository/org.broad/tribble-1.90.1442.jar differ diff --git a/settings/repository/org.broad/tribble-1.84.1337.xml b/settings/repository/org.broad/tribble-1.90.1442.xml similarity index 76% rename from settings/repository/org.broad/tribble-1.84.1337.xml rename to settings/repository/org.broad/tribble-1.90.1442.xml index f14af794e..01b944fe4 100644 --- a/settings/repository/org.broad/tribble-1.84.1337.xml +++ b/settings/repository/org.broad/tribble-1.90.1442.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/org.broadinstitute/variant-1.85.1357.jar b/settings/repository/org.broadinstitute/variant-1.90.1442.jar similarity index 94% rename from settings/repository/org.broadinstitute/variant-1.85.1357.jar rename to settings/repository/org.broadinstitute/variant-1.90.1442.jar index d341e1cf5..cf06f592e 100644 Binary files a/settings/repository/org.broadinstitute/variant-1.85.1357.jar and b/settings/repository/org.broadinstitute/variant-1.90.1442.jar differ diff --git a/settings/repository/org.broadinstitute/variant-1.85.1357.xml b/settings/repository/org.broadinstitute/variant-1.90.1442.xml similarity index 71% rename from settings/repository/org.broadinstitute/variant-1.85.1357.xml rename to settings/repository/org.broadinstitute/variant-1.90.1442.xml index f6d7a2caa..3838b8b6f 100644 --- a/settings/repository/org.broadinstitute/variant-1.85.1357.xml +++ b/settings/repository/org.broadinstitute/variant-1.90.1442.xml @@ -1,3 +1,3 @@ - +