diff --git a/.gitignore b/.gitignore
index 9a20b68ca..65f111587 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
/*.bam
/*.bai
/*.bed
-*.idx
*~
/*.vcf
/*.txt
diff --git a/build.xml b/build.xml
index 03f3232f2..56bf4f0cd 100644
--- a/build.xml
+++ b/build.xml
@@ -91,9 +91,8 @@
This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities(ref bases vs. bases of the alternate allele). The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. This annotation tool outputs the following:
+ *
+ * Caveat
+ *
+ *
This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele).
+ * + *The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ * + * @author rpoplin + * @since 6/28/12 */ public class ClippingRankSumTest extends RankSumTest { @@ -83,12 +85,12 @@ public class ClippingRankSumTest extends RankSumTest { for (Map.EntryWhile the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for * N samples with -dcov D is N * D + *
*/ public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 5acea12f6..1cf91f181 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; @@ -72,11 +73,11 @@ import java.util.Map; /** - * The depth of coverage of each VCF allele in this sample. + * The depth of coverage of each allele per sample * - * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this + *The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the - * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of + * caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of * REF and ALT fields) is the unfiltered count of all reads that carried with them the * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the * power I have to determine the genotype of the sample at this site, while the AD tells me how many times @@ -86,10 +87,12 @@ import java.util.Map; * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are - * many non-informatice reads. - * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation, + * many non-informative reads.
+ * + *Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation, * one should not base assumptions about the underlying genotype based on it; - * instead, the genotype likelihoods (PLs) are what determine the genotype calls. + * instead, the genotype likelihoods (PLs) are what determine the genotype calls.
+ * */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { @@ -139,12 +142,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa } for (Map.EntryPhred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation + * being seen on only the forward or only the reverse strand) in the reads. More bias is + * indicative of false positive calls. + *
+ * + *The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.
*/ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static Logger logger = Logger.getLogger(FisherStrand.class); + private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; @@ -95,6 +104,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat else if (stratifiedPerReadAlleleLikelihoodMap != null) { // either SNP with no alignment context, or indels: per-read likelihood map needed final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); +// logger.info("VC " + vc); +// printTable(table, 0.0); return pValueForBestTable(table, null); } else @@ -131,9 +142,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private MapThe GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference.
+ * + *The window size used to calculate the GC content around the site is set by the tool used for annotation + * (currently UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the Technical Document for each tool + * to find out what window size they use.
*/ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class GCContent extends InfoFieldAnnotation { public MapThis annotation calculates the Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium.
+ * + *Right now we just ignore genotypes that are not confident, but this throws off our HW ratios. + * More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is het or hom var.
*/ -public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation { +public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAnnotation { private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index c25cb6820..4039241ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -50,6 +50,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.GenomeLoc; @@ -63,9 +64,16 @@ import java.util.List; import java.util.Map; /** - * Largest contiguous homopolymer run of the variant allele in either direction on the reference. Computed only for bi-allelic sites. + * Largest contiguous homopolymer run of the variant allele + * + *Calculates the length of the largest contiguous homopolymer run of the variant allele in either direction on the reference.
+ * + *This can only be computed for bi-allelic sites.
+ *This needs to be computed in a more accurate manner. We currently look only at direct runs of the alternate allele adjacent to this position.
*/ -public class HomopolymerRun extends InfoFieldAnnotation { +public class HomopolymerRun extends InfoFieldAnnotation implements ExperimentalAnnotation { private boolean ANNOTATE_INDELS = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index 19f32bae0..ad974a083 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -65,13 +65,20 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; /** - * Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation - * versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is - * diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than - * the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios. + * Likelihood of being a Mendelian Violation + * + *Given a variant context, this tool uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation + * versus the likelihood of the site transmitting according to mendelian rules.
+ * + *Note that this annotation requires a valid ped file.
+ * + *This tool assumes that the organism is diploid. When multiple trios are present, the annotation is simply the maximum + * of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain + * sites and many trios.
*/ -public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { +public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; public static final String MVLR_KEY = "MVLR"; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 8c401eecd..3873138a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -59,8 +60,12 @@ import java.util.*; /** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) - * Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities + * + *This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele).
+ * + *The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
*/ public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation { @@ -88,13 +93,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn return; } for (Map.EntryThis tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error.
+ * + *The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
*/ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { @@ -103,8 +108,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio } for (Map.EntryNote that this annotation is currently not compatible with HaplotypeCaller.
*/ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { @@ -86,10 +89,12 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn int deletions = 0; int depth = 0; for ( Map.EntryThis tool outputs the number of times the tandem repeat unit is repeated, for each allele (including reference).
+ * + *This annotation is currently not compatible with HaplotypeCaller.
+ */ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation { private static final String STR_PRESENT = "STR"; private static final String REPEAT_UNIT_KEY = "RU"; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index b3f5728a2..f8efd7c3f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -65,12 +65,21 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; /** - * Created by IntelliJ IDEA. - * User: rpoplin, lfran, ebanks - * Date: 11/14/11 + * Wittkowski transmission disequilibrium test + * + *Test statistic from Wittkowski transmission disequilibrium test. + * The calculation is based on the following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
+ * + *Note that this annotation requires a valid ped file.
+ * + *This annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).
+ * + * @author rpoplin, lfran, ebanks + * @since 11/14/11 */ -public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements RodRequiringAnnotation { private SetThis tool assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.). + * It also specifies whether the variant is multiallelic (>2 alleles).
*/ -public class VariantType extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class VariantType extends InfoFieldAnnotation { public Map* - *
* The input read data whose base quality scores need to be assessed. *
* A database of known polymorphic sites to skip over. *
* - ** A GATK Report file with many tables: *
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T BaseRecalibrator \
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
index 5ab296a5f..0a4899f1c 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
@@ -61,7 +61,7 @@ import java.util.List;
* User: rpoplin
* Date: Nov 27, 2009
*
- * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
+ * A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker.
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
*/
@@ -91,7 +91,7 @@ public class RecalibrationArgumentCollection {
* If not provided, then no plots will be generated (useful for queue scatter/gathering).
* However, we *highly* recommend that users generate these plots whenever possible for QC checking.
*/
- @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false)
+ @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false, defaultToStdout = false)
public File RECAL_PDF_FILE = null;
/**
@@ -131,14 +131,14 @@ public class RecalibrationArgumentCollection {
public boolean RUN_WITHOUT_DBSNP = false;
/**
- * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the
+ * BaseRecalibrator accepts a --solid_recal_mode flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
*/
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
/**
- * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles
+ * BaseRecalibrator accepts a --solid_nocall_strategy flag which governs how the recalibrator handles
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
* their color space tag can not be recalibrated.
*/
@@ -146,38 +146,38 @@ public class RecalibrationArgumentCollection {
public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
/**
- * The context covariate will use a context of this size to calculate it's covariate value for base mismatches
+ * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
- @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
+ @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false)
public int MISMATCHES_CONTEXT_SIZE = 2;
/**
- * The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
+ * The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
- @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
+ @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false)
public int INDELS_CONTEXT_SIZE = 3;
/**
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
* This argument is ignored if the Cycle covariate is not used.
*/
- @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
+ @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false)
public int MAXIMUM_CYCLE_VALUE = 500;
/**
- * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
+ * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off]
*/
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
public byte MISMATCHES_DEFAULT_QUALITY = -1;
/**
- * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
+ * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on]
*/
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
public byte INSERTIONS_DEFAULT_QUALITY = 45;
/**
- * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
+ * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on]
*/
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45;
@@ -220,7 +220,7 @@ public class RecalibrationArgumentCollection {
public String FORCE_PLATFORM = null;
@Hidden
- @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
+ @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
index 5e6e2a8d9..9f33234cf 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
@@ -178,7 +178,7 @@ public class RecalibrationEngine {
final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable();
// iterate over all values in the qual table
- for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) {
+ for ( final NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) {
final int rgKey = leaf.keys[0];
final int eventIndex = leaf.keys[2];
final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex);
@@ -206,7 +206,9 @@ public class RecalibrationEngine {
*/
@Requires("! finalized")
private RecalibrationTables mergeThreadLocalRecalibrationTables() {
- if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty");
+ if ( recalibrationTablesList.isEmpty() ) {
+ recalibrationTablesList.add( new RecalibrationTables(covariates, numReadGroups, maybeLogStream) );
+ }
RecalibrationTables merged = null;
for ( final RecalibrationTables table : recalibrationTablesList ) {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
index fb11f6249..271617059 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
@@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
@@ -55,18 +56,27 @@ import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.recalibration.*;
import java.io.*;
/**
+ * Evaluate the performance of the base recalibration process
+ *
+ * This tool aims to evaluate the results of the Base Quality Score Recalibration (BQSR) process.
+ *
+ * Caveat
+ * This tool is currently experimental. We do not provide documentation nor support for its operation.
+ *
*/
-
+@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
@PartitionBy(PartitionType.READ)
public class RecalibrationPerformance extends RodWalker implements NanoSchedulable {
- @Output(doc="Write output to this file", required = true)
+ @Output
public PrintStream out;
@Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file")
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java
index c7b990a88..28a48c212 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java
@@ -69,10 +69,45 @@ public class BaseAndQualsCounts extends BaseCounts {
private long sumInsertionQual_N = 0;
private long sumDeletionQual_N = 0;
+ /*
+ * Increments the count
+ *
+ * @param base the base
+ * @param baseQual the base quality
+ * @param insQual the insertion quality
+ * @param delQual the deletion quality
+ * @param baseMappingQual the mapping quality
+ * @param isLowQualBase true if the base is low quality
+ */
+ public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
+ incr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false);
+ }
+
+ /*
+ * Increments the count
+ *
+ * @param base the base
+ * @param baseQual the base quality
+ * @param insQual the insertion quality
+ * @param delQual the deletion quality
+ * @param baseMappingQual the mapping quality
+ * @param isLowQualBase true if the base is low quality
+ * @param isSoftClip true if is soft-clipped
+ */
+ public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) {
+ // if we already have high quality bases, ignore low quality ones
+ if ( isLowQualBase && !isLowQuality() )
+ return;
+
+ // if this is a high quality base then remove any low quality bases and start from scratch
+ if ( !isLowQualBase && isLowQuality() ) {
+ if ( totalCount() > 0 )
+ clear();
+ setLowQuality(false);
+ }
- public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
final BaseIndex i = BaseIndex.byteToBase(base);
- super.incr(i, baseQual);
+ super.incr(i, baseQual, baseMappingQual, isSoftClip);
switch (i) {
case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break;
case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break;
@@ -84,9 +119,38 @@ public class BaseAndQualsCounts extends BaseCounts {
}
}
- public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
+ /*
+ * Decrements the count
+ *
+ * @param base the base
+ * @param baseQual the base quality
+ * @param insQual the insertion quality
+ * @param delQual the deletion quality
+ * @param baseMappingQual the mapping quality
+ * @param isLowQualBase true if the base is low quality
+ */
+ public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
+ decr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false);
+ }
+
+ /*
+ * Decrements the count
+ *
+ * @param base the base
+ * @param baseQual the base quality
+ * @param insQual the insertion quality
+ * @param delQual the deletion quality
+ * @param baseMappingQual the mapping quality
+ * @param isLowQualBase true if the base is low quality
+ * @param isSoftClip true if is soft-clipped
+ */
+ public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) {
+ // if this is not the right type of base, ignore it
+ if ( isLowQualBase != isLowQuality() )
+ return;
+
final BaseIndex i = BaseIndex.byteToBase(base);
- super.decr(i, baseQual);
+ super.decr(i, baseQual, baseMappingQual, isSoftClip);
switch (i) {
case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break;
case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break;
@@ -131,4 +195,13 @@ public class BaseAndQualsCounts extends BaseCounts {
default: throw new IllegalArgumentException(base.name());
}
}
+
+ /**
+ * Clears out all stored data in this object
+ */
+ public void clear() {
+ super.clear();
+ sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0;
+ sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0;
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java
index 17ce3c90d..e1329db3b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java
@@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import org.broadinstitute.sting.utils.MathUtils;
/**
@@ -78,6 +80,9 @@ import com.google.java.contract.Requires;
private int count_N = 0;
private int sumQual_N = 0;
private int totalCount = 0; // keeps track of total count since this is requested so often
+ private int nSoftClippedBases = 0;
+ private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this
+ private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise
public static BaseCounts createWithCounts(int[] countsACGT) {
@@ -100,6 +105,8 @@ import com.google.java.contract.Requires;
this.count_I += other.count_I;
this.count_N += other.count_N;
this.totalCount += other.totalCount;
+ this.nSoftClippedBases = other.nSoftClippedBases;
+ this.mappingQualities.addAll(other.mappingQualities);
}
@Requires("other != null")
@@ -112,6 +119,8 @@ import com.google.java.contract.Requires;
this.count_I -= other.count_I;
this.count_N -= other.count_N;
this.totalCount -= other.totalCount;
+ this.nSoftClippedBases -= other.nSoftClippedBases;
+ this.mappingQualities.removeAll(other.mappingQualities);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
@@ -120,7 +129,7 @@ import com.google.java.contract.Requires;
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
- public void incr(final BaseIndex base, final byte qual) {
+ public void incr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) {
switch (base) {
case A: ++count_A; sumQual_A += qual; break;
case C: ++count_C; sumQual_C += qual; break;
@@ -131,6 +140,8 @@ import com.google.java.contract.Requires;
case N: ++count_N; sumQual_N += qual; break;
}
++totalCount;
+ nSoftClippedBases += isSoftclip ? 1 : 0;
+ mappingQualities.add(mappingQuality);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
@@ -152,7 +163,7 @@ import com.google.java.contract.Requires;
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
- public void decr(final BaseIndex base, final byte qual) {
+ public void decr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) {
switch (base) {
case A: --count_A; sumQual_A -= qual; break;
case C: --count_C; sumQual_C -= qual; break;
@@ -163,6 +174,8 @@ import com.google.java.contract.Requires;
case N: --count_N; sumQual_N -= qual; break;
}
--totalCount;
+ nSoftClippedBases -= isSoftclip ? 1 : 0;
+ mappingQualities.remove((Integer) mappingQuality);
}
@Ensures("result >= 0")
@@ -223,12 +236,25 @@ import com.google.java.contract.Requires;
return (byte) (sumQualsOfBase(base) / countOfBase(base));
}
+ @Ensures("result >= 0")
+ public int nSoftclips() {
+ return nSoftClippedBases;
+ }
@Ensures("result >= 0")
public int totalCount() {
return totalCount;
}
+ /**
+ * The RMS of the mapping qualities of all reads that contributed to this object
+ *
+ * @return the RMS of the mapping qualities of all reads that contributed to this object
+ */
+ public double getRMS() {
+ return MathUtils.rms(mappingQualities);
+ }
+
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
@@ -264,22 +290,42 @@ import com.google.java.contract.Requires;
return baseIndexWithMostCounts().getByte();
}
+ /**
+ * @return the base index for which the count is highest, including indel indexes
+ */
@Ensures("result != null")
public BaseIndex baseIndexWithMostCounts() {
- BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
- for (final BaseIndex i : BaseIndex.values()) {
- if (countOfBase(i) > countOfBase(maxI))
- maxI = i;
- }
- return maxI;
+ return baseIndexWithMostCounts(true);
}
+ /**
+ * @return the base index for which the count is highest, excluding indel indexes
+ */
@Ensures("result != null")
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
+ return baseIndexWithMostCounts(false);
+ }
+
+ /**
+ * Finds the base index with the most counts
+ *
+ * @param allowIndels should we allow base indexes representing indels?
+ * @return non-null base index
+ */
+ @Ensures("result != null")
+ protected BaseIndex baseIndexWithMostCounts(final boolean allowIndels) {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
+ int maxCount = countOfBase(maxI);
+
for (final BaseIndex i : BaseIndex.values()) {
- if (i.isNucleotide() && countOfBase(i) > countOfBase(maxI))
+ if ( !allowIndels && !i.isNucleotide() )
+ continue;
+
+ final int myCount = countOfBase(i);
+ if (myCount > maxCount) {
maxI = i;
+ maxCount = myCount;
+ }
}
return maxI;
}
@@ -290,22 +336,36 @@ import com.google.java.contract.Requires;
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbability() {
- BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
- for (final BaseIndex i : BaseIndex.values()) {
- if (getSumQuals(i) > getSumQuals(maxI))
- maxI = i;
- }
- return (getSumQuals(maxI) > 0L ? maxI : baseIndexWithMostCounts());
+ return baseIndexWithMostProbability(true);
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
+ return baseIndexWithMostProbability(false);
+ }
+
+ /**
+ * Finds the base index with the most probability
+ *
+ * @param allowIndels should we allow base indexes representing indels?
+ * @return non-null base index
+ */
+ @Ensures("result != null")
+ public BaseIndex baseIndexWithMostProbability(final boolean allowIndels) {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
+ long maxSum = getSumQuals(maxI);
+
for (final BaseIndex i : BaseIndex.values()) {
- if (i.isNucleotide() && getSumQuals(i) > getSumQuals(maxI))
+ if ( !allowIndels && !i.isNucleotide() )
+ continue;
+
+ final long mySum = getSumQuals(i);
+ if (mySum > maxSum) {
maxI = i;
+ maxSum = mySum;
+ }
}
- return (getSumQuals(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
+ return (maxSum > 0L ? maxI : baseIndexWithMostCounts(allowIndels));
}
@Ensures("result >=0")
@@ -325,4 +385,27 @@ import com.google.java.contract.Requires;
final int total = totalCountWithoutIndels();
return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total;
}
+
+ /**
+ * @return true if this instance represents low quality bases
+ */
+ public boolean isLowQuality() { return isLowQuality; }
+
+ /**
+ * Sets the low quality value
+ *
+ * @param value true if this instance represents low quality bases false otherwise
+ */
+ public void setLowQuality(final boolean value) { isLowQuality = value; }
+
+ /**
+ * Clears out all stored data in this object
+ */
+ public void clear() {
+ count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0;
+ sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0;
+ totalCount = 0;
+ nSoftClippedBases = 0;
+ mappingQualities.clear();
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java
index e41878a0b..665e3e7ce 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java
@@ -121,7 +121,7 @@ public enum BaseIndex {
*
* @return whether or not it is a nucleotide, given the definition above
*/
- public boolean isNucleotide() {
+ public final boolean isNucleotide() {
return !isIndel();
}
@@ -130,7 +130,7 @@ public enum BaseIndex {
*
* @return true for I or D, false otherwise
*/
- public boolean isIndel() {
+ public final boolean isIndel() {
return this == D || this == I;
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java
index a8a765ddc..36da92b4f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java
@@ -69,15 +69,15 @@ import java.util.Map;
*
* This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files.
*
- * Input
+ * Input
*
* Two BAM files (using -I) with different read group IDs
*
- * Output
+ * Output
*
* [Output description]
*
- * Examples
+ * Examples
*
* java
* -jar GenomeAnalysisTK.jar
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
index 1cd9c1bc0..38b9e957b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
@@ -46,7 +46,7 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
-import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -62,9 +62,9 @@ public class HeaderElement {
private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site.
private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site.
private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right
- private int nSoftClippedBases; // How many bases in this site came from soft clipped bases
private int location; // Genome location of this site (the sliding window knows which contig we're at
- private IntArrayList mappingQuality; // keeps the mapping quality of each read that contributed to this element (site)
+
+ protected static final int MIN_COUNT_FOR_USING_PVALUE = 2;
public int getLocation() {
return location;
@@ -85,7 +85,7 @@ public class HeaderElement {
* @param location the reference location for the new element
*/
public HeaderElement(final int location) {
- this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new IntArrayList());
+ this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location);
}
/**
@@ -95,7 +95,7 @@ public class HeaderElement {
* @param location the reference location for the new element
*/
public HeaderElement(final int location, final int insertionsToTheRight) {
- this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new IntArrayList());
+ this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location);
}
/**
@@ -104,55 +104,67 @@ public class HeaderElement {
* @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read
* @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read
* @param insertionsToTheRight number of insertions to the right of this HeaderElement
- * @param nSoftClippedBases number of softclipped bases of this HeaderElement
* @param location the reference location of this reference element
- * @param mappingQuality the list of mapping quality values of all reads that contributed to this
* HeaderElement
*/
- public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, IntArrayList mappingQuality) {
+ public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int location) {
this.consensusBaseCounts = consensusBaseCounts;
this.filteredBaseCounts = filteredBaseCounts;
this.insertionsToTheRight = insertionsToTheRight;
- this.nSoftClippedBases = nSoftClippedBases;
this.location = location;
- this.mappingQuality = mappingQuality;
}
/**
* Whether or not the site represented by this HeaderElement is variant according to the definitions of variant
* by insertion, deletion and mismatches.
*
+ * @param minVariantPvalue min p-value for deciding that a position is or is not variable due to mismatches
+ * @param minVariantProportion min proportion for deciding that a position is or is not variable due to mismatches
+ * @param minIndelProportion min proportion for deciding that a position is or is not variable due to indels
* @return true if site is variant by any definition. False otherwise.
*/
- public boolean isVariant(double minVariantProportion, double minIndelProportion) {
- return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
+ public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) {
+ return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
}
/**
* Adds a new base to the HeaderElement updating all counts accordingly
*
- * @param base the base to add
+ * @param base the base to add
* @param baseQual the base quality
+ * @param insQual the base insertion quality
+ * @param delQual the base deletion quality
* @param baseMappingQuality the mapping quality of the read this base belongs to
+ * @param minBaseQual the minimum base qual allowed to be a good base
+ * @param minMappingQual the minimum mapping qual allowed to be a good read
+ * @param isSoftClipped true if the base is soft-clipped in the original read
*/
public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
- if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
- consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
+ // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
+ if ( baseMappingQuality >= minMappingQual )
+ consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
else
- filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
-
- this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
- nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
+ filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
}
+ /**
+ * Adds a new base to the HeaderElement updating all counts accordingly
+ *
+ * @param base the base to add
+ * @param baseQual the base quality
+ * @param insQual the base insertion quality
+ * @param delQual the base deletion quality
+ * @param baseMappingQuality the mapping quality of the read this base belongs to
+ * @param minBaseQual the minimum base qual allowed to be a good base
+ * @param minMappingQual the minimum mapping qual allowed to be a good read
+ * @param isSoftClipped true if the base is soft-clipped in the original read
+ */
public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
- if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
- consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
+ // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
+ if ( baseMappingQuality >= minMappingQual )
+ consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
else
- filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
-
- this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
- nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
+ filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
}
/**
* Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
@@ -189,15 +201,6 @@ public class HeaderElement {
return (!hasFilteredData() && !hasConsensusData());
}
- /**
- * The RMS of the mapping qualities of all reads that contributed to this HeaderElement
- *
- * @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement
- */
- public double getRMS() {
- return MathUtils.rms(mappingQuality);
- }
-
/**
* removes an insertion from this element (if you removed a read that had an insertion)
*/
@@ -232,7 +235,7 @@ public class HeaderElement {
/**
* Whether or not the HeaderElement is variant due to excess deletions
*
- * @return whether or not the HeaderElement is variant due to excess insertions
+ * @return whether or not the HeaderElement is variant due to excess deletions
*/
private boolean isVariantFromDeletions(double minIndelProportion) {
return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
@@ -241,12 +244,15 @@ public class HeaderElement {
/**
* Whether or not the HeaderElement is variant due to excess mismatches
*
- * @return whether or not the HeaderElement is variant due to excess insertions
+ * @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage).
+ * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage).
+ * @return whether or not the HeaderElement is variant due to excess mismatches
*/
- protected boolean isVariantFromMismatches(double minVariantProportion) {
- BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
- double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
- return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
+ protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) {
+ final int totalCount = consensusBaseCounts.totalCountWithoutIndels();
+ final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
+ final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon);
+ return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion);
}
/**
@@ -256,37 +262,88 @@ public class HeaderElement {
* @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
*/
protected boolean isVariantFromSoftClips() {
+ final int nSoftClippedBases = consensusBaseCounts.nSoftclips();
return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases);
}
- protected boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) {
- return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
+ /**
+ * Calculates the number of alleles necessary to represent this site.
+ *
+ * @param minVariantPvalue the minimum pvalue to call a site variant.
+ * @param minVariantProportion the minimum proportion to call a site variant.
+ * @return the number of alleles necessary to represent this site or -1 if there are too many indels
+ */
+ public int getNumberOfBaseAlleles(final double minVariantPvalue, final double minVariantProportion) {
+ final ObjectArrayList alleles = getAlleles(minVariantPvalue, minVariantProportion);
+ return alleles == null ? -1 : alleles.size();
}
/**
- * Calculates the number of haplotypes necessary to represent this site.
+ * Calculates the alleles necessary to represent this site.
*
+ * @param minVariantPvalue the minimum pvalue to call a site variant.
* @param minVariantProportion the minimum proportion to call a site variant.
- * @return the number of alleles necessary to represent this site.
+ * @return the list of alleles necessary to represent this site or null if there are too many indels
*/
- public int getNumberOfAlleles(final double minVariantProportion) {
+ public ObjectArrayList getAlleles(final double minVariantPvalue, final double minVariantProportion) {
+ // make sure we have bases at all
final int totalBaseCount = consensusBaseCounts.totalCount();
- if (totalBaseCount == 0)
- return 0;
+ if ( totalBaseCount == 0 )
+ return new ObjectArrayList(0);
- final int minBaseCountForRelevantAlleles = (int)(minVariantProportion * totalBaseCount);
+ // next, check for insertions; technically, the insertion count can be greater than totalBaseCount
+ // (because of the way insertions are counted), so we need to account for that
+ if ( hasSignificantCount(Math.min(totalBaseCount, insertionsToTheRight), totalBaseCount, minVariantPvalue, minVariantProportion) )
+ return null;
- int nAlleles = 0;
- for ( BaseIndex base : BaseIndex.values() ) {
+ // finally, check for the bases themselves (including deletions)
+ final ObjectArrayList alleles = new ObjectArrayList(4);
+ for ( final BaseIndex base : BaseIndex.values() ) {
final int baseCount = consensusBaseCounts.countOfBase(base);
-
- // don't consider this allele if the count is 0
if ( baseCount == 0 )
continue;
- if ( baseCount >= minBaseCountForRelevantAlleles )
- nAlleles++;
+ if ( hasSignificantCount(baseCount, totalBaseCount, minVariantPvalue, minVariantProportion) ) {
+ if ( base == BaseIndex.D )
+ return null;
+ alleles.add(base);
+ }
}
- return nAlleles;
+ return alleles;
+ }
+
+ /*
+ * Checks whether there are a significant number of softclips.
+ *
+ * @param minVariantPvalue the minimum pvalue to call a site variant.
+ * @param minVariantProportion the minimum proportion to call a site variant.
+ * @return true if there are significant softclips, false otherwise
+ */
+ public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) {
+ return hasSignificantCount(consensusBaseCounts.nSoftclips(), consensusBaseCounts.totalCount(), minVariantPvalue, minVariantProportion);
+ }
+
+ /*
+ * Checks whether there are a significant number of count.
+ *
+ * @param count the count (k) to test against
+ * @param total the total (n) to test against
+ * @param minVariantPvalue the minimum pvalue to call a site variant.
+ * @param minVariantProportion the minimum proportion to call a site variant.
+ * @return true if there is a significant count given the provided pvalue, false otherwise
+ */
+ private boolean hasSignificantCount(final int count, final int total, final double minVariantPvalue, final double minVariantProportion) {
+ if ( count == 0 || total == 0 )
+ return false;
+
+ // use p-values for low counts of k
+ if ( count <= MIN_COUNT_FOR_USING_PVALUE ) {
+ final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count);
+ return pvalue > minVariantPvalue;
+ }
+
+ // otherwise, use straight proportions
+ final int minBaseCountForSignificance = (int)(minVariantProportion * total);
+ return count >= minBaseCountForSignificance;
}
}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
index 2f377bee8..bdd407fba 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
@@ -46,9 +46,11 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+import com.google.java.contract.Ensures;
import it.unimi.dsi.fastutil.objects.*;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -94,46 +96,66 @@ public class MultiSampleCompressor {
final int contextSize,
final int downsampleCoverage,
final int minMappingQuality,
+ final double minAltPValueToTriggerVariant,
final double minAltProportionToTriggerVariant,
final double minIndelProportionToTriggerVariant,
final int minBaseQual,
- final ReduceReads.DownsampleStrategy downsampleStrategy,
- final boolean allowPolyploidReduction) {
+ final ReduceReads.DownsampleStrategy downsampleStrategy) {
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
compressorsPerSample.put(name,
new SingleSampleCompressor(contextSize, downsampleCoverage,
- minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, allowPolyploidReduction));
+ minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
}
}
- public ObjectSet addAlignment(GATKSAMRecord read) {
+ /**
+ * Add an alignment to the compressor
+ *
+ * @param read the read to be added
+ * @param knownSnpPositions the set of known SNP positions
+ * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
+ */
+ public ObjectSet addAlignment(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) {
String sampleName = read.getReadGroup().getSample();
SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
if ( compressor == null )
throw new ReviewedStingException("No compressor for sample " + sampleName);
- Pair, CompressionStash> readsAndStash = compressor.addAlignment(read);
+ Pair, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions);
ObjectSet reads = readsAndStash.getFirst();
CompressionStash regions = readsAndStash.getSecond();
- reads.addAll(closeVariantRegionsInAllSamples(regions));
+ reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions));
return reads;
}
- public ObjectSet close() {
+ /**
+ * Properly closes the compressor.
+ *
+ * @param knownSnpPositions the set of known SNP positions
+ * @return A non-null set/list of all reads generated
+ */
+ @Ensures("result != null")
+ public ObjectSet close(final ObjectSortedSet knownSnpPositions) {
ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
- Pair, CompressionStash> readsAndStash = sample.close();
- reads = readsAndStash.getFirst();
+ Pair, CompressionStash> readsAndStash = sample.close(knownSnpPositions);
+ reads.addAll(readsAndStash.getFirst());
}
return reads;
}
- private ObjectSet closeVariantRegionsInAllSamples(CompressionStash regions) {
+ /**
+ * Finalizes current variant regions.
+ *
+ * @param knownSnpPositions the set of known SNP positions
+ * @return A non-null set/list of all reads generated
+ */
+ private ObjectSet closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) {
ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
if (!regions.isEmpty()) {
for (SingleSampleCompressor sample : compressorsPerSample.values()) {
- reads.addAll(sample.closeVariantRegions(regions));
+ reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions));
}
}
return reads;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
index e89158412..71910e566 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
@@ -54,9 +54,7 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.util.SequenceUtil;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Hidden;
-import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@@ -69,11 +67,16 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.Collections;
+import java.util.List;
/**
@@ -86,17 +89,17 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
* savings in file size and performance of the downstream tools.
*
- * Input
+ * Input
*
* The BAM file to be compressed
*
*
- * Output
+ * Output
*
* The compressed (reduced) BAM file.
*
*
- * Examples
+ * Examples
*
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
@@ -112,7 +115,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40)
public class ReduceReads extends ReadWalker, ReduceReadsStash> {
- @Output
+ @Output(required = false, defaultToStdout = false)
private StingSAMFileWriter out = null;
private SAMFileWriter writerToUse = null;
@@ -120,7 +123,7 @@ public class ReduceReads extends ReadWalker, Redu
* The number of bases to keep around mismatches (potential variation)
*/
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
- private int contextSize = 10;
+ public int contextSize = 10;
/**
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
@@ -128,7 +131,7 @@ public class ReduceReads extends ReadWalker, Redu
* towards variable regions.
*/
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
- private int minMappingQuality = 20;
+ public int minMappingQuality = 20;
/**
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
@@ -136,41 +139,45 @@ public class ReduceReads extends ReadWalker, Redu
* towards variable regions.
*/
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
- private byte minBaseQual = 20;
+ public byte minBaseQual = 15;
/**
- * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
- * lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
+ * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with
+ * quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
*/
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
- private byte minTailQuality = 2;
+ public byte minTailQuality = 2;
/**
- * Allow the experimental polyploid-based reduction capabilities of this tool
+ * Any number of VCF files representing known SNPs to be used for the polyploid-based reduction.
+ * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored.
+ * If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present
+ * in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will
+ * be triggered anywhere there is a single SNP present in a consensus window.
*/
- @Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false)
- private boolean USE_POLYPLOID_REDUCTION = false;
+ @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false)
+ public List> known = Collections.emptyList();
/**
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
* and read group).
*/
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
- private boolean DONT_SIMPLIFY_READS = false;
+ public boolean DONT_SIMPLIFY_READS = false;
/**
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
* The program will behave correctly in those cases.
*/
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
- private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
+ public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
/**
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
* quality.
*/
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
- private boolean DONT_CLIP_LOW_QUAL_TAILS = false;
+ public boolean DONT_CLIP_LOW_QUAL_TAILS = false;
/**
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
@@ -178,7 +185,7 @@ public class ReduceReads extends ReadWalker, Redu
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
*/
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
- private boolean DONT_USE_SOFTCLIPPED_BASES = false;
+ public boolean DONT_USE_SOFTCLIPPED_BASES = false;
/**
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
@@ -186,55 +193,68 @@ public class ReduceReads extends ReadWalker, Redu
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
*/
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
- private boolean DONT_COMPRESS_READ_NAMES = false;
+ public boolean DONT_COMPRESS_READ_NAMES = false;
/**
* Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
* border.
*/
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
- private boolean HARD_CLIP_TO_INTERVAL = false;
+ public boolean HARD_CLIP_TO_INTERVAL = false;
/**
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
- * considered consensus.
+ * considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that
+ * this value is used only regions with high coverage.
*/
+ @Advanced
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
- private double minAltProportionToTriggerVariant = 0.05;
+ public double minAltProportionToTriggerVariant = 0.05;
+
+ /**
+ * Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region.
+ * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to
+ * trigger polyploid compression). Note that this value is used only regions with low coverage.
+ */
+ @Advanced
+ @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false)
+ public double minAltPValueToTriggerVariant = 0.01;
/**
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
* considered consensus.
*/
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
- private double minIndelProportionToTriggerVariant = 0.05;
+ public double minIndelProportionToTriggerVariant = 0.05;
/**
- * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
+ * The number of reads emitted per sample in a variant region can be downsampled for better compression.
+ * This level of downsampling only happens after the region has been evaluated, therefore it can
+ * be combined with the engine level downsampling.
* A value of 0 turns downsampling off.
*/
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
- private int downsampleCoverage = 250;
+ public int downsampleCoverage = 250;
@Hidden
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
- private boolean nwayout = false;
+ public boolean nwayout = false;
@Hidden
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
- private int debugLevel = 0;
+ public int debugLevel = 0;
@Hidden
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
- private String debugRead = "";
+ public String debugRead = "";
@Hidden
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
- private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
+ public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
@Hidden
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
- private boolean NO_PG_TAG = false;
+ public boolean NO_PG_TAG = false;
public enum DownsampleStrategy {
Normal,
@@ -248,6 +268,8 @@ public class ReduceReads extends ReadWalker, Redu
ObjectSortedSet intervalList;
+ ObjectSortedSet knownSnpPositions;
+
// IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER
public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam";
@@ -259,6 +281,24 @@ public class ReduceReads extends ReadWalker, Redu
@Override
public void initialize() {
super.initialize();
+
+ if ( !nwayout && out == null )
+ throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes");
+
+ if ( nwayout && out != null )
+ throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other");
+
+ if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 )
+ throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
+
+ if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 )
+ throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
+
+ if ( known.isEmpty() )
+ knownSnpPositions = null;
+ else
+ knownSnpPositions = new ObjectAVLTreeSet();
+
GenomeAnalysisEngine toolkit = getToolkit();
readNameHash = new Object2LongOpenHashMap(100000); // prepare the read name hash to keep track of what reads have had their read names compressed
intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
@@ -266,10 +306,8 @@ public class ReduceReads extends ReadWalker, Redu
if (toolkit.getIntervals() != null)
intervalList.addAll(toolkit.getIntervals());
-
final boolean preSorted = true;
final boolean indexOnTheFly = true;
- final boolean keep_records = true;
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
if (nwayout) {
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
@@ -279,7 +317,7 @@ public class ReduceReads extends ReadWalker, Redu
writerToUse = out;
out.setPresorted(false);
if (!NO_PG_TAG) {
- Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
+ Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
}
}
}
@@ -352,8 +390,22 @@ public class ReduceReads extends ReadWalker, Redu
for (GATKSAMRecord mappedRead : mappedReads)
System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
- return mappedReads;
+ // add the SNPs to the list of known positions
+ populateKnownSNPs(metaDataTracker);
+ return mappedReads;
+ }
+
+ /*
+ * Add the positions of known SNPs to the set so that we can keep track of it
+ *
+ * @param metaDataTracker the ref meta data tracker
+ */
+ protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) {
+ for ( final VariantContext vc : metaDataTracker.getValues(known) ) {
+ if ( vc.isSNP() )
+ knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc));
+ }
}
/**
@@ -366,7 +418,7 @@ public class ReduceReads extends ReadWalker, Redu
*/
@Override
public ReduceReadsStash reduceInit() {
- return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, USE_POLYPLOID_REDUCTION));
+ return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
}
/**
@@ -398,9 +450,16 @@ public class ReduceReads extends ReadWalker, Redu
if (debugLevel == 1)
System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
- for (GATKSAMRecord compressedRead : stash.compress(readReady))
+ for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions))
outputRead(compressedRead);
+ // We only care about maintaining the link between read pairs if they are in the same variant
+ // region. Since an entire variant region's worth of reads is returned in a single call to
+ // stash.compress(), the readNameHash can be cleared after the for() loop above.
+ // The advantage of clearing the hash is that otherwise it holds all reads that have been encountered,
+ // which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory.
+ readNameHash.clear();
+
}
} else
stash.add(read);
@@ -408,6 +467,10 @@ public class ReduceReads extends ReadWalker, Redu
firstRead = false;
}
+ // reduce memory requirements by removing old positions
+ if ( !mappedReads.isEmpty() )
+ clearStaleKnownPositions(mappedReads.get(0));
+
return stash;
}
@@ -420,13 +483,38 @@ public class ReduceReads extends ReadWalker, Redu
public void onTraversalDone(ReduceReadsStash stash) {
// output any remaining reads in the compressor
- for (GATKSAMRecord read : stash.close())
+ for (GATKSAMRecord read : stash.close(knownSnpPositions))
outputRead(read);
if (nwayout)
writerToUse.close();
}
+ /**
+ * Removes known positions that are no longer relevant for use with het compression.
+ *
+ * @param read the current read, used for checking whether there are stale positions we can remove
+ */
+ protected void clearStaleKnownPositions(final GATKSAMRecord read) {
+ // nothing to clear if not used or empty
+ if ( knownSnpPositions == null || knownSnpPositions.isEmpty() )
+ return;
+
+ // not ready to be cleared until we encounter a read from a different contig
+ final int contigIndexOfRead = read.getReferenceIndex();
+ if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead )
+ return;
+
+ // because we expect most elements to be stale, it's not going to be efficient to remove them one at a time
+ final ObjectAVLTreeSet goodLocs = new ObjectAVLTreeSet();
+ for ( final GenomeLoc loc : knownSnpPositions ) {
+ if ( loc.getContigIndex() == contigIndexOfRead )
+ goodLocs.add(loc);
+ }
+ knownSnpPositions.clear();
+ knownSnpPositions.addAll(goodLocs);
+ }
+
/**
* Hard clips away all parts of the read that doesn't agree with the intervals selected.
*
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
index 0a446bab7..52c5f0903 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
@@ -46,6 +46,8 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
+import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@@ -106,11 +108,12 @@ public class ReduceReadsStash {
/**
* sends the read to the MultiSampleCompressor
*
- * @param read the read to be compressed
+ * @param read the read to be compressed
+ * @param knownSnpPositions the set of known SNP positions
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
*/
- public Iterable compress(GATKSAMRecord read) {
- return compressor.addAlignment(read);
+ public Iterable compress(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) {
+ return compressor.addAlignment(read, knownSnpPositions);
}
/**
@@ -125,18 +128,19 @@ public class ReduceReadsStash {
/**
* Close the stash, processing all remaining reads in order
*
+ * @param knownSnpPositions the set of known SNP positions
* @return a list of all the reads produced by the SlidingWindow machinery)
*/
- public Iterable close() {
+ public Iterable close(final ObjectSortedSet knownSnpPositions) {
LinkedList result = new LinkedList();
// compress all the stashed reads (in order)
for (GATKSAMRecord read : outOfOrderReads)
- for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
+ for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions))
result.add(compressedRead);
// output any remaining reads from the compressor
- for (GATKSAMRecord read : compressor.close())
+ for (GATKSAMRecord read : compressor.close(knownSnpPositions))
result.add(read);
return result;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
index 42db83c04..61c34b6a0 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
@@ -46,7 +46,9 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+import com.google.java.contract.Ensures;
import it.unimi.dsi.fastutil.objects.*;
+import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -60,11 +62,11 @@ public class SingleSampleCompressor {
final private int contextSize;
final private int downsampleCoverage;
final private int minMappingQuality;
+ final private double minAltPValueToTriggerVariant;
final private double minAltProportionToTriggerVariant;
final private double minIndelProportionToTriggerVariant;
final private int minBaseQual;
final private ReduceReads.DownsampleStrategy downsampleStrategy;
- final private boolean allowPolyploidReduction;
private SlidingWindow slidingWindow;
private int slidingWindowCounter;
@@ -74,23 +76,30 @@ public class SingleSampleCompressor {
public SingleSampleCompressor(final int contextSize,
final int downsampleCoverage,
final int minMappingQuality,
+ final double minAltPValueToTriggerVariant,
final double minAltProportionToTriggerVariant,
final double minIndelProportionToTriggerVariant,
final int minBaseQual,
- final ReduceReads.DownsampleStrategy downsampleStrategy,
- final boolean allowPolyploidReduction) {
+ final ReduceReads.DownsampleStrategy downsampleStrategy) {
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
this.minMappingQuality = minMappingQuality;
this.slidingWindowCounter = 0;
+ this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant;
this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
this.minBaseQual = minBaseQual;
this.downsampleStrategy = downsampleStrategy;
- this.allowPolyploidReduction = allowPolyploidReduction;
}
- public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) {
+ /**
+ * Add an alignment to the compressor
+ *
+ * @param read the read to be added
+ * @param knownSnpPositions the set of known SNP positions
+ * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
+ */
+ public Pair, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions ) {
ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
CompressionStash stash = new CompressionStash();
int readOriginalStart = read.getUnclippedStart();
@@ -101,14 +110,16 @@ public class SingleSampleCompressor {
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
// close the current sliding window
- Pair, CompressionStash> readsAndStash = slidingWindow.close();
+ Pair, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions);
reads = readsAndStash.getFirst();
stash = readsAndStash.getSecond();
slidingWindow = null; // so we create a new one on the next if
}
if ( slidingWindow == null) { // this is the first read
- slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), allowPolyploidReduction);
+ slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(),
+ slidingWindowCounter, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant,
+ minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
slidingWindowCounter++;
}
@@ -116,12 +127,26 @@ public class SingleSampleCompressor {
return new Pair, CompressionStash>(reads, stash);
}
- public Pair, CompressionStash> close() {
- return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
+ /**
+ * Properly closes the compressor.
+ *
+ * @param knownSnpPositions the set of known SNP positions
+ * @return A non-null set/list of all reads generated
+ */
+ @Ensures("result != null")
+ public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) {
+ return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair;
}
- public ObjectSet closeVariantRegions(CompressionStash regions) {
- return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions);
+ /**
+ * Finalizes current variant regions.
+ *
+ * @param knownSnpPositions the set of known SNP positions
+ * @return A non-null set/list of all reads generated
+ */
+ @Ensures("result != null")
+ public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) {
+ return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions);
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
index 7124b4772..d3ca037be 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
@@ -50,26 +50,22 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import it.unimi.dsi.fastutil.bytes.Byte2IntArrayMap;
import it.unimi.dsi.fastutil.bytes.Byte2IntMap;
-import it.unimi.dsi.fastutil.bytes.Byte2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.*;
-import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
+import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.ListIterator;
+import java.util.*;
/**
@@ -81,8 +77,8 @@ import java.util.ListIterator;
public class SlidingWindow {
// Sliding Window data
- final private ObjectAVLTreeSet readsInWindow;
- final private LinkedList windowHeader;
+ final protected PriorityQueue readsInWindow;
+ final protected LinkedList windowHeader;
protected int contextSize; // the largest context size (between mismatches and indels)
protected String contig;
protected int contigIndex;
@@ -100,9 +96,9 @@ public class SlidingWindow {
protected int filteredDataConsensusCounter;
protected String filteredDataReadName;
-
// Additional parameters
- protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches
+ protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT; // pvalue has to be greater than this value to trigger variant region due to mismatches
+ protected double MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches
protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions
protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value
protected int MIN_MAPPING_QUALITY;
@@ -110,8 +106,6 @@ public class SlidingWindow {
protected ReduceReads.DownsampleStrategy downsampleStrategy;
private boolean hasIndelQualities;
- private boolean allowPolyploidReductionInGeneral;
-
private static CompressionStash emptyRegions = new CompressionStash();
/**
@@ -127,8 +121,8 @@ public class SlidingWindow {
return getStopLocation(windowHeader);
}
- private int getStopLocation(LinkedList header) {
- return getStartLocation(header) + header.size() - 1;
+ private int getStopLocation(final LinkedList header) {
+ return header.isEmpty() ? -1 : header.peekLast().getLocation();
}
public String getContig() {
@@ -139,7 +133,7 @@ public class SlidingWindow {
return contigIndex;
}
- public int getStartLocation(LinkedList header) {
+ public int getStartLocation(final LinkedList header) {
return header.isEmpty() ? -1 : header.peek().getLocation();
}
@@ -152,24 +146,33 @@ public class SlidingWindow {
this.windowHeader = new LinkedList();
windowHeader.addFirst(new HeaderElement(startLocation));
- this.readsInWindow = new ObjectAVLTreeSet();
+ this.readsInWindow = new PriorityQueue(100, new Comparator() {
+ @Override
+ public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
+ return read1.getSoftEnd() - read2.getSoftEnd();
+ }
+ });
}
- public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, boolean allowPolyploidReduction) {
+ public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader,
+ final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber,
+ final double minAltPValueToTriggerVariant, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant,
+ final int minBaseQual, final int minMappingQuality, final int downsampleCoverage,
+ final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) {
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
- this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
+ this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant;
+ this.MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant;
this.MIN_BASE_QUAL_TO_COUNT = minBaseQual;
this.MIN_MAPPING_QUALITY = minMappingQuality;
this.windowHeader = new LinkedList();
- this.readsInWindow = new ObjectAVLTreeSet(new Comparator() {
+ this.readsInWindow = new PriorityQueue(1000, new Comparator() {
@Override
public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
- final int difference = read1.getSoftEnd() - read2.getSoftEnd();
- return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName());
+ return read1.getSoftEnd() - read2.getSoftEnd();
}
});
@@ -189,8 +192,6 @@ public class SlidingWindow {
this.downsampleStrategy = downsampleStrategy;
this.hasIndelQualities = hasIndelQualities;
-
- this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
}
/**
@@ -294,8 +295,8 @@ public class SlidingWindow {
regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose);
}
- while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
- readsInWindow.remove(readsInWindow.first());
+ while (!readsInWindow.isEmpty() && readsInWindow.peek().getSoftEnd() < windowHeaderStartLocation) {
+ readsInWindow.poll();
}
return regions;
@@ -348,10 +349,16 @@ public class SlidingWindow {
private final MarkedSites markedSites = new MarkedSites();
/**
- * returns an array marked with variant and non-variant regions (it uses
- * markVariantRegions to make the marks)
+ * returns the MarkedSites object so that it can be tested after adding data to the Sliding Window
*
- * @param stop check the window from start to stop (not-inclusive)
+ * @return the Marked Sites object used by this Sliding Window
+ */
+ protected MarkedSites getMarkedSitesForTesting() { return markedSites; }
+
+ /**
+ * returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks)
+ *
+ * @param stop check the window from start to stop (not-inclusive); given in global coordinates
*/
protected void markSites(final int stop) {
@@ -361,22 +368,17 @@ public class SlidingWindow {
// copy over as many bits as we can from the previous calculation. Note that we can't trust the
// last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there.
final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1;
- final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize);
+ final int locationToProcess = Math.max(windowHeaderStartLocation, Math.min(lastPositionMarked, stop - contextSize));
- // update the iterator to the correct position
- Iterator headerElementIterator = windowHeader.iterator();
- for (int i = windowHeaderStartLocation; i < locationToProcess; i++) {
- if (headerElementIterator.hasNext())
- headerElementIterator.next();
- }
+ final ListIterator headerElementIterator = windowHeader.listIterator(locationToProcess - windowHeaderStartLocation);
// process a contextSize worth of region from scratch in case there's a variant there
for (int i = locationToProcess; i < stop; i++) {
if (headerElementIterator.hasNext()) {
HeaderElement headerElement = headerElementIterator.next();
- if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
- markVariantRegion(markedSites, i - windowHeaderStartLocation);
+ if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
+ markVariantRegion(i - windowHeaderStartLocation);
} else
break;
@@ -386,33 +388,44 @@ public class SlidingWindow {
/**
* Marks the sites around the variant site (as true)
*
- * @param markedSites the boolean array to bear the marks
* @param variantSiteLocation the location where a variant site was found
*/
- protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
+ protected void markVariantRegion(final int variantSiteLocation) {
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
- int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
- for (int i = from; i < to; i++)
- markedSites.getVariantSiteBitSet()[i] = true;
+ int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length - 1 : variantSiteLocation + contextSize;
+ markRegionAs(from, to, true);
}
/**
- * Adds bases to the running consensus or filtered data accordingly
+ * Marks the sites around the variant site (as true)
+ *
+ * @param from the start index (inclusive) to mark
+ * @param to the end index (inclusive) to mark
+ * @param isVariant mark the region with this boolean value
+ */
+ private void markRegionAs(final int from, final int to, final boolean isVariant) {
+ for (int i = from; i <= to; i++)
+ markedSites.getVariantSiteBitSet()[i] = isVariant;
+ }
+
+ /**
+ * Adds bases to the running consensus
*
* If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
*
* @param header the window header
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
- * @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
+ * @param strandType the strandedness that the synthetic read should be represented as having
* @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated.
*/
@Requires({"start >= 0 && (end >= start || end == 0)"})
@Ensures("result != null")
- protected ObjectArrayList addToSyntheticReads(LinkedList header, int start, int end, boolean isNegativeStrand) {
- ObjectArrayList reads = new ObjectArrayList();
- if (start < end) {
- ListIterator headerElementIterator = header.listIterator(start);
+ protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) {
+ final ObjectArrayList reads = new ObjectArrayList();
+
+ if ( start < end ) {
+ final ListIterator headerElementIterator = header.listIterator(start);
if (!headerElementIterator.hasNext())
throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end));
@@ -420,37 +433,29 @@ public class SlidingWindow {
HeaderElement headerElement = headerElementIterator.next();
if (headerElement.hasConsensusData()) {
- reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
-
- int endOfConsensus = findNextNonConsensusElement(header, start, end);
- addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand);
+ // find the end of the consecutive consensus data in the window
+ final int endOfConsensus = findNextNonConsensusElement(header, start, end);
if (endOfConsensus <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
- reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand));
- } else if (headerElement.hasFilteredData()) {
+ // add to running consensus and recurse
+ addToRunningConsensus(header, start, endOfConsensus, strandType);
+ reads.addAll(addToSyntheticReads(header, endOfConsensus, end, strandType));
+
+ } else {
+
+ // add any outstanding consensus data
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
- int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
- reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
-
- if (endOfFilteredData <= start)
- throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
-
- reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand));
- } else if (headerElement.isEmpty()) {
- reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
-
- int endOfEmptyData = findNextNonEmptyElement(header, start, end);
-
+ // find the end of the consecutive empty data in the window
+ final int endOfEmptyData = findNextConsensusElement(header, start, end);
if (endOfEmptyData <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
- reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand));
- } else
- throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
-
+ // recurse out of the empty region
+ reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, strandType));
+ }
}
return reads;
@@ -462,24 +467,21 @@ public class SlidingWindow {
* @param type the synthetic reads you want to close
* @return a possibly null list of GATKSAMRecords generated by finalizing the synthetic reads
*/
- private ObjectArrayList finalizeAndAdd(ConsensusType type) {
- GATKSAMRecord read = null;
- ObjectArrayList list = new ObjectArrayList();
+ private ObjectArrayList finalizeAndAdd(final ConsensusType type) {
- switch (type) {
- case CONSENSUS:
- read = finalizeRunningConsensus();
- break;
- case FILTERED:
- read = finalizeFilteredDataConsensus();
- break;
- case BOTH:
- read = finalizeRunningConsensus();
- if (read != null) list.add(read);
- read = finalizeFilteredDataConsensus();
+ final ObjectArrayList list = new ObjectArrayList();
+
+ if ( type == ConsensusType.CONSENSUS || type == ConsensusType.BOTH ) {
+ final GATKSAMRecord read = finalizeRunningConsensus();
+ if ( read != null )
+ list.add(read);
+ }
+
+ if ( type == ConsensusType.FILTERED || type == ConsensusType.BOTH ) {
+ final GATKSAMRecord read = finalizeFilteredDataConsensus();
+ if ( read != null )
+ list.add(read);
}
- if (read != null)
- list.add(read);
return list;
}
@@ -487,19 +489,145 @@ public class SlidingWindow {
/**
* Looks for the next position without consensus data
*
- * @param start beginning of the filtered region
- * @param upTo limit to search for another consensus element
+ * @param header the header to check
+ * @param start beginning of the filtered region
+ * @param upTo limit to search for another consensus element
* @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
*/
- private int findNextNonConsensusElement(LinkedList header, int start, int upTo) {
- Iterator headerElementIterator = header.listIterator(start);
+ private int findNextNonConsensusElement(final LinkedList header, final int start, final int upTo) {
+ final Iterator headerElementIterator = header.listIterator(start);
int index = start;
while (index < upTo) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("There are no more header elements in this window");
- HeaderElement headerElement = headerElementIterator.next();
+ if (!headerElementIterator.next().hasConsensusData())
+ break;
+ index++;
+ }
+ return index;
+ }
+
+ /**
+ * Looks for the next position witho consensus data
+ *
+ * @param header the header to check
+ * @param start beginning of the filtered region
+ * @param upTo limit to search for another consensus element
+ * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
+ */
+ private int findNextConsensusElement(final LinkedList header, final int start, final int upTo) {
+ final Iterator headerElementIterator = header.listIterator(start);
+ int index = start;
+ while (index < upTo) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("There are no more header elements in this window");
+
+ if (headerElementIterator.next().hasConsensusData())
+ break;
+ index++;
+ }
+ return index;
+ }
+
+ /**
+ * Adds bases to the filtered data synthetic read.
+ *
+ * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
+ * bases.
+ *
+ * @param header the window header
+ * @param start the first header index to add to consensus
+ * @param end the first header index NOT TO add to consensus
+ * @param strandType the strandedness that the synthetic read should be represented as having
+ */
+ @Requires({"start >= 0 && (end >= start || end == 0)"})
+ private void addToRunningConsensus(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) {
+ if (runningConsensus == null)
+ runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType);
+
+ final Iterator headerElementIterator = header.listIterator(start);
+
+ for (int index = start; index < end; index++) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
+
+ final HeaderElement headerElement = headerElementIterator.next();
if (!headerElement.hasConsensusData())
+ throw new ReviewedStingException("No CONSENSUS data in " + index);
+
+ genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts());
+ }
+ }
+
+ /**
+ * Adds bases to the running filtered data accordingly
+ *
+ * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
+ *
+ * @param header the window header
+ * @param start the first header index to add to consensus
+ * @param end the first header index NOT TO add to consensus
+ * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated.
+ */
+ @Requires({"start >= 0 && (end >= start || end == 0)"})
+ @Ensures("result != null")
+ protected ObjectArrayList addToFilteredReads(final LinkedList header, final int start, final int end) {
+ final ObjectArrayList reads = new ObjectArrayList();
+
+ if ( start < end ) {
+ final ListIterator headerElementIterator = header.listIterator(start);
+
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end));
+
+ HeaderElement headerElement = headerElementIterator.next();
+
+ if (headerElement.hasFilteredData()) {
+
+ // find the end of the consecutive filtered data in the window
+ final int endOfFiltered = findNextNonFilteredElement(header, start, end);
+ if (endOfFiltered <= start)
+ throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFiltered, start));
+
+ // add to running filtered consensus and recurse
+ addToFilteredData(header, start, endOfFiltered);
+ reads.addAll(addToFilteredReads(header, endOfFiltered, end));
+
+ } else {
+
+ // add any outstanding filtered data
+ reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
+
+ // find the end of the consecutive empty data in the window
+ final int endOfEmptyData = findNextFilteredElement(header, start, end);
+ if (endOfEmptyData <= start)
+ throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
+
+ // recurse out of the empty region
+ reads.addAll(addToFilteredReads(header, endOfEmptyData, end));
+ }
+ }
+
+ return reads;
+ }
+
+ /**
+ * Looks for the next position without consensus data
+ *
+ * @param header the header to check
+ * @param start beginning of the filtered region
+ * @param upTo limit to search for another consensus element
+ * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
+ */
+ private int findNextNonFilteredElement(final LinkedList header, final int start, final int upTo) {
+ final Iterator headerElementIterator = header.listIterator(start);
+ int index = start;
+ while (index < upTo) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("There are no more header elements in this window");
+
+ if (!headerElementIterator.next().hasFilteredData())
break;
index++;
}
@@ -507,43 +635,21 @@ public class SlidingWindow {
}
/**
- * Looks for the next position without filtered data
+ * Looks for the next position witho consensus data
*
- * @param start beginning of the region
- * @param upTo limit to search for
- * @return next position in local coordinates (relative to the windowHeader) with no filtered data; otherwise, the start position
+ * @param header the header to check
+ * @param start beginning of the filtered region
+ * @param upTo limit to search for another consensus element
+ * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
*/
- private int findNextNonFilteredDataElement(LinkedList header, int start, int upTo) {
- Iterator headerElementIterator = header.listIterator(start);
+ private int findNextFilteredElement(final LinkedList header, final int start, final int upTo) {
+ final Iterator headerElementIterator = header.listIterator(start);
int index = start;
while (index < upTo) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("There are no more header elements in this window");
- HeaderElement headerElement = headerElementIterator.next();
- if (!headerElement.hasFilteredData() || headerElement.hasConsensusData())
- break;
- index++;
- }
- return index;
- }
-
- /**
- * Looks for the next non-empty header element
- *
- * @param start beginning of the region
- * @param upTo limit to search for
- * @return next position in local coordinates (relative to the windowHeader) with non-empty element; otherwise, the start position
- */
- private int findNextNonEmptyElement(LinkedList header, int start, int upTo) {
- ListIterator headerElementIterator = header.listIterator(start);
- int index = start;
- while (index < upTo) {
- if (!headerElementIterator.hasNext())
- throw new ReviewedStingException("There are no more header elements in this window");
-
- HeaderElement headerElement = headerElementIterator.next();
- if (!headerElement.isEmpty())
+ if (headerElementIterator.next().hasFilteredData())
break;
index++;
}
@@ -559,66 +665,25 @@ public class SlidingWindow {
* @param header the window header
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
- * @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
- * @return a non-null list of GATKSAMRecords representing finalized filtered consensus data. Empty list if no consensus was generated.
*/
@Requires({"start >= 0 && (end >= start || end == 0)"})
@Ensures("result != null")
- private ObjectArrayList addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) {
- ObjectArrayList result = new ObjectArrayList();
+ private void addToFilteredData(final LinkedList header, final int start, final int end) {
if (filteredDataConsensus == null)
- filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
+ filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, SyntheticRead.StrandType.STRANDLESS);
ListIterator headerElementIterator = header.listIterator(start);
for (int index = start; index < end; index++) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
- HeaderElement headerElement = headerElementIterator.next();
- if (headerElement.hasConsensusData())
- throw new ReviewedStingException("Found consensus data inside region to add to filtered data.");
+ final HeaderElement headerElement = headerElementIterator.next();
if (!headerElement.hasFilteredData())
throw new ReviewedStingException("No filtered data in " + index);
- if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
- result.add(finalizeFilteredDataConsensus());
- filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
- }
-
- genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
- }
-
- return result;
- }
-
- /**
- * Adds bases to the filtered data synthetic read.
- *
- * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
- * bases.
- *
- * @param header the window header
- * @param start the first header index to add to consensus
- * @param end the first header index NOT TO add to consensus
- * @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
- */
- @Requires({"start >= 0 && (end >= start || end == 0)"})
- private void addToRunningConsensus(LinkedList header, int start, int end, boolean isNegativeStrand) {
- if (runningConsensus == null)
- runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
-
- Iterator headerElementIterator = header.listIterator(start);
- for (int index = start; index < end; index++) {
- if (!headerElementIterator.hasNext())
- throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
-
- HeaderElement headerElement = headerElementIterator.next();
- if (!headerElement.hasConsensusData())
- throw new ReviewedStingException("No CONSENSUS data in " + index);
-
- genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS());
+ genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts());
}
}
@@ -627,15 +692,14 @@ public class SlidingWindow {
*
* @param syntheticRead the synthetic read to add to
* @param baseCounts the base counts object in the header element
- * @param rms the rms mapping quality in the header element
*/
- private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
+ private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) {
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
byte qual = baseCounts.averageQualsOfBase(base);
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
- syntheticRead.add(base, count, qual, insQual, delQual, rms);
+ syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS());
}
/**
@@ -643,117 +707,219 @@ public class SlidingWindow {
*
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
- * @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here?
- * @return a non-null list of all reads contained in the variant region
+ * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
+ * @return a non-null object representing all reads contained in the variant region
*/
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
@Ensures("result != null")
- protected ObjectList compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
- ObjectList allReads = new ObjectArrayList();
+ protected CloseVariantRegionResult compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) {
+ final CloseVariantRegionResult allReads = new CloseVariantRegionResult(stop);
// Try to compress into a polyploid consensus
- int nVariantPositions = 0;
- int hetRefPosition = -1;
- boolean canCompress = true;
- Object[] header = windowHeader.toArray();
+ // Optimization: don't bother if there are no known SNPs here
+ final int hetRefPosition = (knownSnpPositions != null && knownSnpPositions.isEmpty()) ? -1 : findSinglePolyploidCompressiblePosition(start, stop);
- // foundEvent will remain false if we don't allow polyploid reduction
- if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
- for (int i = start; i<=stop; i++) {
-
- int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
-
- // we will only work on diploid cases because we just don't want to handle/test other scenarios
- if ( nAlleles > 2 ) {
- canCompress = false;
- break;
- } else if ( nAlleles == 2 ) {
- nVariantPositions++;
-
- // make sure that there is only 1 site in the variant region that contains more than one allele
- if ( nVariantPositions == 1 ) {
- hetRefPosition = i;
- } else if ( nVariantPositions > 1 ) {
- canCompress = false;
- break;
- }
- }
- }
+ // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by
+ // insertions (which we don't want because we can't confirm that they represent the same allele).
+ // Also, we only allow polyploid consensus creation at known sites if provided.
+ if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) {
+ // try to create the polyploid consensus
+ allReads.reads.addAll(createPolyploidConsensus(hetRefPosition));
+ allReads.stopPerformed = hetRefPosition; // we stopped at the het position
}
-
- // Try to compress the variant region; note that using the hetRefPosition protects us from trying to compress
- // variant regions that are created by insertions (since we can't confirm here that they represent the same allele)
- if ( canCompress && hetRefPosition != -1 ) {
- allReads = createPolyploidConsensus(start, stop, ((HeaderElement) header[hetRefPosition]).getLocation());
- }
-
- // Return all reads that overlap the variant region and remove them from the window header entirely
- // also remove all reads preceding the variant region (since they will be output as consensus right after compression
+ // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them
+ // from the window header entirely; also remove all reads preceding the variant region (since they will be output
+ // as consensus right after compression)
else {
final int refStart = windowHeader.get(start).getLocation();
final int refStop = windowHeader.get(stop).getLocation();
- ObjectList toRemove = new ObjectArrayList();
- for (GATKSAMRecord read : readsInWindow) {
- if (read.getSoftStart() <= refStop) {
- if (read.getAlignmentEnd() >= refStart) {
- allReads.add(read);
+ final ObjectList toRemove = new ObjectArrayList();
+ for ( final GATKSAMRecord read : readsInWindow ) {
+ if ( read.getSoftStart() <= refStop ) {
+ if ( read.getAlignmentEnd() >= refStart ) {
+ allReads.reads.add(read);
removeFromHeader(windowHeader, read);
}
toRemove.add(read);
}
}
- removeReadsFromWindow(toRemove);
+
+ // remove all used reads
+ for ( final GATKSAMRecord read : toRemove )
+ readsInWindow.remove(read);
}
+
return allReads;
}
+ /**
+ * Determines whether the given position match one of the known sites
+ *
+ * @param targetPosition the position of the het site
+ * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
+ * @return true if the targetPosition matches a known SNP position, false otherwise
+ */
+ @Requires({"targetPosition >= 1 && knownSnpPositions != null"})
+ protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) {
+ final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition);
+ return knownSnpPositions == null || knownSnpPositions.contains(targetLoc);
+ }
+
+ /*
+ * Finds the het variant position located within start and stop (inclusive) if one exists.
+ *
+ * @param start the first header index in the region to check (inclusive)
+ * @param stop the last header index of the region to check (inclusive)
+ * @return the window header index of the single het position or -1 if either none or more than one exists
+ */
+ @Requires("start >= 0 && (stop >= start || stop == 0)")
+ protected int findSinglePolyploidCompressiblePosition(final int start, final int stop) {
+ int hetRefPosition = -1;
+
+ for ( int i = start; i <= stop; i++ ) {
+
+ final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT);
+
+ // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios
+ if ( nAlleles > 2 || nAlleles == -1 )
+ return -1;
+
+ if ( nAlleles == 2 ) {
+
+ // make sure that there is only 1 site in the region that contains more than one allele
+ if ( hetRefPosition != -1 )
+ return -1;
+
+ hetRefPosition = i;
+ }
+ }
+
+ return hetRefPosition;
+ }
+
+ /*
+ * Checks whether there's a position in the header with a significant number of softclips or a variant.
+ *
+ * @param header the window header to examine
+ * @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument)
+ * @return true if there exists a position with significant softclips, false otherwise
+ */
+ @Requires("header != null")
+ protected boolean hasPositionWithSignificantSoftclipsOrVariant(final List header, final int positionToSkip) {
+
+ for ( final HeaderElement headerElement : header ) {
+
+ if ( headerElement.getLocation() == positionToSkip )
+ continue;
+
+ if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ||
+ headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) > 1 )
+ return true;
+ }
+
+ return false;
+ }
+
/**
* Finalizes a variant region, any adjacent synthetic reads.
*
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
- * @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here?
- * @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads
+ * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
+ * @return a non-null object representing all reads contained in the variant region plus any adjacent synthetic reads
*/
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
@Ensures("result != null")
- protected ObjectList closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
- ObjectList allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
+ protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) {
+ final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions);
- ObjectList result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
- result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
- result.addAll(finalizeAndAdd(ConsensusType.BOTH));
+ final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed);
+ result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads);
+ result.reads.addAll(addToSyntheticReads(windowHeader, 0, allReads.stopPerformed + 1, SyntheticRead.StrandType.STRANDLESS));
+ result.reads.addAll(addToFilteredReads(windowHeader, 0, allReads.stopPerformed + 1));
+ result.reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
return result; // finalized reads will be downsampled if necessary
}
- public ObjectSet closeVariantRegions(CompressionStash regions) {
- ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
- if (!regions.isEmpty()) {
- int lastStop = -1;
- int windowHeaderStart = getStartLocation(windowHeader);
+ /*
+ * @see #closeVariantRegions(CompressionStash, ObjectSortedSet, boolean) with forceCloseFullRegions set to false
+ */
+ public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) {
+ return closeVariantRegions(regions, knownSnpPositions, false);
+ }
- for (GenomeLoc region : regions) {
+ private static final class CloseVariantRegionResult {
+ final private ObjectList reads = new ObjectArrayList();
+ private int stopPerformed;
+
+ public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; }
+ }
+
+ /*
+ * Finalizes the list of regions requested (and any regions preceding them)
+ *
+ * @param regions the list of regions to finalize
+ * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere)
+ * @param forceCloseFullRegions if true, requires this method to make sure all regions are fully closed; otherwise, we may decide not to close up to the very end (e.g. during het compression)
+ * @return a non-null set of reduced reads representing the finalized regions
+ */
+ public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions, final boolean forceCloseFullRegions) {
+ final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
+ if ( !regions.isEmpty() ) {
+
+ int windowHeaderStart = getStartLocation(windowHeader);
+ HeaderElement lastCleanedElement = null;
+
+ for ( final GenomeLoc region : regions ) {
if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) {
- int start = region.getStart() - windowHeaderStart;
+ final int start = region.getStart() - windowHeaderStart;
int stop = region.getStop() - windowHeaderStart;
- allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track
- lastStop = stop;
+ CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions);
+ allReads.addAll(closeVariantRegionResult.reads);
+
+ // check whether we didn't close the whole region that was requested
+ if ( stop > 0 && closeVariantRegionResult.stopPerformed < stop ) {
+ // we should update the variant sites bitset because the context size's worth of bases after the variant position are no longer "variant"
+ markRegionAs(closeVariantRegionResult.stopPerformed + 1, stop, false);
+
+ // if the calling method said that it didn't care then we are okay so update the stop
+ if ( !forceCloseFullRegions ) {
+ stop = closeVariantRegionResult.stopPerformed;
+ }
+ // otherwise, we need to forcibly push the stop that we originally requested
+ else {
+ while ( closeVariantRegionResult.stopPerformed < stop ) {
+ // first clean up used header elements so they don't get reused
+ for ( int i = 0; i <= closeVariantRegionResult.stopPerformed; i++ )
+ windowHeader.remove();
+ stop -= (closeVariantRegionResult.stopPerformed + 1);
+
+ closeVariantRegionResult = closeVariantRegion(0, stop, knownSnpPositions);
+ allReads.addAll(closeVariantRegionResult.reads);
+ }
+ }
+ }
+
+ // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions.
+ // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads
+ // (in the case where there are multiple regions to close we'd reuse the reads for each region).
+ if ( stop >= 0 ) {
+ for ( int i = 0; i < stop; i++ )
+ windowHeader.remove();
+ lastCleanedElement = windowHeader.remove();
+ windowHeaderStart = getStartLocation(windowHeader);
+ }
}
}
- // clean up the window header elements up until the end of the variant region.
- // note that we keep the last element of the region in the event that the following element has a read that starts with insertion.
- if ( lastStop >= 0 ) {
- for (int i = 0; i < lastStop; i++)
- windowHeader.remove();
- final HeaderElement lastOfRegion = windowHeader.remove();
- if ( lastOfRegion.hasInsertionToTheRight() )
- windowHeader.addFirst(new HeaderElement(lastOfRegion.getLocation(), lastOfRegion.numInsertionsToTheRight()));
- }
+ // we need to keep the last element of the last cleaned region in the event that the following element has a read that starts with an insertion.
+ if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() )
+ windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight()));
}
+
return allReads;
}
@@ -786,22 +952,23 @@ public class SlidingWindow {
* regions that still exist regardless of being able to fulfill the
* context size requirement in the end.
*
+ * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere)
* @return A non-null set/list of all reads generated
*/
@Ensures("result != null")
- public Pair, CompressionStash> close() {
+ public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) {
// mark variant regions
ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator());
CompressionStash regions = new CompressionStash();
- boolean forceCloseUnfinishedRegions = true;
if (!windowHeader.isEmpty()) {
markSites(getStopLocation(windowHeader) + 1);
- regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
- finalizedReads = closeVariantRegions(regions);
+ regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true);
+ finalizedReads = closeVariantRegions(regions, knownSnpPositions, true);
if (!windowHeader.isEmpty()) {
- finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
+ finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS));
+ finalizedReads.addAll(addToFilteredReads(windowHeader, 0, windowHeader.size()));
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
}
}
@@ -845,86 +1012,120 @@ public class SlidingWindow {
return finalizedRead;
}
+ // define this so that we can use Java generics below
+ private final static class HeaderElementList extends LinkedList {}
+
+ private final static class SingleStrandConsensusData {
+ final HeaderElementList consensus = new HeaderElementList();
+ final ObjectList reads = new ObjectArrayList();
+ }
+
/**
- * Finalizes a variant region, any adjacent synthetic reads.
+ * Finalizes a variant region - and any adjacent synthetic reads - for point mutations (indel sites are not
+ * supported) with polyploid compression.
*
- * @param start the first window header index in the variant region (inclusive)
- * @param stop the last window header index of the variant region (inclusive)
- * @param hetRefPosition reference position (in global coordinates) of the het site
+ * @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE!
* @return a non-null list of all reads contained in the variant region as a polyploid consensus
*/
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
- @Ensures("result != null")
- private ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) {
- // we will create two (positive strand, negative strand) headers for each contig
- ObjectList> headersPosStrand = new ObjectArrayList>();
- ObjectList> headersNegStrand = new ObjectArrayList>();
- ObjectList hetReads = new ObjectArrayList();
- Byte2IntMap haplotypeHeaderMap = new Byte2IntArrayMap(2);
- int currentHaplotype = 0;
- int refStart = windowHeader.get(start).getLocation();
- int refStop = windowHeader.get(stop).getLocation();
- ObjectList toRemove = new ObjectArrayList();
- for (GATKSAMRecord read : readsInWindow) {
- int haplotype;
+ @Ensures({"result != null"})
+ protected ObjectList createPolyploidConsensus(final int hetRefPosition) {
+ // we will create two (positive strand, negative strand) headers for each haplotype
+ final SingleStrandConsensusData[] headersPosStrand = new SingleStrandConsensusData[2];
+ final SingleStrandConsensusData[] headersNegStrand = new SingleStrandConsensusData[2];
- // check if the read is either before or inside the variant region
- if (read.getSoftStart() <= refStop) {
- // check if the read is inside the variant region
- if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) {
- // check if the read contains the het site
- if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) {
- int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
- // TODO -- THIS IS A HUGE BUG AS IT WILL NOT WORK FOR DELETIONS; see commented out unit test
- byte base = read.getReadBases()[readPos];
- byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos];
+ final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation();
- // check if base passes the filters!
- if (qual >= MIN_BASE_QUAL_TO_COUNT) {
- // check which haplotype this read represents and take the index of it from the list of headers
- if (haplotypeHeaderMap.containsKey(base)) {
- haplotype = haplotypeHeaderMap.get(base);
- }
- // create new lists if this haplotype has not been seen yet
- else {
- haplotype = currentHaplotype;
- haplotypeHeaderMap.put(base, currentHaplotype);
- headersPosStrand.add(new LinkedList());
- headersNegStrand.add(new LinkedList());
- currentHaplotype++;
- }
- LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype);
- // add to the polyploid header
- addToHeader(header, read);
- // remove from the standard header so that we don't double count it
- removeFromHeader(windowHeader, read);
- }
- }
- }
+ // initialize the mapping from base (allele) to header
+ final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2);
+ for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ) {
+ final int currentIndex = alleleHeaderMap.size();
+ if ( currentIndex > 1 )
+ throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus");
- // we remove all reads before and inside the variant region from the window
- toRemove.add(read);
+ alleleHeaderMap.put(allele.b, currentIndex);
+ headersPosStrand[currentIndex] = new SingleStrandConsensusData();
+ headersNegStrand[currentIndex] = new SingleStrandConsensusData();
+ }
+
+ // sanity check that we saw 2 alleles
+ if ( alleleHeaderMap.size() != 2 )
+ throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size());
+
+ final ObjectList readsToRemove = new ObjectArrayList();
+
+ for ( final GATKSAMRecord read : readsInWindow ) {
+
+ // if the read falls after the het position, just skip it for now (we'll get to it later)
+ if ( read.getSoftStart() > globalHetRefPosition )
+ continue;
+
+ // remove all other reads from the read cache since we're going to use them here
+ readsToRemove.add(read);
+
+ // if the read falls before the het position or has low MQ, we don't need to look at it
+ if ( read.getSoftEnd() < globalHetRefPosition || read.getMappingQuality() < MIN_MAPPING_QUALITY)
+ continue;
+
+ // remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead
+ removeFromHeader(windowHeader, read);
+
+ // where on the read is the het position?
+ final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
+
+ // this is safe because indels are not supported
+ final byte base = read.getReadBases()[readPosOfHet];
+
+ // check which allele this read represents
+ final Integer allele = alleleHeaderMap.get(base);
+
+ // ignore the read if it represents a base that's not part of the consensus
+ if ( allele != null ) {
+ // add to the appropriate polyploid header
+ final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele];
+ header.reads.add(read);
+ addToHeader(header.consensus, read);
}
}
- for (LinkedList header : headersPosStrand) {
- if (header.size() > 0)
- hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false));
- if (runningConsensus != null)
- hetReads.add(finalizeRunningConsensus());
- }
- for (LinkedList header : headersNegStrand) {
- if (header.size() > 0)
- hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true));
- if (runningConsensus != null)
- hetReads.add(finalizeRunningConsensus());
- }
+ for ( final GATKSAMRecord read : readsToRemove )
+ readsInWindow.remove(read);
- removeReadsFromWindow(toRemove);
+ // create the polyploid synthetic reads if we can
+ final ObjectList hetReads = new ObjectArrayList();
+
+ // sanity check that no new "variant region" exists on just a single consensus strand due to softclips
+ // or multi-allelic sites now that we've broken everything out into their component parts. if one does
+ // exist then we need to back out the consensus for that strand only.
+ for ( final SingleStrandConsensusData header : headersPosStrand ) {
+ if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) )
+ hetReads.addAll(header.reads);
+ else
+ finalizeHetConsensus(header.consensus, false, hetReads);
+ }
+ for ( final SingleStrandConsensusData header : headersNegStrand ) {
+ if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) )
+ hetReads.addAll(header.reads);
+ else
+ finalizeHetConsensus(header.consensus, true, hetReads);
+ }
return hetReads;
}
+ /*
+ * Finalizes a particular het consensus for the given header representation
+ *
+ * @param header the list of header elements representing the header for the consensus
+ * @param isNegativeStrand does this header represent reads on the negative strand?
+ * @param result list in which to store results
+ */
+ protected void finalizeHetConsensus(final LinkedList header, final boolean isNegativeStrand, final ObjectList result) {
+ if ( header.size() > 0 )
+ result.addAll(addToSyntheticReads(header, 0, header.size(), isNegativeStrand ? SyntheticRead.StrandType.NEGATIVE : SyntheticRead.StrandType.POSITIVE));
+ if ( runningConsensus != null )
+ result.add(finalizeRunningConsensus());
+ }
private void addToHeader(LinkedList header, GATKSAMRecord read) {
updateHeaderCounts(header, read, false);
@@ -934,115 +1135,158 @@ public class SlidingWindow {
updateHeaderCounts(header, read, true);
}
-
/**
* Updates the sliding window's header counts with the incoming read bases, insertions
* and deletions.
*
- * @param header the sliding window header to use
- * @param read the incoming read to be added to the sliding window
- * @param removeRead if we are removing the read from the header or adding
+ * @param header the sliding window header to use
+ * @param read the incoming read to be added to the sliding window
+ * @param removeRead if we are removing the read from the header or adding
*/
- private void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) {
- byte[] bases = read.getReadBases();
- byte[] quals = read.getBaseQualities();
- byte[] insQuals = read.getExistingBaseInsertionQualities();
- byte[] delQuals = read.getExistingBaseDeletionQualities();
- int readStart = read.getSoftStart();
- int readEnd = read.getSoftEnd();
- Cigar cigar = read.getCigar();
+ protected void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) {
+ final int readStart = read.getSoftStart();
+ final int headerStart = getStartLocation(header);
+ int locationIndex = headerStart < 0 ? 0 : readStart - headerStart;
- int readBaseIndex = 0;
- int startLocation = getStartLocation(header);
- int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
- int stopLocation = getStopLocation(header);
+ if ( removeRead && locationIndex < 0 )
+ throw new IllegalStateException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header));
- if (removeRead && locationIndex < 0)
- throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
+ // we only need to create new header elements if we are adding the read, not when we're removing it
+ if ( !removeRead )
+ locationIndex = createNewHeaderElements(header, read, locationIndex);
- if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it
- if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
- for (int i = 1; i <= -locationIndex; i++)
- header.addFirst(new HeaderElement(startLocation - i));
+ actuallyUpdateHeaderForRead(header, read, removeRead, locationIndex);
+ }
- startLocation = readStart; // update start location accordingly
- locationIndex = 0;
- }
+ /*
+ * Creates new header elements if needed for the given read.
+ *
+ * @param header the sliding window header to use
+ * @param read the incoming read to be added to the sliding window
+ * @param startIndex the start location index into the header for this read
+ *
+ * @return an updated index into the modified header
+ */
+ @Requires("header != null && read != null")
+ protected int createNewHeaderElements(final LinkedList header, final GATKSAMRecord read, final int startIndex) {
- if (stopLocation < readEnd) { // Do we need to add extra elements to the header?
- int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
- while (elementsToAdd-- > 0)
- header.addLast(new HeaderElement(readEnd - elementsToAdd));
- }
+ int headerStart = getStartLocation(header);
+ int locationIndex = startIndex;
- // Special case for leading insertions before the beginning of the sliding read
- if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
- header.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
- locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
- }
+ // Do we need to add extra elements before the start of the header? This could happen if the previous read was
+ // clipped and this alignment starts before the beginning of the window
+ final int readStart = read.getSoftStart();
+ if ( startIndex < 0 ) {
+ for ( int i = 1; i <= -startIndex; i++ )
+ header.addFirst(new HeaderElement(headerStart - i));
+
+ // update the start location accordingly
+ headerStart = readStart;
+ locationIndex = 0;
}
- Iterator headerElementIterator = header.listIterator(locationIndex);
+ // Do we need to add extra elements to the end of the header?
+ final int headerStop = getStopLocation(header);
+ final int readEnd = read.getSoftEnd();
+ if ( headerStop < readEnd ) {
+ final int elementsToAdd = (headerStop < 0) ? readEnd - readStart + 1 : readEnd - headerStop;
+ for ( int i = elementsToAdd - 1; i >= 0; i-- )
+ header.addLast(new HeaderElement(readEnd - i));
+ }
+
+ // Special case for leading insertions before the beginning of the sliding read
+ if ( ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == headerStart || headerStart < 0) ) {
+ // create a new first element to the window header with no bases added
+ header.addFirst(new HeaderElement(readStart - 1));
+ // this allows the first element (I) to look at locationIndex - 1 when we update the header and do the right thing
+ locationIndex = 1;
+ }
+
+ return locationIndex;
+ }
+
+ /*
+ * Actually updates the sliding window's header counts with the incoming read bases and quals (including insertion and deletion quals).
+ *
+ * @param header the sliding window header to use
+ * @param read the incoming read to be added to the sliding window
+ * @param removeRead if we are removing the read from the header or adding
+ * @param startIndex the start location index into the header for this read
+ */
+ @Requires("header != null && read != null && startIndex >= 0")
+ protected void actuallyUpdateHeaderForRead(final LinkedList header, final GATKSAMRecord read, final boolean removeRead, final int startIndex) {
+
+ final Iterator headerElementIterator = header.listIterator(startIndex);
+ final byte mappingQuality = (byte) read.getMappingQuality();
+
+ // iterator variables
+ int locationIndex = startIndex;
+ int readBaseIndex = 0;
HeaderElement headerElement;
- for (CigarElement cigarElement : cigar.getCigarElements()) {
- switch (cigarElement.getOperator()) {
+
+ for ( final CigarElement cigarElement : read.getCigar().getCigarElements() ) {
+ switch ( cigarElement.getOperator() ) {
case H:
break;
case I:
- if (removeRead && locationIndex == 0) { // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it.
- break;
- }
-
- headerElement = header.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
-
- if (removeRead) {
- headerElement.removeInsertionToTheRight();
- }
- else {
- headerElement.addInsertionToTheRight();
- }
readBaseIndex += cigarElement.getLength();
- break; // just ignore the insertions at the beginning of the read
- case D:
- int nDeletions = cigarElement.getLength();
- while (nDeletions-- > 0) { // deletions are added to the baseCounts with the read mapping quality as it's quality score
- headerElement = headerElementIterator.next();
- byte mq = (byte) read.getMappingQuality();
- if (removeRead)
- headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
- else
- headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
- locationIndex++;
+ // special case, if we don't have the previous header element anymore, don't worry about it.
+ if ( locationIndex == 0 )
+ break;
+
+ // insertions are added to the base to the left (previous element)
+ headerElement = header.get(locationIndex - 1);
+
+ if ( removeRead )
+ headerElement.removeInsertionToTheRight();
+ else
+ headerElement.addInsertionToTheRight();
+
+ break;
+ case D:
+ // deletions are added to the baseCounts with the read mapping quality as it's quality score
+ final int nDeletionBases = cigarElement.getLength();
+ for ( int i = 0; i < nDeletionBases; i++ ) {
+ headerElement = headerElementIterator.next();
+ if (removeRead)
+ headerElement.removeBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
+ else
+ headerElement.addBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
}
+ locationIndex += nDeletionBases;
break;
case S:
case M:
case P:
case EQ:
case X:
- int nBasesToAdd = cigarElement.getLength();
- while (nBasesToAdd-- > 0) {
+ final int nBasesToAdd = cigarElement.getLength();
+ final boolean isSoftClip = cigarElement.getOperator() == CigarOperator.S;
+ final byte[] readBases = read.getReadBases();
+ final byte[] readQuals = read.getBaseQualities();
+ final boolean readHasIndelQuals = read.hasBaseIndelQualities();
+ final byte[] insertionQuals = readHasIndelQuals ? read.getBaseInsertionQualities() : null;
+ final byte[] deletionQuals = readHasIndelQuals ? read.getBaseDeletionQualities() : null;
+
+ for ( int i = 0; i < nBasesToAdd; i++ ) {
headerElement = headerElementIterator.next();
- byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex]; // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything)
- byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex];
- if (removeRead)
- headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
+ final byte insertionQuality = readHasIndelQuals ? insertionQuals[readBaseIndex] : -1;
+ final byte deletionQuality = readHasIndelQuals ? deletionQuals[readBaseIndex] : -1;
+
+ if ( removeRead )
+ headerElement.removeBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip);
else
- headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
+ headerElement.addBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip);
readBaseIndex++;
- locationIndex++;
}
+ locationIndex += nBasesToAdd;
+ break;
+ default:
break;
}
}
}
-
- private void removeReadsFromWindow (ObjectList readsToRemove) {
- for (GATKSAMRecord read : readsToRemove) {
- readsInWindow.remove(read);
- }
- }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
index 72fd52ebe..9d16ea06f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
@@ -47,13 +47,11 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Requires;
-import it.unimi.dsi.fastutil.bytes.ByteArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
-import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
@@ -76,17 +74,25 @@ import java.util.Iterator;
* @since 8/26/11
*/
public class SyntheticRead {
- // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce
- // memory footprint.
- // TODO: better name
+
+ /**
+ * The types of strandedness for synthetic reads
+ */
+ public enum StrandType {
+ POSITIVE,
+ NEGATIVE,
+ STRANDLESS
+ }
+
+ // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint.
private static class SingleBaseInfo {
byte baseIndexOrdinal; // enum BaseIndex.ordinal
- byte count;
+ int count;
byte qual;
byte insertionQual;
byte deletionQual;
- SingleBaseInfo(byte baseIndexOrdinal, byte count, byte qual, byte insertionQual, byte deletionQual) {
+ SingleBaseInfo(byte baseIndexOrdinal, int count, byte qual, byte insertionQual, byte deletionQual) {
this.baseIndexOrdinal = baseIndexOrdinal;
this.count = count;
this.qual = qual;
@@ -124,8 +130,7 @@ public class SyntheticRead {
private final ObjectArrayList basesCountsQuals;
- private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
- private String readTag;
+ private double mappingQuality;
// Information to produce a GATKSAMRecord
private SAMFileHeader header;
@@ -135,7 +140,7 @@ public class SyntheticRead {
private String readName;
private int refStart;
private boolean hasIndelQualities = false;
- private boolean isNegativeStrand = false;
+ private StrandType strandType = StrandType.STRANDLESS;
/**
* Full initialization of the running consensus if you have all the information and are ready to
@@ -147,14 +152,12 @@ public class SyntheticRead {
* @param contigIndex the read's contig index
* @param readName the read's name
* @param refStart the alignment start (reference based)
- * @param readTag the reduce reads tag for the synthetic read
*/
- public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
+ public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) {
final int initialCapacity = 10000;
basesCountsQuals = new ObjectArrayList(initialCapacity);
mappingQuality = 0.0;
- this.readTag = readTag;
this.header = header;
this.readGroupRecord = readGroupRecord;
this.contig = contig;
@@ -162,24 +165,7 @@ public class SyntheticRead {
this.readName = readName;
this.refStart = refStart;
this.hasIndelQualities = hasIndelQualities;
- this.isNegativeStrand = isNegativeRead;
- }
-
- public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
- basesCountsQuals = new ObjectArrayList(bases.size());
- for (int i = 0; i < bases.size(); ++i) {
- basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i)));
- }
- this.mappingQuality = mappingQuality;
- this.readTag = readTag;
- this.header = header;
- this.readGroupRecord = readGroupRecord;
- this.contig = contig;
- this.contigIndex = contigIndex;
- this.readName = readName;
- this.refStart = refStart;
- this.hasIndelQualities = hasIndelQualities;
- this.isNegativeStrand = isNegativeRead;
+ this.strandType = strandType;
}
/**
@@ -190,7 +176,7 @@ public class SyntheticRead {
* @param count number of reads with this base
*/
@Requires("count <= Byte.MAX_VALUE")
- public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) {
+ public void add(BaseIndex base, int count, byte qual, byte insQual, byte delQual, double mappingQuality) {
basesCountsQuals.add(new SingleBaseInfo(base.getOrdinalByte(), count, qual, insQual, delQual));
this.mappingQuality += mappingQuality;
}
@@ -220,15 +206,18 @@ public class SyntheticRead {
read.setReferenceIndex(contigIndex);
read.setReadPairedFlag(false);
read.setReadUnmappedFlag(false);
- read.setReadNegativeStrandFlag(isNegativeStrand);
- read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
+ if ( strandType != StrandType.STRANDLESS ) {
+ read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line
+ read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE);
+ }
+ read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
read.setAlignmentStart(refStart);
read.setReadName(readName);
read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
read.setReadBases(convertReadBases());
read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size()));
read.setReadGroup(readGroupRecord);
- read.setAttribute(readTag, convertBaseCounts());
+ read.setReducedReadCountsTag(convertBaseCounts());
if (hasIndelQualities) {
read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
@@ -278,22 +267,14 @@ public class SyntheticRead {
});
}
- protected byte [] convertBaseCounts() {
- byte[] countsArray = convertVariableGivenBases(new SingleBaseInfoIterator() {
- public Byte next() {
- return it.next().count;
- }
- });
-
- if (countsArray.length == 0)
- throw new ReviewedStingException("Reduced read has counts array of length 0");
-
- byte[] compressedCountsArray = new byte [countsArray.length];
- compressedCountsArray[0] = countsArray[0];
- for (int i = 1; i < countsArray.length; i++)
- compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE);
-
- return compressedCountsArray;
+ protected int[] convertBaseCounts() {
+ int[] variableArray = new int[getReadLengthWithNoDeletions()];
+ int i = 0;
+ for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
+ if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte())
+ variableArray[i++] = singleBaseInfo.count;
+ }
+ return variableArray;
}
private byte [] convertReadBases() {
@@ -369,7 +350,6 @@ public class SyntheticRead {
variableArray[i++] = count;
}
return variableArray;
-
}
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
similarity index 97%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
index 37e82a90c..417da9d79 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
@@ -44,10 +44,11 @@
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
-package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
+package org.broadinstitute.sting.gatk.walkers.diagnostics;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -55,6 +56,8 @@ import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.help.HelpConstants;
import java.io.PrintStream;
import java.util.ArrayList;
@@ -63,25 +66,25 @@ import java.util.LinkedList;
import java.util.Map;
/**
- * Simple walker to plot the coverage distribution per base.
+ * Simple walker to plot the coverage distribution per base
*
*
* Features of this walker:
- *
- includes a smart counting of uncovered bases without visiting the uncovered loci.
+ * - includes a smart counting of uncovered bases without visiting the uncovered loci
* - includes reads with deletions in the loci (optionally can be turned off)
*
*
- * Input
+ * Input
*
* The BAM file and an optional interval list (works for WGS as well)
*
*
- * Output
+ * Output
*
* A GATK Report with the coverage distribution per base
*
*
- * Examples
+ * Examples
*
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
@@ -91,15 +94,16 @@ import java.util.Map;
* -fd \
* -o report.grp
*
- * User: carneiro
- * Date: 1/27/13
- * Time: 11:16 AM
+ *
+ * @author carneiro
+ * @since 1/27/13
*/
+@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
public class BaseCoverageDistribution extends LocusWalker, Map>> {
/**
* The output GATK Report table
*/
- @Output(required = true, doc = "The output GATK Report table")
+ @Output(doc = "The output GATK Report table")
private PrintStream out;
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java
similarity index 93%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java
index b1a26b7a2..ad6023579 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java
@@ -44,7 +44,7 @@
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
-package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
+package org.broadinstitute.sting.gatk.walkers.diagnostics;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
@@ -63,11 +63,36 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
import java.io.PrintStream;
+/**
+ * Outputs a list of intervals that are covered above a given threshold.
+ *
+ * The list can be used as an interval list for other walkers. Note that if the -uncovered argument is given, the tool will instead output intervals that fail the coverage threshold.
+ *
+ * Input
+ *
+ * One or more BAM files.
+ *
+ *
+ * Output
+ *
+ * List of covered (or uncovered) intervals.
+ *
+ *
+ * Example
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -T FindCoveredIntervals \
+ * -R ref.fasta \
+ * -I my_file.bam \
+ * -o output.list
+ *
+ *
+ */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.CONTIG)
@ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000)
public class FindCoveredIntervals extends ActiveRegionWalker {
- @Output(required = true)
+ @Output
private PrintStream out;
@Argument(fullName = "uncovered", shortName = "u", required = false, doc = "output intervals that fail the coverage threshold instead")
@@ -80,7 +105,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker {
// Look to see if the region has sufficient coverage
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
- int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
+ int depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage();
// note the linear probability scale
return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1));
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
new file mode 100644
index 000000000..dca83af44
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
@@ -0,0 +1,150 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
+
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Generic code for Diagnose Target Statistics
+ *
+ * @author Mauricio Carneiro
+ * @since 4/23/13
+ */
+abstract class AbstractStratification {
+
+ private long preComputedTotalCoverage = -1;
+ private Map statusTally = null;
+ protected ThresHolder thresholds;
+
+ /**
+ * Calculates the average "good" coverage of this sample. Good means "passes the base and
+ * mapping quality requirements.
+ *
+ * @return the average "good" coverage
+ */
+ public double averageCoverage(final int size) {
+ if (preComputedTotalCoverage < 0)
+ preComputedTotalCoverage = calculateTotalCoverage(getElements());
+ return (double) preComputedTotalCoverage / size;
+ }
+
+ /**
+ * Calculates the total "good" coverage of this sample. Good means "passes the base and
+ * mapping quality requirements.
+ *
+ * @return the total "good" coverage across the interval for this sample
+ */
+ public long getCoverage() {
+ if (preComputedTotalCoverage < 0)
+ preComputedTotalCoverage = calculateTotalCoverage(getElements());
+ return preComputedTotalCoverage;
+ }
+
+
+ /**
+ * This is how the extending class will calculate it's own total coverage
+ *
+ * @return the total coverage
+ */
+ private long calculateTotalCoverage(Iterable elements) {
+ long cov = 0;
+ for (AbstractStratification element : elements) {
+ cov += element.getCoverage();
+ }
+ return cov;
+ }
+
+ /**
+ * What are the list of elements in your class? For example:
+ *
+ * IntervalStatistics => List
+ * SampleStatistics => List
+ *
+ * @return the corresponding list of elements of the extending class
+ */
+ public abstract Iterable getElements();
+
+ /**
+ * Calculates the Callable statuses for the statistic as a whole (interval, sample or locus)
+ *
+ * @return the callable status(es) for the whole object
+ */
+ public abstract Iterable callableStatuses();
+
+
+ /**
+ * Tally up all the callable status of all the loci in this sample.
+ *
+ * @return a map of callable status and counts
+ */
+ public Map getStatusTally() {
+ if (statusTally == null) {
+ statusTally = new HashMap(CallableStatus.values().length);
+ for (AbstractStratification stats : getElements()) {
+ for (CallableStatus status : stats.callableStatuses()) {
+ statusTally.put(status, !statusTally.containsKey(status) ? 1 : statusTally.get(status) + 1);
+ }
+ }
+ }
+ return statusTally;
+ }
+
+ public static List queryStatus(List statList, AbstractStratification stratification) {
+ List output = new LinkedList();
+ for (Metric stat : statList) {
+ final CallableStatus status = stat.status(stratification);
+ if (status != null) {
+ output.add(status);
+ }
+ }
+ return output;
+ }
+
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java
similarity index 96%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java
index 4bc318b02..d38736f4f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java
@@ -44,7 +44,7 @@
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
-package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
+package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
/**
* Short one line description of the walker.
@@ -52,9 +52,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
* @author Mauricio Carneiro
* @since 2/1/12
*/
-public enum CallableStatus {
-
- REF_N("the reference base was an N, which is not considered callable the GATK"),
+enum CallableStatus {
PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"),
@@ -68,12 +66,7 @@ public enum CallableStatus {
BAD_MATE("the reads are not properly mated, suggesting mapping errors"),
- NO_READS("there are no reads contained in the interval"),
-
- //
- // Interval-level statuses
- //
- LOW_MEDIAN_DEPTH("interval has insufficient median depth across samples");
+ NO_READS("there are no reads contained in the interval");
public final String description;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
similarity index 61%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
index 8b9b37c18..32f87b973 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
@@ -44,10 +44,10 @@
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
-package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
+package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
import net.sf.picard.util.PeekableIterator;
-import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@@ -56,13 +56,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
-import org.broadinstitute.sting.utils.help.HelpConstants;
-import org.broadinstitute.variant.vcf.VCFConstants;
-import org.broadinstitute.variant.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.classloader.PluginManager;
+import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
+import org.broadinstitute.variant.vcf.*;
import java.util.*;
@@ -75,7 +76,7 @@ import java.util.*;
*
*
*
- * Input
+ * Input
*
*
* - A reference file
@@ -84,12 +85,12 @@ import java.util.*;
*
*
*
- * Output
+ * Output
*
* A modified VCF detailing each interval by sample
*
*
- * Examples
+ * Examples
*
* java
* -jar GenomeAnalysisTK.jar
@@ -110,79 +111,52 @@ import java.util.*;
@PartitionBy(PartitionType.INTERVAL)
public class DiagnoseTargets extends LocusWalker {
- @Output(doc = "File to which variants should be written", required = true)
+ private static final String AVG_INTERVAL_DP_KEY = "IDP";
+
+ @Output(doc = "File to which interval statistics should be written")
private VariantContextWriter vcfWriter = null;
- @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false)
- private int minimumBaseQuality = 20;
+ @ArgumentCollection
+ private ThresHolder thresholds = new ThresHolder();
- @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false)
- private int minimumMappingQuality = 20;
+ private Map intervalMap = null; // maps each interval => statistics
+ private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
+ private Set