From dca7c7fa9cb183a57a08952147847ad97f334cd4 Mon Sep 17 00:00:00 2001 From: Kristian Cibulskis Date: Wed, 3 Oct 2012 16:25:34 -0400 Subject: [PATCH 01/18] initial cancer pipeline with mutations and partial indel support --- .../queue/extensions/cancer/MuTect.scala | 378 ++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala new file mode 100644 index 000000000..623d397d4 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala @@ -0,0 +1,378 @@ +package org.broadinstitute.sting.queue.extensions.cancer + +import java.io.File +import org.broadinstitute.sting.commandline.Argument +import org.broadinstitute.sting.commandline.Gather +import org.broadinstitute.sting.commandline.Input +import org.broadinstitute.sting.commandline.Output +import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction +import org.broadinstitute.sting.queue.extensions.gatk.{LocusScatterFunction, TaggedFile} + +class MuTect extends org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK with ScatterGatherableFunction { + analysisName = "MuTect" + analysis_type = "MuTect" + scatterClass = classOf[LocusScatterFunction] + + /** used for debugging, basically exit as soon as we get the reads */ + @Argument(fullName="noop", shortName="", doc="used for debugging, basically exit as soon as we get the reads", required=false, exclusiveOf="", validation="") + var noop: Boolean = _ + + /** add many additional columns of statistics to the output file */ + @Argument(fullName="enable_extended_output", shortName="", doc="add many additional columns of statistics to the output file", required=false, exclusiveOf="", validation="") + var enable_extended_output: Boolean = _ + + /** used when running the caller on a normal (as if it were a tumor) to detect artifacts */ + @Argument(fullName="artifact_detection_mode", shortName="", doc="used when running the caller on a normal (as if it were a tumor) to detect artifacts", required=false, exclusiveOf="", validation="") + var artifact_detection_mode: Boolean = _ + + /** name to use for tumor in output files */ + @Argument(fullName="tumor_sample_name", shortName="", doc="name to use for tumor in output files", required=false, exclusiveOf="", validation="") + var tumor_sample_name: String = _ + + /** if the tumor bam contains multiple samples, only use read groups with SM equal to this value */ + @Argument(fullName="bam_tumor_sample_name", shortName="", doc="if the tumor bam contains multiple samples, only use read groups with SM equal to this value", required=false, exclusiveOf="", validation="") + var bam_tumor_sample_name: String = _ + + /** name to use for normal in output files */ + @Argument(fullName="normal_sample_name", shortName="", doc="name to use for normal in output files", required=false, exclusiveOf="", validation="") + var normal_sample_name: String = _ + + /** force output for each site */ + @Argument(fullName="force_output", shortName="", doc="force output for each site", required=false, exclusiveOf="", validation="") + var force_output: Boolean = _ + + /** force output for all alleles at each site */ + @Argument(fullName="force_alleles", shortName="", doc="force output for all alleles at each site", required=false, exclusiveOf="", validation="") + var force_alleles: Boolean = _ + + /** Initial LOD threshold for calling tumor variant */ + @Argument(fullName="initial_tumor_lod", shortName="", doc="Initial LOD threshold for calling tumor variant", required=false, exclusiveOf="", validation="") + var initial_tumor_lod: Option[Float] = None + + /** Format string for initial_tumor_lod */ + @Argument(fullName="initial_tumor_lodFormat", shortName="", doc="Format string for initial_tumor_lod", required=false, exclusiveOf="", validation="") + var initial_tumor_lodFormat: String = "%s" + + /** LOD threshold for calling tumor variant */ + @Argument(fullName="tumor_lod", shortName="", doc="LOD threshold for calling tumor variant", required=false, exclusiveOf="", validation="") + var tumor_lod: Option[Float] = None + + /** Format string for tumor_lod */ + @Argument(fullName="tumor_lodFormat", shortName="", doc="Format string for tumor_lod", required=false, exclusiveOf="", validation="") + var tumor_lodFormat: String = "%s" + + /** estimate of fraction (0-1) of physical contamination with other unrelated samples */ + @Argument(fullName="fraction_contamination", shortName="", doc="estimate of fraction (0-1) of physical contamination with other unrelated samples", required=false, exclusiveOf="", validation="") + var fraction_contamination: Option[Float] = None + + /** Format string for fraction_contamination */ + @Argument(fullName="fraction_contaminationFormat", shortName="", doc="Format string for fraction_contamination", required=false, exclusiveOf="", validation="") + var fraction_contaminationFormat: String = "%s" + + /** minimum fraction of cells which are presumed to have a mutation, used to handle non-clonality and contamination */ + @Argument(fullName="minimum_mutation_cell_fraction", shortName="", doc="minimum fraction of cells which are presumed to have a mutation, used to handle non-clonality and contamination", required=false, exclusiveOf="", validation="") + var minimum_mutation_cell_fraction: Option[Float] = None + + /** Format string for minimum_mutation_cell_fraction */ + @Argument(fullName="minimum_mutation_cell_fractionFormat", shortName="", doc="Format string for minimum_mutation_cell_fraction", required=false, exclusiveOf="", validation="") + var minimum_mutation_cell_fractionFormat: String = "%s" + + /** LOD threshold for calling normal non-germline */ + @Argument(fullName="normal_lod", shortName="", doc="LOD threshold for calling normal non-germline", required=false, exclusiveOf="", validation="") + var normal_lod: Option[Float] = None + + /** Format string for normal_lod */ + @Argument(fullName="normal_lodFormat", shortName="", doc="Format string for normal_lod", required=false, exclusiveOf="", validation="") + var normal_lodFormat: String = "%s" + + /** LOD threshold for calling normal non-variant */ + @Argument(fullName="normal_artifact_lod", shortName="", doc="LOD threshold for calling normal non-variant", required=false, exclusiveOf="", validation="") + var normal_artifact_lod: Option[Float] = None + + /** Format string for normal_artifact_lod */ + @Argument(fullName="normal_artifact_lodFormat", shortName="", doc="Format string for normal_artifact_lod", required=false, exclusiveOf="", validation="") + var normal_artifact_lodFormat: String = "%s" + + /** LOD threshold for calling strand bias */ + @Argument(fullName="strand_artifact_lod", shortName="", doc="LOD threshold for calling strand bias", required=false, exclusiveOf="", validation="") + var strand_artifact_lod: Option[Float] = None + + /** Format string for strand_artifact_lod */ + @Argument(fullName="strand_artifact_lodFormat", shortName="", doc="Format string for strand_artifact_lod", required=false, exclusiveOf="", validation="") + var strand_artifact_lodFormat: String = "%s" + + /** power threshold for calling strand bias */ + @Argument(fullName="strand_artifact_power_threshold", shortName="", doc="power threshold for calling strand bias", required=false, exclusiveOf="", validation="") + var strand_artifact_power_threshold: Option[Float] = None + + /** Format string for strand_artifact_power_threshold */ + @Argument(fullName="strand_artifact_power_thresholdFormat", shortName="", doc="Format string for strand_artifact_power_threshold", required=false, exclusiveOf="", validation="") + var strand_artifact_power_thresholdFormat: String = "%s" + + /** LOD threshold for calling normal non-variant at dbsnp sites */ + @Argument(fullName="dbsnp_normal_lod", shortName="", doc="LOD threshold for calling normal non-variant at dbsnp sites", required=false, exclusiveOf="", validation="") + var dbsnp_normal_lod: Option[Float] = None + + /** Format string for dbsnp_normal_lod */ + @Argument(fullName="dbsnp_normal_lodFormat", shortName="", doc="Format string for dbsnp_normal_lod", required=false, exclusiveOf="", validation="") + var dbsnp_normal_lodFormat: String = "%s" + + /** Power threshold for normal to determine germline vs variant */ + @Argument(fullName="somatic_classification_normal_power_threshold", shortName="", doc="Power threshold for normal to determine germline vs variant", required=false, exclusiveOf="", validation="") + var somatic_classification_normal_power_threshold: Option[Float] = None + + /** Format string for somatic_classification_normal_power_threshold */ + @Argument(fullName="somatic_classification_normal_power_thresholdFormat", shortName="", doc="Format string for somatic_classification_normal_power_threshold", required=false, exclusiveOf="", validation="") + var somatic_classification_normal_power_thresholdFormat: String = "%s" + + /** minimum allele fraction to be considered in normal, useful for normal sample contaminated with tumor */ + @Argument(fullName="minimum_normal_allele_fraction", shortName="", doc="minimum allele fraction to be considered in normal, useful for normal sample contaminated with tumor", required=false, exclusiveOf="", validation="") + var minimum_normal_allele_fraction: Option[Float] = None + + /** Format string for minimum_normal_allele_fraction */ + @Argument(fullName="minimum_normal_allele_fractionFormat", shortName="", doc="Format string for minimum_normal_allele_fraction", required=false, exclusiveOf="", validation="") + var minimum_normal_allele_fractionFormat: String = "%s" + + /** for computational efficiency, reject sites with allelic fraction below this threshold */ + @Argument(fullName="tumor_f_pretest", shortName="", doc="for computational efficiency, reject sites with allelic fraction below this threshold", required=false, exclusiveOf="", validation="") + var tumor_f_pretest: Option[Float] = None + + /** Format string for tumor_f_pretest */ + @Argument(fullName="tumor_f_pretestFormat", shortName="", doc="Format string for tumor_f_pretest", required=false, exclusiveOf="", validation="") + var tumor_f_pretestFormat: String = "%s" + + /** threshold for minimum base quality score */ + @Argument(fullName="min_qscore", shortName="", doc="threshold for minimum base quality score", required=false, exclusiveOf="", validation="") + var min_qscore: Option[Int] = None + + /** how many gapped events (ins/del) are allowed in proximity to this candidate */ + @Argument(fullName="gap_events_threshold", shortName="", doc="how many gapped events (ins/del) are allowed in proximity to this candidate", required=false, exclusiveOf="", validation="") + var gap_events_threshold: Option[Int] = None + + /** if this fraction or more of the bases in a read are soft/hard clipped, do not use this read for mutation calling */ + @Argument(fullName="heavily_clipped_read_fraction", shortName="", doc="if this fraction or more of the bases in a read are soft/hard clipped, do not use this read for mutation calling", required=false, exclusiveOf="", validation="") + var heavily_clipped_read_fraction: Option[Float] = None + + /** Format string for heavily_clipped_read_fraction */ + @Argument(fullName="heavily_clipped_read_fractionFormat", shortName="", doc="Format string for heavily_clipped_read_fraction", required=false, exclusiveOf="", validation="") + var heavily_clipped_read_fractionFormat: String = "%s" + + /** pvalue threshold for fishers exact test of clipping bias in mutant reads vs ref reads */ + @Argument(fullName="clipping_bias_pvalue_threshold", shortName="", doc="pvalue threshold for fishers exact test of clipping bias in mutant reads vs ref reads", required=false, exclusiveOf="", validation="") + var clipping_bias_pvalue_threshold: Option[Float] = None + + /** Format string for clipping_bias_pvalue_threshold */ + @Argument(fullName="clipping_bias_pvalue_thresholdFormat", shortName="", doc="Format string for clipping_bias_pvalue_threshold", required=false, exclusiveOf="", validation="") + var clipping_bias_pvalue_thresholdFormat: String = "%s" + + /** threshold for determining if there is relatedness between the alt and ref allele read piles */ + @Argument(fullName="fraction_mapq0_threshold", shortName="", doc="threshold for determining if there is relatedness between the alt and ref allele read piles", required=false, exclusiveOf="", validation="") + var fraction_mapq0_threshold: Option[Float] = None + + /** Format string for fraction_mapq0_threshold */ + @Argument(fullName="fraction_mapq0_thresholdFormat", shortName="", doc="Format string for fraction_mapq0_threshold", required=false, exclusiveOf="", validation="") + var fraction_mapq0_thresholdFormat: String = "%s" + + /** threshold for clustered read position artifact median */ + @Argument(fullName="pir_median_threshold", shortName="", doc="threshold for clustered read position artifact median", required=false, exclusiveOf="", validation="") + var pir_median_threshold: Option[Double] = None + + /** Format string for pir_median_threshold */ + @Argument(fullName="pir_median_thresholdFormat", shortName="", doc="Format string for pir_median_threshold", required=false, exclusiveOf="", validation="") + var pir_median_thresholdFormat: String = "%s" + + /** threshold for clustered read position artifact MAD */ + @Argument(fullName="pir_mad_threshold", shortName="", doc="threshold for clustered read position artifact MAD", required=false, exclusiveOf="", validation="") + var pir_mad_threshold: Option[Double] = None + + /** Format string for pir_mad_threshold */ + @Argument(fullName="pir_mad_thresholdFormat", shortName="", doc="Format string for pir_mad_threshold", required=false, exclusiveOf="", validation="") + var pir_mad_thresholdFormat: String = "%s" + + /** required minimum value for tumor alt allele maximum mapping quality score */ + @Argument(fullName="required_maximum_alt_allele_mapping_quality_score", shortName="", doc="required minimum value for tumor alt allele maximum mapping quality score", required=false, exclusiveOf="", validation="") + var required_maximum_alt_allele_mapping_quality_score: Option[Int] = None + + /** threshold for maximum alternate allele counts in normal */ + @Argument(fullName="max_alt_alleles_in_normal_count", shortName="", doc="threshold for maximum alternate allele counts in normal", required=false, exclusiveOf="", validation="") + var max_alt_alleles_in_normal_count: Option[Int] = None + + /** threshold for maximum alternate allele quality score sum in normal */ + @Argument(fullName="max_alt_alleles_in_normal_qscore_sum", shortName="", doc="threshold for maximum alternate allele quality score sum in normal", required=false, exclusiveOf="", validation="") + var max_alt_alleles_in_normal_qscore_sum: Option[Int] = None + + /** threshold for maximum alternate allele fraction in normal */ + @Argument(fullName="max_alt_allele_in_normal_fraction", shortName="", doc="threshold for maximum alternate allele fraction in normal", required=false, exclusiveOf="", validation="") + var max_alt_allele_in_normal_fraction: Option[Double] = None + + /** Format string for max_alt_allele_in_normal_fraction */ + @Argument(fullName="max_alt_allele_in_normal_fractionFormat", shortName="", doc="Format string for max_alt_allele_in_normal_fraction", required=false, exclusiveOf="", validation="") + var max_alt_allele_in_normal_fractionFormat: String = "%s" + + /** Phred scale quality score constant to use in power calculations */ + @Argument(fullName="power_constant_qscore", shortName="", doc="Phred scale quality score constant to use in power calculations", required=false, exclusiveOf="", validation="") + var power_constant_qscore: Option[Int] = None + + /** Absolute Copy Number Data, as defined by Absolute, to use in power calculations */ + @Argument(fullName="absolute_copy_number_data", shortName="", doc="Absolute Copy Number Data, as defined by Absolute, to use in power calculations", required=false, exclusiveOf="", validation="") + var absolute_copy_number_data: File = _ + + /** Allelic fraction constant to use in power calculations */ + @Argument(fullName="power_constant_af", shortName="", doc="Allelic fraction constant to use in power calculations", required=false, exclusiveOf="", validation="") + var power_constant_af: Option[Double] = None + + /** Format string for power_constant_af */ + @Argument(fullName="power_constant_afFormat", shortName="", doc="Format string for power_constant_af", required=false, exclusiveOf="", validation="") + var power_constant_afFormat: String = "%s" + + /** Call-stats output */ + @Output(fullName="out", shortName="o", doc="Call-stats output", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var out: File = _ + + /** + * Short name of out + * @return Short name of out + */ + def o = this.out + + /** + * Short name of out + * @param value Short name of out + */ + def o_=(value: File) { this.out = value } + + /** VCF file of DBSNP information */ + @Input(fullName="dbsnp", shortName="dbsnp", doc="VCF file of DBSNP information", required=false, exclusiveOf="", validation="") + var dbsnp: Seq[File] = Nil + + /** Dependencies on any indexes of dbsnp */ + @Input(fullName="dbsnpIndexes", shortName="", doc="Dependencies on any indexes of dbsnp", required=false, exclusiveOf="", validation="") + private var dbsnpIndexes: Seq[File] = Nil + + /** VCF file of COSMIC sites */ + @Input(fullName="cosmic", shortName="cosmic", doc="VCF file of COSMIC sites", required=false, exclusiveOf="", validation="") + var cosmic: Seq[File] = Nil + + /** Dependencies on any indexes of cosmic */ + @Input(fullName="cosmicIndexes", shortName="", doc="Dependencies on any indexes of cosmic", required=false, exclusiveOf="", validation="") + private var cosmicIndexes: Seq[File] = Nil + + /** VCF file of sites observed in normal */ + @Input(fullName="normal_panel", shortName="normal_panel", doc="VCF file of sites observed in normal", required=false, exclusiveOf="", validation="") + var normal_panel: Seq[File] = Nil + + /** Dependencies on any indexes of normal_panel */ + @Input(fullName="normal_panelIndexes", shortName="", doc="Dependencies on any indexes of normal_panel", required=false, exclusiveOf="", validation="") + private var normal_panelIndexes: Seq[File] = Nil + + /** write out coverage in WIGGLE format to this file */ + @Output(fullName="coverage_file", shortName="cov", doc="write out coverage in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var coverage_file: File = _ + + /** + * Short name of coverage_file + * @return Short name of coverage_file + */ + def cov = this.coverage_file + + /** + * Short name of coverage_file + * @param value Short name of coverage_file + */ + def cov_=(value: File) { this.coverage_file = value } + + /** write out 20x of Q20 coverage in WIGGLE format to this file */ + @Output(fullName="coverage_20_q20_file", shortName="cov_q20", doc="write out 20x of Q20 coverage in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var coverage_20_q20_file: File = _ + + /** + * Short name of coverage_20_q20_file + * @return Short name of coverage_20_q20_file + */ + def cov_q20 = this.coverage_20_q20_file + + /** + * Short name of coverage_20_q20_file + * @param value Short name of coverage_20_q20_file + */ + def cov_q20_=(value: File) { this.coverage_20_q20_file = value } + + /** write out power in WIGGLE format to this file */ + @Output(fullName="power_file", shortName="pow", doc="write out power in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var power_file: File = _ + + /** + * Short name of power_file + * @return Short name of power_file + */ + def pow = this.power_file + + /** + * Short name of power_file + * @param value Short name of power_file + */ + def pow_=(value: File) { this.power_file = value } + + /** write out tumor read depth in WIGGLE format to this file */ + @Output(fullName="tumor_depth_file", shortName="tdf", doc="write out tumor read depth in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var tumor_depth_file: File = _ + + /** + * Short name of tumor_depth_file + * @return Short name of tumor_depth_file + */ + def tdf = this.tumor_depth_file + + /** + * Short name of tumor_depth_file + * @param value Short name of tumor_depth_file + */ + def tdf_=(value: File) { this.tumor_depth_file = value } + + /** write out normal read depth in WIGGLE format to this file */ + @Output(fullName="normal_depth_file", shortName="ndf", doc="write out normal read depth in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var normal_depth_file: File = _ + + /** + * Short name of normal_depth_file + * @return Short name of normal_depth_file + */ + def ndf = this.normal_depth_file + + /** + * Short name of normal_depth_file + * @param value Short name of normal_depth_file + */ + def ndf_=(value: File) { this.normal_depth_file = value } + + /** if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up. */ + @Argument(fullName="filter_mismatching_base_and_quals", shortName="filterMBQ", doc="if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required=false, exclusiveOf="", validation="") + var filter_mismatching_base_and_quals: Boolean = _ + + /** + * Short name of filter_mismatching_base_and_quals + * @return Short name of filter_mismatching_base_and_quals + */ + def filterMBQ = this.filter_mismatching_base_and_quals + + /** + * Short name of filter_mismatching_base_and_quals + * @param value Short name of filter_mismatching_base_and_quals + */ + def filterMBQ_=(value: Boolean) { this.filter_mismatching_base_and_quals = value } + + override def freezeFieldValues() { + super.freezeFieldValues() + dbsnpIndexes ++= dbsnp.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) + cosmicIndexes ++= cosmic.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) + normal_panelIndexes ++= normal_panel.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) + } + + override def commandLine = super.commandLine + conditional(noop, "--noop", escape=true, format="%s") + conditional(enable_extended_output, "--enable_extended_output", escape=true, format="%s") + conditional(artifact_detection_mode, "--artifact_detection_mode", escape=true, format="%s") + optional("--tumor_sample_name", tumor_sample_name, spaceSeparated=true, escape=true, format="%s") + optional("--bam_tumor_sample_name", bam_tumor_sample_name, spaceSeparated=true, escape=true, format="%s") + optional("--normal_sample_name", normal_sample_name, spaceSeparated=true, escape=true, format="%s") + conditional(force_output, "--force_output", escape=true, format="%s") + conditional(force_alleles, "--force_alleles", escape=true, format="%s") + optional("--initial_tumor_lod", initial_tumor_lod, spaceSeparated=true, escape=true, format=initial_tumor_lodFormat) + optional("--tumor_lod", tumor_lod, spaceSeparated=true, escape=true, format=tumor_lodFormat) + optional("--fraction_contamination", fraction_contamination, spaceSeparated=true, escape=true, format=fraction_contaminationFormat) + optional("--minimum_mutation_cell_fraction", minimum_mutation_cell_fraction, spaceSeparated=true, escape=true, format=minimum_mutation_cell_fractionFormat) + optional("--normal_lod", normal_lod, spaceSeparated=true, escape=true, format=normal_lodFormat) + optional("--normal_artifact_lod", normal_artifact_lod, spaceSeparated=true, escape=true, format=normal_artifact_lodFormat) + optional("--strand_artifact_lod", strand_artifact_lod, spaceSeparated=true, escape=true, format=strand_artifact_lodFormat) + optional("--strand_artifact_power_threshold", strand_artifact_power_threshold, spaceSeparated=true, escape=true, format=strand_artifact_power_thresholdFormat) + optional("--dbsnp_normal_lod", dbsnp_normal_lod, spaceSeparated=true, escape=true, format=dbsnp_normal_lodFormat) + optional("--somatic_classification_normal_power_threshold", somatic_classification_normal_power_threshold, spaceSeparated=true, escape=true, format=somatic_classification_normal_power_thresholdFormat) + optional("--minimum_normal_allele_fraction", minimum_normal_allele_fraction, spaceSeparated=true, escape=true, format=minimum_normal_allele_fractionFormat) + optional("--tumor_f_pretest", tumor_f_pretest, spaceSeparated=true, escape=true, format=tumor_f_pretestFormat) + optional("--min_qscore", min_qscore, spaceSeparated=true, escape=true, format="%s") + optional("--gap_events_threshold", gap_events_threshold, spaceSeparated=true, escape=true, format="%s") + optional("--heavily_clipped_read_fraction", heavily_clipped_read_fraction, spaceSeparated=true, escape=true, format=heavily_clipped_read_fractionFormat) + optional("--clipping_bias_pvalue_threshold", clipping_bias_pvalue_threshold, spaceSeparated=true, escape=true, format=clipping_bias_pvalue_thresholdFormat) + optional("--fraction_mapq0_threshold", fraction_mapq0_threshold, spaceSeparated=true, escape=true, format=fraction_mapq0_thresholdFormat) + optional("--pir_median_threshold", pir_median_threshold, spaceSeparated=true, escape=true, format=pir_median_thresholdFormat) + optional("--pir_mad_threshold", pir_mad_threshold, spaceSeparated=true, escape=true, format=pir_mad_thresholdFormat) + optional("--required_maximum_alt_allele_mapping_quality_score", required_maximum_alt_allele_mapping_quality_score, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_alleles_in_normal_count", max_alt_alleles_in_normal_count, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_alleles_in_normal_qscore_sum", max_alt_alleles_in_normal_qscore_sum, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_allele_in_normal_fraction", max_alt_allele_in_normal_fraction, spaceSeparated=true, escape=true, format=max_alt_allele_in_normal_fractionFormat) + optional("--power_constant_qscore", power_constant_qscore, spaceSeparated=true, escape=true, format="%s") + optional("--absolute_copy_number_data", absolute_copy_number_data, spaceSeparated=true, escape=true, format="%s") + optional("--power_constant_af", power_constant_af, spaceSeparated=true, escape=true, format=power_constant_afFormat) + optional("-o", out, spaceSeparated=true, escape=true, format="%s") + repeat("-dbsnp", dbsnp, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + repeat("-cosmic", cosmic, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + repeat("-normal_panel", normal_panel, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + optional("-cov", coverage_file, spaceSeparated=true, escape=true, format="%s") + optional("-cov_q20", coverage_20_q20_file, spaceSeparated=true, escape=true, format="%s") + optional("-pow", power_file, spaceSeparated=true, escape=true, format="%s") + optional("-tdf", tumor_depth_file, spaceSeparated=true, escape=true, format="%s") + optional("-ndf", normal_depth_file, spaceSeparated=true, escape=true, format="%s") + conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape=true, format="%s") +} From 3ffba77656606f4378daf0474b18794741b4423f Mon Sep 17 00:00:00 2001 From: Scott Frazer Date: Thu, 4 Oct 2012 11:37:54 -0400 Subject: [PATCH 02/18] Revert "initial cancer pipeline with mutations and partial indel support" This reverts commit 4a2e5b1fcc3ad53dbb26d43eed1220b0257e9901. --- .../queue/extensions/cancer/MuTect.scala | 378 ------------------ 1 file changed, 378 deletions(-) delete mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala deleted file mode 100644 index 623d397d4..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala +++ /dev/null @@ -1,378 +0,0 @@ -package org.broadinstitute.sting.queue.extensions.cancer - -import java.io.File -import org.broadinstitute.sting.commandline.Argument -import org.broadinstitute.sting.commandline.Gather -import org.broadinstitute.sting.commandline.Input -import org.broadinstitute.sting.commandline.Output -import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction -import org.broadinstitute.sting.queue.extensions.gatk.{LocusScatterFunction, TaggedFile} - -class MuTect extends org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK with ScatterGatherableFunction { - analysisName = "MuTect" - analysis_type = "MuTect" - scatterClass = classOf[LocusScatterFunction] - - /** used for debugging, basically exit as soon as we get the reads */ - @Argument(fullName="noop", shortName="", doc="used for debugging, basically exit as soon as we get the reads", required=false, exclusiveOf="", validation="") - var noop: Boolean = _ - - /** add many additional columns of statistics to the output file */ - @Argument(fullName="enable_extended_output", shortName="", doc="add many additional columns of statistics to the output file", required=false, exclusiveOf="", validation="") - var enable_extended_output: Boolean = _ - - /** used when running the caller on a normal (as if it were a tumor) to detect artifacts */ - @Argument(fullName="artifact_detection_mode", shortName="", doc="used when running the caller on a normal (as if it were a tumor) to detect artifacts", required=false, exclusiveOf="", validation="") - var artifact_detection_mode: Boolean = _ - - /** name to use for tumor in output files */ - @Argument(fullName="tumor_sample_name", shortName="", doc="name to use for tumor in output files", required=false, exclusiveOf="", validation="") - var tumor_sample_name: String = _ - - /** if the tumor bam contains multiple samples, only use read groups with SM equal to this value */ - @Argument(fullName="bam_tumor_sample_name", shortName="", doc="if the tumor bam contains multiple samples, only use read groups with SM equal to this value", required=false, exclusiveOf="", validation="") - var bam_tumor_sample_name: String = _ - - /** name to use for normal in output files */ - @Argument(fullName="normal_sample_name", shortName="", doc="name to use for normal in output files", required=false, exclusiveOf="", validation="") - var normal_sample_name: String = _ - - /** force output for each site */ - @Argument(fullName="force_output", shortName="", doc="force output for each site", required=false, exclusiveOf="", validation="") - var force_output: Boolean = _ - - /** force output for all alleles at each site */ - @Argument(fullName="force_alleles", shortName="", doc="force output for all alleles at each site", required=false, exclusiveOf="", validation="") - var force_alleles: Boolean = _ - - /** Initial LOD threshold for calling tumor variant */ - @Argument(fullName="initial_tumor_lod", shortName="", doc="Initial LOD threshold for calling tumor variant", required=false, exclusiveOf="", validation="") - var initial_tumor_lod: Option[Float] = None - - /** Format string for initial_tumor_lod */ - @Argument(fullName="initial_tumor_lodFormat", shortName="", doc="Format string for initial_tumor_lod", required=false, exclusiveOf="", validation="") - var initial_tumor_lodFormat: String = "%s" - - /** LOD threshold for calling tumor variant */ - @Argument(fullName="tumor_lod", shortName="", doc="LOD threshold for calling tumor variant", required=false, exclusiveOf="", validation="") - var tumor_lod: Option[Float] = None - - /** Format string for tumor_lod */ - @Argument(fullName="tumor_lodFormat", shortName="", doc="Format string for tumor_lod", required=false, exclusiveOf="", validation="") - var tumor_lodFormat: String = "%s" - - /** estimate of fraction (0-1) of physical contamination with other unrelated samples */ - @Argument(fullName="fraction_contamination", shortName="", doc="estimate of fraction (0-1) of physical contamination with other unrelated samples", required=false, exclusiveOf="", validation="") - var fraction_contamination: Option[Float] = None - - /** Format string for fraction_contamination */ - @Argument(fullName="fraction_contaminationFormat", shortName="", doc="Format string for fraction_contamination", required=false, exclusiveOf="", validation="") - var fraction_contaminationFormat: String = "%s" - - /** minimum fraction of cells which are presumed to have a mutation, used to handle non-clonality and contamination */ - @Argument(fullName="minimum_mutation_cell_fraction", shortName="", doc="minimum fraction of cells which are presumed to have a mutation, used to handle non-clonality and contamination", required=false, exclusiveOf="", validation="") - var minimum_mutation_cell_fraction: Option[Float] = None - - /** Format string for minimum_mutation_cell_fraction */ - @Argument(fullName="minimum_mutation_cell_fractionFormat", shortName="", doc="Format string for minimum_mutation_cell_fraction", required=false, exclusiveOf="", validation="") - var minimum_mutation_cell_fractionFormat: String = "%s" - - /** LOD threshold for calling normal non-germline */ - @Argument(fullName="normal_lod", shortName="", doc="LOD threshold for calling normal non-germline", required=false, exclusiveOf="", validation="") - var normal_lod: Option[Float] = None - - /** Format string for normal_lod */ - @Argument(fullName="normal_lodFormat", shortName="", doc="Format string for normal_lod", required=false, exclusiveOf="", validation="") - var normal_lodFormat: String = "%s" - - /** LOD threshold for calling normal non-variant */ - @Argument(fullName="normal_artifact_lod", shortName="", doc="LOD threshold for calling normal non-variant", required=false, exclusiveOf="", validation="") - var normal_artifact_lod: Option[Float] = None - - /** Format string for normal_artifact_lod */ - @Argument(fullName="normal_artifact_lodFormat", shortName="", doc="Format string for normal_artifact_lod", required=false, exclusiveOf="", validation="") - var normal_artifact_lodFormat: String = "%s" - - /** LOD threshold for calling strand bias */ - @Argument(fullName="strand_artifact_lod", shortName="", doc="LOD threshold for calling strand bias", required=false, exclusiveOf="", validation="") - var strand_artifact_lod: Option[Float] = None - - /** Format string for strand_artifact_lod */ - @Argument(fullName="strand_artifact_lodFormat", shortName="", doc="Format string for strand_artifact_lod", required=false, exclusiveOf="", validation="") - var strand_artifact_lodFormat: String = "%s" - - /** power threshold for calling strand bias */ - @Argument(fullName="strand_artifact_power_threshold", shortName="", doc="power threshold for calling strand bias", required=false, exclusiveOf="", validation="") - var strand_artifact_power_threshold: Option[Float] = None - - /** Format string for strand_artifact_power_threshold */ - @Argument(fullName="strand_artifact_power_thresholdFormat", shortName="", doc="Format string for strand_artifact_power_threshold", required=false, exclusiveOf="", validation="") - var strand_artifact_power_thresholdFormat: String = "%s" - - /** LOD threshold for calling normal non-variant at dbsnp sites */ - @Argument(fullName="dbsnp_normal_lod", shortName="", doc="LOD threshold for calling normal non-variant at dbsnp sites", required=false, exclusiveOf="", validation="") - var dbsnp_normal_lod: Option[Float] = None - - /** Format string for dbsnp_normal_lod */ - @Argument(fullName="dbsnp_normal_lodFormat", shortName="", doc="Format string for dbsnp_normal_lod", required=false, exclusiveOf="", validation="") - var dbsnp_normal_lodFormat: String = "%s" - - /** Power threshold for normal to determine germline vs variant */ - @Argument(fullName="somatic_classification_normal_power_threshold", shortName="", doc="Power threshold for normal to determine germline vs variant", required=false, exclusiveOf="", validation="") - var somatic_classification_normal_power_threshold: Option[Float] = None - - /** Format string for somatic_classification_normal_power_threshold */ - @Argument(fullName="somatic_classification_normal_power_thresholdFormat", shortName="", doc="Format string for somatic_classification_normal_power_threshold", required=false, exclusiveOf="", validation="") - var somatic_classification_normal_power_thresholdFormat: String = "%s" - - /** minimum allele fraction to be considered in normal, useful for normal sample contaminated with tumor */ - @Argument(fullName="minimum_normal_allele_fraction", shortName="", doc="minimum allele fraction to be considered in normal, useful for normal sample contaminated with tumor", required=false, exclusiveOf="", validation="") - var minimum_normal_allele_fraction: Option[Float] = None - - /** Format string for minimum_normal_allele_fraction */ - @Argument(fullName="minimum_normal_allele_fractionFormat", shortName="", doc="Format string for minimum_normal_allele_fraction", required=false, exclusiveOf="", validation="") - var minimum_normal_allele_fractionFormat: String = "%s" - - /** for computational efficiency, reject sites with allelic fraction below this threshold */ - @Argument(fullName="tumor_f_pretest", shortName="", doc="for computational efficiency, reject sites with allelic fraction below this threshold", required=false, exclusiveOf="", validation="") - var tumor_f_pretest: Option[Float] = None - - /** Format string for tumor_f_pretest */ - @Argument(fullName="tumor_f_pretestFormat", shortName="", doc="Format string for tumor_f_pretest", required=false, exclusiveOf="", validation="") - var tumor_f_pretestFormat: String = "%s" - - /** threshold for minimum base quality score */ - @Argument(fullName="min_qscore", shortName="", doc="threshold for minimum base quality score", required=false, exclusiveOf="", validation="") - var min_qscore: Option[Int] = None - - /** how many gapped events (ins/del) are allowed in proximity to this candidate */ - @Argument(fullName="gap_events_threshold", shortName="", doc="how many gapped events (ins/del) are allowed in proximity to this candidate", required=false, exclusiveOf="", validation="") - var gap_events_threshold: Option[Int] = None - - /** if this fraction or more of the bases in a read are soft/hard clipped, do not use this read for mutation calling */ - @Argument(fullName="heavily_clipped_read_fraction", shortName="", doc="if this fraction or more of the bases in a read are soft/hard clipped, do not use this read for mutation calling", required=false, exclusiveOf="", validation="") - var heavily_clipped_read_fraction: Option[Float] = None - - /** Format string for heavily_clipped_read_fraction */ - @Argument(fullName="heavily_clipped_read_fractionFormat", shortName="", doc="Format string for heavily_clipped_read_fraction", required=false, exclusiveOf="", validation="") - var heavily_clipped_read_fractionFormat: String = "%s" - - /** pvalue threshold for fishers exact test of clipping bias in mutant reads vs ref reads */ - @Argument(fullName="clipping_bias_pvalue_threshold", shortName="", doc="pvalue threshold for fishers exact test of clipping bias in mutant reads vs ref reads", required=false, exclusiveOf="", validation="") - var clipping_bias_pvalue_threshold: Option[Float] = None - - /** Format string for clipping_bias_pvalue_threshold */ - @Argument(fullName="clipping_bias_pvalue_thresholdFormat", shortName="", doc="Format string for clipping_bias_pvalue_threshold", required=false, exclusiveOf="", validation="") - var clipping_bias_pvalue_thresholdFormat: String = "%s" - - /** threshold for determining if there is relatedness between the alt and ref allele read piles */ - @Argument(fullName="fraction_mapq0_threshold", shortName="", doc="threshold for determining if there is relatedness between the alt and ref allele read piles", required=false, exclusiveOf="", validation="") - var fraction_mapq0_threshold: Option[Float] = None - - /** Format string for fraction_mapq0_threshold */ - @Argument(fullName="fraction_mapq0_thresholdFormat", shortName="", doc="Format string for fraction_mapq0_threshold", required=false, exclusiveOf="", validation="") - var fraction_mapq0_thresholdFormat: String = "%s" - - /** threshold for clustered read position artifact median */ - @Argument(fullName="pir_median_threshold", shortName="", doc="threshold for clustered read position artifact median", required=false, exclusiveOf="", validation="") - var pir_median_threshold: Option[Double] = None - - /** Format string for pir_median_threshold */ - @Argument(fullName="pir_median_thresholdFormat", shortName="", doc="Format string for pir_median_threshold", required=false, exclusiveOf="", validation="") - var pir_median_thresholdFormat: String = "%s" - - /** threshold for clustered read position artifact MAD */ - @Argument(fullName="pir_mad_threshold", shortName="", doc="threshold for clustered read position artifact MAD", required=false, exclusiveOf="", validation="") - var pir_mad_threshold: Option[Double] = None - - /** Format string for pir_mad_threshold */ - @Argument(fullName="pir_mad_thresholdFormat", shortName="", doc="Format string for pir_mad_threshold", required=false, exclusiveOf="", validation="") - var pir_mad_thresholdFormat: String = "%s" - - /** required minimum value for tumor alt allele maximum mapping quality score */ - @Argument(fullName="required_maximum_alt_allele_mapping_quality_score", shortName="", doc="required minimum value for tumor alt allele maximum mapping quality score", required=false, exclusiveOf="", validation="") - var required_maximum_alt_allele_mapping_quality_score: Option[Int] = None - - /** threshold for maximum alternate allele counts in normal */ - @Argument(fullName="max_alt_alleles_in_normal_count", shortName="", doc="threshold for maximum alternate allele counts in normal", required=false, exclusiveOf="", validation="") - var max_alt_alleles_in_normal_count: Option[Int] = None - - /** threshold for maximum alternate allele quality score sum in normal */ - @Argument(fullName="max_alt_alleles_in_normal_qscore_sum", shortName="", doc="threshold for maximum alternate allele quality score sum in normal", required=false, exclusiveOf="", validation="") - var max_alt_alleles_in_normal_qscore_sum: Option[Int] = None - - /** threshold for maximum alternate allele fraction in normal */ - @Argument(fullName="max_alt_allele_in_normal_fraction", shortName="", doc="threshold for maximum alternate allele fraction in normal", required=false, exclusiveOf="", validation="") - var max_alt_allele_in_normal_fraction: Option[Double] = None - - /** Format string for max_alt_allele_in_normal_fraction */ - @Argument(fullName="max_alt_allele_in_normal_fractionFormat", shortName="", doc="Format string for max_alt_allele_in_normal_fraction", required=false, exclusiveOf="", validation="") - var max_alt_allele_in_normal_fractionFormat: String = "%s" - - /** Phred scale quality score constant to use in power calculations */ - @Argument(fullName="power_constant_qscore", shortName="", doc="Phred scale quality score constant to use in power calculations", required=false, exclusiveOf="", validation="") - var power_constant_qscore: Option[Int] = None - - /** Absolute Copy Number Data, as defined by Absolute, to use in power calculations */ - @Argument(fullName="absolute_copy_number_data", shortName="", doc="Absolute Copy Number Data, as defined by Absolute, to use in power calculations", required=false, exclusiveOf="", validation="") - var absolute_copy_number_data: File = _ - - /** Allelic fraction constant to use in power calculations */ - @Argument(fullName="power_constant_af", shortName="", doc="Allelic fraction constant to use in power calculations", required=false, exclusiveOf="", validation="") - var power_constant_af: Option[Double] = None - - /** Format string for power_constant_af */ - @Argument(fullName="power_constant_afFormat", shortName="", doc="Format string for power_constant_af", required=false, exclusiveOf="", validation="") - var power_constant_afFormat: String = "%s" - - /** Call-stats output */ - @Output(fullName="out", shortName="o", doc="Call-stats output", required=false, exclusiveOf="", validation="") - @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) - var out: File = _ - - /** - * Short name of out - * @return Short name of out - */ - def o = this.out - - /** - * Short name of out - * @param value Short name of out - */ - def o_=(value: File) { this.out = value } - - /** VCF file of DBSNP information */ - @Input(fullName="dbsnp", shortName="dbsnp", doc="VCF file of DBSNP information", required=false, exclusiveOf="", validation="") - var dbsnp: Seq[File] = Nil - - /** Dependencies on any indexes of dbsnp */ - @Input(fullName="dbsnpIndexes", shortName="", doc="Dependencies on any indexes of dbsnp", required=false, exclusiveOf="", validation="") - private var dbsnpIndexes: Seq[File] = Nil - - /** VCF file of COSMIC sites */ - @Input(fullName="cosmic", shortName="cosmic", doc="VCF file of COSMIC sites", required=false, exclusiveOf="", validation="") - var cosmic: Seq[File] = Nil - - /** Dependencies on any indexes of cosmic */ - @Input(fullName="cosmicIndexes", shortName="", doc="Dependencies on any indexes of cosmic", required=false, exclusiveOf="", validation="") - private var cosmicIndexes: Seq[File] = Nil - - /** VCF file of sites observed in normal */ - @Input(fullName="normal_panel", shortName="normal_panel", doc="VCF file of sites observed in normal", required=false, exclusiveOf="", validation="") - var normal_panel: Seq[File] = Nil - - /** Dependencies on any indexes of normal_panel */ - @Input(fullName="normal_panelIndexes", shortName="", doc="Dependencies on any indexes of normal_panel", required=false, exclusiveOf="", validation="") - private var normal_panelIndexes: Seq[File] = Nil - - /** write out coverage in WIGGLE format to this file */ - @Output(fullName="coverage_file", shortName="cov", doc="write out coverage in WIGGLE format to this file", required=false, exclusiveOf="", validation="") - @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) - var coverage_file: File = _ - - /** - * Short name of coverage_file - * @return Short name of coverage_file - */ - def cov = this.coverage_file - - /** - * Short name of coverage_file - * @param value Short name of coverage_file - */ - def cov_=(value: File) { this.coverage_file = value } - - /** write out 20x of Q20 coverage in WIGGLE format to this file */ - @Output(fullName="coverage_20_q20_file", shortName="cov_q20", doc="write out 20x of Q20 coverage in WIGGLE format to this file", required=false, exclusiveOf="", validation="") - @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) - var coverage_20_q20_file: File = _ - - /** - * Short name of coverage_20_q20_file - * @return Short name of coverage_20_q20_file - */ - def cov_q20 = this.coverage_20_q20_file - - /** - * Short name of coverage_20_q20_file - * @param value Short name of coverage_20_q20_file - */ - def cov_q20_=(value: File) { this.coverage_20_q20_file = value } - - /** write out power in WIGGLE format to this file */ - @Output(fullName="power_file", shortName="pow", doc="write out power in WIGGLE format to this file", required=false, exclusiveOf="", validation="") - @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) - var power_file: File = _ - - /** - * Short name of power_file - * @return Short name of power_file - */ - def pow = this.power_file - - /** - * Short name of power_file - * @param value Short name of power_file - */ - def pow_=(value: File) { this.power_file = value } - - /** write out tumor read depth in WIGGLE format to this file */ - @Output(fullName="tumor_depth_file", shortName="tdf", doc="write out tumor read depth in WIGGLE format to this file", required=false, exclusiveOf="", validation="") - @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) - var tumor_depth_file: File = _ - - /** - * Short name of tumor_depth_file - * @return Short name of tumor_depth_file - */ - def tdf = this.tumor_depth_file - - /** - * Short name of tumor_depth_file - * @param value Short name of tumor_depth_file - */ - def tdf_=(value: File) { this.tumor_depth_file = value } - - /** write out normal read depth in WIGGLE format to this file */ - @Output(fullName="normal_depth_file", shortName="ndf", doc="write out normal read depth in WIGGLE format to this file", required=false, exclusiveOf="", validation="") - @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) - var normal_depth_file: File = _ - - /** - * Short name of normal_depth_file - * @return Short name of normal_depth_file - */ - def ndf = this.normal_depth_file - - /** - * Short name of normal_depth_file - * @param value Short name of normal_depth_file - */ - def ndf_=(value: File) { this.normal_depth_file = value } - - /** if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up. */ - @Argument(fullName="filter_mismatching_base_and_quals", shortName="filterMBQ", doc="if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required=false, exclusiveOf="", validation="") - var filter_mismatching_base_and_quals: Boolean = _ - - /** - * Short name of filter_mismatching_base_and_quals - * @return Short name of filter_mismatching_base_and_quals - */ - def filterMBQ = this.filter_mismatching_base_and_quals - - /** - * Short name of filter_mismatching_base_and_quals - * @param value Short name of filter_mismatching_base_and_quals - */ - def filterMBQ_=(value: Boolean) { this.filter_mismatching_base_and_quals = value } - - override def freezeFieldValues() { - super.freezeFieldValues() - dbsnpIndexes ++= dbsnp.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) - cosmicIndexes ++= cosmic.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) - normal_panelIndexes ++= normal_panel.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) - } - - override def commandLine = super.commandLine + conditional(noop, "--noop", escape=true, format="%s") + conditional(enable_extended_output, "--enable_extended_output", escape=true, format="%s") + conditional(artifact_detection_mode, "--artifact_detection_mode", escape=true, format="%s") + optional("--tumor_sample_name", tumor_sample_name, spaceSeparated=true, escape=true, format="%s") + optional("--bam_tumor_sample_name", bam_tumor_sample_name, spaceSeparated=true, escape=true, format="%s") + optional("--normal_sample_name", normal_sample_name, spaceSeparated=true, escape=true, format="%s") + conditional(force_output, "--force_output", escape=true, format="%s") + conditional(force_alleles, "--force_alleles", escape=true, format="%s") + optional("--initial_tumor_lod", initial_tumor_lod, spaceSeparated=true, escape=true, format=initial_tumor_lodFormat) + optional("--tumor_lod", tumor_lod, spaceSeparated=true, escape=true, format=tumor_lodFormat) + optional("--fraction_contamination", fraction_contamination, spaceSeparated=true, escape=true, format=fraction_contaminationFormat) + optional("--minimum_mutation_cell_fraction", minimum_mutation_cell_fraction, spaceSeparated=true, escape=true, format=minimum_mutation_cell_fractionFormat) + optional("--normal_lod", normal_lod, spaceSeparated=true, escape=true, format=normal_lodFormat) + optional("--normal_artifact_lod", normal_artifact_lod, spaceSeparated=true, escape=true, format=normal_artifact_lodFormat) + optional("--strand_artifact_lod", strand_artifact_lod, spaceSeparated=true, escape=true, format=strand_artifact_lodFormat) + optional("--strand_artifact_power_threshold", strand_artifact_power_threshold, spaceSeparated=true, escape=true, format=strand_artifact_power_thresholdFormat) + optional("--dbsnp_normal_lod", dbsnp_normal_lod, spaceSeparated=true, escape=true, format=dbsnp_normal_lodFormat) + optional("--somatic_classification_normal_power_threshold", somatic_classification_normal_power_threshold, spaceSeparated=true, escape=true, format=somatic_classification_normal_power_thresholdFormat) + optional("--minimum_normal_allele_fraction", minimum_normal_allele_fraction, spaceSeparated=true, escape=true, format=minimum_normal_allele_fractionFormat) + optional("--tumor_f_pretest", tumor_f_pretest, spaceSeparated=true, escape=true, format=tumor_f_pretestFormat) + optional("--min_qscore", min_qscore, spaceSeparated=true, escape=true, format="%s") + optional("--gap_events_threshold", gap_events_threshold, spaceSeparated=true, escape=true, format="%s") + optional("--heavily_clipped_read_fraction", heavily_clipped_read_fraction, spaceSeparated=true, escape=true, format=heavily_clipped_read_fractionFormat) + optional("--clipping_bias_pvalue_threshold", clipping_bias_pvalue_threshold, spaceSeparated=true, escape=true, format=clipping_bias_pvalue_thresholdFormat) + optional("--fraction_mapq0_threshold", fraction_mapq0_threshold, spaceSeparated=true, escape=true, format=fraction_mapq0_thresholdFormat) + optional("--pir_median_threshold", pir_median_threshold, spaceSeparated=true, escape=true, format=pir_median_thresholdFormat) + optional("--pir_mad_threshold", pir_mad_threshold, spaceSeparated=true, escape=true, format=pir_mad_thresholdFormat) + optional("--required_maximum_alt_allele_mapping_quality_score", required_maximum_alt_allele_mapping_quality_score, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_alleles_in_normal_count", max_alt_alleles_in_normal_count, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_alleles_in_normal_qscore_sum", max_alt_alleles_in_normal_qscore_sum, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_allele_in_normal_fraction", max_alt_allele_in_normal_fraction, spaceSeparated=true, escape=true, format=max_alt_allele_in_normal_fractionFormat) + optional("--power_constant_qscore", power_constant_qscore, spaceSeparated=true, escape=true, format="%s") + optional("--absolute_copy_number_data", absolute_copy_number_data, spaceSeparated=true, escape=true, format="%s") + optional("--power_constant_af", power_constant_af, spaceSeparated=true, escape=true, format=power_constant_afFormat) + optional("-o", out, spaceSeparated=true, escape=true, format="%s") + repeat("-dbsnp", dbsnp, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + repeat("-cosmic", cosmic, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + repeat("-normal_panel", normal_panel, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + optional("-cov", coverage_file, spaceSeparated=true, escape=true, format="%s") + optional("-cov_q20", coverage_20_q20_file, spaceSeparated=true, escape=true, format="%s") + optional("-pow", power_file, spaceSeparated=true, escape=true, format="%s") + optional("-tdf", tumor_depth_file, spaceSeparated=true, escape=true, format="%s") + optional("-ndf", normal_depth_file, spaceSeparated=true, escape=true, format="%s") + conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape=true, format="%s") -} From d27ae67bb65e9fe4edd9cecbe6961405d55bb5d4 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 15 Oct 2012 22:30:01 -0400 Subject: [PATCH 04/18] Updating the multi-step UG integration test. --- .../walkers/haplotypecaller/LikelihoodCalculationEngine.java | 4 +++- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 072f81db9..1eba893fc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -348,12 +348,14 @@ public class LikelihoodCalculationEngine { } } } + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele a : call.getFirst().getAlleles() ) + for( final Allele a : call.getFirst().getAlleles() ) { likelihoodMap.add(read, a, 0.0); + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e2ea47d9c..df088a4ad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -349,7 +349,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("beee9457d7cea42006ac45400db5e873")); + Arrays.asList("f3ff7fe0f15f31eadd726c711d6bf3de")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } From 31be8076649ae0d2682c9f8fb0d899c59ae6bb00 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 15 Oct 2012 22:31:52 -0400 Subject: [PATCH 05/18] Updating missed integration test. --- .../walkers/haplotypecaller/LikelihoodCalculationEngine.java | 2 +- .../sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 1eba893fc..14c1cd59d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -348,7 +348,7 @@ public class LikelihoodCalculationEngine { } } } - + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index 24ffde9c3..93099f82a 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "78ce72d8f9d029313f5f2ceb02bb9822", nt, nct }); + tests.add(new Object[]{ "BOTH", "f8184e336dec7632408aa9afa98e6914", nt, nct }); } return tests.toArray(new Object[][]{}); From d1511e38ad0fdb10ab0b9f2146331e7a43099b6b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 15 Oct 2012 16:06:11 -0400 Subject: [PATCH 06/18] Removing ConstrainedAFCalculationModel; AFCalcPerformanceTest -- Superceded by IndependentAFCalc -- Added support to read in an ExactModelLog in AFCalcPerformanceTest and run the independent alleles model on it. -- A few misc. bug fixes discovered during running the performance test --- .../afcalc/AFCalcPerformanceTest.java | 36 ++++- .../genotyper/afcalc/AFCalcUnitTest.java | 8 +- ...ConstrainedAFCalculationModelUnitTest.java | 124 ------------------ .../gatk/walkers/genotyper/afcalc/AFCalc.java | 92 +++++++++++-- .../genotyper/afcalc/AFCalcFactory.java | 4 - .../genotyper/afcalc/AFCalcResult.java | 13 +- .../afcalc/ConstrainedDiploidExactAFCalc.java | 107 --------------- .../sting/utils/variantcontext/Genotype.java | 14 +- 8 files changed, 135 insertions(+), 263 deletions(-) delete mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index 68b068509..f019d8f8e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -5,15 +5,16 @@ import org.apache.log4j.Logger; import org.apache.log4j.TTCCLayout; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import java.io.FileOutputStream; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -190,7 +191,8 @@ public class AFCalcPerformanceTest { public enum Operation { ANALYZE, - SINGLE + SINGLE, + EXACT_LOG } public static void main(final String[] args) throws Exception { final TTCCLayout layout = new TTCCLayout(); @@ -204,10 +206,37 @@ public class AFCalcPerformanceTest { switch ( op ) { case ANALYZE: analyze(args); break; case SINGLE: profileBig(args); break; + case EXACT_LOG: exactLog(args); break; default: throw new IllegalAccessException("unknown operation " + op); } } + private static void exactLog(final String[] args) throws Exception { + final File ref = new File(args[1]); + final File exactLogFile = new File(args[2]); + final List startsToUse = new LinkedList(); + + for ( int i = 3; i < args.length; i++ ) + startsToUse.add(Integer.valueOf(args[i])); + + final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(ref); + final GenomeLocParser parser = new GenomeLocParser(seq); + final BufferedReader reader = new BufferedReader(new FileReader(exactLogFile)); + final List loggedCalls = AFCalc.readExactLog(reader, startsToUse, parser); + + for ( final AFCalc.ExactCall call : loggedCalls ) { + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1, + AFCalcFactory.Calculation.EXACT_INDEPENDENT, + AFCalcTestBuilder.PriorType.human); + final SimpleTimer timer = new SimpleTimer().start(); + final AFCalcResult result = testBuilder.makeModel().getLog10PNonRef(call.vc, testBuilder.makePriors()); + call.newNanoTime = timer.getElapsedTimeNano(); + call.newPNonRef = result.getLog10PosteriorOfAFGT0(); + logger.info(call); + logger.info("\t\t" + result); + } + } + private static void profileBig(final String[] args) throws Exception { final int nSamples = Integer.valueOf(args[1]); final int ac = Integer.valueOf(args[2]); @@ -234,7 +263,6 @@ public class AFCalcPerformanceTest { final List modelParams = Arrays.asList( new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10), // new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10), - new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100), new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000)); final boolean ONLY_HUMAN_PRIORS = false; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 4ac4692d7..e2407989b 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -372,16 +372,14 @@ public class AFCalcUnitTest extends BaseTest { final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make(); final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat; - final List constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED); - final double TOLERANCE = 0.5; final List initialPNonRefData = Arrays.asList( // bi-allelic sites new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true), - new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel), - new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel), - new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel), + new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false), + new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false), + new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false), new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true), new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true), diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java deleted file mode 100644 index 31ec28af4..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java +++ /dev/null @@ -1,124 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -public class ConstrainedAFCalculationModelUnitTest extends BaseTest { - static Allele A = Allele.create("A", true); - static Allele C = Allele.create("C"); - static Allele G = Allele.create("G"); - - protected static Genotype makePL(final List expectedGT, int ... pls) { - return AFCalcUnitTest.makePL(expectedGT, pls); - } - - @DataProvider(name = "MaxACsToVisit") - public Object[][] makeMaxACsToVisit() { - List tests = new ArrayList(); - - final int nSamples = 10; - - for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) { - final int nChrom = (nSamples - nNonInformative) * 2; - for ( int i = 0; i < nChrom; i++ ) { - // bi-allelic - tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED}); - - // tri-allelic - for ( int j = 0; j < (nChrom - i); j++) - tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "MaxACsToVisit") - public void testMaxACsToVisit(final int nSamples, final List requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) { - final int nAlts = requestedACs.size(); - final AFCalcTestBuilder testBuilder - = new AFCalcTestBuilder(nSamples, nAlts, modelType, - AFCalcTestBuilder.PriorType.human); - - final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100); - final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc); - - testExpectedACs(vc, maxACsToVisit); - } - - private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) { - // this is necessary because cannot ensure that the tester gives us back the - // requested ACs due to rounding errors - final List ACs = new ArrayList(); - for ( final Allele a : vc.getAlternateAlleles() ) - ACs.add(vc.getCalledChrCount(a)); - - for ( int i = 0; i < maxACsToVisit.length; i++ ) { - Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i); - } - } - - @DataProvider(name = "MaxACsGenotypes") - public Object[][] makeMaxACsForGenotype() { - List tests = new ArrayList(); - - final List AA = Arrays.asList(A, A); - final List AC = Arrays.asList(A, C); - final List CC = Arrays.asList(C, C); - final List AG = Arrays.asList(A, G); - final List GG = Arrays.asList(G, G); - final List CG = Arrays.asList(C, G); - - final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make(); - final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make(); - - tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)}); - tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)}); - tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)}); - - // make sure non-informative => 0 - tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)}); - tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)}); - - // multi-allelics - tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)}); - tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)}); - tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)}); - - // deal with non-informatives third alleles - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)}); - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)}); - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)}); - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "MaxACsGenotypes") - private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) { - final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make(); - - final AFCalcTestBuilder testBuilder - = new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED, - AFCalcTestBuilder.PriorType.human); - - final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc); - - testExpectedACs(vc, maxACsToVisit); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index f87084a9c..8cb6bcabc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -28,19 +28,14 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; +import java.io.*; +import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedList; import java.util.List; @@ -218,7 +213,84 @@ public abstract class AFCalc implements Cloneable { callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); } + public static class ExactCall { + final VariantContext vc; + final long origNanoTime; + long newNanoTime = -1; + final double origPNonRef; + double newPNonRef = -1; + + public ExactCall(VariantContext vc, long origNanoTime, double origPNonRef) { + this.vc = vc; + this.origNanoTime = origNanoTime; + this.origPNonRef = origPNonRef; + } + + @Override + public String toString() { + return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s new.pNonRef=%.2f new.runtime=%s", + vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), + origPNonRef, + new AutoFormattingTime(origNanoTime / 1e9).toString(), + newPNonRef, + newNanoTime == -1 ? "not.run" : new AutoFormattingTime(newNanoTime / 1e9).toString()); + } + } + + public static List readExactLog(final BufferedReader reader, final List startsToKeep, GenomeLocParser parser) throws IOException { + List calls = new LinkedList(); + + // skip the header line + reader.readLine(); + + while ( true ) { + final VariantContextBuilder builder = new VariantContextBuilder(); + final List alleles = new ArrayList(); + final List genotypes = new ArrayList(); + long runtimeNano = -1; + + GenomeLoc currentLoc = null; + while ( true ) { + final String line = reader.readLine(); + if ( line == null ) + return calls; + + final String[] parts = line.split("\t"); + final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); + final String variable = parts[1]; + final String key = parts[2]; + final String value = parts[3]; + + if ( currentLoc == null ) + currentLoc = lineLoc; + + if ( variable.equals("log10PosteriorOfAFzero") ) { + if ( startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart()) ) { + builder.alleles(alleles); + final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; + builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); + builder.genotypes(genotypes); + calls.add(new ExactCall(builder.make(), runtimeNano, Double.valueOf(value))); + } + break; + } else if ( variable.equals("allele") ) { + final boolean isRef = key.equals("0"); + alleles.add(Allele.create(value, isRef)); + } else if ( variable.equals("PL") ) { + final GenotypeBuilder gb = new GenotypeBuilder(key); + gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); + genotypes.add(gb.make()); + } else if ( variable.equals("runtime.nano") ) { + runtimeNano = Long.valueOf(value); + } else { + // nothing to do + } + } + } + } + public AFCalcResultTracker getResultTracker() { return resultTracker; } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 981100eaa..7d67815cf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -30,10 +30,6 @@ public class AFCalcFactory { /** reference implementation of multi-allelic EXACT model */ EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), - /** expt. implementation */ - @Deprecated - EXACT_CONSTRAINED(ConstrainedDiploidExactAFCalc.class, 2, -1), - /** expt. implementation -- for testing only */ EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index da7fd08ce..a42795593 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -31,10 +31,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Describes the results of the AFCalc @@ -217,6 +214,14 @@ public class AFCalcResult { return log10PriorsOfAC[AF1p]; } + @Override + public String toString() { + final List byAllele = new LinkedList(); + for ( final Allele a : getAllelesUsedInGenotyping() ) + if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a))); + return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele)); + } + /** * Are we sufficiently confidence in being non-ref that the site is considered polymorphic? * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java deleted file mode 100644 index 36d53ceaa..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java +++ /dev/null @@ -1,107 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -@Deprecated -public class ConstrainedDiploidExactAFCalc extends DiploidExactAFCalc { - protected ConstrainedDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - } - - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - final int[] maxACsToConsider = computeMaxACs(vc); - resultTracker.setAClimits(maxACsToConsider); - return new StateTracker(maxACsToConsider); - } - - /** - * Computes the maximum ACs we need to consider for each alt allele - * - * Walks over the genotypes in VC, and computes for each alt allele the maximum - * AC we need to consider in that alt allele dimension. Does the calculation - * based on the PLs in each genotype g, choosing to update the max AC for the - * alt alleles corresponding to that PL. Only takes the first lowest PL, - * if there are multiple genotype configurations with the same PL value. It - * takes values in the order of the alt alleles. - * - * @param vc the variant context we will compute max alt alleles for - * @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the - * first alt allele. - */ - @Ensures("result != null") - protected final int[] computeMaxACs(final VariantContext vc) { - final int[] maxACs = new int[vc.getNAlleles()-1]; - - for ( final Genotype g : vc.getGenotypes() ) - updateMaxACs(g, maxACs); - - return maxACs; - } - - /** - * Update the maximum achievable allele counts in maxAC according to the PLs in g - * - * Selects the maximum genotype configuration from the PLs in g, and updates - * the maxAC for this configure. For example, if the lowest PL is for 0/1, updates - * the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for - * many number of alt alleles (determined by length of maxACs). - * - * If the max PL occurs at 0/0, updates nothing - * Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have - * the same PL value, then updates the first one. - * - * Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL, - * then only first one (1) will be updated - * - * @param g the genotype to update - * @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele) - */ - @Requires({ - "g != null", - "maxACs != null", - "goodMaxACs(maxACs)"}) - private void updateMaxACs(final Genotype g, final int[] maxACs) { - final int[] PLs = g.getLikelihoods().getAsPLs(); - - int minPLi = 0; - int minPL = PLs[0]; - - for ( int i = 0; i < PLs.length; i++ ) { - if ( PLs[i] < minPL ) { - minPL = PLs[i]; - minPLi = i; - } - } - - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi); - updateMaxACs(maxACs, pair.alleleIndex1); - updateMaxACs(maxACs, pair.alleleIndex2); - } - - /** - * Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref) - * - * If alleleI == 0 => doesn't update anything - * else maxACs[alleleI - 1]++ - * - * @param maxACs array of max alt allele ACs - * @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles. - */ - @Requires({ - "alleleI >= 0", - "(alleleI - 1) < maxACs.length", - "goodMaxACs(maxACs)"}) - private void updateMaxACs(final int[] maxACs, final int alleleI) { - if ( alleleI > 0 ) - maxACs[alleleI-1]++; - } - - private static boolean goodMaxACs(final int[] maxACs) { - return MathUtils.sum(maxACs) >= 0; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index aa801c2b9..67e80cf3c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -298,12 +298,16 @@ public abstract class Genotype implements Comparable { * @return true if all samples PLs are equal and == 0 */ public boolean isNonInformative() { - for ( final int PL : getPL() ) { - if ( PL != 0 ) - return false; - } + if ( getPL() == null ) + return true; + else { + for ( final int PL : getPL() ) { + if ( PL != 0 ) + return false; + } - return true; + return true; + } } /** From 6bd0ec8de40f74548b2c39a8767ceaf100d30a0f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 15 Oct 2012 20:23:07 -0400 Subject: [PATCH 07/18] Proper likelihoods and posterior probability of the joint allele frequency in IndependentAllelesDiploidExactAFCalc -- Fixed minor numerical stability issue in AFCalcResult -- posterior of joint A/B/C is 1 - (1 - P(D | AF_b == 0)) x (1 - P(D | AF_c == 0)), for any number of alleles, obviously. Now computes the joint posterior like this, and then back-calculates likelihoods that generate these posteriors given the priors. It's not pretty but it's the best thing to do --- .../genotyper/afcalc/AFCalcResult.java | 4 +- .../IndependentAllelesDiploidExactAFCalc.java | 155 +++++++++--------- 2 files changed, 81 insertions(+), 78 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index a42795593..209a21d82 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -275,11 +275,11 @@ public class AFCalcResult { // necessary because the posteriors may be so skewed that the log-space normalized value isn't // good, so we have to try both log-space normalization as well as the real-space normalization if the // result isn't good - final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true); + final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) ) return logNormalized; else - return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); + return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 0ac964c9c..ba1e5bbb8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -32,13 +32,74 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; -public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { +/** + * Computes the conditional bi-allelic exact results + * + * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: + * + * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] + * + * it then computes the conditional probability on AF_c == 0: + * + * (2) P(D | AF_t > 0 && AF_c == 0) + * + * Thinking about this visually, we have the following likelihood matrix where each cell is + * the P(D | AF_c == i && AF_t == j): + * + * 0 AF_c > 0 + * ----------------- + * 0 | | + * |--|------------- + * a | | + * f | | + * _ | | + * t | | + * > | | + * 0 | | + * + * What we really want to know how + * + * (3) P(D | AF_c == 0 & AF_t == 0) + * + * compares with + * + * (4) P(D | AF_c > 0 || AF_t > 0) + * + * This is effectively asking for the value in the upper left vs. the sum of all cells. + * + * This class implements the conditional likelihoods summation for any number of alt + * alleles, where each alt allele has its EXACT probability of segregating calculated by + * reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * After each allele has its probability calculated we compute the joint posterior + * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i + * prior for the ith least likely allele. + */ + public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { + /** + * The min. confidence of an allele to be included in the joint posterior. + */ + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20); + private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + /** + * Sorts AFCalcResults by their posteriors of AF > 0, so the + */ private final static class CompareAFCalcResultsByPNonRef implements Comparator { @Override public int compare(AFCalcResult o1, AFCalcResult o2) { - return Double.compare(o1.getLog10LikelihoodOfAFGT0(), o2.getLog10LikelihoodOfAFGT0()); + return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); } } @@ -68,76 +129,13 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc); final List independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors); final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); - return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, withMultiAllelicPriors); + return combineIndependentPNonRefs(vc, withMultiAllelicPriors); } - protected final double computelog10LikelihoodOfRef(final VariantContext vc) { - // this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation - final List allGLs = getGLs(vc.getGenotypes(), false); - double log10LikelihoodOfHomRef = 0.0; - - // TODO -- can be easily optimized (currently looks at all GLs via getGLs) - for ( int i = 0; i < allGLs.size(); i++ ) { - final double[] GLs = allGLs.get(i); - log10LikelihoodOfHomRef += GLs[0]; - //log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0]; - } - - return log10LikelihoodOfHomRef; - } /** - * Computes the conditional bi-allelic exact results - * - * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: - * - * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] - * - * it then computes the conditional probability on AF_c == 0: - * - * (2) P(D | AF_t > 0 && AF_c == 0) - * - * Thinking about this visually, we have the following likelihood matrix where each cell is - * the P(D | AF_c == i && AF_t == j): - * - * 0 AF_c > 0 - * ----------------- - * 0 | | - * |--|------------- - * a | | - * f | | - * _ | | - * t | | - * > | | - * 0 | | - * - * What we really want to know how - * - * (3) P(D | AF_c == 0 & AF_t == 0) - * - * compares with - * - * (4) P(D | AF_c > 0 || AF_t > 0) - * - * This is effectively asking for the value in the upper left vs. the sum of all cells. - * - * The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the - * band at the top where AF_t > 0 and AF_c == 0 - * - * So (4) is actually (1) + (2). - * - * (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating - * - * (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything] - * (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use] - * - * This function implements the conditional likelihoods summation for any number of alt - * alleles (not just the tri-allelic case), where each subsequent variant context is - * further constrained such that each already considered allele x has AF_x == 0 in the - * compute. * * @param vc * @param log10AlleleFrequencyPriors @@ -294,7 +292,6 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, - final double log10LikelihoodsOfACEq0, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; final int nAltAlleles = sortedResultsWithThetaNPriors.size(); @@ -302,8 +299,8 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { final double[] log10PriorsOfAC = new double[2]; final Map log10pNonRefByAllele = new HashMap(nAltAlleles); - // this value is a sum in real space so we need to store values to sum up later - final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles]; + // this value is a sum in log space + double log10PosteriorOfACEq0Sum = 0.0; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); @@ -316,7 +313,8 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); // the AF > 0 case requires us to store the normalized likelihood for later summation - log10LikelihoodsOfACGt0[altI] = sortedResultWithThetaNPriors.getLog10LikelihoodOfAFGT0(); + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) + log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); @@ -325,14 +323,19 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { nEvaluations += sortedResultWithThetaNPriors.nEvaluations; } - // the log10 likelihoods are the sum of the log10 likelihoods across all alt alleles - final double[] log10LikelihoodsOfAC = new double[]{ - log10LikelihoodsOfACEq0, - MathUtils.log10sumLog10(log10LikelihoodsOfACGt0)}; + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, + // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently + // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 + final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + final double[] log10LikelihoodsOfAC = new double[] { + // L + prior = posterior => L = poster - prior + log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], + log10PosteriorOfACGt0 - log10PriorsOfAC[1] + }; return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized log10pNonRefByAllele, sortedResultsWithThetaNPriors); } } From 9b0ab4e9417c4cb61d56acac0a0cf0de1add078a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 15 Oct 2012 20:58:55 -0400 Subject: [PATCH 08/18] Cleanup IndependentAllelesDiploidExactAFCalc -- Remove capability to truncate genotype likelihoods -- this wasn't used and isn't really useful after all -- Added lots of contracts and docs, still more to come. -- Created a default makeMaxLikelihoods function in ReferenceDiploidExactAFCalc and DiploidExactAFCalc so that multiple subclasses don't just do the default thing -- Generalized reference bi-allelic model in IndependentAllelesDiploidExactAFCalc so that in principle any bi-allelic reference model can be used. --- ...dentAllelesDiploidExactAFCalcUnitTest.java | 48 ++----- .../genotyper/afcalc/DiploidExactAFCalc.java | 4 +- .../IndependentAllelesDiploidExactAFCalc.java | 118 ++++++++++-------- .../afcalc/ReferenceDiploidExactAFCalc.java | 6 - 4 files changed, 77 insertions(+), 99 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java index ed164f245..391c99990 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -55,48 +55,14 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @DataProvider(name = "TestCombineGLsWithDrops") - public Object[][] makeTestCombineGLsWithDrops() { - List tests = new ArrayList(); - - final Set noDrops = Collections.emptySet(); - final Set drop1 = Collections.singleton(1); - final Set drop2 = Collections.singleton(2); - - // AA AB BB AC BC CC - // drop1 (B): AA AC CC - // drop2 (C): AA AB BB - tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops}); - tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops}); - tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2}); - tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1}); - - tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops}); - tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops}); - tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2}); - tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1}); - - tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops}); - tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops}); - tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2}); - tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1}); - - return tests.toArray(new Object[][]{}); - } - private Genotype makePL(final int ... PLs) { return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs); } @Test(enabled = true, dataProvider = "TestCombineGLs") private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { - testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.emptySet()); - } - - @Test(enabled = true, dataProvider = "TestCombineGLsWithDrops") - private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set allelesToDrop) { final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); - final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts); + final Genotype combined = calc.combineGLs(testg, altIndex, nAlts); Assert.assertEquals(combined.getPL(), expected.getPL(), "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); @@ -120,22 +86,21 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5); final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2); final Genotype gACcombined = makePL(0, 2, 5); + final Genotype gACcombined2 = makePL(0, 1, 4); final Genotype gAGcombined = makePL(0, 4, 9); - final Genotype gACdropped = makePL(0, 1, 2); - final Genotype gAGdropped = makePL(0, 3, 5); // biallelic tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())}); // tri-allelic - tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())}); - tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())}); + tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())}); + tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())}); return tests.toArray(new Object[][]{}); } - @Test(enabled = false, dataProvider = "TestMakeAlleleConditionalContexts") + @Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts") private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) { final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc); @@ -148,7 +113,8 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); for ( int j = 0; j < actual.getNSamples(); j++ ) - Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL()); + Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(), + "expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL())); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 8b12dff61..49915c515 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -36,7 +36,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } - protected abstract StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker); + protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { + return new StateTracker(); + } @Override protected AFCalcResult computeLog10PNonRef(final VariantContext vc, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index ba1e5bbb8..c0edee291 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -91,6 +91,7 @@ import java.util.*; */ private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20); + private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); /** @@ -105,19 +106,23 @@ import java.util.*; private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); - final ReferenceDiploidExactAFCalc refModel; + /** + * The AFCalc model we are using to do the bi-allelic computation + */ + final AFCalc biAlleleExactModel; protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); - } - - @Override - protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { - return refModel.makeMaxLikelihood(vc, resultTracker); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); } + /** + * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call + */ private static class MyAFCalcResult extends AFCalcResult { + /** + * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call + */ final List supporting; private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { @@ -129,58 +134,89 @@ import java.util.*; @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - final List independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors); + final List independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors); final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); return combineIndependentPNonRefs(vc, withMultiAllelicPriors); } /** + * Compute the conditional exact AFCalcResult for each allele in vc independently, returning + * the result of each, in order of the alt alleles in VC * - * @param vc - * @param log10AlleleFrequencyPriors - * @return + * @param vc the VariantContext we want to analyze + * @param log10AlleleFrequencyPriors the priors + * @return a list of the AFCalcResults for each bi-allelic sub context of vc */ - protected List computeAlleleConditionalExact(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) + @Ensures("goodIndependentResult(vc, result)") + protected final List computeAlleleIndependentExact(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { final List results = new LinkedList(); for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { - final AFCalcResult resultTracker = refModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); + final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); results.add(resultTracker); } return results; } - protected List makeAlleleConditionalContexts(final VariantContext vc) { + /** + * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results + */ + private static boolean goodIndependentResult(final VariantContext vc, final List results) { + if ( results.size() != vc.getNAlleles() - 1) return false; + for ( int i = 0; i < results.size(); i++ ) { + if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) + return false; + if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) + return false; + } + + return true; + } + + /** + * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order + * + * @param vc the variant context to split. Must have n.alt.alleles > 1 + * @return a bi-allelic variant context for each alt allele in vc + */ + @Requires({"vc != null", "vc.getNAlleles() > 1"}) + @Ensures("result.size() == vc.getNAlleles() - 1") + protected final List makeAlleleConditionalContexts(final VariantContext vc) { final int nAltAlleles = vc.getNAlleles() - 1; final List vcs = new LinkedList(); - final List afZeroAlleles = new LinkedList(); for ( int altI = 0; altI < nAltAlleles; altI++ ) { - final Allele altAllele = vc.getAlternateAllele(altI); - final List biallelic = Arrays.asList(vc.getReference(), altAllele); - vcs.add(biallelicCombinedGLs(vc, biallelic, afZeroAlleles, altI + 1)); - //afZeroAlleles.add(altAllele); + vcs.add(biallelicCombinedGLs(vc, altI + 1)); } return vcs; } - protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List biallelic, final List afZeroAlleles, final int allele2) { + /** + * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex + * + * @param rootVC the root (potentially multi-allelic) variant context + * @param altAlleleIndex index of the alt allele, from 0 == first alt allele + * @return a bi-allelic variant context based on rootVC + */ + @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) + @Ensures({"result.isBiallelic()"}) + protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) { if ( rootVC.isBiallelic() ) { - if ( ! afZeroAlleles.isEmpty() ) throw new IllegalArgumentException("Root VariantContext is biallelic but afZeroAlleles wasn't empty: " + afZeroAlleles); return rootVC; } else { - final Set allelesToDiscard = new HashSet(rootVC.getAlleleIndices(afZeroAlleles)); final int nAlts = rootVC.getNAlleles() - 1; final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); for ( final Genotype g : rootVC.getGenotypes() ) - biallelicGenotypes.add(combineGLs(g, allele2, allelesToDiscard, nAlts)); + biallelicGenotypes.add(combineGLs(g, altAlleleIndex, nAlts)); final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); - vcb.alleles(biallelic); + final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); + vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele)); vcb.genotypes(biallelicGenotypes); return vcb.make(); } @@ -201,30 +237,16 @@ import java.util.*; * XB = AB + BC * BB = BB * - * Supports the additional mode of simply dropping GLs whose allele index occurs in allelesToDiscard. This is - * useful in the case where you want to drop alleles (not combine them), such as above: - * - * AA AB BB AC BC CC - * - * and we want to get the bi-allelic GLs for X/B, where X is everything not B, but dropping C (index 2) - * - * XX = AA (since X = A and C is dropped) - * XB = AB - * BB = BB - * - * This allows us to recover partial GLs the correspond to any allele in allelesToDiscard having strictly - * AF == 0. - * * @param original the original multi-allelic genotype * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 * @param nAlts the total number of alt alleles * @return a new biallelic genotype with appropriate PLs */ - @Requires({"original.hasLikelihoods()", "! allelesToDiscard.contains(altIndex)"}) + @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) - protected Genotype combineGLs(final Genotype original, final int altIndex, final Set allelesToDiscard, final int nAlts ) { + protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { if ( original.isNonInformative() ) - return new GenotypeBuilder(original).PL(new int[]{0,0,0}).alleles(BIALLELIC_NOCALL).make(); + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); @@ -234,10 +256,6 @@ import java.util.*; for ( int index = 0; index < normalizedPr.length; index++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); - // just continue if we shouldn't include the pair because it's in the discard set - if ( discardAllelePair(pair, allelesToDiscard) ) - continue; - if ( pair.alleleIndex1 == altIndex ) { if ( pair.alleleIndex2 == altIndex ) // hom-alt case @@ -261,11 +279,7 @@ import java.util.*; return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); } - protected boolean discardAllelePair(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair, Set allelesToDiscard) { - return allelesToDiscard.contains(pair.alleleIndex1) || allelesToDiscard.contains(pair.alleleIndex2); - } - - protected List applyMultiAllelicPriors(final List conditionalPNonRefResults) { + protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { final ArrayList sorted = new ArrayList(conditionalPNonRefResults); // sort the results, so the most likely allele is first @@ -289,6 +303,8 @@ import java.util.*; /** * Take the independent estimates of pNonRef for each alt allele and combine them into a single result * + * TODO -- add more docs + * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java index 4de983508..b4e7b2ab1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java @@ -1,13 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc { protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } - - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - return new StateTracker(); - } } From c74d7061fe389510187cf4aa362a0c6028d7591d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 16 Oct 2012 08:10:22 -0400 Subject: [PATCH 09/18] Added AFCalcResultUnitTest -- Ensures that the posteriors remain within reasonable ranges. Fixed bug where normalization of posteriors = {-1e30, 0.0} => {-100000, 0.0} which isn't good. Now tests ensure that the normalization process preserves log10 precision where possible -- Updated MathUtils to make this possible --- .../afcalc/AFCalcResultUnitTest.java | 77 +++++++++++++++++++ .../genotyper/afcalc/AFCalcUnitTest.java | 37 +++++---- .../genotyper/afcalc/AFCalcResult.java | 10 +-- .../broadinstitute/sting/utils/MathUtils.java | 8 +- 4 files changed, 103 insertions(+), 29 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java new file mode 100644 index 000000000..1070642e9 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -0,0 +1,77 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class AFCalcResultUnitTest extends BaseTest { + private static class MyTest { + final double[] Ls, expectedPosteriors; + + private MyTest(double[] ls, double[] expectedPosteriors) { + Ls = ls; + this.expectedPosteriors = expectedPosteriors; + } + + @Override + public String toString() { + return "Ls [" + Utils.join(",", Ls) + "] expectedPosteriors [" + Utils.join(",", expectedPosteriors) + "]"; + } + } + + @DataProvider(name = "TestComputePosteriors") + public Object[][] makeTestCombineGLs() { + List tests = new ArrayList(); + + tests.add(new Object[]{new MyTest(log10Even, log10Even)}); + + for ( double L0 = -1e9; L0 < 0.0; L0 /= 10.0 ) { + for ( double L1 = -1e2; L1 < 0.0; L1 /= 100.0 ) { + final double[] input = new double[]{L0, L1}; + final double[] expected = MathUtils.normalizeFromLog10(input, true); + tests.add(new Object[]{new MyTest(input, expected)}); + } + } + + for ( double bigBadL = -1e50; bigBadL < -1e200; bigBadL *= 10 ) { + // test that a huge bad likelihood remains, even with a massive better result + for ( final double betterL : Arrays.asList(-1000.0, -100.0, -10.0, -1.0, -0.1, -0.01, -0.001, 0.0)) { + tests.add(new Object[]{new MyTest(new double[]{bigBadL, betterL}, new double[]{bigBadL, 0.0})}); + tests.add(new Object[]{new MyTest(new double[]{betterL, bigBadL}, new double[]{0.0, bigBadL})}); + } + } + + // test that a modest bad likelihood with an ~0.0 value doesn't get lost + for ( final double badL : Arrays.asList(-10000.0, -1000.0, -100.0, -10.0)) { + tests.add(new Object[]{new MyTest(new double[]{badL, -1e-9}, new double[]{badL, 0.0})}); + tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})}); + } + + return tests.toArray(new Object[][]{}); + } + + + final static double[] log10Even = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); + final static Allele C = Allele.create("C"); + final static List alleles = Arrays.asList(Allele.create("A", true), C); + + @Test(enabled = true, dataProvider = "TestComputePosteriors") + private void testComputingPosteriors(final MyTest data) { + final AFCalcResult result = new AFCalcResult(new int[]{0}, 1, alleles, data.Ls, log10Even, Collections.singletonMap(C, -1.0)); + + Assert.assertEquals(result.getLog10PosteriorOfAFEq0(), data.expectedPosteriors[0], 1e-3, "AF = 0 not expected"); + Assert.assertEquals(result.getLog10PosteriorOfAFGT0(), data.expectedPosteriors[1], 1e-3, "AF > 0 not expected"); + + final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()}; + Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index e2407989b..25df0f6d2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -446,7 +446,7 @@ public class AFCalcUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(enabled = true & ! DEBUG_ONLY, dataProvider = "Models") + @Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models") public void testBiallelicPriors(final AFCalc model) { for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) { @@ -454,26 +454,29 @@ public class AFCalcUnitTest extends BaseTest { for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) { final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior); - final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}), true); - GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior); - final AFCalcResult resultTracker = cfg.execute(); - final int actualAC = resultTracker.getAlleleCountsOfMLE()[0]; + final double nonRefPrior = (1-refPrior) / 2; + final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior}), true); + if ( ! Double.isInfinite(priors[1]) ) { + GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior); + final AFCalcResult resultTracker = cfg.execute(); + final int actualAC = resultTracker.getAlleleCountsOfMLE()[0]; - final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; - final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5); - final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior)); - final double log10NonRefPost = Math.log10(nonRefPost); + final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; + final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5); + final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior)); + final double log10NonRefPost = Math.log10(nonRefPost); - if ( ! Double.isInfinite(log10NonRefPost) ) - Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2); + if ( ! Double.isInfinite(log10NonRefPost) ) + Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2); - if ( nonRefPost >= 0.9 ) - Assert.assertTrue(resultTracker.isPolymorphic(C, -1)); + if ( nonRefPost >= 0.9 ) + Assert.assertTrue(resultTracker.isPolymorphic(C, -1)); - final int expectedMLEAC = 1; // the MLE is independent of the prior - Assert.assertEquals(actualAC, expectedMLEAC, - "actual AC with priors " + log10NonRefPrior + " not expected " - + expectedMLEAC + " priors " + Utils.join(",", priors)); + final int expectedMLEAC = 1; // the MLE is independent of the prior + Assert.assertEquals(actualAC, expectedMLEAC, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedMLEAC + " priors " + Utils.join(",", priors)); + } } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index 209a21d82..c737416c5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -271,15 +271,7 @@ public class AFCalcResult { final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) log10UnnormalizedPosteriors[i] = log10LikelihoodsOfAC[i] + log10PriorsOfAC[i]; - - // necessary because the posteriors may be so skewed that the log-space normalized value isn't - // good, so we have to try both log-space normalization as well as the real-space normalization if the - // result isn't good - final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); - if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) ) - return logNormalized; - else - return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true); + return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index a1d6907a2..3740d5d7c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -596,7 +596,6 @@ public class MathUtils { if (keepInLogSpace) { for (int i = 0; i < array.length; i++) { array[i] -= maxValue; - array[i] = Math.max(array[i], LOG10_P_OF_ZERO); } return array; } @@ -613,8 +612,11 @@ public class MathUtils { sum += normalized[i]; for (int i = 0; i < array.length; i++) { double x = normalized[i] / sum; - if (takeLog10OfOutput) - x = Math.max(Math.log10(x), LOG10_P_OF_ZERO); + if (takeLog10OfOutput) { + x = Math.log10(x); + if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) + x = array[i] - maxValue; + } normalized[i] = x; } From 9bcefadd4efaf6094f51268d2113c3328d27a585 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 16 Oct 2012 13:30:09 -0400 Subject: [PATCH 10/18] Refactor ExactCallLogger into a separate class -- Update minor integration tests with NanoSchedule due to qual accuracy update --- .../afcalc/AFCalcPerformanceTest.java | 11 +- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 145 +-------------- .../genotyper/afcalc/ExactCallLogger.java | 166 ++++++++++++++++++ .../NanoSchedulerIntegrationTest.java | 2 +- 4 files changed, 180 insertions(+), 144 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index f019d8f8e..fab26c9d2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -18,11 +18,8 @@ import java.io.*; import java.util.*; /** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 10/2/12 - * Time: 10:25 AM - * To change this template use File | Settings | File Templates. + * A simple GATK utility (i.e, runs from command-line) for assessing the performance of + * the exact model */ public class AFCalcPerformanceTest { final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class); @@ -222,9 +219,9 @@ public class AFCalcPerformanceTest { final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(ref); final GenomeLocParser parser = new GenomeLocParser(seq); final BufferedReader reader = new BufferedReader(new FileReader(exactLogFile)); - final List loggedCalls = AFCalc.readExactLog(reader, startsToUse, parser); + final List loggedCalls = ExactCallLogger.readExactLog(reader, startsToUse, parser); - for ( final AFCalc.ExactCall call : loggedCalls ) { + for ( final ExactCallLogger.ExactCall call : loggedCalls ) { final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1, AFCalcFactory.Calculation.EXACT_INDEPENDENT, AFCalcTestBuilder.PriorType.human); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 8cb6bcabc..07f88c9e3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -28,20 +28,17 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.*; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.io.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; +import java.io.File; import java.util.List; /** * Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods - * */ public abstract class AFCalc implements Cloneable { private final static Logger defaultLogger = Logger.getLogger(AFCalc.class); @@ -53,8 +50,8 @@ public abstract class AFCalc implements Cloneable { protected Logger logger = defaultLogger; private SimpleTimer callTimer = new SimpleTimer(); - private PrintStream callReport = null; private final AFCalcResultTracker resultTracker; + private ExactCallLogger exactCallLogger = null; protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); @@ -69,7 +66,7 @@ public abstract class AFCalc implements Cloneable { } public void enableProcessLog(final File exactCallsLog) { - initializeOutputFile(exactCallsLog); + exactCallLogger = new ExactCallLogger(exactCallsLog); } public void setLogger(Logger logger) { @@ -97,8 +94,8 @@ public abstract class AFCalc implements Cloneable { final AFCalcResult result = computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors); final long nanoTime = callTimer.getElapsedTimeNano(); - if ( callReport != null ) - printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, resultTracker.getLog10PosteriorOfAFzero()); + if ( exactCallLogger != null ) + exactCallLogger.printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result); return result; } @@ -165,130 +162,6 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - - // --------------------------------------------------------------------------- - // - // Print information about the call to the calls log - // - // --------------------------------------------------------------------------- - - private void initializeOutputFile(final File outputFile) { - try { - if (outputFile != null) { - callReport = new PrintStream( new FileOutputStream(outputFile) ); - callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(outputFile, e); - } - } - - private void printCallInfo(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final long runtimeNano, - final double log10PosteriorOfAFzero) { - printCallElement(vc, "type", "ignore", vc.getType()); - - int allelei = 0; - for ( final Allele a : vc.getAlleles() ) - printCallElement(vc, "allele", allelei++, a.getDisplayString()); - - for ( final Genotype g : vc.getGenotypes() ) - printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); - - for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ ) - printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); - - printCallElement(vc, "runtime.nano", "ignore", runtimeNano); - printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero); - - callReport.flush(); - } - - private void printCallElement(final VariantContext vc, - final Object variable, - final Object key, - final Object value) { - final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); - callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); - } - - public static class ExactCall { - final VariantContext vc; - final long origNanoTime; - long newNanoTime = -1; - final double origPNonRef; - double newPNonRef = -1; - - public ExactCall(VariantContext vc, long origNanoTime, double origPNonRef) { - this.vc = vc; - this.origNanoTime = origNanoTime; - this.origPNonRef = origPNonRef; - } - - @Override - public String toString() { - return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s new.pNonRef=%.2f new.runtime=%s", - vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), - origPNonRef, - new AutoFormattingTime(origNanoTime / 1e9).toString(), - newPNonRef, - newNanoTime == -1 ? "not.run" : new AutoFormattingTime(newNanoTime / 1e9).toString()); - } - } - - public static List readExactLog(final BufferedReader reader, final List startsToKeep, GenomeLocParser parser) throws IOException { - List calls = new LinkedList(); - - // skip the header line - reader.readLine(); - - while ( true ) { - final VariantContextBuilder builder = new VariantContextBuilder(); - final List alleles = new ArrayList(); - final List genotypes = new ArrayList(); - long runtimeNano = -1; - - GenomeLoc currentLoc = null; - while ( true ) { - final String line = reader.readLine(); - if ( line == null ) - return calls; - - final String[] parts = line.split("\t"); - final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); - final String variable = parts[1]; - final String key = parts[2]; - final String value = parts[3]; - - if ( currentLoc == null ) - currentLoc = lineLoc; - - if ( variable.equals("log10PosteriorOfAFzero") ) { - if ( startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart()) ) { - builder.alleles(alleles); - final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; - builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); - builder.genotypes(genotypes); - calls.add(new ExactCall(builder.make(), runtimeNano, Double.valueOf(value))); - } - break; - } else if ( variable.equals("allele") ) { - final boolean isRef = key.equals("0"); - alleles.add(Allele.create(value, isRef)); - } else if ( variable.equals("PL") ) { - final GenotypeBuilder gb = new GenotypeBuilder(key); - gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); - genotypes.add(gb.make()); - } else if ( variable.equals("runtime.nano") ) { - runtimeNano = Long.valueOf(value); - } else { - // nothing to do - } - } - } - } - public AFCalcResultTracker getResultTracker() { return resultTracker; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java new file mode 100644 index 000000000..3794ba240 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -0,0 +1,166 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * Allows us to write out and read in information about exact calls (site, alleles, PLs, etc) in tabular format + */ +public class ExactCallLogger implements Cloneable { + private PrintStream callReport = null; + + /** + * Create a new ExactCallLogger writing it's output to outputFile + * + * @param outputFile + */ + public ExactCallLogger(final File outputFile) { + try { + callReport = new PrintStream(new FileOutputStream(outputFile)); + callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(outputFile, e); + } + } + + /** + * Summarizes information about an exact call that happened + */ + public static class ExactCall { + final VariantContext vc; + final long origNanoTime; + long newNanoTime = -1; + final double origPNonRef; + double newPNonRef = -1; + + public ExactCall(VariantContext vc, long origNanoTime, double origPNonRef) { + this.vc = vc; + this.origNanoTime = origNanoTime; + this.origPNonRef = origPNonRef; + } + + @Override + public String toString() { + return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s new.pNonRef=%.2f new.runtime=%s", + vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), + origPNonRef, + new AutoFormattingTime(origNanoTime / 1e9).toString(), + newPNonRef, + newNanoTime == -1 ? "not.run" : new AutoFormattingTime(newNanoTime / 1e9).toString()); + } + } + + protected void printCallInfo(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final long runtimeNano, + final AFCalcResult result) { + printCallElement(vc, "type", "ignore", vc.getType()); + + int allelei = 0; + for (final Allele a : vc.getAlleles()) + printCallElement(vc, "allele", allelei++, a.getDisplayString()); + + for (final Genotype g : vc.getGenotypes()) + printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); + + for (int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++) + printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); + + printCallElement(vc, "runtime.nano", "ignore", runtimeNano); + printCallElement(vc, "log10PosteriorOfAFEq0", "ignore", result.getLog10PosteriorOfAFEq0()); + printCallElement(vc, "log10PosteriorOfAFGt0", "ignore", result.getLog10PosteriorOfAFGT0()); + + for ( final Allele allele : result.getAllelesUsedInGenotyping() ) { + if ( allele.isNonReference() ) { + printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); + printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); + } + } + + callReport.flush(); + } + + private void printCallElement(final VariantContext vc, + final Object variable, + final Object key, + final Object value) { + final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); + callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); + } + + /** + * Read in a list of ExactCall objects from reader, keeping only those + * with starts in startsToKeep or all sites (if this is empty) + * + * @param reader + * @param startsToKeep + * @param parser + * @return + * @throws IOException + */ + public static List readExactLog(final BufferedReader reader, final List startsToKeep, GenomeLocParser parser) throws IOException { + if ( reader == null ) throw new IllegalArgumentException("reader cannot be null"); + if ( startsToKeep == null ) throw new IllegalArgumentException("startsToKeep cannot be null"); + if ( parser == null ) throw new IllegalArgumentException("GenomeLocParser cannot be null"); + + List calls = new LinkedList(); + + // skip the header line + reader.readLine(); + + while (true) { + final VariantContextBuilder builder = new VariantContextBuilder(); + final List alleles = new ArrayList(); + final List genotypes = new ArrayList(); + long runtimeNano = -1; + + GenomeLoc currentLoc = null; + while (true) { + final String line = reader.readLine(); + if (line == null) + return calls; + + final String[] parts = line.split("\t"); + final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); + final String variable = parts[1]; + final String key = parts[2]; + final String value = parts[3]; + + if (currentLoc == null) + currentLoc = lineLoc; + + if (variable.equals("log10PosteriorOfAFzero")) { + if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { + builder.alleles(alleles); + final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; + builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); + builder.genotypes(genotypes); + calls.add(new ExactCall(builder.make(), runtimeNano, Double.valueOf(value))); + } + break; + } else if (variable.equals("allele")) { + final boolean isRef = key.equals("0"); + alleles.add(Allele.create(value, isRef)); + } else if (variable.equals("PL")) { + final GenotypeBuilder gb = new GenotypeBuilder(key); + gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); + genotypes.add(gb.make()); + } else if (variable.equals("runtime.nano")) { + runtimeNano = Long.valueOf(value); + } else { + // nothing to do + } + } + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index 93099f82a..c1b28314c 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "f8184e336dec7632408aa9afa98e6914", nt, nct }); + tests.add(new Object[]{ "BOTH", "8cad82c3a5f5b932042933f136663c8a", nt, nct }); } return tests.toArray(new Object[][]{}); From b30e2a5b7dc5387df5b8da91a9f8b802455de18b Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 15 Oct 2012 11:53:57 -0400 Subject: [PATCH 11/18] BQSR: tool to profile the effects of more-granular locking on scalability by # of threads --- .../LoggingNestedIntegerArray.java | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java index 617391714..6fda0245b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java @@ -34,7 +34,12 @@ import java.io.PrintStream; * to the provided output stream. For testing/debugging purposes. * * Log entries are of the following form (fields are tab-separated): - * LABEL VALUE KEY1 KEY2 ... KEY_N + * LABEL OPERATION VALUE KEY1 KEY2 ... KEY_N + * + * A header line is written before the log entries giving the dimensions of this NestedIntegerArray. + * It has the form: + * + * # LABEL SIZE_OF_FIRST_DIMENSION SIZE_OF_SECOND_DIMENSION ... SIZE_OF_NTH_DIMENSION * * @author David Roazen */ @@ -43,6 +48,9 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { private PrintStream log; private String logEntryLabel; + public static final String HEADER_LINE_PREFIX = "# "; + public enum NestedIntegerArrayOperation { GET, PUT }; + /** * * @param log output stream to which to log update operations @@ -57,6 +65,37 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { } this.log = log; this.logEntryLabel = logEntryLabel != null ? logEntryLabel : ""; + + // Write the header line recording the dimensions of this NestedIntegerArray: + StringBuilder logHeaderLine = new StringBuilder(); + + logHeaderLine.append(HEADER_LINE_PREFIX); + logHeaderLine.append(this.logEntryLabel); + for ( int dimension : dimensions ) { + logHeaderLine.append("\t"); + logHeaderLine.append(dimension); + } + + this.log.println(logHeaderLine.toString()); + } + + @Override + public T get( final int... keys ) { + StringBuilder logEntry = new StringBuilder(); + + logEntry.append(logEntryLabel); + logEntry.append("\t"); + logEntry.append(NestedIntegerArrayOperation.GET); + logEntry.append("\t"); // empty field for the datum value + + for ( int key : keys ) { + logEntry.append("\t"); + logEntry.append(key); + } + + log.println(logEntry.toString()); + + return super.get(keys); } @Override @@ -67,6 +106,8 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { logEntry.append(logEntryLabel); logEntry.append("\t"); + logEntry.append(NestedIntegerArrayOperation.PUT); + logEntry.append("\t"); logEntry.append(value); for ( int key : keys ) { logEntry.append("\t"); From c9e7a947c26e0e632b3161eb457408fd75339717 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 08:34:47 -0400 Subject: [PATCH 13/18] Improve interface of ExactCallLogger, use it to have a more informative AFCalcPerformanceTest --- .../afcalc/AFCalcPerformanceTest.java | 21 +++++- .../genotyper/afcalc/AFCalcResult.java | 13 ++++ .../genotyper/afcalc/ExactCallLogger.java | 73 +++++++++++-------- 3 files changed, 73 insertions(+), 34 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index fab26c9d2..e9ed6b153 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; @@ -225,12 +226,24 @@ public class AFCalcPerformanceTest { final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1, AFCalcFactory.Calculation.EXACT_INDEPENDENT, AFCalcTestBuilder.PriorType.human); + logger.info(call); final SimpleTimer timer = new SimpleTimer().start(); final AFCalcResult result = testBuilder.makeModel().getLog10PNonRef(call.vc, testBuilder.makePriors()); - call.newNanoTime = timer.getElapsedTimeNano(); - call.newPNonRef = result.getLog10PosteriorOfAFGT0(); - logger.info(call); - logger.info("\t\t" + result); + final long newNanoTime = timer.getElapsedTimeNano(); + if ( call.originalCall.anyPolymorphic(-1) || result.anyPolymorphic(-1) ) { + logger.info("**** ONE IS POLY"); + } + logger.info("\t\t getLog10PosteriorOfAFGT0: " + call.originalCall.getLog10PosteriorOfAFGT0() + " vs " + result.getLog10PosteriorOfAFGT0()); + final double speedup = call.runtime / (1.0 * newNanoTime); + logger.info("\t\t runtime: " + call.runtime + " vs " + newNanoTime + " speedup " + String.format("%.2f", speedup) + "x"); + for ( final Allele a : call.originalCall.getAllelesUsedInGenotyping() ) { + if ( a.isNonReference() ) { + final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : ""; + logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE); + final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; + logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost); + } + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index c737416c5..7cacb2060 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -238,6 +238,19 @@ public class AFCalcResult { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } + /** + * Are any of the alleles polymorphic w.r.t. #isPolymorphic? + * + * @param log10minPNonRef the confidence threshold, in log10 space + * @return true if any are poly, false otherwise + */ + public boolean anyPolymorphic(final double log10minPNonRef) { + for ( final Allele a : getAllelesUsedInGenotyping() ) + if ( a.isNonReference() && isPolymorphic(a, log10minPNonRef) ) + return true; + return false; + } + /** * Returns the log10 probability that allele is segregating * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index 3794ba240..9394c99d7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -1,20 +1,18 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; -import org.broadinstitute.sting.utils.AutoFormattingTime; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** * Allows us to write out and read in information about exact calls (site, alleles, PLs, etc) in tabular format + * + * Once opened, calls can be writen to disk with printCallInfo */ public class ExactCallLogger implements Cloneable { private PrintStream callReport = null; @@ -38,32 +36,28 @@ public class ExactCallLogger implements Cloneable { */ public static class ExactCall { final VariantContext vc; - final long origNanoTime; - long newNanoTime = -1; - final double origPNonRef; - double newPNonRef = -1; + final long runtime; + final AFCalcResult originalCall; - public ExactCall(VariantContext vc, long origNanoTime, double origPNonRef) { + public ExactCall(VariantContext vc, final long runtime, final AFCalcResult originalCall) { this.vc = vc; - this.origNanoTime = origNanoTime; - this.origPNonRef = origPNonRef; + this.runtime = runtime; + this.originalCall = originalCall; } @Override public String toString() { - return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s new.pNonRef=%.2f new.runtime=%s", + return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s", vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), - origPNonRef, - new AutoFormattingTime(origNanoTime / 1e9).toString(), - newPNonRef, - newNanoTime == -1 ? "not.run" : new AutoFormattingTime(newNanoTime / 1e9).toString()); + originalCall.getLog10PosteriorOfAFGT0(), + new AutoFormattingTime(runtime / 1e9).toString()); } } - protected void printCallInfo(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final long runtimeNano, - final AFCalcResult result) { + protected final void printCallInfo(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final long runtimeNano, + final AFCalcResult result) { printCallElement(vc, "type", "ignore", vc.getType()); int allelei = 0; @@ -90,6 +84,7 @@ public class ExactCallLogger implements Cloneable { callReport.flush(); } + @Requires({"vc != null", "variable != null", "key != null", "value != null", "callReport != null"}) private void printCallElement(final VariantContext vc, final Object variable, final Object key, @@ -102,10 +97,10 @@ public class ExactCallLogger implements Cloneable { * Read in a list of ExactCall objects from reader, keeping only those * with starts in startsToKeep or all sites (if this is empty) * - * @param reader - * @param startsToKeep - * @param parser - * @return + * @param reader a just-opened reader sitting at the start of the file + * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should be kept + * @param parser a genome loc parser to create genome locs + * @return a list of ExactCall objects in reader * @throws IOException */ public static List readExactLog(final BufferedReader reader, final List startsToKeep, GenomeLocParser parser) throws IOException { @@ -118,10 +113,17 @@ public class ExactCallLogger implements Cloneable { // skip the header line reader.readLine(); + // skip the first "type" line + reader.readLine(); + while (true) { final VariantContextBuilder builder = new VariantContextBuilder(); final List alleles = new ArrayList(); final List genotypes = new ArrayList(); + final double[] posteriors = new double[2]; + final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); + final List mle = new ArrayList(); + final Map log10pNonRefByAllele = new HashMap(); long runtimeNano = -1; GenomeLoc currentLoc = null; @@ -139,13 +141,15 @@ public class ExactCallLogger implements Cloneable { if (currentLoc == null) currentLoc = lineLoc; - if (variable.equals("log10PosteriorOfAFzero")) { + if (variable.equals("type")) { if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { builder.alleles(alleles); final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); - calls.add(new ExactCall(builder.make(), runtimeNano, Double.valueOf(value))); + final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{})); + final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); + calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; } else if (variable.equals("allele")) { @@ -155,6 +159,15 @@ public class ExactCallLogger implements Cloneable { final GenotypeBuilder gb = new GenotypeBuilder(key); gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); genotypes.add(gb.make()); + } else if (variable.equals("log10PosteriorOfAFEq0")) { + posteriors[0] = Double.valueOf(value); + } else if (variable.equals("log10PosteriorOfAFGt0")) { + posteriors[1] = Double.valueOf(value); + } else if (variable.equals("MLE")) { + mle.add(Integer.valueOf(value)); + } else if (variable.equals("pNonRefByAllele")) { + final Allele a = Allele.create(key); + log10pNonRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { From fa93681f513992b387387ec018f24d66232cc2dd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 14:14:41 -0400 Subject: [PATCH 14/18] Scalability test for EXACT models --- .../genotyper/afcalc/AFCalcTestBuilder.java | 6 +- .../afcalc/AFCalcPerformanceUnitTest.java | 87 +++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index b4d105507..cfb67164d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -45,12 +45,16 @@ public class AFCalcTestBuilder { human } + public int getNumAltAlleles() { + return numAltAlleles; + } + public int getnSamples() { return nSamples; } public AFCalc makeModel() { - return AFCalcFactory.createAFCalc(modelType, nSamples, 4, 4, 2); + return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), getNumAltAlleles(), 2); } public double[] makePriors() { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java new file mode 100644 index 000000000..556b7451f --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java @@ -0,0 +1,87 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class AFCalcPerformanceUnitTest extends BaseTest { + @DataProvider(name = "ScalingTests") + public Object[][] makepolyTestProviderLotsOfAlleles() { + List tests = new ArrayList(); + + // list of all high-quality models in the system + final List biAllelicModels = Arrays.asList( + AFCalcFactory.Calculation.EXACT_INDEPENDENT, + AFCalcFactory.Calculation.EXACT_REFERENCE); + + final List multiAllelicModels = Arrays.asList( + AFCalcFactory.Calculation.EXACT_INDEPENDENT); + +// for ( final int nonTypePLs : Arrays.asList(100) ) { +// for ( final int nSamples : Arrays.asList(10000) ) { +// final List alleleCounts = Arrays.asList(50); +// for ( final int nAltAlleles : Arrays.asList(1) ) { + for ( final int nonTypePLs : Arrays.asList(100) ) { + for ( final int nSamples : Arrays.asList(100, 1000) ) { + final List alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 50, 500); + for ( final int nAltAlleles : Arrays.asList(1, 2, 3) ) { + final List models = nAltAlleles > 1 ? multiAllelicModels : biAllelicModels; + for ( final AFCalcFactory.Calculation model : models ) { + for ( final List ACs : Utils.makePermutations(alleleCounts, nAltAlleles, true) ) { + if ( MathUtils.sum(ACs) < nSamples * 2 ) { + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(nSamples, nAltAlleles, model, AFCalcTestBuilder.PriorType.human); + tests.add(new Object[]{testBuilder, ACs, nonTypePLs}); + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Pair estNumberOfEvaluations(final AFCalcTestBuilder testBuilder, final VariantContext vc, final int nonTypePL) { + final int evalOverhead = 2; // 2 + final int maxEvalsPerSamplePerAC = 3; + + int minEvals = 0, maxEvals = 0; + + for ( final Allele alt : vc.getAlternateAlleles() ) { + final int AC = vc.getCalledChrCount(alt); + minEvals += AC + evalOverhead; // everyone is hom-var + maxEvals += AC * maxEvalsPerSamplePerAC + 10; + } + + return new Pair(minEvals, maxEvals); + } + + @Test(dataProvider = "ScalingTests") + private void testScaling(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL) { + final AFCalc calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL); + final AFCalcResult result = calc.getLog10PNonRef(vc, priors); + final Pair expectedNEvaluation = estNumberOfEvaluations(testBuilder, vc, nonTypePL); + final int minEvals = expectedNEvaluation.getFirst(); + final int maxEvals = expectedNEvaluation.getSecond(); + + logger.warn(" min " + minEvals + " obs " + result.getnEvaluations() + " max " + maxEvals + " for test " + testBuilder + " sum(ACs)=" + (int)MathUtils.sum(ACs)); + + Assert.assertTrue(result.getnEvaluations() >= minEvals, + "Actual number of evaluations " + result.getnEvaluations() + " < min number of evals " + minEvals); + Assert.assertTrue(result.getnEvaluations() <= maxEvals, + "Actual number of evaluations " + result.getnEvaluations() + " > max number of evals " + minEvals); + } +} \ No newline at end of file From 8288c30e364b1aa858ef56411195fcdef9364b99 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 14:14:55 -0400 Subject: [PATCH 15/18] Use buffered output for ExactCallLogger --- .../sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index 9394c99d7..f13fe4429 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -24,7 +24,7 @@ public class ExactCallLogger implements Cloneable { */ public ExactCallLogger(final File outputFile) { try { - callReport = new PrintStream(new FileOutputStream(outputFile)); + callReport = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFile), 10000000)); callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); } catch (FileNotFoundException e) { throw new UserException.CouldNotCreateOutputFile(outputFile, e); From 32ee2c7dffde3210e2c3b183f5f2fefd3a49af23 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 15 Oct 2012 17:04:29 -0400 Subject: [PATCH 16/18] Refactored the compression interface per sample in ReduceReadsa The CompressionStash is now responsible for keeping track of all intervals that must be kept uncompressed by all samples. In general this is a list generated by a tumor sample that will enforce all normal samples to abide. - Updated ReduceReads integration tests - Sliding Window is now using the CompressionStash (single sample). DEV-104 #resolve #time 3m --- .../reducereads/CompressionStash.java | 21 +++++++ .../reducereads/MultiSampleCompressor.java | 5 +- .../compression/reducereads/ReduceReads.java | 4 +- .../reducereads/SingleSampleCompressor.java | 7 ++- .../reducereads/SlidingWindow.java | 55 ++++++++++--------- .../ReduceReadsIntegrationTest.java | 30 +++++----- .../reducereads/SimpleGenomeLoc.java | 30 ++++++++++ 7 files changed, 107 insertions(+), 45 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java new file mode 100644 index 000000000..714a4df18 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.GenomeLocComparator; + +import java.util.TreeSet; + +/** + * A stash of regions that must be kept uncompressed in all samples + * + * In general, these are regions that were kept uncompressed by a tumor sample and we want to force + * all other samples (normals and/or tumors) to also keep these regions uncompressed + * + * User: carneiro + * Date: 10/15/12 + * Time: 4:08 PM + */ +public class CompressionStash extends TreeSet { + public CompressionStash() { + super(new GenomeLocComparator()); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 7c9fc101b..2c3439010 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -55,11 +55,12 @@ public class MultiSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction) { + final boolean allowPolyploidReduction, + final CompressionStash compressionStash) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 5810bc94f..b6761f4a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -222,6 +222,8 @@ public class ReduceReads extends ReadWalker, ReduceRea HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). Long nextReadNumber = 1L; // The next number to use for the compressed read name. + CompressionStash compressionStash = new CompressionStash(); + SortedSet intervalList; private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag @@ -328,7 +330,7 @@ public class ReduceReads extends ReadWalker, ReduceRea */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash)); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 6a086c53b..82a433300 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -20,6 +20,7 @@ public class SingleSampleCompressor implements Compressor { final private ReduceReads.DownsampleStrategy downsampleStrategy; final private int nContigs; final private boolean allowPolyploidReduction; + final CompressionStash compressionStash; private SlidingWindow slidingWindow; private int slidingWindowCounter; @@ -33,7 +34,8 @@ public class SingleSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction) { + final boolean allowPolyploidReduction, + final CompressionStash compressionStash) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -44,6 +46,7 @@ public class SingleSampleCompressor implements Compressor { this.downsampleStrategy = downsampleStrategy; this.nContigs = nContigs; this.allowPolyploidReduction = allowPolyploidReduction; + this.compressionStash = compressionStash; } /** @@ -65,7 +68,7 @@ public class SingleSampleCompressor implements Compressor { } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash); slidingWindowCounter++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 6fdf85317..22c40c542 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -6,7 +6,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; @@ -56,6 +55,7 @@ public class SlidingWindow { private final int nContigs; private boolean allowPolyploidReductionInGeneral; + private CompressionStash compressionStash; /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -87,7 +87,7 @@ public class SlidingWindow { } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -118,6 +118,7 @@ public class SlidingWindow { this.nContigs = nContigs; this.allowPolyploidReductionInGeneral = allowPolyploidReduction; + this.compressionStash = compressionStash; } /** @@ -145,7 +146,7 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. */ - private Pair getNextVariantRegion(int from, int to, boolean[] variantSite) { + private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) { boolean foundStart = false; int variantRegionStartIndex = 0; for (int i=from; i(variantRegionStartIndex, i-1)); + return(new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, i-1, true)); } } - return (foundStart) ? new Pair(variantRegionStartIndex, -1) : null; + return (foundStart) ? new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, to-1, false) : null; } /** @@ -168,23 +169,22 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return a list with start/stops of variant regions following getNextVariantRegion description */ - private List> getAllVariantRegions(int from, int to, boolean[] variantSite) { - List> regions = new LinkedList>(); + private CompressionStash getVariantRegionsFromThisSample(int from, int to, boolean[] variantSite) { + CompressionStash regions = new CompressionStash(); int index = from; while(index < to) { - Pair result = getNextVariantRegion(index, to, variantSite); + SimpleGenomeLoc result = getNextVariantRegion(index, to, variantSite); if (result == null) break; regions.add(result); - if (result.getSecond() < 0) + if (result.getStop() < 0) break; - index = result.getSecond() + 1; + index = result.getStop() + 1; } return regions; } - /** * Determines if the window can be slid given the new incoming read. * @@ -203,7 +203,7 @@ public class SlidingWindow { boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex); int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - List> regions = getAllVariantRegions(0, breakpoint, variantSite); + CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, variantSite); finalizedReads = closeVariantRegions(regions, false); List readsToRemove = new LinkedList(); @@ -567,26 +567,31 @@ public class SlidingWindow { result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); - return result; // finalized reads will be downsampled if necessary + return result; // finalized reads will be downsampled if necessary } - private List closeVariantRegions(List> regions, boolean forceClose) { + private List closeVariantRegions(CompressionStash regions, boolean forceClose) { List allReads = new LinkedList(); if (!regions.isEmpty()) { int lastStop = -1; - for (Pair region : regions) { - int start = region.getFirst(); - int stop = region.getSecond(); - if (stop < 0 && forceClose) - stop = windowHeader.size() - 1; - if (stop >= 0) { - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); - lastStop = stop; + for (SimpleGenomeLoc region : regions) { + int start = region.getStart(); + int stop = region.getStop(); + + if (!region.isFinished()) { + if(forceClose) // region is unfinished but we're forcing the close of this window + stop = windowHeader.size() - 1; + else + continue; // region is unfinished and we're not forcing the close of this window } + + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); + lastStop = stop; } - for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) - windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! + + for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) + windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! } return allReads; } @@ -626,7 +631,7 @@ public class SlidingWindow { if (!windowHeader.isEmpty()) { boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1); - List> regions = getAllVariantRegions(0, windowHeader.size(), variantSite); + CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), variantSite); finalizedReads = closeVariantRegions(regions, true); if (!windowHeader.isEmpty()) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index db8ea4eb8..89f251ed4 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -21,36 +21,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest { executeTest(testName, spec); } - @Test(enabled = false) + @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f"); + RRTest("testDefaultCompression ", L, "1f95f3193bd9f120a73c34a0087abaf6"); } - @Test(enabled = false) + @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76"); + RRTest("testMultipleIntervals ", intervals, "79213d6ac68d56d4d72dcf511223e424"); } - @Test(enabled = false) + @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "dab2aa8e3655139974bbe12a568363d9"); } - @Test(enabled = false) + @Test(enabled = true) public void testLowCompression() { RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63"); } - @Test(enabled = false) + @Test(enabled = true) public void testIndelCompression() { - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973"); + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "1255245ed4ebeacda90f0dbb4e4da081"); } - @Test(enabled = false) + @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb"))); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("122e4e60c4412a31d0aeb3cce879e841"))); } /** @@ -61,20 +61,20 @@ public class ReduceReadsIntegrationTest extends WalkerTest { * * This bam is simplified to replicate the exact bug with the three provided intervals. */ - @Test(enabled = false) + @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("4b590269cbe3574dbdd5bdc2bc6f5f1c"))); } /** * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get * filtered out. */ - @Test(enabled = false) + @Test(enabled = true) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2"))); + executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("d8d066304f7c187f182bfb50f39baa0c"))); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java new file mode 100644 index 000000000..45e105751 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations + * with them. + * + * However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot + * be created easily for small tasks that do not require the rigors of the GenomeLocParser validation + * + * SimpleGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should + * only be used outside of the engine. + * + * User: carneiro + * Date: 10/16/12 + * Time: 2:07 PM + */ +public class SimpleGenomeLoc extends GenomeLoc { + private boolean finished; + + public SimpleGenomeLoc(String contigName, int contigIndex, int start, int stop, boolean finished) { + super(contigName, contigIndex, start, stop); + this.finished = finished; + } + + public boolean isFinished() { + return finished; + } +} From 55ac4ba70bfe28cdde85b3efc002813e857ca88e Mon Sep 17 00:00:00 2001 From: kshakir Date: Wed, 17 Oct 2012 19:39:03 -0400 Subject: [PATCH 17/18] Added another utility that can convert to RemoteFiles. QScripts will now generate remote versions of files if the caller has not already passed in remote versions (or the QScript replaces the passed in remote references... not good) Instead of having yet another plugin, combined QStatusMessenger and RemoteFileConverter under general QCommandPlugin trait. --- .../sting/queue/QCommandLine.scala | 53 ++++++++++++------- .../sting/queue/QCommandPlugin.scala | 9 ++++ .../broadinstitute/sting/queue/QScript.scala | 25 +++++++++ .../sting/queue/util/ClassFieldCache.scala | 11 ++++ .../queue/util/RemoteFileConverter.scala | 21 ++++++++ 5 files changed, 100 insertions(+), 19 deletions(-) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala create mode 100644 public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 5b84bfd16..2afa66d9c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -24,7 +24,6 @@ package org.broadinstitute.sting.queue -import function.QFunction import java.io.File import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ @@ -96,18 +95,18 @@ class QCommandLine extends CommandLineProgram with Logging { new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) } - private lazy val qStatusMessengerPluginManager = { - new PluginManager[QStatusMessenger](classOf[QStatusMessenger]) + private lazy val qCommandPlugin = { + new PluginManager[QCommandPlugin](classOf[QCommandPlugin]) } - ClassFieldCache.parsingEngine = new ParsingEngine(this) - /** * Takes the QScripts passed in, runs their script() methods, retrieves their generated * functions, and then builds and runs a QGraph based on the dependencies. */ def execute = { - val allStatusMessengers = qStatusMessengerPluginManager.createAllTypes() + ClassFieldCache.parsingEngine = this.parser + + val allCommandPlugins = qCommandPlugin.createAllTypes() if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) @@ -115,14 +114,24 @@ class QCommandLine extends CommandLineProgram with Logging { settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") qGraph.initializeWithSettings(settings) - for (statusMessenger <- allStatusMessengers) { - loadArgumentsIntoObject(statusMessenger) + for (commandPlugin <- allCommandPlugins) { + loadArgumentsIntoObject(commandPlugin) } - for (statusMessenger <- allStatusMessengers) { - statusMessenger.started() + for (commandPlugin <- allCommandPlugins) { + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.started() } + // TODO: Default command plugin argument? + val remoteFileConverter = ( + for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) + yield commandPlugin.remoteFileConverter + ).headOption.getOrElse(null) + + if (remoteFileConverter != null) + loadArgumentsIntoObject(remoteFileConverter) + val allQScripts = qScriptPluginManager.createAllTypes() for (script <- allQScripts) { logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) @@ -137,10 +146,15 @@ class QCommandLine extends CommandLineProgram with Logging { case e: Exception => throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e) } + + if (remoteFileConverter != null) { + if (remoteFileConverter.convertToRemoteEnabled) + script.mkRemoteOutputs(remoteFileConverter) + } + script.functions.foreach(qGraph.add(_)) logger.info("Added " + script.functions.size + " functions") } - // Execute the job graph qGraph.run() @@ -162,14 +176,16 @@ class QCommandLine extends CommandLineProgram with Logging { if (!success) { logger.info("Done with errors") qGraph.logFailed() - for (statusMessenger <- allStatusMessengers) - statusMessenger.exit("Done with errors") + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.exit("Done with errors") 1 } else { if (settings.run) { allQScripts.foreach(_.pushOutputs()) - for (statusMessenger <- allStatusMessengers) - statusMessenger.done(allQScripts.map(_.remoteOutputs)) + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.done(allQScripts.map(_.remoteOutputs)) } 0 } @@ -189,7 +205,7 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentSources = { var plugins = Seq.empty[Class[_]] plugins ++= qScriptPluginManager.getPlugins - plugins ++= qStatusMessengerPluginManager.getPlugins + plugins ++= qCommandPlugin.getPlugins plugins.toArray } @@ -200,11 +216,10 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentSourceName(source: Class[_]) = { if (classOf[QScript].isAssignableFrom(source)) qScriptPluginManager.getName(source.asSubclass(classOf[QScript])) - else if (classOf[QStatusMessenger].isAssignableFrom(source)) - qStatusMessengerPluginManager.getName(source.asSubclass(classOf[QStatusMessenger])) + else if (classOf[QCommandPlugin].isAssignableFrom(source)) + qCommandPlugin.getName(source.asSubclass(classOf[QCommandPlugin])) else null - } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala new file mode 100644 index 000000000..499c31554 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala @@ -0,0 +1,9 @@ +package org.broadinstitute.sting.queue + +import engine.QStatusMessenger +import util.RemoteFileConverter + +trait QCommandPlugin { + def statusMessenger: QStatusMessenger = null + def remoteFileConverter: RemoteFileConverter = null +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 2dcfb916c..3df61b1e3 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -108,6 +108,24 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon functions.foreach( f => add(f) ) } + /** + * Convert all @Output files to remote output files. + * @param remoteFileConverter Converter for files to remote files. + */ + def mkRemoteOutputs(remoteFileConverter: RemoteFileConverter) { + for (field <- outputFields) { + val fieldFile = ClassFieldCache.getFieldFile(this, field) + if (fieldFile != null && !fieldFile.isInstanceOf[RemoteFile]) { + val fieldName = ClassFieldCache.fullName(field) + val remoteFile = remoteFileConverter.convertToRemote(fieldFile, fieldName) + ClassFieldCache.setFieldValue(this, field, remoteFile) + } + } + } + + /** + * Pull all remote files to the local disk. + */ def pullInputs() { val inputs = ClassFieldCache.getFieldFiles(this, inputFields) for (remoteFile <- filterRemoteFiles(inputs)) { @@ -116,6 +134,9 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } } + /** + * Push all remote files from the local disk. + */ def pushOutputs() { val outputs = ClassFieldCache.getFieldFiles(this, outputFields) for (remoteFile <- filterRemoteFiles(outputs)) { @@ -124,6 +145,10 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } } + /** + * List out the remote outputs + * @return the RemoteFile outputs by argument source + */ def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = outputFields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala index 870dd5617..ae3db6860 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala @@ -180,4 +180,15 @@ object ClassFieldCache { case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown)) } + + // + // other utilities + // + + /** + * Retrieves the fullName of the argument + * @param field ArgumentSource to check + * @return Full name of the argument source + */ + def fullName(field: ArgumentSource) = field.createArgumentDefinitions().get(0).fullName } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala new file mode 100644 index 000000000..c77c242d0 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File + +trait RemoteFileConverter { + type RemoteFileType <: RemoteFile + + /** + * If this remote file creator is capable of converting to a remote file. + * @return true if ready to convert + */ + def convertToRemoteEnabled: Boolean + + /** + * Converts to a remote file + * @param file The original file + * @param name A "name" to use for the remote file + * @return The new version of this remote file. + */ + def convertToRemote(file: File, name: String): RemoteFileType +}