diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 85615962c..b0e9ef4fe 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -34,7 +34,6 @@ public class BQSRIntegrationTest extends WalkerTest { " -I " + bam + " -L " + interval + args + - " --no_plots" + " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) + " -o %s"; } @@ -50,21 +49,21 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")}, - {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")}, - {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "90d70542076715a8605a8d4002614b34")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "be6c7bc0b79a2d0395d21cd0154540d5")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "65781095beb41d8feca26e93e04dcc0b")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "8ee1fed1713daca1f36e8b30bee2cd23")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "9449d8a8baac742f46673e9b8314220b")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "39313c6e3b85142548fee9b6c130e7b6")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "15eae9e834ed80b24660393c6df87f85")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "8485d8fd5e780e98d720dfbf79f26528")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "c423d1d443822dae404239bb9a746b96")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "fb0a6aef430f562ed5e0002d03e0c619")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "efee7bcb89abe36da1cfd8a635d37cd2")}, + {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "0e8a3238902a1ff0f0c657fb09b4c022")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "5e58d3dcf5ca38f008a64d1c0743ed83")}, + {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "1a8e5c85c7935eb1bd2203f5c86ce1db")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "c423d1d443822dae404239bb9a746b96")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "6762b39dc027056365280a9d582a6713")}, }; } @@ -88,7 +87,6 @@ public class BQSRIntegrationTest extends WalkerTest { " -R " + b36KGReference + " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" + " -L 1:10,000,000-10,200,000" + - " --no_plots" + " -o %s", 1, // just one output file UserException.CommandLineException.class); @@ -102,7 +100,6 @@ public class BQSRIntegrationTest extends WalkerTest { " -R " + b36KGReference + " -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" + " -L 1:50,000-80,000" + - " --no_plots" + " -o %s", 1, // just one output file UserException.class); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index b45c027a7..b4ac2b86d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -73,4 +73,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8d092b25f40456e618eef91fdce8adf0")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } + + @Test + public void HCTestStructuralIndels() { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c29e61810c056b52a47baae0696931ea")); + executeTest("HCTestStructuralIndels: ", spec); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 516ea8451..fc2546173 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -63,6 +63,7 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import java.io.File; +import java.io.OutputStream; import java.util.*; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java index 300e801e6..9f69a4144 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java @@ -50,7 +50,7 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage, StingSAMFileWrite * Retrieves the SAM file to (ultimately) be created. * @return The SAM file. Must not be null. */ - public File getSAMFile() { + public File getOutputFile() { return samFile; } @@ -162,7 +162,7 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite simplifyBAM = v; } - public OutputStream getSAMOutputStream() { + public OutputStream getOutputStream() { return samOutputStream; } @@ -220,7 +220,7 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite /** * Gets whether to generate an md5 on-the-fly for this BAM. - * @return True generates the md5. False means skip writing the file. + * @param generateMD5 True generates the md5. False means skip writing the file. */ public void setGenerateMD5(boolean generateMD5) { if(writeStarted) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java index b042144b6..873f5b7c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java @@ -27,6 +27,9 @@ package org.broadinstitute.sting.gatk.io.stubs; import org.broadinstitute.sting.gatk.io.OutputTracker; +import java.io.File; +import java.io.OutputStream; + /** * A stub used for managing IO. Acts as a proxy for IO streams * not yet created or streams that need significant external @@ -43,4 +46,14 @@ public interface Stub { * @param outputTracker The connector used to provide an appropriate stream. */ public void register( OutputTracker outputTracker ); + + /** + * Returns the OutputStream represented by this stub or null if not available. + */ + public OutputStream getOutputStream(); + + /** + * Returns the File represented by this stub or null if not available. + */ + public File getOutputFile(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index ee1dc63e6..f92d78bb5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -140,7 +140,7 @@ public class VariantContextWriterStub implements Stub, Var * Retrieves the file to (ultimately) be created. * @return The file. Can be null if genotypeStream is not. */ - public File getFile() { + public File getOutputFile() { return genotypeFile; } @@ -148,7 +148,7 @@ public class VariantContextWriterStub implements Stub, Var * Retrieves the output stearm to which to (ultimately) write. * @return The file. Can be null if genotypeFile is not. */ - public PrintStream getOutputStream() { + public OutputStream getOutputStream() { return genotypeStream; } @@ -196,7 +196,7 @@ public class VariantContextWriterStub implements Stub, Var if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY); - if ( forceBCF || (getFile() != null && VariantContextWriterFactory.isBCFOutput(getFile())) ) + if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) ) options.add(Options.FORCE_BCF); return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); @@ -271,7 +271,7 @@ public class VariantContextWriterStub implements Stub, Var public boolean alsoWriteBCFForTest() { return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded ! isCompressed() && // for non-compressed outputs - getFile() != null && // that are going to disk + getOutputFile() != null && // that are going to disk engine.getArguments().generateShadowBCF; // and we actually want to do it } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index cce106210..c4de9ed45 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -161,6 +161,9 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false) protected Boolean USE_ALL_ANNOTATIONS = false; + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") protected Boolean LIST = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index a6d82d5b3..dbb628135 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -49,7 +49,6 @@ public class BQSRGatherer extends Gatherer { @Override public void gather(List inputs, File output) { - RecalibrationReport generalReport = null; final PrintStream outputFile; try { outputFile = new PrintStream(output); @@ -57,6 +56,7 @@ public class BQSRGatherer extends Gatherer { throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); } + RecalibrationReport generalReport = null; for (File input : inputs) { final RecalibrationReport inputReport = new RecalibrationReport(input); if (generalReport == null) @@ -70,14 +70,15 @@ public class BQSRGatherer extends Gatherer { generalReport.calculateQuantizedQualities(); RecalibrationArgumentCollection RAC = generalReport.getRAC(); - if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { - final File recal_out = new File(output.getName() + ".original"); - final RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalUtils.generateRecalibrationPlot(recal_out, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); - } - else if (!RAC.NO_PLOTS) { - final File recal_out = new File(output.getName() + ".recal"); - RecalUtils.generateRecalibrationPlot(recal_out, generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); + if ( RAC.RECAL_PDF_FILE != null ) { + RAC.RECAL_TABLE_FILE = output; + if ( RAC.existingRecalibrationReport != null ) { + final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport); + RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } + else { + RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } } generalReport.output(outputFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 43aa85a05..e78b9b6fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -50,7 +50,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; -import java.io.FileNotFoundException; +import java.io.IOException; import java.io.PrintStream; import java.lang.reflect.Constructor; import java.util.ArrayList; @@ -110,6 +110,7 @@ import java.util.ArrayList; @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta public class BaseRecalibrator extends LocusWalker implements TreeReducible, NanoSchedulable { + @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates @@ -150,7 +151,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed RecalUtils.listAvailableCovariates(logger); System.exit(0); } - RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table + RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); @@ -169,6 +170,12 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection } + try { + RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); + } + int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); @@ -284,7 +291,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed generateReport(); logger.info("...done!"); - if (!RAC.NO_PLOTS) { + if (RAC.RECAL_PDF_FILE != null) { logger.info("Generating recalibration plots..."); generatePlots(); } @@ -296,10 +303,10 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; if (recalFile != null) { RecalibrationReport report = new RecalibrationReport(recalFile); - RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, report.getRecalibrationTables(), recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), recalibrationTables, requestedCovariates); } else - RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates); } @@ -313,14 +320,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed } private void generateReport() { - PrintStream output; - try { - output = new PrintStream(RAC.RECAL_FILE); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_FILE, "could not be created"); - } - - RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, output); + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index f4b00925e..f1f0ce38e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -28,10 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.recalibration.RecalUtils; import java.io.File; +import java.io.PrintStream; import java.util.Collections; import java.util.List; @@ -59,14 +59,29 @@ public class RecalibrationArgumentCollection { * After the header, data records occur one per line until the end of the file. The first several items on a line are the * values of the individual covariates and will change depending on which covariates were specified at runtime. The last * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, - * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. */ @Gather(BQSRGatherer.class) - @Output - public File RECAL_FILE; + @Output(doc = "The output recalibration table file to create", required = true) + public File RECAL_TABLE_FILE = null; + public PrintStream RECAL_TABLE; /** - * List all implemented covariates. + * If not provided, then no plots will be generated (useful for queue scatter/gathering). + * However, we *highly* recommend that users generate these plots whenever possible for QC checking. + */ + @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false) + public File RECAL_PDF_FILE = null; + + /** + * If not provided, then a temporary file is created and then deleted upon completion. + */ + @Hidden + @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) + public File RECAL_CSV_FILE = null; + + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. */ @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) public boolean LIST_ONLY = false; @@ -166,14 +181,8 @@ public class RecalibrationArgumentCollection { @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; - @Hidden - @Argument(fullName = "keep_intermediate_files", shortName = "k", required = false, doc ="does not remove the temporary csv file created to generate the plots") - public boolean KEEP_INTERMEDIATE_FILES = false; - @Hidden - @Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering") - public boolean NO_PLOTS = false; - public File recalibrationReport = null; + public File existingRecalibrationReport = null; public GATKReportTable generateReportTable(final String covariateNames) { GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); @@ -205,12 +214,10 @@ public class RecalibrationArgumentCollection { argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); argumentsTable.addRowID("quantizing_levels", true); argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); - argumentsTable.addRowID("keep_intermediate_files", true); - argumentsTable.set("keep_intermediate_files", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); - argumentsTable.addRowID("no_plots", true); - argumentsTable.set("no_plots", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); argumentsTable.addRowID("recalibration_report", true); - argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); + argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); + argumentsTable.addRowID("plot_pdf_file", true); + argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath()); argumentsTable.addRowID("binary_tag_name", true); argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); return argumentsTable; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a57c877e0..469d63b8a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -198,23 +197,7 @@ public class UnifiedGenotyperEngine { } } - return addMissingSamples(results, allSamples); - } - - private List addMissingSamples(final List calls, final Set allSamples) { - if ( calls.isEmpty() || allSamples == null ) return calls; - - final List withAllSamples = new ArrayList(calls.size()); - for ( final VariantCallContext call : calls ) { - if ( call == null ) - withAllSamples.add(null); - else { - final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples); - withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit)); - } - } - - return withAllSamples; + return results; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index 58cd14737..6971be807 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -126,7 +126,9 @@ public class VariantEval extends RodWalker implements TreeRedu @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false) public RodBinding goldStandard = null; - // Help arguments + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false) protected Boolean LIST = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 555999bdb..b1d8dc91d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -313,7 +313,7 @@ public class CombineVariants extends RodWalker implements Tree VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples)); + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 3d14308b6..9664a5bde 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; @@ -325,6 +326,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(doc="indel size select",required=false,fullName="maxIndelSize") private int maxIndelSize = Integer.MAX_VALUE; + @Argument(doc="Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES") + private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; + /* Private class used to store the intermediate variants in the integer random selection process */ private static class RandomVariantStructure { @@ -386,10 +390,31 @@ public class SelectVariants extends RodWalker implements TreeR Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); - // first, add any requested samples - samples.addAll(samplesFromFile); - samples.addAll(samplesFromExpressions); + // first, check overlap between requested and present samples + Set commandLineUniqueSamples = new HashSet(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size()); + commandLineUniqueSamples.addAll(samplesFromFile); + commandLineUniqueSamples.addAll(samplesFromExpressions); + commandLineUniqueSamples.addAll(sampleNames); + commandLineUniqueSamples.removeAll(vcfSamples); + + // second, add the requested samples samples.addAll(sampleNames); + samples.addAll(samplesFromExpressions); + samples.addAll(samplesFromFile); + + logger.debug(Utils.join(",",commandLineUniqueSamples)); + + if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) { + logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); + samples.removeAll(commandLineUniqueSamples); + } else if (commandLineUniqueSamples.size() > 0 ) { + throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", + "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", + "A list of these samples:", + Utils.join(",",commandLineUniqueSamples), + "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")); + } + // if none were requested, we want all of them if ( samples.isEmpty() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 2e6a80462..6bc6153df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -7,7 +7,9 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -15,6 +17,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -278,7 +281,7 @@ public class VariantsToBinaryPed extends RodWalker { private byte getFlippedEncoding(Genotype g, int offset) { byte b; - if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { + if ( ! checkGQIsGood(g) ) { b = NO_CALL; } else if ( g.isHomRef() ) { b = HOM_VAR; @@ -293,6 +296,16 @@ public class VariantsToBinaryPed extends RodWalker { return (byte) (b << (2*offset)); } + private boolean checkGQIsGood(Genotype genotype) { + if ( genotype.hasGQ() ) { + return genotype.getGQ() >= minGenotypeQuality; + } else if ( genotype.hasLikelihoods() ) { + return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; + } + + return false; + } + private static String getID(VariantContext v) { if ( v.hasID() ) { return v.getID(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 78c9c4a1c..5f80f77a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -246,7 +246,6 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vc = VariantContextUtils.addMissingSamples(vc, samples); vcfwriter.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 667de3dea..5273806a7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -88,8 +88,8 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF case UNBOUNDED: return -1; case A: return vc.getNAlleles() - 1; case G: - final int ploidy = vc.getMaxPloidy(); - return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy == 0 ? 2 : ploidy); + final int ploidy = vc.getMaxPloidy(2); + return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy); default: throw new ReviewedStingException("Unknown count type: " + countType); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 20aabdb83..ca490789f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -47,7 +47,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; -import java.io.FileNotFoundException; +import java.io.IOException; import java.io.PrintStream; import java.util.*; @@ -333,8 +333,8 @@ public class RecalUtils { return covariate.getClass().getSimpleName().split("Covariate")[0]; } - public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { - outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { + outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), RAC.RECAL_TABLE); } /** @@ -362,46 +362,36 @@ public class RecalUtils { report.print(outputFile); } - private static Pair initializeRecalibrationPlot(File filename) { - final PrintStream deltaTableStream; - final File deltaTableFileName = new File(filename + ".csv"); - try { - deltaTableStream = new PrintStream(deltaTableFileName); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created"); - } - return new Pair(deltaTableStream, deltaTableFileName); - } - - private static void outputRecalibrationPlot(final File gatkReportFilename, Pair files, boolean keepIntermediates) { - final File csvFileName = files.getSecond(); - final File plotFileName = new File(csvFileName + ".pdf"); - files.getFirst().close(); + private static void outputRecalibrationPlot(final RecalibrationArgumentCollection RAC) { final RScriptExecutor executor = new RScriptExecutor(); executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFileName.getAbsolutePath()); - executor.addArgs(gatkReportFilename.getAbsolutePath()); - executor.addArgs(plotFileName.getAbsolutePath()); + executor.addArgs(RAC.RECAL_CSV_FILE.getAbsolutePath()); + executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); + executor.addArgs(RAC.RECAL_PDF_FILE.getAbsolutePath()); executor.exec(); - - if (!keepIntermediates) - if (!csvFileName.delete()) - throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath()); - } - public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final Covariate[] requestedCovariates, final boolean keepIntermediates) { - final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, true); - outputRecalibrationPlot(filename, files, keepIntermediates); + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { + generateRecalibrationPlot(RAC, original, null, requestedCovariates); } - public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates, final boolean keepIntermediates) { - final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", requestedCovariates, true); - writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, false); - outputRecalibrationPlot(filename, files, keepIntermediates); + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { + final PrintStream csvFile; + try { + if ( RAC.RECAL_CSV_FILE == null ) { + RAC.RECAL_CSV_FILE = File.createTempFile("BQSR", ".csv"); + RAC.RECAL_CSV_FILE.deleteOnExit(); + } + csvFile = new PrintStream(RAC.RECAL_CSV_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_CSV_FILE, e); + } + + if ( recalibrated != null ) + writeCSV(csvFile, recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(csvFile, original, "ORIGINAL", requestedCovariates, recalibrated == null); + outputRecalibrationPlot(RAC); } private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 271c07649..41b07832c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -284,14 +284,11 @@ public class RecalibrationReport { else if (argument.equals("quantizing_levels")) RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); - else if (argument.equals("keep_intermediate_files")) - RAC.KEEP_INTERMEDIATE_FILES = Boolean.parseBoolean((String) value); - - else if (argument.equals("no_plots")) - RAC.NO_PLOTS = Boolean.parseBoolean((String) value); - else if (argument.equals("recalibration_report")) - RAC.recalibrationReport = (value == null) ? null : new File((String) value); + RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value); + + else if (argument.equals("plot_pdf_file")) + RAC.RECAL_PDF_FILE = (value == null) ? null : new File((String) value); else if (argument.equals("binary_tag_name")) RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 0ee32fa2e..8fd792d3b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -53,6 +53,9 @@ import java.util.*; */ @Invariant({"alleles != null"}) public final class GenotypeBuilder { + private static final List HAPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL); + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private String sampleName = null; private List alleles = Collections.emptyList(); @@ -90,6 +93,23 @@ public final class GenotypeBuilder { return new GenotypeBuilder(sampleName, alleles).PL(gls).make(); } + /** + * Create a new Genotype object for a sample that's missing from the VC (i.e., in + * the output header). Defaults to a diploid no call genotype ./. + * + * @param sampleName the name of this sample + * @return an initialized Genotype with sampleName that's a diploid ./. no call genotype + */ + public static Genotype createMissing(final String sampleName, final int ploidy) { + final GenotypeBuilder builder = new GenotypeBuilder(sampleName); + switch ( ploidy ) { + case 1: builder.alleles(HAPLOID_NO_CALL); break; + case 2: builder.alleles(DIPLOID_NO_CALL); break; + default: builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); break; + } + return builder.make(); + } + /** * Create a empty builder. Both a sampleName and alleles must be provided * before trying to make a Genotype from this builder. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 7b4256b70..641eb5449 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.EnumMap; +import java.util.List; public class GenotypeLikelihoods { private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; @@ -167,10 +168,36 @@ public class GenotypeLikelihoods { //Return the neg log10 Genotype Quality (GQ) for the given genotype //Returns Double.NEGATIVE_INFINITY in case of missing genotype + + /** + * This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context. + * Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List) in place of it. + * + * If you **know** you're biallelic, use getGQLog10FromLikelihoods directly. + * @param genotype - actually a genotype type (no call, hom ref, het, hom var) + * @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best). + */ + @Deprecated public double getLog10GQ(GenotypeType genotype){ return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); } + @Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"}) + private double getLog10GQ(List genotypeAlleles,List contextAlleles) { + int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0)); + int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1)); + int plIndex = calculatePLindex(allele1Index,allele2Index); + return getGQLog10FromLikelihoods(plIndex,getAsVector()); + } + + public double getLog10GQ(Genotype genotype, List vcAlleles ) { + return getLog10GQ(genotype.getAlleles(),vcAlleles); + } + + public double getLog10GQ(Genotype genotype, VariantContext context) { + return getLog10GQ(genotype,context.getAlleles()); + } + public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ if(likelihoods == null) return Double.NEGATIVE_INFINITY; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index ba8668fa9..f306bac4d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import java.util.*; @@ -413,14 +412,26 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + /** + * What is the max ploidy among all samples? Returns defaultPloidy if no genotypes are present + * + * @param defaultPloidy the default ploidy, if all samples are no-called + * @return + */ @Ensures("result >= 0") - public int getMaxPloidy() { + public int getMaxPloidy(final int defaultPloidy) { + if ( defaultPloidy < 0 ) throw new IllegalArgumentException("defaultPloidy must be greater than or equal to 0"); + if ( maxPloidy == -1 ) { maxPloidy = 0; // necessary in the case where there are no genotypes for ( final Genotype g : getGenotypes() ) { maxPloidy = Math.max(g.getPloidy(), maxPloidy); } + + // everything is no called so we return the default ploidy + if ( maxPloidy == 0 ) maxPloidy = defaultPloidy; } + return maxPloidy; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index dd16cf7e1..abac84202 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -642,14 +642,15 @@ public class VariantContext implements Feature { // to enable tribble integratio } /** - * Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes + * Returns the maximum ploidy of all samples in this VC, or default if there are no genotypes * * This function is caching, so it's only expensive on the first call * - * @return -1, or the max ploidy + * @param defaultPloidy the default ploidy, if all samples are no-called + * @return default, or the max ploidy */ - public int getMaxPloidy() { - return genotypes.getMaxPloidy(); + public int getMaxPloidy(final int defaultPloidy) { + return genotypes.getMaxPloidy(defaultPloidy); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index d7e4a7135..8abcf115a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -32,8 +32,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -47,7 +47,6 @@ public class VariantContextUtils { public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; - private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); final public static JexlEngine engine = new JexlEngine(); @@ -60,31 +59,6 @@ public class VariantContextUtils { engine.setDebug(false); } - /** - * Ensures that VC contains all of the samples in allSamples by adding missing samples to - * the resulting VC with default diploid ./. genotypes - * - * @param vc the VariantContext - * @param allSamples all of the samples needed - * @return a new VariantContext with missing samples added - */ - public static VariantContext addMissingSamples(final VariantContext vc, final Set allSamples) { - // TODO -- what's the fastest way to do this calculation? - final Set missingSamples = new HashSet(allSamples); - missingSamples.removeAll(vc.getSampleNames()); - - if ( missingSamples.isEmpty() ) - return vc; - else { - //logger.warn("Adding " + missingSamples.size() + " missing samples to called context"); - final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes()); - for ( final String missing : missingSamples ) { - gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make()); - } - return new VariantContextBuilder(vc).genotypes(gc).make(); - } - } - /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java index 5b81e7117..61c0129bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -272,11 +272,7 @@ public abstract class BCF2FieldWriter { encodingType = BCF2Type.INT8; buildAlleleMap(vc); - nValuesPerGenotype = vc.getMaxPloidy(); - - // deal with the case where we have no call everywhere, in which case we write out diploid - if ( nValuesPerGenotype == -1 ) - nValuesPerGenotype = 2; + nValuesPerGenotype = vc.getMaxPloidy(2); super.start(encoder, vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index e4c64b26b..536f07f90 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -32,7 +32,10 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion; -import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFContigHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -345,10 +348,12 @@ class BCF2Writer extends IndexingVariantContextWriter { final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); + assert writer != null; + writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) VCFWriter.missingSampleError(vc, header); + if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index db74f2263..f5306b6da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -339,13 +338,13 @@ class VCFWriter extends IndexingVariantContextWriter { */ private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) throws IOException { + final int ploidy = vc.getMaxPloidy(2); + for ( String sample : mHeader.getGenotypeSamples() ) { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); - if ( g == null ) { - missingSampleError(vc, mHeader); - } + if ( g == null ) g = GenotypeBuilder.createMissing(sample, ploidy); final List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String field : genotypeFormatKeys ) { @@ -426,13 +425,6 @@ class VCFWriter extends IndexingVariantContextWriter { } } - public static final void missingSampleError(final VariantContext vc, final VCFHeader header) { - final List badSampleNames = new ArrayList(); - for ( final String x : header.getGenotypeSamples() ) - if ( ! vc.hasGenotype(x) ) badSampleNames.add(x); - throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames)); - } - private boolean isMissingValue(String s) { // we need to deal with the case that it's a list of missing values return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 77e29f87b..ffd9c9b4a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -70,6 +70,20 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testComplexSelection--" + testfile, spec); } + @Test + public void testComplexSelectionWithNonExistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); + } + @Test public void testNonExistingFieldSelection() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -98,6 +112,21 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testSampleExclusion--" + testfile, spec); } + @Test + public void testSampleInclusionWithNonexistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, + 1, + UserException.BadInput.class + ); + spec.disableShadowBCF(); + + executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); + } + @Test public void testConcordance() { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 69f42e1f9..4ce32cee7 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,12 +29,15 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; +import java.util.Arrays; import java.util.EnumMap; +import java.util.List; /** @@ -44,6 +47,7 @@ public class GenotypeLikelihoodsUnitTest { double [] v = new double[]{-10.5, -1.25, -5.11}; final static String vGLString = "-10.50,-1.25,-5.11"; final static String vPLString = "93,0,39"; + double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC @Test public void testFromVector2() { @@ -139,6 +143,28 @@ public class GenotypeLikelihoodsUnitTest { } } + // this test is completely broken, the method is wrong. + public void testGetQualFromLikelihoodsMultiAllelicBroken() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + double actualGQ = gl.getLog10GQ(GenotypeType.HET); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + + public void testGetQualFromLikelihoodsMultiAllelic() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + Allele ref = Allele.create(BaseUtils.A,true); + Allele alt1 = Allele.create(BaseUtils.C); + Allele alt2 = Allele.create(BaseUtils.T); + List allAlleles = Arrays.asList(ref,alt1,alt2); + List gtAlleles = Arrays.asList(alt1,alt2); + GenotypeBuilder gtBuilder = new GenotypeBuilder(); + gtBuilder.alleles(gtAlleles); + double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 26e2dbfbc..6785fa816 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -596,6 +596,51 @@ public class VariantContextTestProvider { return TEST_DATAs; } + public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { + final int nSamples = data.header.getNGenotypeSamples(); + if ( nSamples > 2 ) { + for ( final VariantContext vc : data.vcs ) + if ( vc.isSymbolic() ) + // cannot handle symbolic alleles because they may be weird non-call VCFs + return; + + final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); + tmpFile.deleteOnExit(); + + // write expected to disk + final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); + final VariantContextWriter writer = tester.makeWriter(tmpFile, options); + + final Set samplesInVCF = new HashSet(data.header.getGenotypeSamples()); + final List missingSamples = Arrays.asList("MISSING1", "MISSING2"); + final List allSamples = new ArrayList(missingSamples); + allSamples.addAll(samplesInVCF); + + final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples); + writeVCsToFile(writer, header, data.vcs); + + // ensure writing of expected == actual + final Pair> p = readAllVCs(tmpFile, tester.makeCodec()); + final Iterable actual = p.getSecond(); + + int i = 0; + for ( final VariantContext readVC : actual ) { + if ( readVC == null ) continue; // sometimes we read null records... + final VariantContext expected = data.vcs.get(i++); + for ( final Genotype g : readVC.getGenotypes() ) { + Assert.assertTrue(allSamples.contains(g.getSampleName())); + if ( samplesInVCF.contains(g.getSampleName()) ) { + assertEquals(g, expected.getGenotype(g.getSampleName())); + } else { + // missing + Assert.assertTrue(g.isNoCall()); + } + } + } + + } + } + public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { testReaderWriter(tester, data.header, data.vcs, data.vcs, true); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java index 1b791bf6c..adf3eb235 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java @@ -82,6 +82,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData); } + @Test(dataProvider = "VariantContextTest_SingleContexts") + public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData); + } + private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() { @@ -110,6 +115,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData); } + @Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts") + public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData); + } + private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() {