From 91bdf069d3d6d67f03bedd757e4b9b80a75b296d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 13 Jan 2014 22:13:53 -0500 Subject: [PATCH 1/3] Some updates to CRCV. 1. Throw a user error when the input data for a given genotype does not contain PLs. 2. Add VCF header line for --dbsnp input 3. Need to check that the UG result is not null 4. Don't error out at positions with no gVCFs (which is possible when using a dbSNP rod) --- .../variantutils/CombineReferenceCalculationVariants.java | 7 +++++-- .../sting/utils/variant/GATKVariantContextUtils.java | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java index 2a004aaca..66bf562d7 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java @@ -153,9 +153,12 @@ public class CombineReferenceCalculationVariants extends RodWalker vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); - final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + if ( dbsnp != null && dbsnp.dbsnp.isBound() ) + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); + + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); vcfWriter.writeHeader(vcfHeader); @@ -204,7 +207,7 @@ public class CombineReferenceCalculationVariants extends RodWalker VCs, final GenomeLoc loc, final Byte refBase) { - - if ( VCs == null || VCs.size() == 0 ) throw new IllegalArgumentException("VCs cannot be null or empty"); + // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs + if ( VCs == null || VCs.size() == 0 ) + return null; // establish the baseline info (sometimes from the first VC) final VariantContext first = VCs.get(0); @@ -1536,6 +1537,8 @@ public class GATKVariantContextUtils { final List remappedAlleles, final List targetAlleles) { for ( final Genotype g : VC.getGenotypes() ) { + if ( !g.hasPL() ) + throw new UserException("cannot merge genotypes from samples without PLs; sample " + g.getSampleName() + " does not have likelihoods at position " + VC.getChr() + ":" + VC.getStart()); // only add if the name is new final String name = g.getSampleName(); From 9cac24d1e69dac7fe713505c1279280026e2bcb0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 4 Feb 2014 00:24:44 -0500 Subject: [PATCH 2/3] Moving logging status of VCF indexing to DEBUG instead of INFO, otherwise it's painful when reading in lots of files --- .../sting/commandline/ArgumentTypeDescriptor.java | 2 +- .../sting/gatk/refdata/tracks/RMDTrackBuilder.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 14b5118ad..8f0abe360 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -374,7 +374,7 @@ public abstract class ArgumentTypeDescriptor { FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); if ( featureDescriptor != null ) { tribbleType = featureDescriptor.getName(); - logger.info("Dynamically determined type of " + file + " to be " + tribbleType); + logger.debug("Dynamically determined type of " + file + " to be " + tribbleType); } } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 4c50cfaae..3433b5342 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -169,7 +169,7 @@ public class RMDTrackBuilder { // extends PluginManager { */ private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { // we might not know the index type, try loading with the default reader constructor - logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); + logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); try { return new Pair(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)),null); } catch (TribbleException e) { @@ -318,7 +318,7 @@ public class RMDTrackBuilder { // extends PluginManager { * @return an Index, or null if we're unable to load */ protected Index loadFromDisk( final File inputFile, final File indexFile ) { - logger.info("Loading Tribble index from disk for file " + inputFile); + logger.debug("Loading Tribble index from disk for file " + inputFile); Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); // check if the file is up-to date (filestamp and version check) @@ -384,7 +384,7 @@ public class RMDTrackBuilder { // extends PluginManager { */ protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { // this can take a while, let them know what we're doing - logger.info("Creating Tribble index in memory for file " + inputFile); + logger.debug("Creating Tribble index in memory for file " + inputFile); Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); return idx; From 740b33acbbd9b5e57a92454e4e4a575709d0af82 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 4 Feb 2014 01:21:17 -0500 Subject: [PATCH 3/3] We were never validating the sequence dictionary of tabix indexed VCFs for some reason. Fixed. These changes happened in Tribble, but Joel clobbered them with his commit. We can now change the logging priority on failures to validate the sequence dictionary to WARN. Thanks to Tim F for indirectly pointing this out. --- .../sting/gatk/refdata/tracks/IndexDictionaryUtils.java | 2 +- .../sting/gatk/refdata/tracks/RMDTrackBuilder.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java index e0b5dd4cb..fbbaa6636 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java @@ -96,7 +96,7 @@ public class IndexDictionaryUtils { final ValidationExclusion.TYPE validationExclusionType ) { // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation if (trackDict == null || trackDict.size() == 0) - logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); else { Set trackSequences = new TreeSet(); for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 3433b5342..a587a3984 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -34,6 +34,7 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; +import org.broad.tribble.util.TabixUtils; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -171,7 +172,9 @@ public class RMDTrackBuilder { // extends PluginManager { // we might not know the index type, try loading with the default reader constructor logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); try { - return new Pair(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)),null); + final File indexFile = new File(inputFile.getAbsoluteFile() + TabixUtils.STANDARD_INDEX_EXTENSION); + final SAMSequenceDictionary dict = TabixUtils.getSequenceDictionary(indexFile); + return new Pair<>(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)), dict); } catch (TribbleException e) { throw new UserException(e.getMessage(), e); }