From 5d0f284305ec4274ec255474a1cd536574fc1a39 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 21 Sep 2011 20:26:28 -0400 Subject: [PATCH 1/3] Fixing exome specific arguments to the VQSR in the methods development calling pipeline --- .../qscripts/MethodsDevelopmentCallingPipeline.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index 29ee2769b..d187c1e1a 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -266,13 +266,18 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" ) this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" ) this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS") - if(t.nSamples >= 10) { + if(t.nSamples >= 10) { // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate this.use_annotation ++= List("InbreedingCoeff") } if(!t.isExome) { this.use_annotation ++= List("DP") - } else { + } else { // exome specific parameters + this.resource :+= new TaggedFile( badSites_1000G, "bad=true,prior=2.0" ) this.mG = 6 + if(t.nSamples <= 3) { // very few exome samples means very few variants + this.mG = 4 + this.percentBad = 0.04 + } } this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile } this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } From 8f8b59a932923d29fdeb91a7ef90d1eef24ca1fd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 21 Sep 2011 22:23:28 -0400 Subject: [PATCH 2/3] My interpretation of the VCF spec is that the FORMAT field should only be present if there is genotype/sample data. So the VCFCodec now throws an exception when it encounters such a case. I had to fix one of the integration test VCFs. --- .../utils/codecs/vcf/AbstractVCFCodec.java | 38 ++++++++++--------- .../sting/utils/codecs/vcf/VCFHeader.java | 17 +++------ .../sting/utils/exceptions/UserException.java | 6 +++ 3 files changed, 32 insertions(+), 29 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 83c7083d0..43b07476d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -115,15 +115,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } arrayIndex++; } + + boolean sawFormatTag = false; if ( arrayIndex < strings.length ) { if ( !strings[arrayIndex].equals("FORMAT") ) throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); + sawFormatTag = true; arrayIndex++; } - while (arrayIndex < strings.length) + while ( arrayIndex < strings.length ) auxTags.add(strings[arrayIndex++]); + if ( sawFormatTag && auxTags.size() == 0 ) + throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data"); + } else { if ( str.startsWith("##INFO=") ) { VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version); @@ -200,28 +206,24 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @return a VariantContext */ public Feature decode(String line) { - return reallyDecode(line); - } + // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line + if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; - private Feature reallyDecode(String line) { - // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line - if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; + // our header cannot be null, we need the genotype sample names and counts + if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); + if (parts == null) + parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; - if (parts == null) - parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; + int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) + if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) || + (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) + throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + + " tokens, and saw " + nParts + " )", lineNo); - // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) - if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) || - (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + - " tokens, and saw " + nParts + " )", lineNo); - - return parseVCFLine(parts); + return parseVCFLine(parts); } protected void generateException(String message) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index fd1c74993..66e11bc1e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -35,9 +35,6 @@ public class VCFHeader { // the header string indicator public static final String HEADER_INDICATOR = "#"; - /** do we have genotying data? */ - private boolean hasGenotypingData = false; - // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; @@ -57,17 +54,15 @@ public class VCFHeader { * create a VCF header, given a list of meta data and auxillary tags * * @param metaData the meta data associated with this header - * @param genotypeSampleNames the genotype format field, and the sample names + * @param genotypeSampleNames the sample names */ public VCFHeader(Set metaData, Set genotypeSampleNames) { mMetaData = new TreeSet(); if ( metaData != null ) mMetaData.addAll(metaData); - for (String col : genotypeSampleNames) { - if (!col.equals("FORMAT")) - mGenotypeSampleNames.add(col); - } - if (genotypeSampleNames.size() > 0) hasGenotypingData = true; + + mGenotypeSampleNames.addAll(genotypeSampleNames); + loadVCFVersion(); loadMetaDataMaps(); @@ -157,7 +152,7 @@ public class VCFHeader { * @return true if we have genotyping columns, false otherwise */ public boolean hasGenotypingData() { - return hasGenotypingData; + return mGenotypeSampleNames.size() > 0; } /** @@ -171,7 +166,7 @@ public class VCFHeader { /** @return the column count */ public int getColumnCount() { - return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); + return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 274c64f42..70f7387f4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -174,6 +174,12 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedVCFHeader extends UserException { + public MalformedVCFHeader(String message) { + super(String.format("The provided VCF file has a malformed header: %s", message)); + } + } + public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); From b8ea9ceb68f3ded503bac08cd95e3a15237d5930 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 21 Sep 2011 22:43:31 -0400 Subject: [PATCH 3/3] Adding integration test that uses the -V:dbsnp binding to make sure it won't fail later on if someone messes with Tribble. --- .../SelectVariantsIntegrationTest.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 20409d4ca..e4ded491b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), 1, Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") ); @@ -30,7 +30,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile, + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, 1, Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd") ); @@ -43,7 +43,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testfile = validationDataLocation + "test.dup.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile), + baseTestString(" -sn A -sn B -sn C --variant " + testfile), 1, Arrays.asList("b74038779fe6485dbb8734ae48178356") ); @@ -56,7 +56,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant:VCF " + b37hapmapGenotypes + " -disc:VCF " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s -NO_HEADER", 1, Arrays.asList("78e6842325f1f1bc9ab30d5e7737ee6e") ); @@ -69,7 +69,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc:VCF " + b37hapmapGenotypes + " --variant " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile + " -o %s -NO_HEADER", 1, Arrays.asList("d2ba3ea30a810f6f0fbfb1b643292b6a") ); @@ -90,16 +90,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testVariantTypeSelection--" + testFile, spec); } - @Test(enabled=false) - public void testRemovePLs() { + @Test + public void testUsingDbsnpName() { String testFile = validationDataLocation + "combine.3.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s -NO_HEADER", 1, - Arrays.asList("") + Arrays.asList("167a1265df820978a74c267df44d5c43") ); - executeTest("testWithPLs--" + testFile, spec); + executeTest("testUsingDbsnpName--" + testFile, spec); } }