Merge branch 'master' of ssh://gsa2/humgen/gsa-scr1/chartl/dev/git

2011-09-22 11:01:58 -04:00 · 2011-09-22 11:01:58 -04:00 · 4f4a0fc38a
parent 982c47bfa7 b8ea9ceb68
commit 4f4a0fc38a
5 changed files with 49 additions and 41 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -115,15 +115,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
                    }
                    arrayIndex++;
                }
+
+                boolean sawFormatTag = false;
                if ( arrayIndex < strings.length ) {
                    if ( !strings[arrayIndex].equals("FORMAT") )
                        throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
+                    sawFormatTag = true;
                    arrayIndex++;
                }

-                while (arrayIndex < strings.length)
+                while ( arrayIndex < strings.length )
                    auxTags.add(strings[arrayIndex++]);

+                if ( sawFormatTag && auxTags.size() == 0 )
+                    throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
+
            } else {
                if ( str.startsWith("##INFO=") ) {
                    VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
@ -200,28 +206,24 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
     * @return a VariantContext
     */
    public Feature decode(String line) {
-        return reallyDecode(line);
-    }
+        // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
+        if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;

-    private Feature reallyDecode(String line) {
-            // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
-            if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
+        // our header cannot be null, we need the genotype sample names and counts
+        if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");

-            // our header cannot be null, we need the genotype sample names and counts
-            if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
+        if (parts == null)
+            parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];

-            if (parts == null)
-                parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
+        int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);

-            int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
+        // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
+        if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
+             (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
+            throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
+                    " tokens, and saw " + nParts + " )", lineNo);

-            // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
-            if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
-                 (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
-                throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
-                        " tokens, and saw " + nParts + " )", lineNo);
-
-            return parseVCFLine(parts);
+        return parseVCFLine(parts);
    }

    protected void generateException(String message) {
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@ -35,9 +35,6 @@ public class VCFHeader {
    // the header string indicator
    public static final String HEADER_INDICATOR = "#";

-    /** do we have genotying data? */
-    private boolean hasGenotypingData = false;
-
    // were the input samples sorted originally (or are we sorting them)?
    private boolean samplesWereAlreadySorted = true;

@ -57,17 +54,15 @@ public class VCFHeader {
     * create a VCF header, given a list of meta data and auxillary tags
     *
     * @param metaData            the meta data associated with this header
-     * @param genotypeSampleNames the genotype format field, and the sample names
+     * @param genotypeSampleNames the sample names
     */
    public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
        mMetaData = new TreeSet<VCFHeaderLine>();
        if ( metaData != null )
            mMetaData.addAll(metaData);
-        for (String col : genotypeSampleNames) {
-            if (!col.equals("FORMAT"))
-                mGenotypeSampleNames.add(col);
-        }
-        if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
+
+        mGenotypeSampleNames.addAll(genotypeSampleNames);
+
        loadVCFVersion();
        loadMetaDataMaps();

@ -157,7 +152,7 @@ public class VCFHeader {
     * @return true if we have genotyping columns, false otherwise
     */
    public boolean hasGenotypingData() {
-        return hasGenotypingData;
+        return mGenotypeSampleNames.size() > 0;
    }

    /**
@ -171,7 +166,7 @@ public class VCFHeader {

    /** @return the column count */
    public int getColumnCount() {
-        return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
+        return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@ -174,6 +174,12 @@ public class UserException extends ReviewedStingException {
        }
    }

+    public static class MalformedVCFHeader extends UserException {
+        public MalformedVCFHeader(String message) {
+            super(String.format("The provided VCF file has a malformed header: %s", message));
+        }
+    }
+
    public static class ReadMissingReadGroup extends MalformedBAM {
        public ReadMissingReadGroup(SAMRecord read) {
            super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK.  Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName()));
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";

        WalkerTestSpec spec = new WalkerTestSpec(
-            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile),
+            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile),
            1,
            Arrays.asList("d18516c1963802e92cb9e425c0b75fd6")
        );
@ -30,7 +30,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";

        WalkerTestSpec spec = new WalkerTestSpec(
-            "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile,
+            "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile,
            1,
            Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd")
        );
@ -43,7 +43,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String testfile = validationDataLocation + "test.dup.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile),
+                baseTestString(" -sn A -sn B -sn C --variant " + testfile),
                1,
                Arrays.asList("b74038779fe6485dbb8734ae48178356")
        );
@ -56,7 +56,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String testFile = validationDataLocation + "NA12878.hg19.example1.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant:VCF " + b37hapmapGenotypes + " -disc:VCF " + testFile + " -o %s -NO_HEADER",
+                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s -NO_HEADER",
                1,
                Arrays.asList("78e6842325f1f1bc9ab30d5e7737ee6e")
        );
@ -69,7 +69,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String testFile = validationDataLocation + "NA12878.hg19.example1.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc:VCF " + b37hapmapGenotypes + " --variant " + testFile + " -o %s -NO_HEADER",
+                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile + " -o %s -NO_HEADER",
                1,
                Arrays.asList("d2ba3ea30a810f6f0fbfb1b643292b6a")
        );
@ -90,16 +90,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        executeTest("testVariantTypeSelection--" + testFile, spec);
    }

-    @Test(enabled=false)
-    public void testRemovePLs() {
+    @Test
+    public void testUsingDbsnpName() {
        String testFile = validationDataLocation + "combine.3.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s -NO_HEADER",
+                "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s -NO_HEADER",
                1,
-                Arrays.asList("")
+                Arrays.asList("167a1265df820978a74c267df44d5c43")
        );

-        executeTest("testWithPLs--" + testFile, spec);
+        executeTest("testUsingDbsnpName--" + testFile, spec);
    }
 }
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
@ -266,13 +266,18 @@ class MethodsDevelopmentCallingPipeline extends QScript {
    this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" )
    this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" )
    this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS")
-    if(t.nSamples >= 10) {
+    if(t.nSamples >= 10) { // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate
        this.use_annotation ++= List("InbreedingCoeff")
    }
    if(!t.isExome) {
        this.use_annotation ++= List("DP")
-    } else {
+    } else { // exome specific parameters
+        this.resource :+= new TaggedFile( badSites_1000G, "bad=true,prior=2.0" )
        this.mG = 6
+        if(t.nSamples <= 3) { // very few exome samples means very few variants
+            this.mG = 4
+            this.percentBad = 0.04
+        }
    }
    this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile }
    this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile }