And now the UnifiedGenotyper can officially annotate genotype (FORMAT) fields too.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3039 348d0f76-0448-11de-a6fe-93d51630548a
2010-03-19 04:58:37 +00:00 · 2010-03-19 04:58:37 +00:00 · 03480c955c
parent e757f6f078
commit 03480c955c
7 changed files with 46 additions and 40 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
@ -220,7 +220,7 @@ public class VariantContextAdaptors {
        return toVCF(vc, vcfRefBase, null, true);
    }

-    public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List<String> vcfGenotypeAttributeKeys, boolean filtersWereAppliedToContext) {
+    public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List<String> allowedGenotypeAttributeKeys, boolean filtersWereAppliedToContext) {
        // deal with the reference
        String referenceBases = new String(vc.getReference().getBases());

@ -277,11 +277,12 @@ public class VariantContextAdaptors {
            alleleMap.put(a, encoding);
        }

-        if ( vcfGenotypeAttributeKeys == null ) {
-            vcfGenotypeAttributeKeys = new ArrayList<String>();
-            if ( vc.hasGenotypes() ) {
-                vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY);
-                vcfGenotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
+        List<String> vcfGenotypeAttributeKeys = new ArrayList<String>();
+        if ( vc.hasGenotypes() ) {
+            vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY);
+            for ( String key : calcVCFGenotypeKeys(vc) ) {
+                if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
+                    vcfGenotypeAttributeKeys.add(key);
            }
        }
        String genotypeFormatString = Utils.join(VCFRecord.GENOTYPE_FIELD_SEPERATOR, vcfGenotypeAttributeKeys);
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
@ -49,5 +49,5 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, ExperimentalA

    public String getKeyName() { return "AD"; }

-    public VCFFormatHeaderLine getDescription() { return new VCFFormatHeaderLine(getKeyName(), 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Depth in genotypes for each ALT allele, in the same order as listed"); }
+    public VCFFormatHeaderLine getDescription() { return new VCFFormatHeaderLine(getKeyName(), 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Depth in genotypes for each ALT allele, in the same order as listed"); }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java
@ -119,6 +119,7 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
            CalledGenotype cg = new CalledGenotype(sample, myAlleles, AFbasedGenotype.second);
            cg.setLikelihoods(GLs.get(sample).getLikelihoods());
            cg.setReadBackedPileup(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup());
+            cg.putAttribute(VCFGenotypeRecord.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());

            double[] posteriors = GLs.get(sample).getPosteriors();
            cg.setPosteriors(posteriors);
--- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFFormatHeaderLine.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFFormatHeaderLine.java
@ -12,15 +12,15 @@ import org.broadinstitute.sting.utils.Utils;
 */
 public class VCFFormatHeaderLine extends VCFHeaderLine {

-    // the info field types
-    public enum INFO_TYPE {
+    // the format field types
+    public enum FORMAT_TYPE {
        Integer, Float, String
    }

    private String mName;
    private int mCount;
    private String mDescription;
-    private INFO_TYPE mType;
+    private FORMAT_TYPE mType;


    /**
@ -31,7 +31,7 @@ public class VCFFormatHeaderLine extends VCFHeaderLine {
     * @param type         the type for this header line
     * @param description  the description for this header line
     */
-    public VCFFormatHeaderLine(String name, int count, INFO_TYPE type, String description) {
+    public VCFFormatHeaderLine(String name, int count, FORMAT_TYPE type, String description) {
        super("FORMAT", "");
        mName = name;
        mCount = count;
@ -52,7 +52,7 @@ public class VCFFormatHeaderLine extends VCFHeaderLine {

        mName = pieces[0];
        mCount = Integer.valueOf(pieces[1]);
-        mType = INFO_TYPE.valueOf(pieces[2]);
+        mType = FORMAT_TYPE.valueOf(pieces[2]);
        mDescription = Utils.trim(pieces[3], '"');
        // just in case there were some commas in the description
        for (int i = 4; i < pieces.length; i++)
@ -63,6 +63,11 @@ public class VCFFormatHeaderLine extends VCFHeaderLine {
        return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
    }

+    public String getName() { return mName; }
+    public int getCount() { return mCount; }
+    public String getDescription() { return mDescription; }
+    public FORMAT_TYPE getType() { return mType; }
+
    public boolean equals(Object o) {
        if ( !(o instanceof VCFFormatHeaderLine) )
            return false;
--- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java
@ -333,10 +333,10 @@ public class VCFGenotypeRecord {

    public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings() {
        Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
-        result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.String, "Genotype"));
-        result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Float, "Genotype Quality"));
-        result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
-        result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.INFO_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
+        result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype"));
+        result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Genotype Quality"));
+        result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
+        result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
        //result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality"));
        return result;
    }
--- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java
@ -30,11 +30,8 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
    // validation stringency
    private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT;

-    // standard genotype format strings
-    private static String[] standardGenotypeFormatStrings = { VCFGenotypeRecord.GENOTYPE_KEY,
-                                                              VCFGenotypeRecord.DEPTH_KEY,
-                                                              VCFGenotypeRecord.GENOTYPE_QUALITY_KEY,
-                                                              VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY };
+    // allowed genotype format strings
+    private List<String> allowedGenotypeFormatStrings;

    public VCFGenotypeWriterAdapter(File writeTo) {
        if (writeTo == null) throw new RuntimeException("VCF output file must not be null");
@ -55,14 +52,21 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
    public void writeHeader(Set<String> sampleNames, Set<VCFHeaderLine> headerInfo) {
        mSampleNames.addAll(sampleNames);

-        // setup the header fields
+        // set up the header fields
        Set<VCFHeaderLine> hInfo = new TreeSet<VCFHeaderLine>();
        hInfo.add(new VCFHeaderLine(VCFHeader.FILE_FORMAT_KEY, VCFHeader.VCF_VERSION));
        hInfo.addAll(headerInfo);
        
-        // setup the sample names
+        // set up the sample names
        mHeader = new VCFHeader(hInfo, mSampleNames);
        mWriter.writeHeader(mHeader);
+
+        // set up the allowed genotype format fields
+        allowedGenotypeFormatStrings = new ArrayList<String>();
+        for ( VCFHeaderLine field : headerInfo ) {
+            if ( field instanceof VCFFormatHeaderLine )
+                allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getName());
+        }
    }

    /** finish writing, closing any open files. */
@ -79,12 +83,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
        if ( mHeader == null )
            throw new IllegalStateException("The VCF Header must be written before records can be added");

-        List<String> formatStrings;
-        if ( vc.getChromosomeCount() > 0 )
-            formatStrings = Arrays.asList(standardGenotypeFormatStrings);
-        else
-            formatStrings = new ArrayList<String>();
-        VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), formatStrings, false);
+        VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), allowedGenotypeFormatStrings, false);

        Set<Allele> altAlleles = vc.getAlternateAlleles();
        StringBuffer altAlleleCountString = new StringBuffer();
--- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -35,7 +35,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    public void testMultiSamplePilot1Joint() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
-                Arrays.asList("d8af2cb687aa89d21c5492c98f100b5f"));
+                Arrays.asList("2a31f997f9c07b6259cf3cdf2459c74b"));
        executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
    }

@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    public void testMultiSamplePilot2Joint() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
-                Arrays.asList("724bc2b640e111df82b9ebd261ddb5d9"));
+                Arrays.asList("2ab8e48cd7a3d0474e187c2af9694628"));
        executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
    }

@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    public void testSingleSamplePilot2Joint() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
-                Arrays.asList("304ec09a459705f5738a9a82b603ae1f"));
+                Arrays.asList("ed35f375beb0ac849bfe98daffc1cee2"));
        executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
    }

@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    public void testParallelization() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1,
-                Arrays.asList("33e9fe3b8c1ed729c22196d5db3e0d11"));
+                Arrays.asList("75488092bb229d6345b84d9b793e4e55"));
        executeTest("test parallelization", spec);
    }

@ -77,11 +77,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    @Test
    public void testParameter() {
        HashMap<String, String> e = new HashMap<String, String>();
-        e.put( "-genotype", "fb3ffa0f101cf9f8ffc6892b0acab414" );
-        e.put( "-all_bases", "3888d0856370f9a5b18c078e2caaec2a" );
-        e.put( "--min_base_quality_score 26", "66f729d1948dc057486832731278c226" );
-        e.put( "--min_mapping_quality_score 26", "80a7fca199b899a3d0bc1293eb7bf7e5" );
-        e.put( "--max_mismatches_in_40bp_window 5", "c6f8846865dcd9021372df917f6c962b" );
+        e.put( "-genotype", "02316b8892d439e23d6fbbc65232f921" );
+        e.put( "-all_bases", "bcce54904e4c3352168bbfb39f2b9a2f" );
+        e.put( "--min_base_quality_score 26", "a8a286e61af8a6b9ba72c2eab19573bb" );
+        e.put( "--min_mapping_quality_score 26", "c599b0c3a1e42772e9d1f9ff7112d1e4" );
+        e.put( "--max_mismatches_in_40bp_window 5", "46129c47e7d283d8950e029ed39dc1e8" );

        for ( Map.Entry<String, String> entry : e.entrySet() ) {
            WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -95,7 +95,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    public void testConfidence() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
-                Arrays.asList("7854c02fcc0c8fcc879f6e35fef2e11f"));
+                Arrays.asList("64c8fffc735f72663f27b2257fff5583"));
        executeTest("testConfidence", spec);
    }

@ -106,7 +106,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    // --------------------------------------------------------------------------------------------------------------
    @Test
    public void testOtherOutput() {
-        String[] md5s = {"5f3b9abe1b2c30c2ede0007c43e1934c", "8cba0b8752f18fc620b4697840bc7291"};
+        String[] md5s = {"e9d7c27ed63d2e92a17ba44501ab1138", "8cba0b8752f18fc620b4697840bc7291"};
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T UnifiedGenotyper" +
                        " -R " + oneKGLocation + "reference/human_b36_both.fasta" +