From 03480c955c38366e0bd781687f20e0844e7f2d52 Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 19 Mar 2010 04:58:37 +0000 Subject: [PATCH] And now the UnifiedGenotyper can officially annotate genotype (FORMAT) fields too. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3039 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/VariantContextAdaptors.java | 13 +++++----- .../annotator/DepthPerAlleleBySample.java | 2 +- .../DiploidGenotypeCalculationModel.java | 1 + .../genotype/vcf/VCFFormatHeaderLine.java | 15 +++++++---- .../utils/genotype/vcf/VCFGenotypeRecord.java | 8 +++--- .../vcf/VCFGenotypeWriterAdapter.java | 25 +++++++++---------- .../UnifiedGenotyperIntegrationTest.java | 22 ++++++++-------- 7 files changed, 46 insertions(+), 40 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index bc644a72a..d489e3d77 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -220,7 +220,7 @@ public class VariantContextAdaptors { return toVCF(vc, vcfRefBase, null, true); } - public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List vcfGenotypeAttributeKeys, boolean filtersWereAppliedToContext) { + public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List allowedGenotypeAttributeKeys, boolean filtersWereAppliedToContext) { // deal with the reference String referenceBases = new String(vc.getReference().getBases()); @@ -277,11 +277,12 @@ public class VariantContextAdaptors { alleleMap.put(a, encoding); } - if ( vcfGenotypeAttributeKeys == null ) { - vcfGenotypeAttributeKeys = new ArrayList(); - if ( vc.hasGenotypes() ) { - vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY); - vcfGenotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc)); + List vcfGenotypeAttributeKeys = new ArrayList(); + if ( vc.hasGenotypes() ) { + vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY); + for ( String key : calcVCFGenotypeKeys(vc) ) { + if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) ) + vcfGenotypeAttributeKeys.add(key); } } String genotypeFormatString = Utils.join(VCFRecord.GENOTYPE_FIELD_SEPERATOR, vcfGenotypeAttributeKeys); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 057c87142..55939176c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -49,5 +49,5 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, ExperimentalA public String getKeyName() { return "AD"; } - public VCFFormatHeaderLine getDescription() { return new VCFFormatHeaderLine(getKeyName(), 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Depth in genotypes for each ALT allele, in the same order as listed"); } + public VCFFormatHeaderLine getDescription() { return new VCFFormatHeaderLine(getKeyName(), 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Depth in genotypes for each ALT allele, in the same order as listed"); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java index 156f79ab0..daaf26278 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java @@ -119,6 +119,7 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul CalledGenotype cg = new CalledGenotype(sample, myAlleles, AFbasedGenotype.second); cg.setLikelihoods(GLs.get(sample).getLikelihoods()); cg.setReadBackedPileup(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()); + cg.putAttribute(VCFGenotypeRecord.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); double[] posteriors = GLs.get(sample).getPosteriors(); cg.setPosteriors(posteriors); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFFormatHeaderLine.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFFormatHeaderLine.java index adfc85896..577fff133 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFFormatHeaderLine.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFFormatHeaderLine.java @@ -12,15 +12,15 @@ import org.broadinstitute.sting.utils.Utils; */ public class VCFFormatHeaderLine extends VCFHeaderLine { - // the info field types - public enum INFO_TYPE { + // the format field types + public enum FORMAT_TYPE { Integer, Float, String } private String mName; private int mCount; private String mDescription; - private INFO_TYPE mType; + private FORMAT_TYPE mType; /** @@ -31,7 +31,7 @@ public class VCFFormatHeaderLine extends VCFHeaderLine { * @param type the type for this header line * @param description the description for this header line */ - public VCFFormatHeaderLine(String name, int count, INFO_TYPE type, String description) { + public VCFFormatHeaderLine(String name, int count, FORMAT_TYPE type, String description) { super("FORMAT", ""); mName = name; mCount = count; @@ -52,7 +52,7 @@ public class VCFFormatHeaderLine extends VCFHeaderLine { mName = pieces[0]; mCount = Integer.valueOf(pieces[1]); - mType = INFO_TYPE.valueOf(pieces[2]); + mType = FORMAT_TYPE.valueOf(pieces[2]); mDescription = Utils.trim(pieces[3], '"'); // just in case there were some commas in the description for (int i = 4; i < pieces.length; i++) @@ -63,6 +63,11 @@ public class VCFFormatHeaderLine extends VCFHeaderLine { return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription); } + public String getName() { return mName; } + public int getCount() { return mCount; } + public String getDescription() { return mDescription; } + public FORMAT_TYPE getType() { return mType; } + public boolean equals(Object o) { if ( !(o instanceof VCFFormatHeaderLine) ) return false; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 606866436..1631c26f3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -333,10 +333,10 @@ public class VCFGenotypeRecord { public static Set getSupportedHeaderStrings() { Set result = new HashSet(); - result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.String, "Genotype")); - result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Float, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (only filtered reads used for calling)")); - result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.INFO_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype")); + result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Genotype Quality")); + result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); //result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality")); return result; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index cafc561cd..34d2462e9 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -30,11 +30,8 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { // validation stringency private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT; - // standard genotype format strings - private static String[] standardGenotypeFormatStrings = { VCFGenotypeRecord.GENOTYPE_KEY, - VCFGenotypeRecord.DEPTH_KEY, - VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, - VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY }; + // allowed genotype format strings + private List allowedGenotypeFormatStrings; public VCFGenotypeWriterAdapter(File writeTo) { if (writeTo == null) throw new RuntimeException("VCF output file must not be null"); @@ -55,14 +52,21 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { public void writeHeader(Set sampleNames, Set headerInfo) { mSampleNames.addAll(sampleNames); - // setup the header fields + // set up the header fields Set hInfo = new TreeSet(); hInfo.add(new VCFHeaderLine(VCFHeader.FILE_FORMAT_KEY, VCFHeader.VCF_VERSION)); hInfo.addAll(headerInfo); - // setup the sample names + // set up the sample names mHeader = new VCFHeader(hInfo, mSampleNames); mWriter.writeHeader(mHeader); + + // set up the allowed genotype format fields + allowedGenotypeFormatStrings = new ArrayList(); + for ( VCFHeaderLine field : headerInfo ) { + if ( field instanceof VCFFormatHeaderLine ) + allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getName()); + } } /** finish writing, closing any open files. */ @@ -79,12 +83,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { if ( mHeader == null ) throw new IllegalStateException("The VCF Header must be written before records can be added"); - List formatStrings; - if ( vc.getChromosomeCount() > 0 ) - formatStrings = Arrays.asList(standardGenotypeFormatStrings); - else - formatStrings = new ArrayList(); - VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), formatStrings, false); + VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), allowedGenotypeFormatStrings, false); Set altAlleles = vc.getAlternateAlleles(); StringBuffer altAlleleCountString = new StringBuffer(); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index df3257972..9dd6278f7 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -35,7 +35,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("d8af2cb687aa89d21c5492c98f100b5f")); + Arrays.asList("2a31f997f9c07b6259cf3cdf2459c74b")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("724bc2b640e111df82b9ebd261ddb5d9")); + Arrays.asList("2ab8e48cd7a3d0474e187c2af9694628")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("304ec09a459705f5738a9a82b603ae1f")); + Arrays.asList("ed35f375beb0ac849bfe98daffc1cee2")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); } @@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testParallelization() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1, - Arrays.asList("33e9fe3b8c1ed729c22196d5db3e0d11")); + Arrays.asList("75488092bb229d6345b84d9b793e4e55")); executeTest("test parallelization", spec); } @@ -77,11 +77,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "fb3ffa0f101cf9f8ffc6892b0acab414" ); - e.put( "-all_bases", "3888d0856370f9a5b18c078e2caaec2a" ); - e.put( "--min_base_quality_score 26", "66f729d1948dc057486832731278c226" ); - e.put( "--min_mapping_quality_score 26", "80a7fca199b899a3d0bc1293eb7bf7e5" ); - e.put( "--max_mismatches_in_40bp_window 5", "c6f8846865dcd9021372df917f6c962b" ); + e.put( "-genotype", "02316b8892d439e23d6fbbc65232f921" ); + e.put( "-all_bases", "bcce54904e4c3352168bbfb39f2b9a2f" ); + e.put( "--min_base_quality_score 26", "a8a286e61af8a6b9ba72c2eab19573bb" ); + e.put( "--min_mapping_quality_score 26", "c599b0c3a1e42772e9d1f9ff7112d1e4" ); + e.put( "--max_mismatches_in_40bp_window 5", "46129c47e7d283d8950e029ed39dc1e8" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -95,7 +95,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1, - Arrays.asList("7854c02fcc0c8fcc879f6e35fef2e11f")); + Arrays.asList("64c8fffc735f72663f27b2257fff5583")); executeTest("testConfidence", spec); } @@ -106,7 +106,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testOtherOutput() { - String[] md5s = {"5f3b9abe1b2c30c2ede0007c43e1934c", "8cba0b8752f18fc620b4697840bc7291"}; + String[] md5s = {"e9d7c27ed63d2e92a17ba44501ab1138", "8cba0b8752f18fc620b4697840bc7291"}; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper" + " -R " + oneKGLocation + "reference/human_b36_both.fasta" +