From 5a20bf0e6466e3e2a34e41f4f7bb95ef1bde9206 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 9 Mar 2010 17:16:47 +0000 Subject: [PATCH] 3 changes to UG which break integration tests: 1. emit AA,AB,BB likelihoods in the FORMAT field for Mark 2. remove constraint that genotype alleles (in the GT field) need to be lexigraphically sorted. 3. Add bam file(s) used by genotyper to header for Kiran git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2963 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/VariantContextAdaptors.java | 13 ++-------- .../DiploidGenotypeCalculationModel.java | 16 +++++++++---- .../walkers/genotyper/UnifiedGenotyper.java | 6 ++++- .../sting/utils/genotype/CalledGenotype.java | 4 +++- .../utils/genotype/vcf/VCFGenotypeRecord.java | 4 ++++ .../vcf/VCFGenotypeWriterAdapter.java | 5 ++-- .../UnifiedGenotyperIntegrationTest.java | 24 +++++++++---------- 7 files changed, 41 insertions(+), 31 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index f015ab441..2879cd847 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -162,8 +162,7 @@ public class VariantContextAdaptors { alleles.add(refAllele); for ( String alt : vcf.getAlternateAlleleList() ) { if ( ! Allele.acceptableAlleleBases(alt) ) { - // todo -- cleanup - System.out.printf("Excluding vcf record %s%n", vcf); + //System.out.printf("Excluding vcf record %s%n", vcf); return null; } alleles.add(new Allele(alt, false)); @@ -288,15 +287,7 @@ public class VariantContextAdaptors { for ( Genotype g : vc.getGenotypesSortedByName() ) { List encodings = new ArrayList(g.getPloidy()); - // TODO -- REMOVE ME ONCE INTEGRATION TESTS PASS!!! - ArrayList temporaryList = new ArrayList(g.getAlleles()); - Collections.sort(temporaryList, new Comparator() { - public int compare(Allele a1, Allele a2) { - return a1.toString().charAt(0) - a2.toString().charAt(0); - } - }); - for ( Allele a : temporaryList ) { - //for ( Allele a : g.getAlleles() ) { + for ( Allele a : g.getAlleles() ) { encodings.add(alleleMap.get(a.toString())); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java index dda4f8c95..e57eebe1d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.genotype.CalledGenotype; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; @@ -93,6 +94,9 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt); Allele refAllele = new Allele(Character.toString(ref), true); Allele altAllele = new Allele(Character.toString(alt), false); + DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); + DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt); + DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt); for ( String sample : GLs.keySet() ) { @@ -113,17 +117,21 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul CalledGenotype cg = new CalledGenotype(sample, myAlleles, AFbasedGenotype.second); cg.setLikelihoods(GLs.get(sample).getLikelihoods()); - cg.setPosteriors(GLs.get(sample).getPosteriors()); cg.setReadBackedPileup(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()); + double[] posteriors = GLs.get(sample).getPosteriors(); + cg.setPosteriors(posteriors); + String GL = String.format("%.2f,%.2f,%.2f", + posteriors[refGenotype.ordinal()], + posteriors[hetGenotype.ordinal()], + posteriors[homGenotype.ordinal()]); + cg.putAttribute(VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY, GL); + calls.put(sample, cg); } // output to beagle file if requested if ( beagleWriter != null ) { - DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); - DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt); - DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt); for ( String sample : samples ) { GenotypeLikelihoods gl = GLs.get(sample); if ( gl == null ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index d8662b6ac..b02b1649a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.*; import java.util.*; import java.io.PrintStream; +import java.io.File; /** @@ -139,7 +140,7 @@ public class UnifiedGenotyper extends LocusWalker commandLineArgs = CommandLineUtils.getApproximateCommandLineArguments(args); for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) headerInfo.add(new VCFHeaderLine(String.format("UG_%s", commandLineArg.getKey()), commandLineArg.getValue())); + // also, the list of input bams + for ( File file : getToolkit().getArguments().samFiles ) + headerInfo.add(new VCFHeaderLine("UG_bam_file_used", file.getName())); return headerInfo; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/CalledGenotype.java b/java/src/org/broadinstitute/sting/utils/genotype/CalledGenotype.java index 809caee1c..b36abc6fb 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/CalledGenotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/CalledGenotype.java @@ -6,16 +6,18 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.*; /** - * This class emcompasses all the basic information about a genotype. It is immutable. + * This class emcompasses all the basic information about a called genotype. * * @author ebanks */ public class CalledGenotype extends MutableGenotype { + // key names for standard genotype attributes public static final String LIKELIHOODS_ATTRIBUTE_KEY = "Likelihoods"; public static final String POSTERIORS_ATTRIBUTE_KEY = "Posteriors"; public static final String READBACKEDPILEUP_ATTRIBUTE_KEY = "ReadBackedPileup"; + public CalledGenotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 1975f1732..2a9050f4d 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -19,6 +19,7 @@ public class VCFGenotypeRecord { public static final String DEPTH_KEY = "DP"; public static final String HAPLOTYPE_QUALITY_KEY = "HQ"; public static final String GENOTYPE_FILTER_KEY = "FT"; + public static final String GENOTYPE_POSTERIORS_TRIPLET_KEY = "GL"; public static final String OLD_DEPTH_KEY = "RD"; // the values for empty fields @@ -313,6 +314,8 @@ public class VCFGenotypeRecord { result = String.valueOf(MISSING_DEPTH); else if ( field.equals(GENOTYPE_FILTER_KEY) ) result = UNFILTERED; + else if ( field.equals(GENOTYPE_POSTERIORS_TRIPLET_KEY) ) + result = "0,0,0"; // TODO -- support haplotype quality //else if ( field.equals(HAPLOTYPE_QUALITY_KEY) ) // result = String.valueOf(MISSING_HAPLOTYPE_QUALITY); @@ -324,6 +327,7 @@ public class VCFGenotypeRecord { result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.String, "Genotype")); result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Float, "Genotype Quality")); result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.INFO_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); //result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality")); return result; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 0c3654da5..cafc561cd 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -11,7 +11,7 @@ import java.util.*; /** - * @author aaron + * @author aaron, ebanks *

* Class VCFGenotypeWriterAdapter *

@@ -33,7 +33,8 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { // standard genotype format strings private static String[] standardGenotypeFormatStrings = { VCFGenotypeRecord.GENOTYPE_KEY, VCFGenotypeRecord.DEPTH_KEY, - VCFGenotypeRecord.GENOTYPE_QUALITY_KEY }; + VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, + VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY }; public VCFGenotypeWriterAdapter(File writeTo) { if (writeTo == null) throw new RuntimeException("VCF output file must not be null"); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 15cf965f9..926917f69 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testPooled1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1, - Arrays.asList("5a86c53e8f0897a71ff74662c5696dc4")); + Arrays.asList("c30af5d192661abd77b05a316f1d8923")); executeTest("testPooled1", spec); } @@ -35,7 +35,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("2f928b1261963044ca1781601cae4bf7")); + Arrays.asList("882b2fae1cd1ba65cac3cadacec0ce2b")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("10fe265d0140243b52f500c3882230f2")); + Arrays.asList("aa0cff414e6623c36465726a987a645d")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("5c455e1a33e3f82d13898b20ee71ac69")); + Arrays.asList("53df224164083cc7d8ad85f3d16ba38f")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); } @@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testParallelization() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1, - Arrays.asList("c827c74e59263b0f6b526089b050c100")); + Arrays.asList("3ade750c0d261594ea549db7b127a1e3")); executeTest("test parallelization", spec); } @@ -77,11 +77,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "8818584bf7053ecf52844f3b404ab630" ); - e.put( "-all_bases", "bf6ea1c04bf52002e3365c1f47103d4a" ); - e.put( "--min_base_quality_score 26", "3f192cd301f057698d0bc6c41841ce81" ); - e.put( "--min_mapping_quality_score 26", "2631025243ac6b52df852309235ec8d3" ); - e.put( "--max_mismatches_in_40bp_window 5", "e540a2057164e3b05a5d635805f1167e" ); + e.put( "-genotype", "bee9fa71d70fdde094ab30785d4fa84e" ); + e.put( "-all_bases", "410cff9d97cd017becd1f6260c7abeeb" ); + e.put( "--min_base_quality_score 26", "85e1c35d3926afc68761aefea3f41332" ); + e.put( "--min_mapping_quality_score 26", "1c49a7d5e6ad295c0450b8a35053050f" ); + e.put( "--max_mismatches_in_40bp_window 5", "7e7db5a0d859704e12a4b89d35065682" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -95,7 +95,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1, - Arrays.asList("bfa93ee89aa0807b9c4e4793363452b4")); + Arrays.asList("c67dd3e97cb188b117074d2c4692fcfa")); executeTest("testConfidence", spec); } @@ -106,7 +106,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testOtherOutput() { - String[] md5s = {"8c7dd53a402b727753002ebcd76168ac", "8cba0b8752f18fc620b4697840bc7291"}; + String[] md5s = {"ce0024816a092af9f998a7561ffb4fb2", "8cba0b8752f18fc620b4697840bc7291"}; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper" + " -R " + oneKGLocation + "reference/human_b36_both.fasta" +