Thanks to Guillermo, I found a bug in the Unified Genotyper output: GL was posteriors instead of likelihoods. Not a huge deal because the

priors were flat, but fixed nonetheless.
Also, needed to update Tribble.
Minor updates to the Beagle input maker.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3461 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-05-28 19:28:26 +00:00
parent 4e268ef6ac
commit ffeb3fd80d
8 changed files with 44 additions and 51 deletions

View File

@ -58,7 +58,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
// Don't allow mixed types for now // Don't allow mixed types for now
private EnumSet<VariantContext.Type> ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL); private EnumSet<VariantContext.Type> ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL);
private String[] ALLOWED_FORMAT_FIELDS = {VCFGenotypeRecord.GENOTYPE_KEY, VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, VCFGenotypeRecord.DEPTH_KEY, VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY }; private String[] ALLOWED_FORMAT_FIELDS = {VCFGenotypeRecord.GENOTYPE_KEY, VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, VCFGenotypeRecord.DEPTH_KEY, VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY };
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) ) if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )

View File

@ -156,13 +156,13 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
cg.setReadBackedPileup(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()); cg.setReadBackedPileup(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup());
cg.putAttribute(VCFGenotypeRecord.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); cg.putAttribute(VCFGenotypeRecord.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
double[] posteriors = GLs.get(sample).getPosteriors(); cg.setPosteriors(GLs.get(sample).getPosteriors());
cg.setPosteriors(posteriors); double[] likelihoods = GLs.get(sample).getLikelihoods();
String GL = String.format("%.2f,%.2f,%.2f", String GL = String.format("%.2f,%.2f,%.2f",
posteriors[refGenotype.ordinal()], likelihoods[refGenotype.ordinal()],
posteriors[hetGenotype.ordinal()], likelihoods[hetGenotype.ordinal()],
posteriors[homGenotype.ordinal()]); likelihoods[homGenotype.ordinal()]);
cg.putAttribute(VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY, GL); cg.putAttribute(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY, GL);
calls.put(sample, cg); calls.put(sample, cg);
} }

View File

@ -25,10 +25,9 @@
package org.broadinstitute.sting.oneoffprojects.walkers; package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broad.tribble.vcf.VCFCodec; import org.broad.tribble.vcf.VCFRecord;
import org.broad.tribble.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
@ -37,34 +36,32 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.*; import java.util.*;
/** /**
* Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF fiel * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF file
*/ */
@Requires(value={},referenceMetaData=@RMD(name="vcf",type= VCFRecord.class))
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> { public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(fullName = "beagle_file", shortName = "beagle", doc = "File to print BEAGLE-specific data for use with imputation", required = false) @Argument(fullName = "beagle_file", shortName = "beagle", doc = "File to print BEAGLE-specific data for use with imputation", required = true)
public PrintStream beagleWriter = null; public PrintStream beagleWriter = null;
final TreeSet<String> samples = new TreeSet<String>(); final TreeSet<String> samples = new TreeSet<String>();
public void initialize() { public void initialize() {
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources(); final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for ( final ReferenceOrderedDataSource source : dataSources ) { for ( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData(); final RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getType().equals(VCFCodec.class) ) { if ( rod.getType().equals(VCFRecord.class) ) {
final VCFReader reader = new VCFReader(rod.getFile()); final VCFReader reader = new VCFReader(rod.getFile());
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples(); final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
samples.addAll(vcfSamples); samples.addAll(vcfSamples);
@ -72,15 +69,11 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
} }
} }
beagleWriter.print("marker alleleA alleleB");
for ( String sample : samples )
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
if ( beagleWriter != null ) { beagleWriter.println();
beagleWriter.print("marker alleleA alleleB");
for ( String sample : samples )
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
beagleWriter.println();
}
} }
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
@ -91,7 +84,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
try { try {
vc_eval = tracker.getVariantContext(ref,"eval", vc, loc, true); vc_eval = tracker.getVariantContext(ref,"vcf", vc, loc, true);
} catch (java.util.NoSuchElementException e) { } catch (java.util.NoSuchElementException e) {
return 0; return 0;
} }
@ -110,23 +103,23 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
beagleWriter.print(String.format("%s ", bglPrintString)); beagleWriter.print(String.format("%s ", bglPrintString));
} }
Map<String, Genotype> genotypes = vc_eval.getGenotypes(); if ( !vc_eval.hasGenotypes() )
if ( genotypes == null || genotypes.size() == 0 )
return null; return null;
Map<String, Genotype> genotypes = vc_eval.getGenotypes();
for ( String sample : samples ) { for ( String sample : samples ) {
// use sample as key into genotypes structure // use sample as key into genotypes structure
Genotype genotype = genotypes.get(sample); Genotype genotype = genotypes.get(sample);
String gls = "0 0 0 "; if (genotype.isCalled() && genotype.hasAttribute(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY)) {
if (genotype.isCalled()) { String[] glArray = genotype.getAttributeAsString(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY).split(",");
String[] glArray = genotype.getAttributeAsString("GL").split(",");
for (String gl : glArray) { for (String gl : glArray) {
Double d_gl = -Double.valueOf(gl); Double d_gl = Math.pow(10, Double.valueOf(gl));
beagleWriter.print(String.format("%5.2f ",d_gl)); beagleWriter.print(String.format("%5.2f ",d_gl));
} }
} }
else else
beagleWriter.print(gls); // write 0 likelihoods for uncalled genotypes. beagleWriter.print("0 0 0 "); // write 0 likelihoods for uncalled genotypes.
} }

View File

@ -159,9 +159,9 @@ public class TrioGenotyperWalker extends RefWalker<VariantContext, Integer>{
* @return * @return
*/ */
private double genotypeL( List<Allele> alleles, Genotype genotypeCall ) { private double genotypeL( List<Allele> alleles, Genotype genotypeCall ) {
String postTriplet = (String)genotypeCall.getAttribute(VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY); String postTriplet = (String)genotypeCall.getAttribute(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY);
if ( postTriplet == null ) if ( postTriplet == null )
throw new StingException("BUG: TrioGenotyperWalker expected genotype likelihood triplets " + VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY); throw new StingException("BUG: TrioGenotyperWalker expected genotype likelihood triplets " + VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY);
// calculate the offset -- AA => 0, AB => 1, BB => 2 // calculate the offset -- AA => 0, AB => 1, BB => 2
int i = 0; int i = 0;

View File

@ -35,7 +35,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1Joint() { public void testMultiSamplePilot1Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000", 1, "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("3f919977584b2c15273cb7de948dd1ac")); Arrays.asList("d9d99c7ce4ea63a907183893de1dd905"));
executeTest("testMultiSamplePilot1 - Joint Estimate", spec); executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
} }
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2Joint() { public void testMultiSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000", 1, "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000", 1,
Arrays.asList("d45934ae6537a84bb123db2288ad235e")); Arrays.asList("23dfd7747ec6149e59abd753d2a8b00c"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec); executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
} }
@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2Joint() { public void testSingleSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000", 1, "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("96245f0842534527b66a9daad53134bd")); Arrays.asList("5ad0d5e8180f5290f511723392a54d03"));
executeTest("testSingleSamplePilot2 - Joint Estimate", spec); executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
} }
@ -63,7 +63,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test @Test
public void testParallelization() { public void testParallelization() {
String md5 = "638911a5d7b155076afc79a3d3f50548"; String md5 = "c7bfe0500e933d1546cb40448d7b4c61";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,075,000", 1, "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,075,000", 1,
@ -85,11 +85,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test @Test
public void testParameter() { public void testParameter() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();
e.put( "-genotype", "d301500f0cd5ef1929a1adce7e857f9c" ); e.put( "-genotype", "0ac7ab893a3f550cb1b8c34f28baedf6" );
e.put( "-all_bases", "70e94c6671a3e5122cb9ad04c0ad138e" ); e.put( "-all_bases", "40520c3020f6abcb60e8b632d9614554" );
e.put( "--min_base_quality_score 26", "c508728f9d26cc3a8b73f3a5ee4275f2" ); e.put( "--min_base_quality_score 26", "ecc1b0dd618eae9b9f62db2742ac3306" );
e.put( "--min_mapping_quality_score 26", "34634ea8f196eaf40dac6e069ebcf09a" ); e.put( "--min_mapping_quality_score 26", "75bd53d5070f1146350d633321a165e3" );
e.put( "--max_mismatches_in_40bp_window 5", "92903b97b43145c24f19a899ce127aa3" ); e.put( "--max_mismatches_in_40bp_window 5", "8e1236b7f0f6c19a0be808b2d44e3255" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -103,12 +103,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() { public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("cdcc3977ce78e15d7c8153eaf5f9a2df")); Arrays.asList("ff8a1008ddfd342a20b3e032bade61bc"));
executeTest("testConfidence1", spec1); executeTest("testConfidence1", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
Arrays.asList("2413aafdcfd3d8ac1169662f72524035")); Arrays.asList("d469b4367188dd2b9be37ab8effc65c4"));
executeTest("testConfidence2", spec2); executeTest("testConfidence2", spec2);
} }

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="org.broad" module="tribble" revision="86" status="integration" publication="20100526124200" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="org.broad" module="tribble" revision="87" status="integration" publication="20100528124200" />
</ivy-module>