From 365f1d2429361ad3a3e5c6148c7569fa5dea8d63 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Sat, 29 Sep 2012 00:55:31 -0400 Subject: [PATCH] hmk123's error on the forum came from the reference context occasionally lacking bases needed for validating the reference bases in the variant context. (no @Window for VariantsToBinaryPed). This bugfix adresses this and other minor items: 1) ValidateVariants removed in favor of direct validation VariantContexts. Integration test added to test broken contexts. 2) Enabling indel and SV output. Still bi-allelic sites only. Integration tests added for these cases. 3) Found a bug where GQ recalculation (if a genotype has PLs but no GQ) would only happen for flipped encoding. Fixed. Integration test added. --- .../variantutils/VariantsToBinaryPed.java | 110 +++++++++++++----- .../VariantsToBinaryPedIntegrationTest.java | 45 +++++++ 2 files changed, 124 insertions(+), 31 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 37fc96681..b7ef85a04 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -7,19 +9,19 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -30,6 +32,7 @@ import java.util.*; * produces a binary ped file in individual major mode. */ @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=0,stop=100)) public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -78,8 +81,6 @@ public class VariantsToBinaryPed extends RodWalker { @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") boolean majorAlleleFirst = false; - private ValidateVariants vv = new ValidateVariants(); - private static double APPROX_CM_PER_BP = 1000000.0/750000.0; private static final byte HOM_REF = 0x0; @@ -89,6 +90,8 @@ public class VariantsToBinaryPed extends RodWalker { private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples + private static final String PLINK_DELETION_MARKER = "-"; + // note that HET and NO_CALL are flipped from the documentation: that's because // plink actually reads these in backwards; and we want to use a shift operator // to put these in the appropriate location @@ -101,7 +104,6 @@ public class VariantsToBinaryPed extends RodWalker { private List famOrder = new ArrayList(); public void initialize() { - initializeValidator(); writeBedHeader(); Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers @@ -150,22 +152,25 @@ public class VariantsToBinaryPed extends RodWalker { } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null || ! tracker.hasValues(variantCollection.variants) || - tracker.getFirstValue(variantCollection.variants).isFiltered() || - ! tracker.getFirstValue(variantCollection.variants).isSNP() || - ! tracker.getFirstValue(variantCollection.variants).isBiallelic()) { + if ( tracker == null ) { + return 0; + } + + VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation()); + if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) { return 0; } try { - vv.map(tracker,ref,context); - } catch (UserException e) { + validateVariantSite(vc,ref,context); + } catch (TribbleException e) { throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ - "Please run ValidateVariants for more detailed information."); + "Please run ValidateVariants for more detailed information. This error is: "+e.getMessage()); } - VariantContext vc = tracker.getFirstValue(variantCollection.variants); String refOut; String altOut; + String vcRef = getReferenceAllele(vc); + String vcAlt = getAlternateAllele(vc); boolean altMajor; if ( majorAlleleFirst ) { // want to use the major allele as ref @@ -174,17 +179,17 @@ public class VariantsToBinaryPed extends RodWalker { VariantContextUtils.calculateChromosomeCounts(vc,ats,true); } if ( getAF(ats.get("AF")) > 0.5 ) { - refOut = vc.getAlternateAllele(0).getBaseString(); - altOut = vc.getReference().getBaseString(); + refOut = vcAlt; + altOut = vcRef; altMajor = true; } else { - refOut = vc.getReference().getBaseString(); - altOut = vc.getAlternateAllele(0).getBaseString(); + refOut = vcRef; + altOut = vcAlt; altMajor = false; } } else { - refOut = vc.getReference().getBaseString(); - altOut = vc.getAlternateAllele(0).getBaseString(); + refOut = vcRef; + altOut = vcAlt; altMajor = false; } // write an entry into the map file @@ -286,8 +291,8 @@ public class VariantsToBinaryPed extends RodWalker { private byte getStandardEncoding(Genotype g, int offset) { byte b; - if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { - b = NO_CALL; + if ( ! checkGQIsGood(g) ) { + b = NO_CALL; } else if ( g.isHomRef() ) { b = HOM_REF; } else if ( g.isHomVar() ) { @@ -322,7 +327,8 @@ public class VariantsToBinaryPed extends RodWalker { if ( genotype.hasGQ() ) { return genotype.getGQ() >= minGenotypeQuality; } else if ( genotype.hasLikelihoods() ) { - return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; + double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()); + return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality; } return false; @@ -346,13 +352,6 @@ public class VariantsToBinaryPed extends RodWalker { } } - private void initializeValidator() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; - } - private void writeBedHeader() { // write magic bits into the ped file try { @@ -410,4 +409,53 @@ public class VariantsToBinaryPed extends RodWalker { return metaValues; } + + private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) { + final Allele reportedRefAllele = vc.getReference(); + final int refLength = reportedRefAllele.length(); + if ( refLength > 100 ) { + logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart())); + return; + } + + final byte[] observedRefBases = new byte[refLength]; + System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength); + final Allele observedRefAllele = Allele.create(observedRefBases); + vc.validateReferenceBases(reportedRefAllele, observedRefAllele); + vc.validateAlternateAlleles(); + } + + private String getReferenceAllele(VariantContext vc) { + if ( vc.isSimpleInsertion() ) { + // bi-allelic, so we just have "-" for ped output + return PLINK_DELETION_MARKER; + } + if ( vc.isSymbolic() ) { + // either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1. + return "1"; + } + if ( vc.isSimpleDeletion() ) { + // bi-allelic. Want to take the standard representation and strip off the leading base. + return vc.getReference().getBaseString().substring(1); + } + // snp or mnp + return vc.getReference().getBaseString(); + } + + private String getAlternateAllele(VariantContext vc ) { + if ( vc.isSimpleInsertion() ) { + // bi-allelic. Want to take the standard representation and strip off the leading base. + return vc.getAlternateAllele(0).getBaseString().substring(1); + } + if ( vc.isSymbolic() ) { + // either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2. + return "2"; + } + if ( vc.isSimpleDeletion() ) { + // bi-allelic, so we just have "-" for ped output + return PLINK_DELETION_MARKER; + } + // snp or mnp + return vc.getAlternateAllele(0).getBaseString(); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index a75da6cf9..3e59508bc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -52,6 +52,50 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + @Test + public void testNA12878HighGQ() { + String testName = "testNA12878HighGQ"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959") + ); + + executeTest(testName, spec); + } + + @Test + public void testVCFMismatchReference() { + String testName = "testVCFMismatchReference"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80), + 3, + UserException.class + ); + + executeTest(testName, spec); + } + + @Test + public void test1000GWithIndels() { + String testName = "test1000GWithIndels"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0), + 3, + Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af") + ); + } + + @Test + public void test1000G_Symbolic() { + String testName = "test1000G_Symbolic"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0), + 3, + Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8") + ); + } + @Test public void testCEUTrio() { String testName = "testCEUTrio"; @@ -112,6 +156,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + }