hmk123's error on the forum came from the reference context occasionally lacking bases needed for validating the reference bases in the variant context. (no @Window for VariantsToBinaryPed). This bugfix adresses this and other minor items:

1) ValidateVariants removed in favor of direct validation VariantContexts. Integration test added to test broken contexts.
 2) Enabling indel and SV output. Still bi-allelic sites only. Integration tests added for these cases.
 3) Found a bug where GQ recalculation (if a genotype has PLs but no GQ) would only happen for flipped encoding. Fixed. Integration test added.
This commit is contained in:
Christopher Hartl 2012-09-29 00:55:31 -04:00
parent 55cdf4f9b7
commit 365f1d2429
2 changed files with 124 additions and 31 deletions

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils; package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
@ -7,19 +9,19 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
@ -30,6 +32,7 @@ import java.util.*;
* produces a binary ped file in individual major mode. * produces a binary ped file in individual major mode.
*/ */
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=0,stop=100))
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> { public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@ArgumentCollection @ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ -78,8 +81,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
boolean majorAlleleFirst = false; boolean majorAlleleFirst = false;
private ValidateVariants vv = new ValidateVariants();
private static double APPROX_CM_PER_BP = 1000000.0/750000.0; private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
private static final byte HOM_REF = 0x0; private static final byte HOM_REF = 0x0;
@ -89,6 +90,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
private static final String PLINK_DELETION_MARKER = "-";
// note that HET and NO_CALL are flipped from the documentation: that's because // note that HET and NO_CALL are flipped from the documentation: that's because
// plink actually reads these in backwards; and we want to use a shift operator // plink actually reads these in backwards; and we want to use a shift operator
// to put these in the appropriate location // to put these in the appropriate location
@ -101,7 +104,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private List<String> famOrder = new ArrayList<String>(); private List<String> famOrder = new ArrayList<String>();
public void initialize() { public void initialize() {
initializeValidator();
writeBedHeader(); writeBedHeader();
Map<String,Map<String,String>> sampleMetaValues = parseMetaData(); Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
// create temporary output streams and buffers // create temporary output streams and buffers
@ -150,22 +152,25 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
} }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) || if ( tracker == null ) {
tracker.getFirstValue(variantCollection.variants).isFiltered() || return 0;
! tracker.getFirstValue(variantCollection.variants).isSNP() || }
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation());
if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) {
return 0; return 0;
} }
try { try {
vv.map(tracker,ref,context); validateVariantSite(vc,ref,context);
} catch (UserException e) { } catch (TribbleException e) {
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
"Please run ValidateVariants for more detailed information."); "Please run ValidateVariants for more detailed information. This error is: "+e.getMessage());
} }
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
String refOut; String refOut;
String altOut; String altOut;
String vcRef = getReferenceAllele(vc);
String vcAlt = getAlternateAllele(vc);
boolean altMajor; boolean altMajor;
if ( majorAlleleFirst ) { if ( majorAlleleFirst ) {
// want to use the major allele as ref // want to use the major allele as ref
@ -174,17 +179,17 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
VariantContextUtils.calculateChromosomeCounts(vc,ats,true); VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
} }
if ( getAF(ats.get("AF")) > 0.5 ) { if ( getAF(ats.get("AF")) > 0.5 ) {
refOut = vc.getAlternateAllele(0).getBaseString(); refOut = vcAlt;
altOut = vc.getReference().getBaseString(); altOut = vcRef;
altMajor = true; altMajor = true;
} else { } else {
refOut = vc.getReference().getBaseString(); refOut = vcRef;
altOut = vc.getAlternateAllele(0).getBaseString(); altOut = vcAlt;
altMajor = false; altMajor = false;
} }
} else { } else {
refOut = vc.getReference().getBaseString(); refOut = vcRef;
altOut = vc.getAlternateAllele(0).getBaseString(); altOut = vcAlt;
altMajor = false; altMajor = false;
} }
// write an entry into the map file // write an entry into the map file
@ -286,8 +291,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getStandardEncoding(Genotype g, int offset) { private byte getStandardEncoding(Genotype g, int offset) {
byte b; byte b;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { if ( ! checkGQIsGood(g) ) {
b = NO_CALL; b = NO_CALL;
} else if ( g.isHomRef() ) { } else if ( g.isHomRef() ) {
b = HOM_REF; b = HOM_REF;
} else if ( g.isHomVar() ) { } else if ( g.isHomVar() ) {
@ -322,7 +327,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
if ( genotype.hasGQ() ) { if ( genotype.hasGQ() ) {
return genotype.getGQ() >= minGenotypeQuality; return genotype.getGQ() >= minGenotypeQuality;
} else if ( genotype.hasLikelihoods() ) { } else if ( genotype.hasLikelihoods() ) {
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector());
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
} }
return false; return false;
@ -346,13 +352,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
} }
} }
private void initializeValidator() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
}
private void writeBedHeader() { private void writeBedHeader() {
// write magic bits into the ped file // write magic bits into the ped file
try { try {
@ -410,4 +409,53 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
return metaValues; return metaValues;
} }
private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) {
final Allele reportedRefAllele = vc.getReference();
final int refLength = reportedRefAllele.length();
if ( refLength > 100 ) {
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
return;
}
final byte[] observedRefBases = new byte[refLength];
System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength);
final Allele observedRefAllele = Allele.create(observedRefBases);
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
vc.validateAlternateAlleles();
}
private String getReferenceAllele(VariantContext vc) {
if ( vc.isSimpleInsertion() ) {
// bi-allelic, so we just have "-" for ped output
return PLINK_DELETION_MARKER;
}
if ( vc.isSymbolic() ) {
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1.
return "1";
}
if ( vc.isSimpleDeletion() ) {
// bi-allelic. Want to take the standard representation and strip off the leading base.
return vc.getReference().getBaseString().substring(1);
}
// snp or mnp
return vc.getReference().getBaseString();
}
private String getAlternateAllele(VariantContext vc ) {
if ( vc.isSimpleInsertion() ) {
// bi-allelic. Want to take the standard representation and strip off the leading base.
return vc.getAlternateAllele(0).getBaseString().substring(1);
}
if ( vc.isSymbolic() ) {
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2.
return "2";
}
if ( vc.isSimpleDeletion() ) {
// bi-allelic, so we just have "-" for ped output
return PLINK_DELETION_MARKER;
}
// snp or mnp
return vc.getAlternateAllele(0).getBaseString();
}
} }

View File

@ -52,6 +52,50 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
executeTest(testName, spec); executeTest(testName, spec);
} }
@Test
public void testNA12878HighGQ() {
String testName = "testNA12878HighGQ";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959")
);
executeTest(testName, spec);
}
@Test
public void testVCFMismatchReference() {
String testName = "testVCFMismatchReference";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80),
3,
UserException.class
);
executeTest(testName, spec);
}
@Test
public void test1000GWithIndels() {
String testName = "test1000GWithIndels";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0),
3,
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af")
);
}
@Test
public void test1000G_Symbolic() {
String testName = "test1000G_Symbolic";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0),
3,
Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8")
);
}
@Test @Test
public void testCEUTrio() { public void testCEUTrio() {
String testName = "testCEUTrio"; String testName = "testCEUTrio";
@ -112,6 +156,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
executeTest(testName, spec); executeTest(testName, spec);
} }
} }