hmk123's error on the forum came from the reference context occasionally lacking bases needed for validating the reference bases in the variant context. (no @Window for VariantsToBinaryPed). This bugfix adresses this and other minor items:
1) ValidateVariants removed in favor of direct validation VariantContexts. Integration test added to test broken contexts. 2) Enabling indel and SV output. Still bi-allelic sites only. Integration tests added for these cases. 3) Found a bug where GQ recalculation (if a genotype has PLs but no GQ) would only happen for flipped encoding. Fixed. Integration test added.
This commit is contained in:
parent
55cdf4f9b7
commit
365f1d2429
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -7,19 +9,19 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
|
@ -30,6 +32,7 @@ import java.util.*;
|
|||
* produces a binary ped file in individual major mode.
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@Reference(window=@Window(start=0,stop=100))
|
||||
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
|
@ -78,8 +81,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
|
||||
boolean majorAlleleFirst = false;
|
||||
|
||||
private ValidateVariants vv = new ValidateVariants();
|
||||
|
||||
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||
|
||||
private static final byte HOM_REF = 0x0;
|
||||
|
|
@ -89,6 +90,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
|
||||
|
||||
private static final String PLINK_DELETION_MARKER = "-";
|
||||
|
||||
// note that HET and NO_CALL are flipped from the documentation: that's because
|
||||
// plink actually reads these in backwards; and we want to use a shift operator
|
||||
// to put these in the appropriate location
|
||||
|
|
@ -101,7 +104,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
private List<String> famOrder = new ArrayList<String>();
|
||||
|
||||
public void initialize() {
|
||||
initializeValidator();
|
||||
writeBedHeader();
|
||||
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
|
||||
// create temporary output streams and buffers
|
||||
|
|
@ -150,22 +152,25 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
|
||||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
|
||||
if ( tracker == null ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation());
|
||||
if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) {
|
||||
return 0;
|
||||
}
|
||||
try {
|
||||
vv.map(tracker,ref,context);
|
||||
} catch (UserException e) {
|
||||
validateVariantSite(vc,ref,context);
|
||||
} catch (TribbleException e) {
|
||||
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
|
||||
"Please run ValidateVariants for more detailed information.");
|
||||
"Please run ValidateVariants for more detailed information. This error is: "+e.getMessage());
|
||||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
|
||||
String refOut;
|
||||
String altOut;
|
||||
String vcRef = getReferenceAllele(vc);
|
||||
String vcAlt = getAlternateAllele(vc);
|
||||
boolean altMajor;
|
||||
if ( majorAlleleFirst ) {
|
||||
// want to use the major allele as ref
|
||||
|
|
@ -174,17 +179,17 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
|
||||
}
|
||||
if ( getAF(ats.get("AF")) > 0.5 ) {
|
||||
refOut = vc.getAlternateAllele(0).getBaseString();
|
||||
altOut = vc.getReference().getBaseString();
|
||||
refOut = vcAlt;
|
||||
altOut = vcRef;
|
||||
altMajor = true;
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
refOut = vcRef;
|
||||
altOut = vcAlt;
|
||||
altMajor = false;
|
||||
}
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
refOut = vcRef;
|
||||
altOut = vcAlt;
|
||||
altMajor = false;
|
||||
}
|
||||
// write an entry into the map file
|
||||
|
|
@ -286,8 +291,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
private byte getStandardEncoding(Genotype g, int offset) {
|
||||
byte b;
|
||||
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
|
||||
b = NO_CALL;
|
||||
if ( ! checkGQIsGood(g) ) {
|
||||
b = NO_CALL;
|
||||
} else if ( g.isHomRef() ) {
|
||||
b = HOM_REF;
|
||||
} else if ( g.isHomVar() ) {
|
||||
|
|
@ -322,7 +327,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
if ( genotype.hasGQ() ) {
|
||||
return genotype.getGQ() >= minGenotypeQuality;
|
||||
} else if ( genotype.hasLikelihoods() ) {
|
||||
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
|
||||
double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector());
|
||||
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
@ -346,13 +352,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private void initializeValidator() {
|
||||
vv.variantCollection = variantCollection;
|
||||
vv.dbsnp = dbsnp;
|
||||
vv.DO_NOT_VALIDATE_FILTERED = true;
|
||||
vv.type = ValidateVariants.ValidationType.REF;
|
||||
}
|
||||
|
||||
private void writeBedHeader() {
|
||||
// write magic bits into the ped file
|
||||
try {
|
||||
|
|
@ -410,4 +409,53 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
return metaValues;
|
||||
}
|
||||
|
||||
private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) {
|
||||
final Allele reportedRefAllele = vc.getReference();
|
||||
final int refLength = reportedRefAllele.length();
|
||||
if ( refLength > 100 ) {
|
||||
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
|
||||
return;
|
||||
}
|
||||
|
||||
final byte[] observedRefBases = new byte[refLength];
|
||||
System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength);
|
||||
final Allele observedRefAllele = Allele.create(observedRefBases);
|
||||
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
|
||||
vc.validateAlternateAlleles();
|
||||
}
|
||||
|
||||
private String getReferenceAllele(VariantContext vc) {
|
||||
if ( vc.isSimpleInsertion() ) {
|
||||
// bi-allelic, so we just have "-" for ped output
|
||||
return PLINK_DELETION_MARKER;
|
||||
}
|
||||
if ( vc.isSymbolic() ) {
|
||||
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1.
|
||||
return "1";
|
||||
}
|
||||
if ( vc.isSimpleDeletion() ) {
|
||||
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||
return vc.getReference().getBaseString().substring(1);
|
||||
}
|
||||
// snp or mnp
|
||||
return vc.getReference().getBaseString();
|
||||
}
|
||||
|
||||
private String getAlternateAllele(VariantContext vc ) {
|
||||
if ( vc.isSimpleInsertion() ) {
|
||||
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||
return vc.getAlternateAllele(0).getBaseString().substring(1);
|
||||
}
|
||||
if ( vc.isSymbolic() ) {
|
||||
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2.
|
||||
return "2";
|
||||
}
|
||||
if ( vc.isSimpleDeletion() ) {
|
||||
// bi-allelic, so we just have "-" for ped output
|
||||
return PLINK_DELETION_MARKER;
|
||||
}
|
||||
// snp or mnp
|
||||
return vc.getAlternateAllele(0).getBaseString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,50 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878HighGQ() {
|
||||
String testName = "testNA12878HighGQ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||
3,
|
||||
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959")
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCFMismatchReference() {
|
||||
String testName = "testVCFMismatchReference";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||
3,
|
||||
UserException.class
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000GWithIndels() {
|
||||
String testName = "test1000GWithIndels";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||
3,
|
||||
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000G_Symbolic() {
|
||||
String testName = "test1000G_Symbolic";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||
3,
|
||||
Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCEUTrio() {
|
||||
String testName = "testCEUTrio";
|
||||
|
|
@ -112,6 +156,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue