hmk123's error on the forum came from the reference context occasionally lacking bases needed for validating the reference bases in the variant context. (no @Window for VariantsToBinaryPed). This bugfix adresses this and other minor items:
1) ValidateVariants removed in favor of direct validation VariantContexts. Integration test added to test broken contexts. 2) Enabling indel and SV output. Still bi-allelic sites only. Integration tests added for these cases. 3) Found a bug where GQ recalculation (if a genotype has PLs but no GQ) would only happen for flipped encoding. Fixed. Integration test added.
This commit is contained in:
parent
55cdf4f9b7
commit
365f1d2429
|
|
@ -1,5 +1,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import org.broad.tribble.TribbleException;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence;
|
||||||
import org.broadinstitute.sting.commandline.*;
|
import org.broadinstitute.sting.commandline.*;
|
||||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||||
|
|
@ -7,19 +9,19 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -30,6 +32,7 @@ import java.util.*;
|
||||||
* produces a binary ped file in individual major mode.
|
* produces a binary ped file in individual major mode.
|
||||||
*/
|
*/
|
||||||
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
|
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
|
||||||
|
@Reference(window=@Window(start=0,stop=100))
|
||||||
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
@ArgumentCollection
|
@ArgumentCollection
|
||||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||||
|
|
@ -78,8 +81,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
|
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
|
||||||
boolean majorAlleleFirst = false;
|
boolean majorAlleleFirst = false;
|
||||||
|
|
||||||
private ValidateVariants vv = new ValidateVariants();
|
|
||||||
|
|
||||||
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||||
|
|
||||||
private static final byte HOM_REF = 0x0;
|
private static final byte HOM_REF = 0x0;
|
||||||
|
|
@ -89,6 +90,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
|
|
||||||
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
|
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
|
||||||
|
|
||||||
|
private static final String PLINK_DELETION_MARKER = "-";
|
||||||
|
|
||||||
// note that HET and NO_CALL are flipped from the documentation: that's because
|
// note that HET and NO_CALL are flipped from the documentation: that's because
|
||||||
// plink actually reads these in backwards; and we want to use a shift operator
|
// plink actually reads these in backwards; and we want to use a shift operator
|
||||||
// to put these in the appropriate location
|
// to put these in the appropriate location
|
||||||
|
|
@ -101,7 +104,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
private List<String> famOrder = new ArrayList<String>();
|
private List<String> famOrder = new ArrayList<String>();
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
initializeValidator();
|
|
||||||
writeBedHeader();
|
writeBedHeader();
|
||||||
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
|
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
|
||||||
// create temporary output streams and buffers
|
// create temporary output streams and buffers
|
||||||
|
|
@ -150,22 +152,25 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
|
if ( tracker == null ) {
|
||||||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
|
return 0;
|
||||||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
|
}
|
||||||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
|
|
||||||
|
VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation());
|
||||||
|
if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
vv.map(tracker,ref,context);
|
validateVariantSite(vc,ref,context);
|
||||||
} catch (UserException e) {
|
} catch (TribbleException e) {
|
||||||
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
|
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
|
||||||
"Please run ValidateVariants for more detailed information.");
|
"Please run ValidateVariants for more detailed information. This error is: "+e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
|
|
||||||
String refOut;
|
String refOut;
|
||||||
String altOut;
|
String altOut;
|
||||||
|
String vcRef = getReferenceAllele(vc);
|
||||||
|
String vcAlt = getAlternateAllele(vc);
|
||||||
boolean altMajor;
|
boolean altMajor;
|
||||||
if ( majorAlleleFirst ) {
|
if ( majorAlleleFirst ) {
|
||||||
// want to use the major allele as ref
|
// want to use the major allele as ref
|
||||||
|
|
@ -174,17 +179,17 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
|
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
|
||||||
}
|
}
|
||||||
if ( getAF(ats.get("AF")) > 0.5 ) {
|
if ( getAF(ats.get("AF")) > 0.5 ) {
|
||||||
refOut = vc.getAlternateAllele(0).getBaseString();
|
refOut = vcAlt;
|
||||||
altOut = vc.getReference().getBaseString();
|
altOut = vcRef;
|
||||||
altMajor = true;
|
altMajor = true;
|
||||||
} else {
|
} else {
|
||||||
refOut = vc.getReference().getBaseString();
|
refOut = vcRef;
|
||||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
altOut = vcAlt;
|
||||||
altMajor = false;
|
altMajor = false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
refOut = vc.getReference().getBaseString();
|
refOut = vcRef;
|
||||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
altOut = vcAlt;
|
||||||
altMajor = false;
|
altMajor = false;
|
||||||
}
|
}
|
||||||
// write an entry into the map file
|
// write an entry into the map file
|
||||||
|
|
@ -286,7 +291,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
|
|
||||||
private byte getStandardEncoding(Genotype g, int offset) {
|
private byte getStandardEncoding(Genotype g, int offset) {
|
||||||
byte b;
|
byte b;
|
||||||
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
|
if ( ! checkGQIsGood(g) ) {
|
||||||
b = NO_CALL;
|
b = NO_CALL;
|
||||||
} else if ( g.isHomRef() ) {
|
} else if ( g.isHomRef() ) {
|
||||||
b = HOM_REF;
|
b = HOM_REF;
|
||||||
|
|
@ -322,7 +327,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
if ( genotype.hasGQ() ) {
|
if ( genotype.hasGQ() ) {
|
||||||
return genotype.getGQ() >= minGenotypeQuality;
|
return genotype.getGQ() >= minGenotypeQuality;
|
||||||
} else if ( genotype.hasLikelihoods() ) {
|
} else if ( genotype.hasLikelihoods() ) {
|
||||||
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
|
double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector());
|
||||||
|
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -346,13 +352,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initializeValidator() {
|
|
||||||
vv.variantCollection = variantCollection;
|
|
||||||
vv.dbsnp = dbsnp;
|
|
||||||
vv.DO_NOT_VALIDATE_FILTERED = true;
|
|
||||||
vv.type = ValidateVariants.ValidationType.REF;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writeBedHeader() {
|
private void writeBedHeader() {
|
||||||
// write magic bits into the ped file
|
// write magic bits into the ped file
|
||||||
try {
|
try {
|
||||||
|
|
@ -410,4 +409,53 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||||
|
|
||||||
return metaValues;
|
return metaValues;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
final Allele reportedRefAllele = vc.getReference();
|
||||||
|
final int refLength = reportedRefAllele.length();
|
||||||
|
if ( refLength > 100 ) {
|
||||||
|
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final byte[] observedRefBases = new byte[refLength];
|
||||||
|
System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength);
|
||||||
|
final Allele observedRefAllele = Allele.create(observedRefBases);
|
||||||
|
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
|
||||||
|
vc.validateAlternateAlleles();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getReferenceAllele(VariantContext vc) {
|
||||||
|
if ( vc.isSimpleInsertion() ) {
|
||||||
|
// bi-allelic, so we just have "-" for ped output
|
||||||
|
return PLINK_DELETION_MARKER;
|
||||||
|
}
|
||||||
|
if ( vc.isSymbolic() ) {
|
||||||
|
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1.
|
||||||
|
return "1";
|
||||||
|
}
|
||||||
|
if ( vc.isSimpleDeletion() ) {
|
||||||
|
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||||
|
return vc.getReference().getBaseString().substring(1);
|
||||||
|
}
|
||||||
|
// snp or mnp
|
||||||
|
return vc.getReference().getBaseString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getAlternateAllele(VariantContext vc ) {
|
||||||
|
if ( vc.isSimpleInsertion() ) {
|
||||||
|
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||||
|
return vc.getAlternateAllele(0).getBaseString().substring(1);
|
||||||
|
}
|
||||||
|
if ( vc.isSymbolic() ) {
|
||||||
|
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2.
|
||||||
|
return "2";
|
||||||
|
}
|
||||||
|
if ( vc.isSimpleDeletion() ) {
|
||||||
|
// bi-allelic, so we just have "-" for ped output
|
||||||
|
return PLINK_DELETION_MARKER;
|
||||||
|
}
|
||||||
|
// snp or mnp
|
||||||
|
return vc.getAlternateAllele(0).getBaseString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,50 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
||||||
executeTest(testName, spec);
|
executeTest(testName, spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNA12878HighGQ() {
|
||||||
|
String testName = "testNA12878HighGQ";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||||
|
3,
|
||||||
|
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959")
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest(testName, spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testVCFMismatchReference() {
|
||||||
|
String testName = "testVCFMismatchReference";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||||
|
3,
|
||||||
|
UserException.class
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest(testName, spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test1000GWithIndels() {
|
||||||
|
String testName = "test1000GWithIndels";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||||
|
3,
|
||||||
|
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test1000G_Symbolic() {
|
||||||
|
String testName = "test1000G_Symbolic";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||||
|
3,
|
||||||
|
Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCEUTrio() {
|
public void testCEUTrio() {
|
||||||
String testName = "testCEUTrio";
|
String testName = "testCEUTrio";
|
||||||
|
|
@ -112,6 +156,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
||||||
executeTest(testName, spec);
|
executeTest(testName, spec);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue