Major (and useful) changes to MultiSampleConcordance:
1) Now cares about Genotype filtering. If it is flagged as filtered, it can count as a FP/FN/TP; but goes into a "non-confident genotype" bin, rather than het/hom.
2) Can give it a Genotype Confidence flag (-GC) which will automatically filter genotypes in the way above for quality > Q for "-GC Q"
3) Can give it an -assumeRef flag. For sites only in the truth VCF (that don't even appear in the variant VCF), that locus will be treated as confident
ref calls for all individuals in the variant VCF; and the calculators updated accordingly.
*** Important: Default behavior is that sites unique to the truth VCF are considered no-call sites for the variant. This flag can help get aroudn that;
however the safest way to run this is to have a variant VCF with calls at each and every locus, if that is possible.
VCFGenotypeRecord -- added an isFiltered() call to automate looking up the FILTERED flag for VCF v3.3
SimpleVCFIntersectWalker - basic outline for a walker I'm working on tonight.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2747 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ea7e737441
commit
236764b249
|
|
@ -11,6 +11,7 @@ import java.util.Set;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class MultiSampleConcordanceSet {
|
||||
private boolean treatTruthOnlyAsFalseNegative;
|
||||
private int minimumDepthForTest;
|
||||
private HashSet<VCFConcordanceCalculator> concordanceSet;
|
||||
private Set<String> cachedSampleNames;
|
||||
|
|
@ -18,14 +19,17 @@ class MultiSampleConcordanceSet {
|
|||
private long truthOnlyVariantSites;
|
||||
private long variantOnlySites;
|
||||
private long overlappingSites;
|
||||
private int genotypeQuality;
|
||||
|
||||
public MultiSampleConcordanceSet(int minDepth) {
|
||||
public MultiSampleConcordanceSet(int minDepth, boolean assumeRef, int genotypeQuality) {
|
||||
concordanceSet = new HashSet<VCFConcordanceCalculator>();
|
||||
truthOnlySites = 0l;
|
||||
truthOnlyVariantSites = 0l;
|
||||
variantOnlySites = 0l;
|
||||
overlappingSites = 0l;
|
||||
minimumDepthForTest = minDepth;
|
||||
treatTruthOnlyAsFalseNegative = assumeRef;
|
||||
this.genotypeQuality = genotypeQuality;
|
||||
}
|
||||
|
||||
public boolean hasBeenInstantiated() {
|
||||
|
|
@ -35,7 +39,7 @@ class MultiSampleConcordanceSet {
|
|||
public void instantiate(Set<String> samples) {
|
||||
cachedSampleNames = samples;
|
||||
for ( String s : samples ) {
|
||||
concordanceSet.add(new VCFConcordanceCalculator(s,minimumDepthForTest));
|
||||
concordanceSet.add(new VCFConcordanceCalculator(s,minimumDepthForTest,genotypeQuality));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -49,6 +53,11 @@ class MultiSampleConcordanceSet {
|
|||
truthOnlySites++;
|
||||
if ( info.isVariantSite() ) {
|
||||
truthOnlyVariantSites++;
|
||||
if ( treatTruthOnlyAsFalseNegative ) {
|
||||
for ( VCFConcordanceCalculator concordance : concordanceSet ) {
|
||||
concordance.updateTruthOnly(info);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
variantOnlySites++;
|
||||
|
|
|
|||
|
|
@ -28,15 +28,20 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|||
public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInfo, MultiSampleConcordanceSet > {
|
||||
@Argument(fullName="noLowDepthLoci", shortName="NLD", doc="Do not use loci in analysis where the variant depth (as specified in the VCF) is less than the given number; "+
|
||||
"DO NOT USE THIS IF YOUR VCF DOES NOT HAVE 'DP' IN THE FORMAT FIELD", required=false) private int minDepth = -1;
|
||||
@Argument(fullName="genotypeConfidence", shortName="GC", doc="The quality score for genotypes below which to count genotyping as a no-call", required=false)
|
||||
int genotypeQuality = 0;
|
||||
@Argument(fullName = "ignoreKnownSites", shortName = "novel", doc="Only run concordance over novel sites (sites marked in the VCF as being in dbSNP or Hapmap 2 or 3)", required=false )
|
||||
boolean ignoreKnownSites = false;
|
||||
@Argument(fullName="missingLocusAsConfidentRef", shortName="assumeRef", doc="Assume a missing locus in the variant VCF is a confident ref call with sufficient depth"+
|
||||
"across all samples. Default: Missing locus = no call", required=false)
|
||||
boolean assumeRef = false;
|
||||
|
||||
public void initialize() {
|
||||
|
||||
}
|
||||
|
||||
public MultiSampleConcordanceSet reduceInit() {
|
||||
return new MultiSampleConcordanceSet(minDepth);
|
||||
return new MultiSampleConcordanceSet(minDepth,assumeRef,genotypeQuality);
|
||||
}
|
||||
|
||||
public LocusConcordanceInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext c) {
|
||||
|
|
@ -97,7 +102,7 @@ public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInf
|
|||
}
|
||||
|
||||
public void onTraversalDone(MultiSampleConcordanceSet cSet) {
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Sample_ID","Ignored_due_to_depth","Concordant_Refs","Concordant_Homs","Concordant_Hets","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call");
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Sample_ID","Ignored_due_to_depth","Concordant_Refs","Concordant_Homs","Concordant_Hets","Correct_But_Low_Genotype_Qual","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call");
|
||||
for ( VCFConcordanceCalculator sample : cSet.getConcordanceSet() ) {
|
||||
out.print(String.format("%s%n",sample));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval.multisample;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
|
@ -14,40 +16,56 @@ import java.util.Set;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class VCFConcordanceCalculator {
|
||||
|
||||
private int minimumDepthForUpdate;
|
||||
private int minimumGenotypeQuality;
|
||||
private String name;
|
||||
private Set<GenomeLoc> falsePositiveLoci;
|
||||
private Set<GenomeLoc> falseNegativeLoci;
|
||||
private Set<GenomeLoc> falseNegativeLociDueToNoCall;
|
||||
private Set<GenomeLoc> hetsCalledHoms;
|
||||
private Set<GenomeLoc> homsCalledHets;
|
||||
private Set<GenomeLoc> nonConfidentGenotypeCalls;
|
||||
private Set<GenomeLoc> concordantHomCalls;
|
||||
private Set<GenomeLoc> concordantHetCalls;
|
||||
private Set<GenomeLoc> concordantGenotypeReferenceCalls;
|
||||
private Set<GenomeLoc> chipNoCalls;
|
||||
private Set<GenomeLoc> ignoredDueToDepth;
|
||||
|
||||
public VCFConcordanceCalculator(String sampleName, int minimumDepth) {
|
||||
public VCFConcordanceCalculator(String sampleName, int minimumDepth, int minGenQual) {
|
||||
name = sampleName;
|
||||
falseNegativeLoci = new HashSet<GenomeLoc>();
|
||||
falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
|
||||
falsePositiveLoci = new HashSet<GenomeLoc>();
|
||||
hetsCalledHoms = new HashSet<GenomeLoc>();
|
||||
homsCalledHets = new HashSet<GenomeLoc>();
|
||||
nonConfidentGenotypeCalls = new HashSet<GenomeLoc>();
|
||||
concordantHomCalls = new HashSet<GenomeLoc>();
|
||||
concordantHetCalls = new HashSet<GenomeLoc>();
|
||||
concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
|
||||
chipNoCalls = new HashSet<GenomeLoc>();
|
||||
ignoredDueToDepth = new HashSet<GenomeLoc>();
|
||||
minimumDepthForUpdate = minimumDepth;
|
||||
minimumGenotypeQuality = minGenQual;
|
||||
}
|
||||
|
||||
public void update(LocusConcordanceInfo info) {
|
||||
compareGenotypes(info.getTruthGenotype(name), info.getVariantGenotype(name), info.getLoc(), info.getReferenceBase() );
|
||||
}
|
||||
|
||||
public void updateTruthOnly(LocusConcordanceInfo info) {
|
||||
if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase() ) ) {
|
||||
falseNegativeLoci.add(info.getLoc());
|
||||
} else {
|
||||
concordantGenotypeReferenceCalls.add(info.getLoc());
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(),concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),falseNegativeLociDueToNoCall.size());
|
||||
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(),
|
||||
concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),nonConfidentGenotypeCalls.size(),
|
||||
homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),
|
||||
falseNegativeLociDueToNoCall.size());
|
||||
}
|
||||
|
||||
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
|
||||
|
|
@ -78,14 +96,21 @@ class VCFConcordanceCalculator {
|
|||
}
|
||||
|
||||
private void checkGenotypeCall( VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc ) {
|
||||
if ( truth.isHet() && call.isHom() ) {
|
||||
hetsCalledHoms.add(loc);
|
||||
} else if ( truth.isHom() && call.isHet() ) {
|
||||
homsCalledHets.add(loc);
|
||||
} else if ( ( truth.isHet() && call.isHet() ) ) {
|
||||
concordantHetCalls.add(loc);
|
||||
} else if ( truth.isHom() && call.isHom() ) { // be extra careful
|
||||
concordantHomCalls.add(loc);
|
||||
if ( ! call.isFiltered() && 10*call.getNegLog10PError() > minimumGenotypeQuality) {
|
||||
|
||||
if ( truth.isHet() && call.isHom() ) {
|
||||
hetsCalledHoms.add(loc);
|
||||
} else if ( truth.isHom() && call.isHet() ) {
|
||||
homsCalledHets.add(loc);
|
||||
} else if ( ( truth.isHet() && call.isHet() ) ) {
|
||||
concordantHetCalls.add(loc);
|
||||
} else if ( truth.isHom() && call.isHom() ) { // be extra careful
|
||||
concordantHomCalls.add(loc);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
nonConfidentGenotypeCalls.add(loc);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,42 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
import java.lang.Long;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Jan 29, 2010
|
||||
*/
|
||||
public abstract class SimpleVCFIntersectWalker extends RodWalker<VCFRecordPair,Long>{
|
||||
|
||||
public void initialize() {
|
||||
|
||||
}
|
||||
|
||||
public Long reduceInit() {
|
||||
return 0l;
|
||||
}
|
||||
|
||||
public VCFRecordPair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Long reduce(VCFRecordPair records, long prevReduce) {
|
||||
return 0l;
|
||||
}
|
||||
|
||||
public void onTraversalDone(long finalReduce) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
class VCFRecordPair {
|
||||
public VCFRecord rec1;
|
||||
public VCFRecord rec2;
|
||||
}
|
||||
|
|
@ -193,6 +193,10 @@ public class VCFGenotypeRecord implements Genotype, SampleBacked {
|
|||
return true;
|
||||
}
|
||||
|
||||
public boolean isFiltered() {
|
||||
return ( mFields.get(FILTER_KEY) != null && ! mFields.get(FILTER_KEY).equals("0"));
|
||||
}
|
||||
|
||||
public int getPloidy() {
|
||||
return 2;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue