From 236764b249c5759476a6cb43eda657ba12c6a5c7 Mon Sep 17 00:00:00 2001 From: chartl Date: Sat, 30 Jan 2010 01:18:31 +0000 Subject: [PATCH] Major (and useful) changes to MultiSampleConcordance: 1) Now cares about Genotype filtering. If it is flagged as filtered, it can count as a FP/FN/TP; but goes into a "non-confident genotype" bin, rather than het/hom. 2) Can give it a Genotype Confidence flag (-GC) which will automatically filter genotypes in the way above for quality > Q for "-GC Q" 3) Can give it an -assumeRef flag. For sites only in the truth VCF (that don't even appear in the variant VCF), that locus will be treated as confident ref calls for all individuals in the variant VCF; and the calculators updated accordingly. *** Important: Default behavior is that sites unique to the truth VCF are considered no-call sites for the variant. This flag can help get aroudn that; however the safest way to run this is to have a variant VCF with calls at each and every locus, if that is possible. VCFGenotypeRecord -- added an isFiltered() call to automate looking up the FILTERED flag for VCF v3.3 SimpleVCFIntersectWalker - basic outline for a walker I'm working on tonight. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2747 348d0f76-0448-11de-a6fe-93d51630548a --- .../MultiSampleConcordanceSet.java | 13 +++++- .../MultiSampleConcordanceWalker.java | 9 +++- .../multisample/VCFConcordanceCalculator.java | 45 ++++++++++++++----- .../vcftools/SimpleVCFIntersectWalker.java | 42 +++++++++++++++++ .../utils/genotype/vcf/VCFGenotypeRecord.java | 4 ++ 5 files changed, 99 insertions(+), 14 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/SimpleVCFIntersectWalker.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceSet.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceSet.java index 7bfd72155..6a6ac9032 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceSet.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceSet.java @@ -11,6 +11,7 @@ import java.util.Set; * To change this template use File | Settings | File Templates. */ class MultiSampleConcordanceSet { + private boolean treatTruthOnlyAsFalseNegative; private int minimumDepthForTest; private HashSet concordanceSet; private Set cachedSampleNames; @@ -18,14 +19,17 @@ class MultiSampleConcordanceSet { private long truthOnlyVariantSites; private long variantOnlySites; private long overlappingSites; + private int genotypeQuality; - public MultiSampleConcordanceSet(int minDepth) { + public MultiSampleConcordanceSet(int minDepth, boolean assumeRef, int genotypeQuality) { concordanceSet = new HashSet(); truthOnlySites = 0l; truthOnlyVariantSites = 0l; variantOnlySites = 0l; overlappingSites = 0l; minimumDepthForTest = minDepth; + treatTruthOnlyAsFalseNegative = assumeRef; + this.genotypeQuality = genotypeQuality; } public boolean hasBeenInstantiated() { @@ -35,7 +39,7 @@ class MultiSampleConcordanceSet { public void instantiate(Set samples) { cachedSampleNames = samples; for ( String s : samples ) { - concordanceSet.add(new VCFConcordanceCalculator(s,minimumDepthForTest)); + concordanceSet.add(new VCFConcordanceCalculator(s,minimumDepthForTest,genotypeQuality)); } } @@ -49,6 +53,11 @@ class MultiSampleConcordanceSet { truthOnlySites++; if ( info.isVariantSite() ) { truthOnlyVariantSites++; + if ( treatTruthOnlyAsFalseNegative ) { + for ( VCFConcordanceCalculator concordance : concordanceSet ) { + concordance.updateTruthOnly(info); + } + } } } else { variantOnlySites++; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceWalker.java index 4e6588805..5922055cb 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/MultiSampleConcordanceWalker.java @@ -28,15 +28,20 @@ import org.broadinstitute.sting.utils.cmdLine.Argument; public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInfo, MultiSampleConcordanceSet > { @Argument(fullName="noLowDepthLoci", shortName="NLD", doc="Do not use loci in analysis where the variant depth (as specified in the VCF) is less than the given number; "+ "DO NOT USE THIS IF YOUR VCF DOES NOT HAVE 'DP' IN THE FORMAT FIELD", required=false) private int minDepth = -1; + @Argument(fullName="genotypeConfidence", shortName="GC", doc="The quality score for genotypes below which to count genotyping as a no-call", required=false) + int genotypeQuality = 0; @Argument(fullName = "ignoreKnownSites", shortName = "novel", doc="Only run concordance over novel sites (sites marked in the VCF as being in dbSNP or Hapmap 2 or 3)", required=false ) boolean ignoreKnownSites = false; + @Argument(fullName="missingLocusAsConfidentRef", shortName="assumeRef", doc="Assume a missing locus in the variant VCF is a confident ref call with sufficient depth"+ + "across all samples. Default: Missing locus = no call", required=false) + boolean assumeRef = false; public void initialize() { } public MultiSampleConcordanceSet reduceInit() { - return new MultiSampleConcordanceSet(minDepth); + return new MultiSampleConcordanceSet(minDepth,assumeRef,genotypeQuality); } public LocusConcordanceInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext c) { @@ -97,7 +102,7 @@ public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInf } public void onTraversalDone(MultiSampleConcordanceSet cSet) { - out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Sample_ID","Ignored_due_to_depth","Concordant_Refs","Concordant_Homs","Concordant_Hets","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call"); + out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Sample_ID","Ignored_due_to_depth","Concordant_Refs","Concordant_Homs","Concordant_Hets","Correct_But_Low_Genotype_Qual","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call"); for ( VCFConcordanceCalculator sample : cSet.getConcordanceSet() ) { out.print(String.format("%s%n",sample)); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/VCFConcordanceCalculator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/VCFConcordanceCalculator.java index 5f9d401bb..e5c0ae3a4 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/VCFConcordanceCalculator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/multisample/VCFConcordanceCalculator.java @@ -1,8 +1,10 @@ package org.broadinstitute.sting.oneoffprojects.walkers.varianteval.multisample; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding; import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; +import java.util.Arrays; import java.util.HashSet; import java.util.Set; @@ -14,40 +16,56 @@ import java.util.Set; * To change this template use File | Settings | File Templates. */ class VCFConcordanceCalculator { + private int minimumDepthForUpdate; + private int minimumGenotypeQuality; private String name; private Set falsePositiveLoci; private Set falseNegativeLoci; private Set falseNegativeLociDueToNoCall; private Set hetsCalledHoms; private Set homsCalledHets; + private Set nonConfidentGenotypeCalls; private Set concordantHomCalls; private Set concordantHetCalls; private Set concordantGenotypeReferenceCalls; private Set chipNoCalls; private Set ignoredDueToDepth; - public VCFConcordanceCalculator(String sampleName, int minimumDepth) { + public VCFConcordanceCalculator(String sampleName, int minimumDepth, int minGenQual) { name = sampleName; falseNegativeLoci = new HashSet(); falseNegativeLociDueToNoCall = new HashSet(); falsePositiveLoci = new HashSet(); hetsCalledHoms = new HashSet(); homsCalledHets = new HashSet(); + nonConfidentGenotypeCalls = new HashSet(); concordantHomCalls = new HashSet(); concordantHetCalls = new HashSet(); concordantGenotypeReferenceCalls = new HashSet(); chipNoCalls = new HashSet(); ignoredDueToDepth = new HashSet(); minimumDepthForUpdate = minimumDepth; + minimumGenotypeQuality = minGenQual; } public void update(LocusConcordanceInfo info) { compareGenotypes(info.getTruthGenotype(name), info.getVariantGenotype(name), info.getLoc(), info.getReferenceBase() ); } + public void updateTruthOnly(LocusConcordanceInfo info) { + if ( info.getTruthGenotype(name).isVariant( (char) info.getReferenceBase() ) ) { + falseNegativeLoci.add(info.getLoc()); + } else { + concordantGenotypeReferenceCalls.add(info.getLoc()); + } + } + public String toString() { - return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(),concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),falseNegativeLociDueToNoCall.size()); + return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,ignoredDueToDepth.size(), + concordantGenotypeReferenceCalls.size(),concordantHomCalls.size(),concordantHetCalls.size(),nonConfidentGenotypeCalls.size(), + homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(), + falseNegativeLociDueToNoCall.size()); } private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) { @@ -78,14 +96,21 @@ class VCFConcordanceCalculator { } private void checkGenotypeCall( VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc ) { - if ( truth.isHet() && call.isHom() ) { - hetsCalledHoms.add(loc); - } else if ( truth.isHom() && call.isHet() ) { - homsCalledHets.add(loc); - } else if ( ( truth.isHet() && call.isHet() ) ) { - concordantHetCalls.add(loc); - } else if ( truth.isHom() && call.isHom() ) { // be extra careful - concordantHomCalls.add(loc); + if ( ! call.isFiltered() && 10*call.getNegLog10PError() > minimumGenotypeQuality) { + + if ( truth.isHet() && call.isHom() ) { + hetsCalledHoms.add(loc); + } else if ( truth.isHom() && call.isHet() ) { + homsCalledHets.add(loc); + } else if ( ( truth.isHet() && call.isHet() ) ) { + concordantHetCalls.add(loc); + } else if ( truth.isHom() && call.isHom() ) { // be extra careful + concordantHomCalls.add(loc); + } + + } else { + + nonConfidentGenotypeCalls.add(loc); } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/SimpleVCFIntersectWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/SimpleVCFIntersectWalker.java new file mode 100644 index 000000000..d398de5eb --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/SimpleVCFIntersectWalker.java @@ -0,0 +1,42 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import java.lang.Long; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Jan 29, 2010 + */ +public abstract class SimpleVCFIntersectWalker extends RodWalker{ + + public void initialize() { + + } + + public Long reduceInit() { + return 0l; + } + + public VCFRecordPair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return null; + } + + public Long reduce(VCFRecordPair records, long prevReduce) { + return 0l; + } + + public void onTraversalDone(long finalReduce) { + return; + } +} + +class VCFRecordPair { + public VCFRecord rec1; + public VCFRecord rec2; +} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 02c55858c..814b02542 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -193,6 +193,10 @@ public class VCFGenotypeRecord implements Genotype, SampleBacked { return true; } + public boolean isFiltered() { + return ( mFields.get(FILTER_KEY) != null && ! mFields.get(FILTER_KEY).equals("0")); + } + public int getPloidy() { return 2; }