diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/LocusConcordanceInfo.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/LocusConcordanceInfo.java new file mode 100644 index 000000000..06b6f11f2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/LocusConcordanceInfo.java @@ -0,0 +1,81 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Jan 27, 2010 + * Time: 5:48:36 PM + * To change this template use File | Settings | File Templates. + */ +class LocusConcordanceInfo { + + public enum ConcordanceType { + TRUTH_SET,VARIANT_SET,BOTH_SETS + } + + private ConcordanceType concordanceType; + private VCFRecord variantVCFRecord; + private VCFRecord truthVCFRecord; + private ReferenceContext reference; + + public LocusConcordanceInfo(ConcordanceType type, VCFRecord truthRecord, VCFRecord variantRecord, ReferenceContext ref) { + concordanceType = type; + variantVCFRecord = variantRecord; + truthVCFRecord = truthRecord; + reference = ref; + } + + public boolean concordanceIsCheckable() { + return concordanceType == ConcordanceType.BOTH_SETS; + } + + public VCFGenotypeRecord getTruthGenotype(String sample) { + return truthVCFRecord.getGenotype(sample); + } + + public VCFGenotypeRecord getVariantGenotype(String sample) { + return variantVCFRecord.getGenotype(sample); + } + + public Set getOverlappingSamples() { + Set variantSamples = new HashSet( Arrays.asList(variantVCFRecord.getSampleNames()) ); + variantSamples.retainAll(Arrays.asList(truthVCFRecord.getSampleNames())); + return variantSamples; + } + + public byte getReferenceBase() { + return (byte) reference.getBase(); + } + + public boolean isTruthOnly () { + return concordanceType == ConcordanceType.TRUTH_SET; + } + + public boolean isVariantSite() { + for ( VCFGenotypeRecord g : truthVCFRecord.getVCFGenotypeRecords() ) { + if ( g.isVariant(reference.getBase()) ) { + return true; + } + } + + return false; + } + + public GenomeLoc getLoc() { + if ( concordanceType == ConcordanceType.TRUTH_SET || concordanceType == ConcordanceType.BOTH_SETS) { + return truthVCFRecord.getLocation(); + } else { + return variantVCFRecord.getLocation(); + } + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/MultiSampleConcordanceSet.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/MultiSampleConcordanceSet.java new file mode 100644 index 000000000..47d04b62e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/MultiSampleConcordanceSet.java @@ -0,0 +1,75 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample; + +import java.util.HashSet; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Jan 27, 2010 + * Time: 5:47:27 PM + * To change this template use File | Settings | File Templates. + */ +class MultiSampleConcordanceSet { + private HashSet concordanceSet; + private Set cachedSampleNames; + private long truthOnlySites; + private long truthOnlyVariantSites; + private long variantOnlySites; + private long overlappingSites; + + public MultiSampleConcordanceSet() { + concordanceSet = new HashSet(); + truthOnlySites = 0l; + truthOnlyVariantSites = 0l; + variantOnlySites = 0l; + overlappingSites = 0l; + } + + public boolean hasBeenInstantiated() { + return cachedSampleNames != null; + } + + public void instantiate(Set samples) { + cachedSampleNames = samples; + for ( String s : samples ) { + concordanceSet.add(new VCFConcordanceCalculator(s)); + } + } + + public void update(LocusConcordanceInfo info) { + if ( info.concordanceIsCheckable() ) { + overlappingSites++; + for ( VCFConcordanceCalculator concordance : concordanceSet ) { + concordance.update(info); + } + } else if ( info.isTruthOnly() ) { + truthOnlySites++; + if ( info.isVariantSite() ) { + truthOnlyVariantSites++; + } + } else { + variantOnlySites++; + } + } + + public Set getConcordanceSet() { + return concordanceSet; + } + + public long numberOfTruthOnlySites() { + return truthOnlySites; + } + + public long numberOfTruthOnlyVariantSites() { + return truthOnlyVariantSites; + } + + public long numberOfVariantOnlySites() { + return variantOnlySites; + } + + public long numberOfOverlappingSites() { + return overlappingSites; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/MultiSampleConcordanceWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/MultiSampleConcordanceWalker.java new file mode 100644 index 000000000..7f132071a --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/MultiSampleConcordanceWalker.java @@ -0,0 +1,105 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Jan 27, 2010 + * Time: 10:40:44 AM + * To change this template use File | Settings | File Templates. + */ +/* + * Calculates per-sample concordance metrics across two multi-sample VCF files; outputs simple counts of concordant + * variant and genotype calls, genotyping errors, and call errors. Requires a VCF binding with the name 'truth' and + * a VCF binding with the name 'variants'. + * @Author: Chris Hartl + */ +@Requires(value= DataSource.REFERENCE,referenceMetaData = {@RMD(name="truth",type= RodVCF.class),@RMD(name="variants",type= RodVCF.class)}) +public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInfo, MultiSampleConcordanceSet > { + + public void initialize() { + + } + + public MultiSampleConcordanceSet reduceInit() { + return new MultiSampleConcordanceSet(); + } + + public LocusConcordanceInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext c) { + if ( tracker == null ) { + return null; + } + ReferenceOrderedDatum truthData = tracker.lookup("truth", null); + ReferenceOrderedDatum variantData = tracker.lookup("variants",null); + LocusConcordanceInfo concordance; + if ( truthData == null && variantData == null) { + concordance = null; + } else if ( truthData == null ) { + // not in the truth set + if ( ( (RodVCF) variantData ).isFiltered() ) { + concordance = null; + } else { + concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref); + } + } else if ( variantData == null ) { + // not in the variant set + if ( ( (RodVCF) truthData).isFiltered() ) { + concordance = null; + } else { + concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref); + } + } else { + // in both + // check for filtering + boolean truth_filter = ((RodVCF) truthData).isFiltered(); + boolean call_filter = ((RodVCF) variantData).isFiltered(); + if ( truth_filter && call_filter ) { + concordance = null; + } else if ( truth_filter ) { + concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref); + } else if ( call_filter ) { + concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref); + } else { + concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.BOTH_SETS,( (RodVCF) truthData).getRecord(),( (RodVCF) variantData).getRecord(),ref); + } + } + + return concordance; + } + + public MultiSampleConcordanceSet reduce(LocusConcordanceInfo info, MultiSampleConcordanceSet concordanceSet) { + if ( info != null ) { + if ( concordanceSet.hasBeenInstantiated() ) { + concordanceSet.update(info); + } else if ( info.concordanceIsCheckable() ) { + concordanceSet.instantiate(info.getOverlappingSamples()); + concordanceSet.update(info); + } else { + concordanceSet.update(info); + } + } + + return concordanceSet; + } + + public void onTraversalDone(MultiSampleConcordanceSet cSet) { + String[] header = {"Sample_ID","Concordant_Refs","Concordant_Vars","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call"}; + out.print(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n",header)); + for ( VCFConcordanceCalculator sample : cSet.getConcordanceSet() ) { + out.print(String.format("%s%n",sample)); + } + logger.info("Overlapping="+cSet.numberOfOverlappingSites()+"\tTruthOnly="+cSet.numberOfTruthOnlySites()+"\tTruthOnlyVariantSites="+ + cSet.numberOfTruthOnlyVariantSites()+"\tVariantOnly="+cSet.numberOfVariantOnlySites()); + } + +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/VCFConcordanceCalculator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/VCFConcordanceCalculator.java new file mode 100644 index 000000000..8433487f2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/multisample/VCFConcordanceCalculator.java @@ -0,0 +1,79 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; + +import java.util.HashSet; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Jan 27, 2010 + * Time: 5:48:08 PM + * To change this template use File | Settings | File Templates. + */ +class VCFConcordanceCalculator { + private String name; + private Set falsePositiveLoci; + private Set falseNegativeLoci; + private Set falseNegativeLociDueToNoCall; + private Set hetsCalledHoms; + private Set homsCalledHets; + private Set concordantCalls; + private Set concordantGenotypeReferenceCalls; + private Set chipNoCalls; + + public VCFConcordanceCalculator(String sampleName) { + name = sampleName; + falseNegativeLoci = new HashSet(); + falseNegativeLociDueToNoCall = new HashSet(); + falsePositiveLoci = new HashSet(); + hetsCalledHoms = new HashSet(); + homsCalledHets = new HashSet(); + concordantCalls = new HashSet(); + concordantGenotypeReferenceCalls = new HashSet(); + chipNoCalls = new HashSet(); + } + + public void update(LocusConcordanceInfo info) { + compareGenotypes(info.getTruthGenotype(name), info.getVariantGenotype(name), info.getLoc(), info.getReferenceBase() ); + } + + public String toString() { + return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,concordantGenotypeReferenceCalls.size(),concordantCalls.size(),homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),falseNegativeLociDueToNoCall.size()); + } + + private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) { + if ( truth.isNoCall() ) { + chipNoCalls.add(loc); + } else if ( truth.isVariant(( char) ref) ) { + if ( call.isNoCall() ) { + falseNegativeLociDueToNoCall.add(loc); + } else if ( ! call.isVariant( (char) ref ) ) { + falseNegativeLoci.add(loc); + } else if ( call.isVariant((char) ref) ) { + // check het vs hom + checkGenotypeCall(truth,call, loc); + } + + } else if ( ! truth.isVariant( (char) ref ) ) { + + if ( call.isVariant((char) ref) ) { + falsePositiveLoci.add(loc); + } else { + concordantGenotypeReferenceCalls.add(loc); + } + } + } + + private void checkGenotypeCall( VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc ) { + if ( truth.isHet() && call.isHom() ) { + hetsCalledHoms.add(loc); + } else if ( truth.isHom() && call.isHet() ) { + homsCalledHets.add(loc); + } else if ( ( truth.isHet() && call.isHet() ) || ( truth.isHom() && call.isHom() ) ) { // be extra careful + concordantCalls.add(loc); + } + } +}