Added a multisample concordance walker which takes the place of the VCF python library I've been using. Takes a truth VCF and a variant VCF and outputs A TSV that looks like this:

Sample_ID       Concordant_Refs Concordant_Vars Homs_called_het Het_called_homs False_Positives False_Negatives_Due_To_Ref_Call False_Negatives_Due_To_No_Call
NA19381 491     294     2       0       0       0       1
NA19451 489     298     1       0       0       0       0
NA19463 486     289     2       3       1       4       3
NA19376 488     296     1       0       2       0       1
NA19317 489     284     5       3       3       3       1


This walker will be merged with GenotypeConcordance once it's clear how to do so. 



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2715 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-01-27 22:59:17 +00:00
parent eccf40b17d
commit 1b9184a1c7
4 changed files with 340 additions and 0 deletions

View File

@ -0,0 +1,81 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Jan 27, 2010
* Time: 5:48:36 PM
* To change this template use File | Settings | File Templates.
*/
class LocusConcordanceInfo {
public enum ConcordanceType {
TRUTH_SET,VARIANT_SET,BOTH_SETS
}
private ConcordanceType concordanceType;
private VCFRecord variantVCFRecord;
private VCFRecord truthVCFRecord;
private ReferenceContext reference;
public LocusConcordanceInfo(ConcordanceType type, VCFRecord truthRecord, VCFRecord variantRecord, ReferenceContext ref) {
concordanceType = type;
variantVCFRecord = variantRecord;
truthVCFRecord = truthRecord;
reference = ref;
}
public boolean concordanceIsCheckable() {
return concordanceType == ConcordanceType.BOTH_SETS;
}
public VCFGenotypeRecord getTruthGenotype(String sample) {
return truthVCFRecord.getGenotype(sample);
}
public VCFGenotypeRecord getVariantGenotype(String sample) {
return variantVCFRecord.getGenotype(sample);
}
public Set<String> getOverlappingSamples() {
Set<String> variantSamples = new HashSet<String>( Arrays.asList(variantVCFRecord.getSampleNames()) );
variantSamples.retainAll(Arrays.asList(truthVCFRecord.getSampleNames()));
return variantSamples;
}
public byte getReferenceBase() {
return (byte) reference.getBase();
}
public boolean isTruthOnly () {
return concordanceType == ConcordanceType.TRUTH_SET;
}
public boolean isVariantSite() {
for ( VCFGenotypeRecord g : truthVCFRecord.getVCFGenotypeRecords() ) {
if ( g.isVariant(reference.getBase()) ) {
return true;
}
}
return false;
}
public GenomeLoc getLoc() {
if ( concordanceType == ConcordanceType.TRUTH_SET || concordanceType == ConcordanceType.BOTH_SETS) {
return truthVCFRecord.getLocation();
} else {
return variantVCFRecord.getLocation();
}
}
}

View File

@ -0,0 +1,75 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
import java.util.HashSet;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Jan 27, 2010
* Time: 5:47:27 PM
* To change this template use File | Settings | File Templates.
*/
class MultiSampleConcordanceSet {
private HashSet<VCFConcordanceCalculator> concordanceSet;
private Set<String> cachedSampleNames;
private long truthOnlySites;
private long truthOnlyVariantSites;
private long variantOnlySites;
private long overlappingSites;
public MultiSampleConcordanceSet() {
concordanceSet = new HashSet<VCFConcordanceCalculator>();
truthOnlySites = 0l;
truthOnlyVariantSites = 0l;
variantOnlySites = 0l;
overlappingSites = 0l;
}
public boolean hasBeenInstantiated() {
return cachedSampleNames != null;
}
public void instantiate(Set<String> samples) {
cachedSampleNames = samples;
for ( String s : samples ) {
concordanceSet.add(new VCFConcordanceCalculator(s));
}
}
public void update(LocusConcordanceInfo info) {
if ( info.concordanceIsCheckable() ) {
overlappingSites++;
for ( VCFConcordanceCalculator concordance : concordanceSet ) {
concordance.update(info);
}
} else if ( info.isTruthOnly() ) {
truthOnlySites++;
if ( info.isVariantSite() ) {
truthOnlyVariantSites++;
}
} else {
variantOnlySites++;
}
}
public Set<VCFConcordanceCalculator> getConcordanceSet() {
return concordanceSet;
}
public long numberOfTruthOnlySites() {
return truthOnlySites;
}
public long numberOfTruthOnlyVariantSites() {
return truthOnlyVariantSites;
}
public long numberOfVariantOnlySites() {
return variantOnlySites;
}
public long numberOfOverlappingSites() {
return overlappingSites;
}
}

View File

@ -0,0 +1,105 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Jan 27, 2010
* Time: 10:40:44 AM
* To change this template use File | Settings | File Templates.
*/
/*
* Calculates per-sample concordance metrics across two multi-sample VCF files; outputs simple counts of concordant
* variant and genotype calls, genotyping errors, and call errors. Requires a VCF binding with the name 'truth' and
* a VCF binding with the name 'variants'.
* @Author: Chris Hartl
*/
@Requires(value= DataSource.REFERENCE,referenceMetaData = {@RMD(name="truth",type= RodVCF.class),@RMD(name="variants",type= RodVCF.class)})
public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInfo, MultiSampleConcordanceSet > {
public void initialize() {
}
public MultiSampleConcordanceSet reduceInit() {
return new MultiSampleConcordanceSet();
}
public LocusConcordanceInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext c) {
if ( tracker == null ) {
return null;
}
ReferenceOrderedDatum truthData = tracker.lookup("truth", null);
ReferenceOrderedDatum variantData = tracker.lookup("variants",null);
LocusConcordanceInfo concordance;
if ( truthData == null && variantData == null) {
concordance = null;
} else if ( truthData == null ) {
// not in the truth set
if ( ( (RodVCF) variantData ).isFiltered() ) {
concordance = null;
} else {
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
}
} else if ( variantData == null ) {
// not in the variant set
if ( ( (RodVCF) truthData).isFiltered() ) {
concordance = null;
} else {
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref);
}
} else {
// in both
// check for filtering
boolean truth_filter = ((RodVCF) truthData).isFiltered();
boolean call_filter = ((RodVCF) variantData).isFiltered();
if ( truth_filter && call_filter ) {
concordance = null;
} else if ( truth_filter ) {
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
} else if ( call_filter ) {
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref);
} else {
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.BOTH_SETS,( (RodVCF) truthData).getRecord(),( (RodVCF) variantData).getRecord(),ref);
}
}
return concordance;
}
public MultiSampleConcordanceSet reduce(LocusConcordanceInfo info, MultiSampleConcordanceSet concordanceSet) {
if ( info != null ) {
if ( concordanceSet.hasBeenInstantiated() ) {
concordanceSet.update(info);
} else if ( info.concordanceIsCheckable() ) {
concordanceSet.instantiate(info.getOverlappingSamples());
concordanceSet.update(info);
} else {
concordanceSet.update(info);
}
}
return concordanceSet;
}
public void onTraversalDone(MultiSampleConcordanceSet cSet) {
String[] header = {"Sample_ID","Concordant_Refs","Concordant_Vars","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call"};
out.print(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n",header));
for ( VCFConcordanceCalculator sample : cSet.getConcordanceSet() ) {
out.print(String.format("%s%n",sample));
}
logger.info("Overlapping="+cSet.numberOfOverlappingSites()+"\tTruthOnly="+cSet.numberOfTruthOnlySites()+"\tTruthOnlyVariantSites="+
cSet.numberOfTruthOnlyVariantSites()+"\tVariantOnly="+cSet.numberOfVariantOnlySites());
}
}

View File

@ -0,0 +1,79 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import java.util.HashSet;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Jan 27, 2010
* Time: 5:48:08 PM
* To change this template use File | Settings | File Templates.
*/
class VCFConcordanceCalculator {
private String name;
private Set<GenomeLoc> falsePositiveLoci;
private Set<GenomeLoc> falseNegativeLoci;
private Set<GenomeLoc> falseNegativeLociDueToNoCall;
private Set<GenomeLoc> hetsCalledHoms;
private Set<GenomeLoc> homsCalledHets;
private Set<GenomeLoc> concordantCalls;
private Set<GenomeLoc> concordantGenotypeReferenceCalls;
private Set<GenomeLoc> chipNoCalls;
public VCFConcordanceCalculator(String sampleName) {
name = sampleName;
falseNegativeLoci = new HashSet<GenomeLoc>();
falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
falsePositiveLoci = new HashSet<GenomeLoc>();
hetsCalledHoms = new HashSet<GenomeLoc>();
homsCalledHets = new HashSet<GenomeLoc>();
concordantCalls = new HashSet<GenomeLoc>();
concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
chipNoCalls = new HashSet<GenomeLoc>();
}
public void update(LocusConcordanceInfo info) {
compareGenotypes(info.getTruthGenotype(name), info.getVariantGenotype(name), info.getLoc(), info.getReferenceBase() );
}
public String toString() {
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,concordantGenotypeReferenceCalls.size(),concordantCalls.size(),homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),falseNegativeLociDueToNoCall.size());
}
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
if ( truth.isNoCall() ) {
chipNoCalls.add(loc);
} else if ( truth.isVariant(( char) ref) ) {
if ( call.isNoCall() ) {
falseNegativeLociDueToNoCall.add(loc);
} else if ( ! call.isVariant( (char) ref ) ) {
falseNegativeLoci.add(loc);
} else if ( call.isVariant((char) ref) ) {
// check het vs hom
checkGenotypeCall(truth,call, loc);
}
} else if ( ! truth.isVariant( (char) ref ) ) {
if ( call.isVariant((char) ref) ) {
falsePositiveLoci.add(loc);
} else {
concordantGenotypeReferenceCalls.add(loc);
}
}
}
private void checkGenotypeCall( VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc ) {
if ( truth.isHet() && call.isHom() ) {
hetsCalledHoms.add(loc);
} else if ( truth.isHom() && call.isHet() ) {
homsCalledHets.add(loc);
} else if ( ( truth.isHet() && call.isHet() ) || ( truth.isHom() && call.isHom() ) ) { // be extra careful
concordantCalls.add(loc);
}
}
}