Added a multisample concordance walker which takes the place of the VCF python library I've been using. Takes a truth VCF and a variant VCF and outputs A TSV that looks like this:
Sample_ID Concordant_Refs Concordant_Vars Homs_called_het Het_called_homs False_Positives False_Negatives_Due_To_Ref_Call False_Negatives_Due_To_No_Call NA19381 491 294 2 0 0 0 1 NA19451 489 298 1 0 0 0 0 NA19463 486 289 2 3 1 4 3 NA19376 488 296 1 0 2 0 1 NA19317 489 284 5 3 3 3 1 This walker will be merged with GenotypeConcordance once it's clear how to do so. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2715 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
eccf40b17d
commit
1b9184a1c7
|
|
@ -0,0 +1,81 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: Jan 27, 2010
|
||||||
|
* Time: 5:48:36 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
class LocusConcordanceInfo {
|
||||||
|
|
||||||
|
public enum ConcordanceType {
|
||||||
|
TRUTH_SET,VARIANT_SET,BOTH_SETS
|
||||||
|
}
|
||||||
|
|
||||||
|
private ConcordanceType concordanceType;
|
||||||
|
private VCFRecord variantVCFRecord;
|
||||||
|
private VCFRecord truthVCFRecord;
|
||||||
|
private ReferenceContext reference;
|
||||||
|
|
||||||
|
public LocusConcordanceInfo(ConcordanceType type, VCFRecord truthRecord, VCFRecord variantRecord, ReferenceContext ref) {
|
||||||
|
concordanceType = type;
|
||||||
|
variantVCFRecord = variantRecord;
|
||||||
|
truthVCFRecord = truthRecord;
|
||||||
|
reference = ref;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean concordanceIsCheckable() {
|
||||||
|
return concordanceType == ConcordanceType.BOTH_SETS;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFGenotypeRecord getTruthGenotype(String sample) {
|
||||||
|
return truthVCFRecord.getGenotype(sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFGenotypeRecord getVariantGenotype(String sample) {
|
||||||
|
return variantVCFRecord.getGenotype(sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getOverlappingSamples() {
|
||||||
|
Set<String> variantSamples = new HashSet<String>( Arrays.asList(variantVCFRecord.getSampleNames()) );
|
||||||
|
variantSamples.retainAll(Arrays.asList(truthVCFRecord.getSampleNames()));
|
||||||
|
return variantSamples;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte getReferenceBase() {
|
||||||
|
return (byte) reference.getBase();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isTruthOnly () {
|
||||||
|
return concordanceType == ConcordanceType.TRUTH_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVariantSite() {
|
||||||
|
for ( VCFGenotypeRecord g : truthVCFRecord.getVCFGenotypeRecords() ) {
|
||||||
|
if ( g.isVariant(reference.getBase()) ) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public GenomeLoc getLoc() {
|
||||||
|
if ( concordanceType == ConcordanceType.TRUTH_SET || concordanceType == ConcordanceType.BOTH_SETS) {
|
||||||
|
return truthVCFRecord.getLocation();
|
||||||
|
} else {
|
||||||
|
return variantVCFRecord.getLocation();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,75 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: Jan 27, 2010
|
||||||
|
* Time: 5:47:27 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
class MultiSampleConcordanceSet {
|
||||||
|
private HashSet<VCFConcordanceCalculator> concordanceSet;
|
||||||
|
private Set<String> cachedSampleNames;
|
||||||
|
private long truthOnlySites;
|
||||||
|
private long truthOnlyVariantSites;
|
||||||
|
private long variantOnlySites;
|
||||||
|
private long overlappingSites;
|
||||||
|
|
||||||
|
public MultiSampleConcordanceSet() {
|
||||||
|
concordanceSet = new HashSet<VCFConcordanceCalculator>();
|
||||||
|
truthOnlySites = 0l;
|
||||||
|
truthOnlyVariantSites = 0l;
|
||||||
|
variantOnlySites = 0l;
|
||||||
|
overlappingSites = 0l;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasBeenInstantiated() {
|
||||||
|
return cachedSampleNames != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void instantiate(Set<String> samples) {
|
||||||
|
cachedSampleNames = samples;
|
||||||
|
for ( String s : samples ) {
|
||||||
|
concordanceSet.add(new VCFConcordanceCalculator(s));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(LocusConcordanceInfo info) {
|
||||||
|
if ( info.concordanceIsCheckable() ) {
|
||||||
|
overlappingSites++;
|
||||||
|
for ( VCFConcordanceCalculator concordance : concordanceSet ) {
|
||||||
|
concordance.update(info);
|
||||||
|
}
|
||||||
|
} else if ( info.isTruthOnly() ) {
|
||||||
|
truthOnlySites++;
|
||||||
|
if ( info.isVariantSite() ) {
|
||||||
|
truthOnlyVariantSites++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
variantOnlySites++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<VCFConcordanceCalculator> getConcordanceSet() {
|
||||||
|
return concordanceSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long numberOfTruthOnlySites() {
|
||||||
|
return truthOnlySites;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long numberOfTruthOnlyVariantSites() {
|
||||||
|
return truthOnlyVariantSites;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long numberOfVariantOnlySites() {
|
||||||
|
return variantOnlySites;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long numberOfOverlappingSites() {
|
||||||
|
return overlappingSites;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: Jan 27, 2010
|
||||||
|
* Time: 10:40:44 AM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Calculates per-sample concordance metrics across two multi-sample VCF files; outputs simple counts of concordant
|
||||||
|
* variant and genotype calls, genotyping errors, and call errors. Requires a VCF binding with the name 'truth' and
|
||||||
|
* a VCF binding with the name 'variants'.
|
||||||
|
* @Author: Chris Hartl
|
||||||
|
*/
|
||||||
|
@Requires(value= DataSource.REFERENCE,referenceMetaData = {@RMD(name="truth",type= RodVCF.class),@RMD(name="variants",type= RodVCF.class)})
|
||||||
|
public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInfo, MultiSampleConcordanceSet > {
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public MultiSampleConcordanceSet reduceInit() {
|
||||||
|
return new MultiSampleConcordanceSet();
|
||||||
|
}
|
||||||
|
|
||||||
|
public LocusConcordanceInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext c) {
|
||||||
|
if ( tracker == null ) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
ReferenceOrderedDatum truthData = tracker.lookup("truth", null);
|
||||||
|
ReferenceOrderedDatum variantData = tracker.lookup("variants",null);
|
||||||
|
LocusConcordanceInfo concordance;
|
||||||
|
if ( truthData == null && variantData == null) {
|
||||||
|
concordance = null;
|
||||||
|
} else if ( truthData == null ) {
|
||||||
|
// not in the truth set
|
||||||
|
if ( ( (RodVCF) variantData ).isFiltered() ) {
|
||||||
|
concordance = null;
|
||||||
|
} else {
|
||||||
|
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
|
||||||
|
}
|
||||||
|
} else if ( variantData == null ) {
|
||||||
|
// not in the variant set
|
||||||
|
if ( ( (RodVCF) truthData).isFiltered() ) {
|
||||||
|
concordance = null;
|
||||||
|
} else {
|
||||||
|
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// in both
|
||||||
|
// check for filtering
|
||||||
|
boolean truth_filter = ((RodVCF) truthData).isFiltered();
|
||||||
|
boolean call_filter = ((RodVCF) variantData).isFiltered();
|
||||||
|
if ( truth_filter && call_filter ) {
|
||||||
|
concordance = null;
|
||||||
|
} else if ( truth_filter ) {
|
||||||
|
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
|
||||||
|
} else if ( call_filter ) {
|
||||||
|
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref);
|
||||||
|
} else {
|
||||||
|
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.BOTH_SETS,( (RodVCF) truthData).getRecord(),( (RodVCF) variantData).getRecord(),ref);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return concordance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MultiSampleConcordanceSet reduce(LocusConcordanceInfo info, MultiSampleConcordanceSet concordanceSet) {
|
||||||
|
if ( info != null ) {
|
||||||
|
if ( concordanceSet.hasBeenInstantiated() ) {
|
||||||
|
concordanceSet.update(info);
|
||||||
|
} else if ( info.concordanceIsCheckable() ) {
|
||||||
|
concordanceSet.instantiate(info.getOverlappingSamples());
|
||||||
|
concordanceSet.update(info);
|
||||||
|
} else {
|
||||||
|
concordanceSet.update(info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return concordanceSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(MultiSampleConcordanceSet cSet) {
|
||||||
|
String[] header = {"Sample_ID","Concordant_Refs","Concordant_Vars","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call"};
|
||||||
|
out.print(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n",header));
|
||||||
|
for ( VCFConcordanceCalculator sample : cSet.getConcordanceSet() ) {
|
||||||
|
out.print(String.format("%s%n",sample));
|
||||||
|
}
|
||||||
|
logger.info("Overlapping="+cSet.numberOfOverlappingSites()+"\tTruthOnly="+cSet.numberOfTruthOnlySites()+"\tTruthOnlyVariantSites="+
|
||||||
|
cSet.numberOfTruthOnlyVariantSites()+"\tVariantOnly="+cSet.numberOfVariantOnlySites());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: Jan 27, 2010
|
||||||
|
* Time: 5:48:08 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
class VCFConcordanceCalculator {
|
||||||
|
private String name;
|
||||||
|
private Set<GenomeLoc> falsePositiveLoci;
|
||||||
|
private Set<GenomeLoc> falseNegativeLoci;
|
||||||
|
private Set<GenomeLoc> falseNegativeLociDueToNoCall;
|
||||||
|
private Set<GenomeLoc> hetsCalledHoms;
|
||||||
|
private Set<GenomeLoc> homsCalledHets;
|
||||||
|
private Set<GenomeLoc> concordantCalls;
|
||||||
|
private Set<GenomeLoc> concordantGenotypeReferenceCalls;
|
||||||
|
private Set<GenomeLoc> chipNoCalls;
|
||||||
|
|
||||||
|
public VCFConcordanceCalculator(String sampleName) {
|
||||||
|
name = sampleName;
|
||||||
|
falseNegativeLoci = new HashSet<GenomeLoc>();
|
||||||
|
falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
|
||||||
|
falsePositiveLoci = new HashSet<GenomeLoc>();
|
||||||
|
hetsCalledHoms = new HashSet<GenomeLoc>();
|
||||||
|
homsCalledHets = new HashSet<GenomeLoc>();
|
||||||
|
concordantCalls = new HashSet<GenomeLoc>();
|
||||||
|
concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
|
||||||
|
chipNoCalls = new HashSet<GenomeLoc>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(LocusConcordanceInfo info) {
|
||||||
|
compareGenotypes(info.getTruthGenotype(name), info.getVariantGenotype(name), info.getLoc(), info.getReferenceBase() );
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,concordantGenotypeReferenceCalls.size(),concordantCalls.size(),homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),falseNegativeLociDueToNoCall.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
|
||||||
|
if ( truth.isNoCall() ) {
|
||||||
|
chipNoCalls.add(loc);
|
||||||
|
} else if ( truth.isVariant(( char) ref) ) {
|
||||||
|
if ( call.isNoCall() ) {
|
||||||
|
falseNegativeLociDueToNoCall.add(loc);
|
||||||
|
} else if ( ! call.isVariant( (char) ref ) ) {
|
||||||
|
falseNegativeLoci.add(loc);
|
||||||
|
} else if ( call.isVariant((char) ref) ) {
|
||||||
|
// check het vs hom
|
||||||
|
checkGenotypeCall(truth,call, loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if ( ! truth.isVariant( (char) ref ) ) {
|
||||||
|
|
||||||
|
if ( call.isVariant((char) ref) ) {
|
||||||
|
falsePositiveLoci.add(loc);
|
||||||
|
} else {
|
||||||
|
concordantGenotypeReferenceCalls.add(loc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkGenotypeCall( VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc ) {
|
||||||
|
if ( truth.isHet() && call.isHom() ) {
|
||||||
|
hetsCalledHoms.add(loc);
|
||||||
|
} else if ( truth.isHom() && call.isHet() ) {
|
||||||
|
homsCalledHets.add(loc);
|
||||||
|
} else if ( ( truth.isHet() && call.isHet() ) || ( truth.isHom() && call.isHom() ) ) { // be extra careful
|
||||||
|
concordantCalls.add(loc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue