Added a multisample concordance walker which takes the place of the VCF python library I've been using. Takes a truth VCF and a variant VCF and outputs A TSV that looks like this:
Sample_ID Concordant_Refs Concordant_Vars Homs_called_het Het_called_homs False_Positives False_Negatives_Due_To_Ref_Call False_Negatives_Due_To_No_Call NA19381 491 294 2 0 0 0 1 NA19451 489 298 1 0 0 0 0 NA19463 486 289 2 3 1 4 3 NA19376 488 296 1 0 2 0 1 NA19317 489 284 5 3 3 3 1 This walker will be merged with GenotypeConcordance once it's clear how to do so. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2715 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
eccf40b17d
commit
1b9184a1c7
|
|
@ -0,0 +1,81 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Jan 27, 2010
|
||||
* Time: 5:48:36 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class LocusConcordanceInfo {
|
||||
|
||||
public enum ConcordanceType {
|
||||
TRUTH_SET,VARIANT_SET,BOTH_SETS
|
||||
}
|
||||
|
||||
private ConcordanceType concordanceType;
|
||||
private VCFRecord variantVCFRecord;
|
||||
private VCFRecord truthVCFRecord;
|
||||
private ReferenceContext reference;
|
||||
|
||||
public LocusConcordanceInfo(ConcordanceType type, VCFRecord truthRecord, VCFRecord variantRecord, ReferenceContext ref) {
|
||||
concordanceType = type;
|
||||
variantVCFRecord = variantRecord;
|
||||
truthVCFRecord = truthRecord;
|
||||
reference = ref;
|
||||
}
|
||||
|
||||
public boolean concordanceIsCheckable() {
|
||||
return concordanceType == ConcordanceType.BOTH_SETS;
|
||||
}
|
||||
|
||||
public VCFGenotypeRecord getTruthGenotype(String sample) {
|
||||
return truthVCFRecord.getGenotype(sample);
|
||||
}
|
||||
|
||||
public VCFGenotypeRecord getVariantGenotype(String sample) {
|
||||
return variantVCFRecord.getGenotype(sample);
|
||||
}
|
||||
|
||||
public Set<String> getOverlappingSamples() {
|
||||
Set<String> variantSamples = new HashSet<String>( Arrays.asList(variantVCFRecord.getSampleNames()) );
|
||||
variantSamples.retainAll(Arrays.asList(truthVCFRecord.getSampleNames()));
|
||||
return variantSamples;
|
||||
}
|
||||
|
||||
public byte getReferenceBase() {
|
||||
return (byte) reference.getBase();
|
||||
}
|
||||
|
||||
public boolean isTruthOnly () {
|
||||
return concordanceType == ConcordanceType.TRUTH_SET;
|
||||
}
|
||||
|
||||
public boolean isVariantSite() {
|
||||
for ( VCFGenotypeRecord g : truthVCFRecord.getVCFGenotypeRecords() ) {
|
||||
if ( g.isVariant(reference.getBase()) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public GenomeLoc getLoc() {
|
||||
if ( concordanceType == ConcordanceType.TRUTH_SET || concordanceType == ConcordanceType.BOTH_SETS) {
|
||||
return truthVCFRecord.getLocation();
|
||||
} else {
|
||||
return variantVCFRecord.getLocation();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Jan 27, 2010
|
||||
* Time: 5:47:27 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class MultiSampleConcordanceSet {
|
||||
private HashSet<VCFConcordanceCalculator> concordanceSet;
|
||||
private Set<String> cachedSampleNames;
|
||||
private long truthOnlySites;
|
||||
private long truthOnlyVariantSites;
|
||||
private long variantOnlySites;
|
||||
private long overlappingSites;
|
||||
|
||||
public MultiSampleConcordanceSet() {
|
||||
concordanceSet = new HashSet<VCFConcordanceCalculator>();
|
||||
truthOnlySites = 0l;
|
||||
truthOnlyVariantSites = 0l;
|
||||
variantOnlySites = 0l;
|
||||
overlappingSites = 0l;
|
||||
}
|
||||
|
||||
public boolean hasBeenInstantiated() {
|
||||
return cachedSampleNames != null;
|
||||
}
|
||||
|
||||
public void instantiate(Set<String> samples) {
|
||||
cachedSampleNames = samples;
|
||||
for ( String s : samples ) {
|
||||
concordanceSet.add(new VCFConcordanceCalculator(s));
|
||||
}
|
||||
}
|
||||
|
||||
public void update(LocusConcordanceInfo info) {
|
||||
if ( info.concordanceIsCheckable() ) {
|
||||
overlappingSites++;
|
||||
for ( VCFConcordanceCalculator concordance : concordanceSet ) {
|
||||
concordance.update(info);
|
||||
}
|
||||
} else if ( info.isTruthOnly() ) {
|
||||
truthOnlySites++;
|
||||
if ( info.isVariantSite() ) {
|
||||
truthOnlyVariantSites++;
|
||||
}
|
||||
} else {
|
||||
variantOnlySites++;
|
||||
}
|
||||
}
|
||||
|
||||
public Set<VCFConcordanceCalculator> getConcordanceSet() {
|
||||
return concordanceSet;
|
||||
}
|
||||
|
||||
public long numberOfTruthOnlySites() {
|
||||
return truthOnlySites;
|
||||
}
|
||||
|
||||
public long numberOfTruthOnlyVariantSites() {
|
||||
return truthOnlyVariantSites;
|
||||
}
|
||||
|
||||
public long numberOfVariantOnlySites() {
|
||||
return variantOnlySites;
|
||||
}
|
||||
|
||||
public long numberOfOverlappingSites() {
|
||||
return overlappingSites;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Jan 27, 2010
|
||||
* Time: 10:40:44 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
/*
|
||||
* Calculates per-sample concordance metrics across two multi-sample VCF files; outputs simple counts of concordant
|
||||
* variant and genotype calls, genotyping errors, and call errors. Requires a VCF binding with the name 'truth' and
|
||||
* a VCF binding with the name 'variants'.
|
||||
* @Author: Chris Hartl
|
||||
*/
|
||||
@Requires(value= DataSource.REFERENCE,referenceMetaData = {@RMD(name="truth",type= RodVCF.class),@RMD(name="variants",type= RodVCF.class)})
|
||||
public class MultiSampleConcordanceWalker extends RodWalker< LocusConcordanceInfo, MultiSampleConcordanceSet > {
|
||||
|
||||
public void initialize() {
|
||||
|
||||
}
|
||||
|
||||
public MultiSampleConcordanceSet reduceInit() {
|
||||
return new MultiSampleConcordanceSet();
|
||||
}
|
||||
|
||||
public LocusConcordanceInfo map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext c) {
|
||||
if ( tracker == null ) {
|
||||
return null;
|
||||
}
|
||||
ReferenceOrderedDatum truthData = tracker.lookup("truth", null);
|
||||
ReferenceOrderedDatum variantData = tracker.lookup("variants",null);
|
||||
LocusConcordanceInfo concordance;
|
||||
if ( truthData == null && variantData == null) {
|
||||
concordance = null;
|
||||
} else if ( truthData == null ) {
|
||||
// not in the truth set
|
||||
if ( ( (RodVCF) variantData ).isFiltered() ) {
|
||||
concordance = null;
|
||||
} else {
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
|
||||
}
|
||||
} else if ( variantData == null ) {
|
||||
// not in the variant set
|
||||
if ( ( (RodVCF) truthData).isFiltered() ) {
|
||||
concordance = null;
|
||||
} else {
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref);
|
||||
}
|
||||
} else {
|
||||
// in both
|
||||
// check for filtering
|
||||
boolean truth_filter = ((RodVCF) truthData).isFiltered();
|
||||
boolean call_filter = ((RodVCF) variantData).isFiltered();
|
||||
if ( truth_filter && call_filter ) {
|
||||
concordance = null;
|
||||
} else if ( truth_filter ) {
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.VARIANT_SET,null, ( (RodVCF) variantData ).getRecord(),ref);
|
||||
} else if ( call_filter ) {
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.TRUTH_SET,( (RodVCF) truthData).getRecord(),null,ref);
|
||||
} else {
|
||||
concordance = new LocusConcordanceInfo(LocusConcordanceInfo.ConcordanceType.BOTH_SETS,( (RodVCF) truthData).getRecord(),( (RodVCF) variantData).getRecord(),ref);
|
||||
}
|
||||
}
|
||||
|
||||
return concordance;
|
||||
}
|
||||
|
||||
public MultiSampleConcordanceSet reduce(LocusConcordanceInfo info, MultiSampleConcordanceSet concordanceSet) {
|
||||
if ( info != null ) {
|
||||
if ( concordanceSet.hasBeenInstantiated() ) {
|
||||
concordanceSet.update(info);
|
||||
} else if ( info.concordanceIsCheckable() ) {
|
||||
concordanceSet.instantiate(info.getOverlappingSamples());
|
||||
concordanceSet.update(info);
|
||||
} else {
|
||||
concordanceSet.update(info);
|
||||
}
|
||||
}
|
||||
|
||||
return concordanceSet;
|
||||
}
|
||||
|
||||
public void onTraversalDone(MultiSampleConcordanceSet cSet) {
|
||||
String[] header = {"Sample_ID","Concordant_Refs","Concordant_Vars","Homs_called_het","Het_called_homs","False_Positives","False_Negatives_Due_To_Ref_Call","False_Negatives_Due_To_No_Call"};
|
||||
out.print(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n",header));
|
||||
for ( VCFConcordanceCalculator sample : cSet.getConcordanceSet() ) {
|
||||
out.print(String.format("%s%n",sample));
|
||||
}
|
||||
logger.info("Overlapping="+cSet.numberOfOverlappingSites()+"\tTruthOnly="+cSet.numberOfTruthOnlySites()+"\tTruthOnlyVariantSites="+
|
||||
cSet.numberOfTruthOnlyVariantSites()+"\tVariantOnly="+cSet.numberOfVariantOnlySites());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval.multisample;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Jan 27, 2010
|
||||
* Time: 5:48:08 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class VCFConcordanceCalculator {
|
||||
private String name;
|
||||
private Set<GenomeLoc> falsePositiveLoci;
|
||||
private Set<GenomeLoc> falseNegativeLoci;
|
||||
private Set<GenomeLoc> falseNegativeLociDueToNoCall;
|
||||
private Set<GenomeLoc> hetsCalledHoms;
|
||||
private Set<GenomeLoc> homsCalledHets;
|
||||
private Set<GenomeLoc> concordantCalls;
|
||||
private Set<GenomeLoc> concordantGenotypeReferenceCalls;
|
||||
private Set<GenomeLoc> chipNoCalls;
|
||||
|
||||
public VCFConcordanceCalculator(String sampleName) {
|
||||
name = sampleName;
|
||||
falseNegativeLoci = new HashSet<GenomeLoc>();
|
||||
falseNegativeLociDueToNoCall = new HashSet<GenomeLoc>();
|
||||
falsePositiveLoci = new HashSet<GenomeLoc>();
|
||||
hetsCalledHoms = new HashSet<GenomeLoc>();
|
||||
homsCalledHets = new HashSet<GenomeLoc>();
|
||||
concordantCalls = new HashSet<GenomeLoc>();
|
||||
concordantGenotypeReferenceCalls = new HashSet<GenomeLoc>();
|
||||
chipNoCalls = new HashSet<GenomeLoc>();
|
||||
}
|
||||
|
||||
public void update(LocusConcordanceInfo info) {
|
||||
compareGenotypes(info.getTruthGenotype(name), info.getVariantGenotype(name), info.getLoc(), info.getReferenceBase() );
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d",name,concordantGenotypeReferenceCalls.size(),concordantCalls.size(),homsCalledHets.size(),hetsCalledHoms.size(),falsePositiveLoci.size(),falseNegativeLoci.size(),falseNegativeLociDueToNoCall.size());
|
||||
}
|
||||
|
||||
private void compareGenotypes(VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc, byte ref) {
|
||||
if ( truth.isNoCall() ) {
|
||||
chipNoCalls.add(loc);
|
||||
} else if ( truth.isVariant(( char) ref) ) {
|
||||
if ( call.isNoCall() ) {
|
||||
falseNegativeLociDueToNoCall.add(loc);
|
||||
} else if ( ! call.isVariant( (char) ref ) ) {
|
||||
falseNegativeLoci.add(loc);
|
||||
} else if ( call.isVariant((char) ref) ) {
|
||||
// check het vs hom
|
||||
checkGenotypeCall(truth,call, loc);
|
||||
}
|
||||
|
||||
} else if ( ! truth.isVariant( (char) ref ) ) {
|
||||
|
||||
if ( call.isVariant((char) ref) ) {
|
||||
falsePositiveLoci.add(loc);
|
||||
} else {
|
||||
concordantGenotypeReferenceCalls.add(loc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkGenotypeCall( VCFGenotypeRecord truth, VCFGenotypeRecord call, GenomeLoc loc ) {
|
||||
if ( truth.isHet() && call.isHom() ) {
|
||||
hetsCalledHoms.add(loc);
|
||||
} else if ( truth.isHom() && call.isHet() ) {
|
||||
homsCalledHets.add(loc);
|
||||
} else if ( ( truth.isHet() && call.isHet() ) || ( truth.isHom() && call.isHom() ) ) { // be extra careful
|
||||
concordantCalls.add(loc);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue