Merge pull request #810 from broadinstitute/pd_monoallelic_concordance

GenotypeConcordance: monomorphic sites in truth are no longer...
This commit is contained in:
rpoplin 2015-02-10 15:42:40 -05:00
commit 893e8ff9c4
3 changed files with 82 additions and 51 deletions

View File

@ -742,4 +742,45 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH),1);
}
private Pair<VariantContext,VariantContext> getMonoallelicData() {
final Allele ref = Allele.create(BaseUtils.Base.T.base,true);
final Allele alt = Allele.create(BaseUtils.Base.C.base);
//Site in eval is monoallelic, both samples are HOM_REF
//sample1 in comp is HOM_VAR, sample2 is NO_CALL
//None of these should trigger mismatching alleles
final GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1",1,1);
final VariantContextBuilder site1Comp = new VariantContextBuilder();
final VariantContextBuilder site1Eval = new VariantContextBuilder();
site1Comp.loc(loc.getContig(), loc.getStart(), loc.getStop());
site1Eval.loc(loc.getContig(), loc.getStart(), loc.getStop());
site1Comp.alleles(Arrays.asList(ref));
site1Eval.alleles(Arrays.asList(ref, alt));
site1Comp.genotypes(GenotypeBuilder.create("test2_sample1", Arrays.asList(ref, ref)),
GenotypeBuilder.create("test2_sample2", Arrays.asList(ref, ref)));
site1Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(alt,alt)),
GenotypeBuilder.create("test2_sample2",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)));
return new Pair<>(site1Eval.make(), site1Comp.make());
}
@Test
public void testMonoallelicSite() {
final Pair<VariantContext,VariantContext> data = getMonoallelicData();
final VariantContext eval = data.getFirst();
final VariantContext truth = data.getSecond();
final VCFCodec codec = new VCFCodec();
final VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER))));
final VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER))));
final ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,null);
metrics.update(eval,truth);
Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample1").getnMismatchingAlt(),0);
Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample2").getnMismatchingAlt(),0);
Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample1").getTable()[3][1],1);
Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample2").getTable()[0][1],1);
}
}

View File

@ -46,9 +46,9 @@ public class ConcordanceMetrics {
final PrintStream sitesFile;
public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth, PrintStream inputSitesFile) {
HashSet<String> overlappingSamples = new HashSet<String>(evaluate.getGenotypeSamples());
HashSet<String> overlappingSamples = new HashSet<>(evaluate.getGenotypeSamples());
overlappingSamples.retainAll(truth.getGenotypeSamples());
perSampleGenotypeConcordance = new HashMap<String, GenotypeConcordanceTable>(overlappingSamples.size());
perSampleGenotypeConcordance = new HashMap<>(overlappingSamples.size());
for ( String sample : overlappingSamples ) {
perSampleGenotypeConcordance.put(sample,new GenotypeConcordanceTable());
}
@ -82,7 +82,7 @@ public class ConcordanceMetrics {
}
public Map<String,Double> getPerSampleNRD() {
Map<String,Double> nrd = new HashMap<String,Double>(perSampleGenotypeConcordance.size());
Map<String,Double> nrd = new HashMap<>(perSampleGenotypeConcordance.size());
for ( Map.Entry<String,GenotypeConcordanceTable> sampleTable : perSampleGenotypeConcordance.entrySet() ) {
nrd.put(sampleTable.getKey(),calculateNRD(sampleTable.getValue()));
}
@ -91,7 +91,7 @@ public class ConcordanceMetrics {
}
public Map<String,Double> getPerSampleOGC() {
Map<String,Double> ogc = new HashMap<String,Double>(perSampleGenotypeConcordance.size());
Map<String,Double> ogc = new HashMap<>(perSampleGenotypeConcordance.size());
for ( Map.Entry<String,GenotypeConcordanceTable> sampleTable : perSampleGenotypeConcordance.entrySet() ) {
ogc.put(sampleTable.getKey(),calculateOGC(sampleTable.getValue()));
}
@ -108,7 +108,7 @@ public class ConcordanceMetrics {
}
public Map<String,Double> getPerSampleNRS() {
Map<String,Double> nrs = new HashMap<String,Double>(perSampleGenotypeConcordance.size());
Map<String,Double> nrs = new HashMap<>(perSampleGenotypeConcordance.size());
for ( Map.Entry<String,GenotypeConcordanceTable> sampleTable : perSampleGenotypeConcordance.entrySet() ) {
nrs.put(sampleTable.getKey(),calculateNRS(sampleTable.getValue()));
}
@ -121,25 +121,20 @@ public class ConcordanceMetrics {
}
@Requires({"eval != null","truth != null"})
public void update(VariantContext eval, VariantContext truth) {
boolean doPrint = false;
public void update(final VariantContext eval, final VariantContext truth) {
overallSiteConcordance.update(eval,truth);
Set<String> alleleTruth = new HashSet<String>(8);
String truthRef = truth.getReference().getBaseString();
alleleTruth.add(truthRef);
for ( Allele a : truth.getAlternateAlleles() ) {
alleleTruth.add(a.getBaseString());
}
for ( String sample : perSampleGenotypeConcordance.keySet() ) {
Genotype evalGenotype = eval.getGenotype(sample);
Genotype truthGenotype = truth.getGenotype(sample);
final Set<Allele> truthAlleles = new HashSet<>(truth.getAlleles());
for ( final String sample : perSampleGenotypeConcordance.keySet() ) {
final Genotype evalGenotype = eval.getGenotype(sample);
final Genotype truthGenotype = truth.getGenotype(sample);
// ensure genotypes are either no-call ("."), missing (empty alleles), or diploid
if ( ( ! evalGenotype.isNoCall() && evalGenotype.getPloidy() != 2 && evalGenotype.getPloidy() > 0) ||
( ! truthGenotype.isNoCall() && truthGenotype.getPloidy() != 2 && truthGenotype.getPloidy() > 0) ) {
throw new UserException(String.format("Concordance Metrics is currently only implemented for DIPLOID genotypes, found eval ploidy: %d, comp ploidy: %d",evalGenotype.getPloidy(),truthGenotype.getPloidy()));
}
perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef);
doPrint = overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef);
final boolean allelesMatch = doAllelesMatch(evalGenotype, truthGenotype, truth.getReference(), truthAlleles);
perSampleGenotypeConcordance.get(sample).update(allelesMatch, evalGenotype, truthGenotype);
final boolean doPrint = overallGenotypeConcordance.update(allelesMatch, evalGenotype, truthGenotype);
if(sitesFile != null && doPrint)
sitesFile.println(eval.getChr() + ":" + eval.getStart() + "\t" + sample + "\t" + truthGenotype.getType() + "\t" + evalGenotype.getType());
}
@ -211,6 +206,29 @@ public class ConcordanceMetrics {
return total == 0l ? 0.0 : ( (double) confirmedVariant ) / ( (double) ( total ) );
}
private boolean doAllelesMatch(final Genotype eval, final Genotype truth,
final Allele truthRef, final Set<Allele> truthSiteAlleles) {
// When determining if alleles match, there are a number of cases to consider. In order:
// 1) If either genotype is uncalled or unavailable, the alleles MATCH
// 2) If the truth genotype is hom ref, then:
// a) If the truth variant is mononallelic (no alternate alleles), the alleles MATCH
// b) Otherwise, the alleles match IFF the alleles in the eval genotype are a subset
// of the alleles in the truth VARIANT
// 3) Otherwise, the alleles match IFF the alleles in the eval genotype are a subset
// of the alleles in (the truth GENOTYPE + the truth REF allele)
boolean matching = true;
if (eval.isCalled() && truth.isCalled()) { // Case 1
if (truth.isHomRef()) { // Case 2
matching = truthSiteAlleles.size() == 1 || truthSiteAlleles.containsAll(eval.getAlleles());
} else { // Case 3
final Set<Allele> truthAlleles = new HashSet<>(truth.getAlleles());
truthAlleles.add(truthRef);
matching = truthAlleles.containsAll(eval.getAlleles());
}
}
return matching;
}
class GenotypeConcordanceTable {
@ -223,38 +241,10 @@ public class ConcordanceMetrics {
}
@Requires({"eval!=null","truth != null","truthAlleles != null"})
public Boolean update(Genotype eval, Genotype truth, Set<String> truthAlleles, String truthRef) {
// this is slow but correct.
// NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles
// that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will
// all match, so long as A and T are alleles in the truth callset.
boolean matchingAlt = true;
int evalGT, truthGT;
if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) {
// by default, no-calls "match" between alleles, so if
// one or both sites are no-call or unavailable, the alt alleles match
// otherwise, check explicitly: if the eval has an allele that's not ref, no-call, or present in truth
// the alt allele is mismatching - regardless of whether the genotype is correct.
for ( Allele evalAllele : eval.getAlleles() ) {
matchingAlt &= truthAlleles.contains(evalAllele.getBaseString());
}
} else if ( eval.isCalled() && truth.isCalled() ) {
// otherwise, the eval genotype has to match either the alleles in the truth genotype, or the truth reference allele
// todo -- this can be sped up by caching the truth allele sets
Set<String> genoAlleles = new HashSet<String>(3);
genoAlleles.add(truthRef);
for ( Allele truthGenoAl : truth.getAlleles() ) {
genoAlleles.add(truthGenoAl.getBaseString());
}
for ( Allele evalAllele : eval.getAlleles() ) {
matchingAlt &= genoAlleles.contains(evalAllele.getBaseString());
}
}
public Boolean update(final boolean matchingAlt, final Genotype eval, final Genotype truth) {
if ( matchingAlt ) {
evalGT = eval.getType().ordinal();
truthGT = truth.getType().ordinal();
final int evalGT = eval.getType().ordinal();
final int truthGT = truth.getType().ordinal();
genotypeCounts[evalGT][truthGT]++;
if(evalGT != truthGT) //report variants where genotypes don't match
return true;

View File

@ -74,8 +74,8 @@ import java.util.*;
* <h4>Tables</h4>
* <p>
* Headers for the (non-moltenized -- see below) GenotypeConcordance counts and proportions tables give the genotype of
* the COMP callset followed by the genotype of the EVAL callset. For example the value corresponding to HOM_REF_HET
* reflects variants called HOM_REF in the COMP callset and HET in the EVAL callset. Variants for which the alternate
* the EVAL callset followed by the genotype of the COMP callset. For example the value corresponding to HOM_REF_HET
* reflects variants called HOM_REF in the EVAL callset and HET in the COMP callset. Variants for which the alternate
* alleles between the EVAL and COMP sample did not match are excluded from genotype comparisons and given in the
* "Mismatching_Alleles" field.
* </p>