Added hidden flag to GenotypeConcordance to output sites of discordant genotypes (to System.out)

Revised ConcondanceMetrics tests to adapt to change
Added comments to PosteriorLikelihoodsUtils
This commit is contained in:
Laura Gauthier 2014-02-27 15:48:40 -05:00
parent 1a1fb8cf6f
commit 7f9f58dbd1
4 changed files with 75 additions and 25 deletions

View File

@ -67,14 +67,18 @@ public class PosteriorLikelihoodsUtils {
throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented");
final Map<Allele,Integer> totalAlleleCounts = new HashMap<>();
//store the allele counts for each allele in the variant priors
for ( final VariantContext resource : resources ) {
addAlleleCounts(totalAlleleCounts,resource,useAC);
}
//add the allele counts from the input samples (if applicable)
if ( useInputSamples ) {
addAlleleCounts(totalAlleleCounts,vc1,useAC);
}
//add zero allele counts for any reference alleles not seen in priors (if applicable)
totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources);
// now extract the counts of the alleles present within vc1, and in order
@ -86,6 +90,7 @@ public class PosteriorLikelihoodsUtils {
totalAlleleCounts.get(allele) : 0 );
}
//parse the likelihoods for each sample's genotype
final List<double[]> likelihoods = new ArrayList<>(vc1.getNSamples());
for ( final Genotype genotype : vc1.getGenotypes() ) {
likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null );
@ -196,13 +201,24 @@ public class PosteriorLikelihoodsUtils {
return priors;
}
/**
* Parse counts for each allele
* @param counts - Map to store and return data
* @param context - line to be parsed from the input VCF file
* @param useAC - use allele count annotation value from VariantContext (vs. MLEAC)
*/
private static void addAlleleCounts(final Map<Allele,Integer> counts, final VariantContext context, final boolean useAC) {
final int[] ac;
//use MLEAC value...
if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) {
ac = extractInts(context.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
} else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
}
//...unless specified by the user in useAC
else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
ac = extractInts(context.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
} else {
}
//if VariantContext annotation doesn't contain AC/MLEAC then get the data from another field
else {
ac = new int[context.getAlternateAlleles().size()];
int idx = 0;
for ( final Allele allele : context.getAlternateAlleles() ) {
@ -210,24 +226,33 @@ public class PosteriorLikelihoodsUtils {
}
}
//since the allele count for the reference allele is not given in the VCF format,
//calculate it from the allele number minus the total counts for alternate alleles
for ( final Allele allele : context.getAlleles() ) {
final int count;
if ( allele.isReference() ) {
if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) {
count = context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac);
count = Math.max(context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac),0); //occasionally an MLEAC value will sneak in that's greater than the AN
} else {
count = context.getCalledChrCount() - (int) MathUtils.sum(ac);
count = Math.max(context.getCalledChrCount() - (int) MathUtils.sum(ac),0);
}
} else {
count = ac[context.getAlternateAlleles().indexOf(allele)];
}
//if this allele isn't in the map yet, add it
if ( ! counts.containsKey(allele) ) {
counts.put(allele,0);
}
//add the count for the current allele to the existing value in the map
counts.put(allele,count + counts.get(allele));
}
}
/**
* Check the formatting on the Object returned by a call to VariantContext::getAttribute() and parse appropriately
* @param integerListContainingVCField - Object returned by a call to VariantContext::getAttribute()
* @return - array of ints
*/
public static int[] extractInts(final Object integerListContainingVCField) {
List<Integer> mleList = null;
if ( integerListContainingVCField instanceof List ) {

View File

@ -135,7 +135,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
metrics.update(eval,truth);
Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2);
Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),1);
@ -185,7 +185,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
metrics.update(eval,truth);
Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2);
Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2);
@ -205,7 +205,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
codec = new VCFCodec();
evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
metrics = new ConcordanceMetrics(evalHeader,compHeader);
metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
metrics.update(eval,truth);
Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2);
Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2);
@ -260,7 +260,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
metrics.update(eval,truth);
Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample1").getnMismatchingAlt(),1);
Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0);
@ -313,7 +313,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
metrics.update(eval,truth);
Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0);
Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0);
@ -362,7 +362,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
metrics.update(eval,truth);
Assert.assertTrue(eval.getGenotype("test1_sample2").getType().equals(GenotypeType.UNAVAILABLE));
Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0);
@ -516,7 +516,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
for ( Pair<VariantContext,VariantContext> contextPair : data ) {
VariantContext eval = contextPair.getFirst();
@ -550,7 +550,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
int[][] table = metrics.getOverallGenotypeConcordance().getTable();
// set up the table
table[0] = new int[] {30, 12, 7, 5, 6, 0};
@ -585,8 +585,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_1))));
VCFHeader disjointCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_2))));
VCFHeader overlapCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_3))));
ConcordanceMetrics disjointMetrics = new ConcordanceMetrics(evalHeader,disjointCompHeader);
ConcordanceMetrics overlapMetrics = new ConcordanceMetrics(evalHeader,overlapCompHeader);
ConcordanceMetrics disjointMetrics = new ConcordanceMetrics(evalHeader,disjointCompHeader,false);
ConcordanceMetrics overlapMetrics = new ConcordanceMetrics(evalHeader,overlapCompHeader,false);
// test what happens if you put in disjoint sets and start making requests
Assert.assertEquals(0,disjointMetrics.getPerSampleGenotypeConcordance().size());
@ -716,7 +716,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
VCFCodec codec = new VCFCodec();
VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER))));
VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER))));
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader);
ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false);
List<Pair<VariantContext,VariantContext>> data = getData7();

View File

@ -42,8 +42,9 @@ public class ConcordanceMetrics {
private Map<String,GenotypeConcordanceTable> perSampleGenotypeConcordance;
private GenotypeConcordanceTable overallGenotypeConcordance;
private SiteConcordanceTable overallSiteConcordance;
private Boolean printInterestingSites;
public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth) {
public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth, Boolean printSitesEnabled) {
HashSet<String> overlappingSamples = new HashSet<String>(evaluate.getGenotypeSamples());
overlappingSamples.retainAll(truth.getGenotypeSamples());
perSampleGenotypeConcordance = new HashMap<String, GenotypeConcordanceTable>(overlappingSamples.size());
@ -52,6 +53,7 @@ public class ConcordanceMetrics {
}
overallGenotypeConcordance = new GenotypeConcordanceTable();
overallSiteConcordance = new SiteConcordanceTable();
printInterestingSites = printSitesEnabled;
}
public GenotypeConcordanceTable getOverallGenotypeConcordance() {
@ -114,6 +116,7 @@ public class ConcordanceMetrics {
@Requires({"eval != null","truth != null"})
public void update(VariantContext eval, VariantContext truth) {
Boolean doPrint = false;
overallSiteConcordance.update(eval,truth);
Set<String> alleleTruth = new HashSet<String>(8);
String truthRef = truth.getReference().getBaseString();
@ -130,7 +133,12 @@ public class ConcordanceMetrics {
throw new UserException(String.format("Concordance Metrics is currently only implemented for DIPLOID genotypes, found eval ploidy: %d, comp ploidy: %d",evalGenotype.getPloidy(),truthGenotype.getPloidy()));
}
perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef);
overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef);
doPrint = overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef);
if(printInterestingSites && doPrint)
System.out.println(eval.getChr() + ":" + eval.getStart() + "\t truth is:" + truthGenotype.getType() + "\t eval is:" + evalGenotype.getType());
//Below is code to print out mismatched alternate alleles
//System.out.println(eval.getChr() + ":" + eval.getStart() + "\t truth is:" + truthGenotype.getAlleles() + "\t eval is:" + evalGenotype.getAlleles());
}
}
@ -212,13 +220,14 @@ public class ConcordanceMetrics {
}
@Requires({"eval!=null","truth != null","truthAlleles != null"})
public void update(Genotype eval, Genotype truth, Set<String> truthAlleles, String truthRef) {
public Boolean update(Genotype eval, Genotype truth, Set<String> truthAlleles, String truthRef) {
// this is slow but correct.
// NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles
// that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will
// all match, so long as A and T are alleles in the truth callset.
boolean matchingAlt = true;
int evalGT, truthGT;
if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) {
// by default, no-calls "match" between alleles, so if
// one or both sites are no-call or unavailable, the alt alleles match
@ -241,10 +250,17 @@ public class ConcordanceMetrics {
}
if ( matchingAlt ) {
genotypeCounts[eval.getType().ordinal()][truth.getType().ordinal()]++;
evalGT = eval.getType().ordinal();
truthGT = truth.getType().ordinal();
genotypeCounts[evalGT][truthGT]++;
if(evalGT != truthGT) //report variants where genotypes don't match
return true;
} else {
nMismatchingAlt++;
return false;
//return true; //alternatively, report variants where alt alleles don't match
}
return false;
}
public int[][] getTable() {

View File

@ -25,10 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -213,6 +210,16 @@ public class GenotypeConcordance extends RodWalker<List<Pair<VariantContext,Vari
@Argument(shortName="moltenize",fullName="moltenize",doc="Molten rather than tabular output")
public boolean moltenize = false;
/**
* Print sites where genotypes are mismatched between callsets along with annotations giving the genotype of each callset
* Outputs directly to System.out. Super classy.
*
* NOTE: doesn't currently differentiate between samples, so there may be repeats
*/
@Hidden
@Argument(shortName="sites", fullName = "printInterestingSites", required=false)
protected boolean printSites = false;
@Output
PrintStream out;
@ -244,7 +251,7 @@ public class GenotypeConcordance extends RodWalker<List<Pair<VariantContext,Vari
evalSamples = evalHeader.getGenotypeSamples();
VCFHeader compHeader = headerMap.get(compBinding.getName());
compSamples = compHeader.getGenotypeSamples();
return new ConcordanceMetrics(evalHeader,compHeader);
return new ConcordanceMetrics(evalHeader,compHeader, printSites);
}
@ -347,8 +354,10 @@ public class GenotypeConcordance extends RodWalker<List<Pair<VariantContext,Vari
}
public ConcordanceMetrics reduce(List<Pair<VariantContext,VariantContext>> evalCompList, ConcordanceMetrics metrics) {
for ( Pair<VariantContext,VariantContext> evalComp : evalCompList)
for ( Pair<VariantContext,VariantContext> evalComp : evalCompList){
metrics.update(evalComp.getFirst(),evalComp.getSecond());
}
return metrics;
}