More changes to Variant Eval and Genotype Concordance (passes all integration tests):

1: -sample can now include a file, which will be parsed for sample-name entries
2: If you request a sample to run analysis on, but it is not present in any of your RODs, VEW will exception out
3: Change added to parse Integer, String, and List<Integer> type Allele Count annotations (error otherwise)
4 [slightly problematic]: The count objects now maintain row-keys in order, as the keys were taking an inordinate amount of time in onTraversalDone (multiple calls to getRowKeys(), so many multiple sorts of the same underlying unsorted object, very bad)

There is a legacy comparison object which is unused which I will strip out soon.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4502 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-10-15 12:40:36 +00:00
parent 3d988576a6
commit c9d473edee
3 changed files with 134 additions and 23 deletions

View File

@ -7,6 +7,7 @@ import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.report.tags.Analysis;
import org.broadinstitute.sting.utils.report.tags.DataPoint;
import org.broadinstitute.sting.utils.report.utils.TableType;
@ -264,9 +265,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
if (eval != null) {
// initialize the concordance table
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
alleleCountStats = new ACStats(eval,Genotype.Type.values().length);
alleleCountStats = new ACStats(eval,Genotype.Type.values().length, new CompACNames(veWalker.getLogger()));
sampleSummaryStats = new SampleSummaryStats(eval);
alleleCountSummary = new ACSummaryStats(eval);
alleleCountSummary = new ACSummaryStats(eval, new CompACNames(veWalker.getLogger()));
for (final VariantContext vc : missedValidationData) {
determineStats(null, vc);
}
@ -297,12 +298,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
String interesting = null;
final boolean validationIsValidVC = isValidVC(validation);
final String evalAC = (eval != null && eval.getAlternateAlleles().size() == 1 && eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) ? String.format("evalAC%s",eval.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null ;
final String validationAC = ( validationIsValidVC && validation.getAlternateAlleles().size() == 1 && validation.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) ? String.format("compAC%s",validation.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null;
final String evalAC = ( vcHasGoodAC(eval) ) ? String.format("evalAC%d",getAC(eval)) : null ;
final String validationAC = ( vcHasGoodAC(validation) ) ? String.format("compAC%d",getAC(validation)) : null;
// determine concordance for eval data
if (eval != null) {
for (final String sample : eval.getSampleNames()) {
for (final String sample : eval.getGenotypes().keySet()) {
final Genotype.Type called = eval.getGenotype(sample).getType();
final Genotype.Type truth;
@ -330,7 +331,7 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
else {
final Genotype.Type called = Genotype.Type.NO_CALL;
for (final String sample : validation.getSampleNames()) {
for (final String sample : validation.getGenotypes().keySet()) {
final Genotype.Type truth = validation.getGenotype(sample).getType();
sampleStats.incrValue(sample, truth, called);
if ( evalAC != null ) {
@ -363,8 +364,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
// TP & FP quality score histograms
if( eval != null && eval.isPolymorphic() && validationIsValidVC ) {
if( eval.getSampleNames().size() == 1 ) { // single sample calls
for( final String sample : eval.getSampleNames() ) { // only one sample
if( eval.getGenotypes().keySet().size() == 1 ) { // single sample calls
for( final String sample : eval.getGenotypes().keySet() ) { // only one sample
if( validation.hasGenotype(sample) ) {
final Genotype truth = validation.getGenotype(sample);
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() );
@ -396,6 +397,33 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
alleleCountSummary.generateSampleSummaryStats( alleleCountStats );
}
}
private boolean vcHasGoodAC(VariantContext vc) {
return ( vc != null && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) );
}
private int getAC(VariantContext vc) {
if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(List.class) ) {
return ((List<Integer>) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0);
} else if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(Integer.class)) {
return (Integer) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY);
} else if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(String.class)) {
// two ways of parsing
String ac = (String) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY);
if ( ac.startsWith("[") ) {
return Integer.parseInt(ac.replaceAll("[","").replaceAll("]",""));
} else {
try {
return Integer.parseInt(ac);
} catch ( NumberFormatException e ) {
throw new UserException(String.format("The format of the AC field is improperly formatted: AC=%s",ac));
}
}
} else {
throw new UserException(String.format("The format of the AC field does not appear to be of integer-list or String format"));
}
}
}
/**
@ -444,7 +472,7 @@ class SampleStats implements TableType {
public SampleStats(VariantContext vc, int nGenotypeTypes) {
this.nGenotypeTypes = nGenotypeTypes;
for (String sample : vc.getSampleNames())
for (String sample : vc.getGenotypes().keySet())
concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]);
}
@ -482,12 +510,24 @@ class SampleStats implements TableType {
* Sample stats, but for AC
*/
class ACStats extends SampleStats {
public ACStats(VariantContext vc, int nGenotypeTypes) {
private final CompACNames myComp;
private String[] rowKeys;
public ACStats(VariantContext vc, int nGenotypeTypes, CompACNames comp) {
super(nGenotypeTypes);
for ( int i = 0; i <= 2*vc.getNSamples(); i++ ) { // todo -- assuming ploidy 2 here...
rowKeys = new String[2+4*vc.getGenotypes().size()];
for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here...
concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]);
concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]);
rowKeys[i] = String.format("evalAC%d",i);
}
for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) {
concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]);
rowKeys[1+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i);
}
myComp = comp;
}
public String getName() {
@ -495,9 +535,10 @@ class ACStats extends SampleStats {
}
public Object[] getRowKeys() {
String[] acNames = (String[]) super.getRowKeys();
Arrays.sort(acNames,new CompACNames());
return acNames;
if ( rowKeys == null ) {
throw new StingException("RowKeys is null!");
}
return rowKeys;
}
}
@ -537,7 +578,7 @@ class SampleSummaryStats implements TableType {
public SampleSummaryStats(final VariantContext vc) {
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
for( final String sample : vc.getSampleNames() ) {
for( final String sample : vc.getGenotypes().keySet() ) {
concordanceSummary.put(sample, new double[COLUMN_KEYS.length]);
}
}
@ -676,12 +717,23 @@ class SampleSummaryStats implements TableType {
* SampleSummaryStats .. but for allele counts
*/
class ACSummaryStats extends SampleSummaryStats {
public ACSummaryStats (final VariantContext vc) {
final private CompACNames myComp;
private String[] rowKeys;
public ACSummaryStats (final VariantContext vc, CompACNames comp) {
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
for( int i = 0; i <= 2*vc.getNSamples() ; i ++ ) {
rowKeys = new String[3+4*vc.getGenotypes().size()];
rowKeys[0] = ALL_SAMPLES_KEY;
for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) {
concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]);
concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]);
rowKeys[i+1] = String.format("evalAC%d",i);
}
for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) {
concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]);
rowKeys[2+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i);
}
myComp = comp;
}
public String getName() {
@ -689,19 +741,31 @@ class ACSummaryStats extends SampleSummaryStats {
}
public Object[] getRowKeys() {
String[] acNames = (String[]) super.getRowKeys();
Arrays.sort(acNames,new CompACNames());
return acNames;
if ( rowKeys == null) {
throw new StingException("rowKeys is null!!");
}
return rowKeys;
}
}
class CompACNames implements Comparator{
final Logger myLogger;
private boolean info = true;
public CompACNames(Logger l) {
myLogger = l;
}
public boolean equals(Object o) {
return ( o.getClass() == CompACNames.class );
}
public int compare(Object o1, Object o2) {
if ( info ) {
myLogger.info("Sorting AC names");
info = false;
}
//System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2));
return getRank(o1) - getRank(o2);
}

View File

@ -29,6 +29,7 @@ import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.MutableVariantContext;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.StandardVCFWriter;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -41,6 +42,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.ApplyVariantCuts;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.report.ReportMarshaller;
import org.broadinstitute.sting.utils.report.VE2ReportFactory;
import org.broadinstitute.sting.utils.report.templates.ReportFormat;
@ -311,7 +313,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
if ( LIST )
listModulesAndExit();
SAMPLES_LIST = Arrays.asList(SAMPLES);
SAMPLES_LIST = SampleUtils.getSamplesFromCommandLineInput(Arrays.asList(SAMPLES));
determineEvalations();
@ -676,13 +678,28 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
HashMap<String,Object> newAts = new HashMap<String,Object>(vc.getAttributes());
VariantContextUtils.calculateChromosomeCounts(vc,newAts,true);
vc = VariantContext.modifyAttributes(vc,newAts);
logger.debug(String.format("VC %s subset to %s AC%n",vc.getName(),vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)));
//if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc));
} else if ( vc != null && ! vc.hasGenotypes(SAMPLES_LIST) ) {
throw new UserException(String.format("Genotypes for the variant context %s do not contain all the provided samples %s",vc.getName(), getMissingSamples(SAMPLES_LIST,vc)));
}
map.put(name, allowExcludes && excludeComp(vc) ? null : vc);
}
}
private static String getMissingSamples(Collection<String> soughtSamples, VariantContext vc) {
StringBuffer buf = new StringBuffer();
buf.append("Missing samples are:");
for ( String s : soughtSamples ) {
if ( ! vc.getGenotypes().keySet().contains(s) ) {
buf.append(String.format("%n%s",s));
}
}
return buf.toString();
}
// --------------------------------------------------------------------------------------------------------------
//
// reduce

View File

@ -31,9 +31,14 @@ import org.broad.tribble.vcf.VCFHeader;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@ -185,5 +190,30 @@ public class SampleUtils {
}
public static List<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
if (sampleArgs != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
List<String> samplesFromFiles = new ArrayList<String>();
for (String SAMPLE_EXPRESSION : sampleArgs) {
File sampleFile = new File(SAMPLE_EXPRESSION);
try {
XReadLines reader = new XReadLines(sampleFile);
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line);
}
} catch (FileNotFoundException e) {
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
}
}
return samplesFromFiles;
}
return new ArrayList<String>();
}
}