More changes to Variant Eval and Genotype Concordance (passes all integration tests):
1: -sample can now include a file, which will be parsed for sample-name entries 2: If you request a sample to run analysis on, but it is not present in any of your RODs, VEW will exception out 3: Change added to parse Integer, String, and List<Integer> type Allele Count annotations (error otherwise) 4 [slightly problematic]: The count objects now maintain row-keys in order, as the keys were taking an inordinate amount of time in onTraversalDone (multiple calls to getRowKeys(), so many multiple sorts of the same underlying unsorted object, very bad) There is a legacy comparison object which is unused which I will strip out soon. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4502 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3d988576a6
commit
c9d473edee
|
|
@ -7,6 +7,7 @@ import org.broad.tribble.util.variantcontext.VariantContext;
|
|||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.report.tags.Analysis;
|
||||
import org.broadinstitute.sting.utils.report.tags.DataPoint;
|
||||
import org.broadinstitute.sting.utils.report.utils.TableType;
|
||||
|
|
@ -264,9 +265,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
|
|||
if (eval != null) {
|
||||
// initialize the concordance table
|
||||
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
|
||||
alleleCountStats = new ACStats(eval,Genotype.Type.values().length);
|
||||
alleleCountStats = new ACStats(eval,Genotype.Type.values().length, new CompACNames(veWalker.getLogger()));
|
||||
sampleSummaryStats = new SampleSummaryStats(eval);
|
||||
alleleCountSummary = new ACSummaryStats(eval);
|
||||
alleleCountSummary = new ACSummaryStats(eval, new CompACNames(veWalker.getLogger()));
|
||||
for (final VariantContext vc : missedValidationData) {
|
||||
determineStats(null, vc);
|
||||
}
|
||||
|
|
@ -297,12 +298,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
|
|||
String interesting = null;
|
||||
|
||||
final boolean validationIsValidVC = isValidVC(validation);
|
||||
final String evalAC = (eval != null && eval.getAlternateAlleles().size() == 1 && eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) ? String.format("evalAC%s",eval.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null ;
|
||||
final String validationAC = ( validationIsValidVC && validation.getAlternateAlleles().size() == 1 && validation.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) ? String.format("compAC%s",validation.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null;
|
||||
final String evalAC = ( vcHasGoodAC(eval) ) ? String.format("evalAC%d",getAC(eval)) : null ;
|
||||
final String validationAC = ( vcHasGoodAC(validation) ) ? String.format("compAC%d",getAC(validation)) : null;
|
||||
|
||||
// determine concordance for eval data
|
||||
if (eval != null) {
|
||||
for (final String sample : eval.getSampleNames()) {
|
||||
for (final String sample : eval.getGenotypes().keySet()) {
|
||||
final Genotype.Type called = eval.getGenotype(sample).getType();
|
||||
final Genotype.Type truth;
|
||||
|
||||
|
|
@ -330,7 +331,7 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
|
|||
else {
|
||||
final Genotype.Type called = Genotype.Type.NO_CALL;
|
||||
|
||||
for (final String sample : validation.getSampleNames()) {
|
||||
for (final String sample : validation.getGenotypes().keySet()) {
|
||||
final Genotype.Type truth = validation.getGenotype(sample).getType();
|
||||
sampleStats.incrValue(sample, truth, called);
|
||||
if ( evalAC != null ) {
|
||||
|
|
@ -363,8 +364,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
|
|||
|
||||
// TP & FP quality score histograms
|
||||
if( eval != null && eval.isPolymorphic() && validationIsValidVC ) {
|
||||
if( eval.getSampleNames().size() == 1 ) { // single sample calls
|
||||
for( final String sample : eval.getSampleNames() ) { // only one sample
|
||||
if( eval.getGenotypes().keySet().size() == 1 ) { // single sample calls
|
||||
for( final String sample : eval.getGenotypes().keySet() ) { // only one sample
|
||||
if( validation.hasGenotype(sample) ) {
|
||||
final Genotype truth = validation.getGenotype(sample);
|
||||
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() );
|
||||
|
|
@ -396,6 +397,33 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
|
|||
alleleCountSummary.generateSampleSummaryStats( alleleCountStats );
|
||||
}
|
||||
}
|
||||
|
||||
private boolean vcHasGoodAC(VariantContext vc) {
|
||||
return ( vc != null && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) );
|
||||
|
||||
}
|
||||
|
||||
private int getAC(VariantContext vc) {
|
||||
if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(List.class) ) {
|
||||
return ((List<Integer>) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0);
|
||||
} else if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(Integer.class)) {
|
||||
return (Integer) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY);
|
||||
} else if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(String.class)) {
|
||||
// two ways of parsing
|
||||
String ac = (String) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY);
|
||||
if ( ac.startsWith("[") ) {
|
||||
return Integer.parseInt(ac.replaceAll("[","").replaceAll("]",""));
|
||||
} else {
|
||||
try {
|
||||
return Integer.parseInt(ac);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new UserException(String.format("The format of the AC field is improperly formatted: AC=%s",ac));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new UserException(String.format("The format of the AC field does not appear to be of integer-list or String format"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -444,7 +472,7 @@ class SampleStats implements TableType {
|
|||
|
||||
public SampleStats(VariantContext vc, int nGenotypeTypes) {
|
||||
this.nGenotypeTypes = nGenotypeTypes;
|
||||
for (String sample : vc.getSampleNames())
|
||||
for (String sample : vc.getGenotypes().keySet())
|
||||
concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]);
|
||||
}
|
||||
|
||||
|
|
@ -482,12 +510,24 @@ class SampleStats implements TableType {
|
|||
* Sample stats, but for AC
|
||||
*/
|
||||
class ACStats extends SampleStats {
|
||||
public ACStats(VariantContext vc, int nGenotypeTypes) {
|
||||
private final CompACNames myComp;
|
||||
private String[] rowKeys;
|
||||
|
||||
public ACStats(VariantContext vc, int nGenotypeTypes, CompACNames comp) {
|
||||
super(nGenotypeTypes);
|
||||
for ( int i = 0; i <= 2*vc.getNSamples(); i++ ) { // todo -- assuming ploidy 2 here...
|
||||
rowKeys = new String[2+4*vc.getGenotypes().size()];
|
||||
for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here...
|
||||
concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]);
|
||||
concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]);
|
||||
rowKeys[i] = String.format("evalAC%d",i);
|
||||
|
||||
}
|
||||
|
||||
for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) {
|
||||
concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]);
|
||||
rowKeys[1+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i);
|
||||
}
|
||||
|
||||
myComp = comp;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
|
|
@ -495,9 +535,10 @@ class ACStats extends SampleStats {
|
|||
}
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
String[] acNames = (String[]) super.getRowKeys();
|
||||
Arrays.sort(acNames,new CompACNames());
|
||||
return acNames;
|
||||
if ( rowKeys == null ) {
|
||||
throw new StingException("RowKeys is null!");
|
||||
}
|
||||
return rowKeys;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -537,7 +578,7 @@ class SampleSummaryStats implements TableType {
|
|||
|
||||
public SampleSummaryStats(final VariantContext vc) {
|
||||
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
|
||||
for( final String sample : vc.getSampleNames() ) {
|
||||
for( final String sample : vc.getGenotypes().keySet() ) {
|
||||
concordanceSummary.put(sample, new double[COLUMN_KEYS.length]);
|
||||
}
|
||||
}
|
||||
|
|
@ -676,12 +717,23 @@ class SampleSummaryStats implements TableType {
|
|||
* SampleSummaryStats .. but for allele counts
|
||||
*/
|
||||
class ACSummaryStats extends SampleSummaryStats {
|
||||
public ACSummaryStats (final VariantContext vc) {
|
||||
final private CompACNames myComp;
|
||||
private String[] rowKeys;
|
||||
|
||||
public ACSummaryStats (final VariantContext vc, CompACNames comp) {
|
||||
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
|
||||
for( int i = 0; i <= 2*vc.getNSamples() ; i ++ ) {
|
||||
rowKeys = new String[3+4*vc.getGenotypes().size()];
|
||||
rowKeys[0] = ALL_SAMPLES_KEY;
|
||||
for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) {
|
||||
concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]);
|
||||
concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]);
|
||||
rowKeys[i+1] = String.format("evalAC%d",i);
|
||||
}
|
||||
for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) {
|
||||
concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]);
|
||||
rowKeys[2+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i);
|
||||
}
|
||||
|
||||
myComp = comp;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
|
|
@ -689,19 +741,31 @@ class ACSummaryStats extends SampleSummaryStats {
|
|||
}
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
String[] acNames = (String[]) super.getRowKeys();
|
||||
Arrays.sort(acNames,new CompACNames());
|
||||
return acNames;
|
||||
if ( rowKeys == null) {
|
||||
throw new StingException("rowKeys is null!!");
|
||||
}
|
||||
return rowKeys;
|
||||
}
|
||||
}
|
||||
|
||||
class CompACNames implements Comparator{
|
||||
|
||||
final Logger myLogger;
|
||||
private boolean info = true;
|
||||
|
||||
public CompACNames(Logger l) {
|
||||
myLogger = l;
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
return ( o.getClass() == CompACNames.class );
|
||||
}
|
||||
|
||||
public int compare(Object o1, Object o2) {
|
||||
if ( info ) {
|
||||
myLogger.info("Sorting AC names");
|
||||
info = false;
|
||||
}
|
||||
//System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2));
|
||||
return getRank(o1) - getRank(o2);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import org.apache.log4j.Logger;
|
|||
import org.broad.tribble.util.variantcontext.MutableVariantContext;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.StandardVCFWriter;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -41,6 +42,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
|
|||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.ApplyVariantCuts;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.report.ReportMarshaller;
|
||||
import org.broadinstitute.sting.utils.report.VE2ReportFactory;
|
||||
import org.broadinstitute.sting.utils.report.templates.ReportFormat;
|
||||
|
|
@ -311,7 +313,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
|||
if ( LIST )
|
||||
listModulesAndExit();
|
||||
|
||||
SAMPLES_LIST = Arrays.asList(SAMPLES);
|
||||
SAMPLES_LIST = SampleUtils.getSamplesFromCommandLineInput(Arrays.asList(SAMPLES));
|
||||
|
||||
determineEvalations();
|
||||
|
||||
|
|
@ -676,13 +678,28 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
|||
HashMap<String,Object> newAts = new HashMap<String,Object>(vc.getAttributes());
|
||||
VariantContextUtils.calculateChromosomeCounts(vc,newAts,true);
|
||||
vc = VariantContext.modifyAttributes(vc,newAts);
|
||||
logger.debug(String.format("VC %s subset to %s AC%n",vc.getName(),vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)));
|
||||
//if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc));
|
||||
} else if ( vc != null && ! vc.hasGenotypes(SAMPLES_LIST) ) {
|
||||
throw new UserException(String.format("Genotypes for the variant context %s do not contain all the provided samples %s",vc.getName(), getMissingSamples(SAMPLES_LIST,vc)));
|
||||
}
|
||||
|
||||
map.put(name, allowExcludes && excludeComp(vc) ? null : vc);
|
||||
}
|
||||
}
|
||||
|
||||
private static String getMissingSamples(Collection<String> soughtSamples, VariantContext vc) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
buf.append("Missing samples are:");
|
||||
for ( String s : soughtSamples ) {
|
||||
if ( ! vc.getGenotypes().keySet().contains(s) ) {
|
||||
buf.append(String.format("%n%s",s));
|
||||
}
|
||||
}
|
||||
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
|
|
|
|||
|
|
@ -31,9 +31,14 @@ import org.broad.tribble.vcf.VCFHeader;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -185,5 +190,30 @@ public class SampleUtils {
|
|||
|
||||
}
|
||||
|
||||
public static List<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
|
||||
if (sampleArgs != null) {
|
||||
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
|
||||
// sample list set, and treat the entries as if they had been specified on the command line.
|
||||
List<String> samplesFromFiles = new ArrayList<String>();
|
||||
for (String SAMPLE_EXPRESSION : sampleArgs) {
|
||||
File sampleFile = new File(SAMPLE_EXPRESSION);
|
||||
|
||||
try {
|
||||
XReadLines reader = new XReadLines(sampleFile);
|
||||
|
||||
List<String> lines = reader.readLines();
|
||||
for (String line : lines) {
|
||||
samplesFromFiles.add(line);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
|
||||
}
|
||||
}
|
||||
|
||||
return samplesFromFiles;
|
||||
}
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue