diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java index 6bb5fe16e..7c24a356e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java @@ -7,6 +7,7 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFConstants; import org.broadinstitute.sting.gatk.contexts.*; import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.report.tags.Analysis; import org.broadinstitute.sting.utils.report.tags.DataPoint; import org.broadinstitute.sting.utils.report.utils.TableType; @@ -264,9 +265,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva if (eval != null) { // initialize the concordance table sampleStats = new SampleStats(eval,Genotype.Type.values().length); - alleleCountStats = new ACStats(eval,Genotype.Type.values().length); + alleleCountStats = new ACStats(eval,Genotype.Type.values().length, new CompACNames(veWalker.getLogger())); sampleSummaryStats = new SampleSummaryStats(eval); - alleleCountSummary = new ACSummaryStats(eval); + alleleCountSummary = new ACSummaryStats(eval, new CompACNames(veWalker.getLogger())); for (final VariantContext vc : missedValidationData) { determineStats(null, vc); } @@ -297,12 +298,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva String interesting = null; final boolean validationIsValidVC = isValidVC(validation); - final String evalAC = (eval != null && eval.getAlternateAlleles().size() == 1 && eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) ? String.format("evalAC%s",eval.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null ; - final String validationAC = ( validationIsValidVC && validation.getAlternateAlleles().size() == 1 && validation.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) ? String.format("compAC%s",validation.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null; + final String evalAC = ( vcHasGoodAC(eval) ) ? String.format("evalAC%d",getAC(eval)) : null ; + final String validationAC = ( vcHasGoodAC(validation) ) ? String.format("compAC%d",getAC(validation)) : null; // determine concordance for eval data if (eval != null) { - for (final String sample : eval.getSampleNames()) { + for (final String sample : eval.getGenotypes().keySet()) { final Genotype.Type called = eval.getGenotype(sample).getType(); final Genotype.Type truth; @@ -330,7 +331,7 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva else { final Genotype.Type called = Genotype.Type.NO_CALL; - for (final String sample : validation.getSampleNames()) { + for (final String sample : validation.getGenotypes().keySet()) { final Genotype.Type truth = validation.getGenotype(sample).getType(); sampleStats.incrValue(sample, truth, called); if ( evalAC != null ) { @@ -363,8 +364,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva // TP & FP quality score histograms if( eval != null && eval.isPolymorphic() && validationIsValidVC ) { - if( eval.getSampleNames().size() == 1 ) { // single sample calls - for( final String sample : eval.getSampleNames() ) { // only one sample + if( eval.getGenotypes().keySet().size() == 1 ) { // single sample calls + for( final String sample : eval.getGenotypes().keySet() ) { // only one sample if( validation.hasGenotype(sample) ) { final Genotype truth = validation.getGenotype(sample); qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() ); @@ -396,6 +397,33 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva alleleCountSummary.generateSampleSummaryStats( alleleCountStats ); } } + + private boolean vcHasGoodAC(VariantContext vc) { + return ( vc != null && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ); + + } + + private int getAC(VariantContext vc) { + if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(List.class) ) { + return ((List) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0); + } else if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(Integer.class)) { + return (Integer) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); + } else if ( vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass().isAssignableFrom(String.class)) { + // two ways of parsing + String ac = (String) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); + if ( ac.startsWith("[") ) { + return Integer.parseInt(ac.replaceAll("[","").replaceAll("]","")); + } else { + try { + return Integer.parseInt(ac); + } catch ( NumberFormatException e ) { + throw new UserException(String.format("The format of the AC field is improperly formatted: AC=%s",ac)); + } + } + } else { + throw new UserException(String.format("The format of the AC field does not appear to be of integer-list or String format")); + } + } } /** @@ -444,7 +472,7 @@ class SampleStats implements TableType { public SampleStats(VariantContext vc, int nGenotypeTypes) { this.nGenotypeTypes = nGenotypeTypes; - for (String sample : vc.getSampleNames()) + for (String sample : vc.getGenotypes().keySet()) concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]); } @@ -482,12 +510,24 @@ class SampleStats implements TableType { * Sample stats, but for AC */ class ACStats extends SampleStats { - public ACStats(VariantContext vc, int nGenotypeTypes) { + private final CompACNames myComp; + private String[] rowKeys; + + public ACStats(VariantContext vc, int nGenotypeTypes, CompACNames comp) { super(nGenotypeTypes); - for ( int i = 0; i <= 2*vc.getNSamples(); i++ ) { // todo -- assuming ploidy 2 here... + rowKeys = new String[2+4*vc.getGenotypes().size()]; + for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); - concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); + rowKeys[i] = String.format("evalAC%d",i); + } + + for ( int i = 0; i <= 2*vc.getGenotypes().size(); i++ ) { + concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); + rowKeys[1+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i); + } + + myComp = comp; } public String getName() { @@ -495,9 +535,10 @@ class ACStats extends SampleStats { } public Object[] getRowKeys() { - String[] acNames = (String[]) super.getRowKeys(); - Arrays.sort(acNames,new CompACNames()); - return acNames; + if ( rowKeys == null ) { + throw new StingException("RowKeys is null!"); + } + return rowKeys; } } @@ -537,7 +578,7 @@ class SampleSummaryStats implements TableType { public SampleSummaryStats(final VariantContext vc) { concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - for( final String sample : vc.getSampleNames() ) { + for( final String sample : vc.getGenotypes().keySet() ) { concordanceSummary.put(sample, new double[COLUMN_KEYS.length]); } } @@ -676,12 +717,23 @@ class SampleSummaryStats implements TableType { * SampleSummaryStats .. but for allele counts */ class ACSummaryStats extends SampleSummaryStats { - public ACSummaryStats (final VariantContext vc) { + final private CompACNames myComp; + private String[] rowKeys; + + public ACSummaryStats (final VariantContext vc, CompACNames comp) { concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - for( int i = 0; i <= 2*vc.getNSamples() ; i ++ ) { + rowKeys = new String[3+4*vc.getGenotypes().size()]; + rowKeys[0] = ALL_SAMPLES_KEY; + for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) { concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); - concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); + rowKeys[i+1] = String.format("evalAC%d",i); } + for( int i = 0; i <= 2*vc.getGenotypes().size() ; i ++ ) { + concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); + rowKeys[2+2*vc.getGenotypes().size()+i] = String.format("compAC%d",i); + } + + myComp = comp; } public String getName() { @@ -689,19 +741,31 @@ class ACSummaryStats extends SampleSummaryStats { } public Object[] getRowKeys() { - String[] acNames = (String[]) super.getRowKeys(); - Arrays.sort(acNames,new CompACNames()); - return acNames; + if ( rowKeys == null) { + throw new StingException("rowKeys is null!!"); + } + return rowKeys; } } class CompACNames implements Comparator{ + final Logger myLogger; + private boolean info = true; + + public CompACNames(Logger l) { + myLogger = l; + } + public boolean equals(Object o) { return ( o.getClass() == CompACNames.class ); } public int compare(Object o1, Object o2) { + if ( info ) { + myLogger.info("Sorting AC names"); + info = false; + } //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); return getRank(o1) - getRank(o2); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 2997040a6..6b6f956dd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -29,6 +29,7 @@ import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.MutableVariantContext; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.StandardVCFWriter; +import org.broad.tribble.vcf.VCFConstants; import org.broad.tribble.vcf.VCFWriter; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -41,6 +42,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.ApplyVariantCuts; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.report.ReportMarshaller; import org.broadinstitute.sting.utils.report.VE2ReportFactory; import org.broadinstitute.sting.utils.report.templates.ReportFormat; @@ -311,7 +313,7 @@ public class VariantEvalWalker extends RodWalker { if ( LIST ) listModulesAndExit(); - SAMPLES_LIST = Arrays.asList(SAMPLES); + SAMPLES_LIST = SampleUtils.getSamplesFromCommandLineInput(Arrays.asList(SAMPLES)); determineEvalations(); @@ -676,13 +678,28 @@ public class VariantEvalWalker extends RodWalker { HashMap newAts = new HashMap(vc.getAttributes()); VariantContextUtils.calculateChromosomeCounts(vc,newAts,true); vc = VariantContext.modifyAttributes(vc,newAts); + logger.debug(String.format("VC %s subset to %s AC%n",vc.getName(),vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY))); //if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc)); + } else if ( vc != null && ! vc.hasGenotypes(SAMPLES_LIST) ) { + throw new UserException(String.format("Genotypes for the variant context %s do not contain all the provided samples %s",vc.getName(), getMissingSamples(SAMPLES_LIST,vc))); } map.put(name, allowExcludes && excludeComp(vc) ? null : vc); } } + private static String getMissingSamples(Collection soughtSamples, VariantContext vc) { + StringBuffer buf = new StringBuffer(); + buf.append("Missing samples are:"); + for ( String s : soughtSamples ) { + if ( ! vc.getGenotypes().keySet().contains(s) ) { + buf.append(String.format("%n%s",s)); + } + } + + return buf.toString(); + } + // -------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index f008a9e5f..de0d7e411 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -31,9 +31,14 @@ import org.broad.tribble.vcf.VCFHeader; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.vcf.VCFUtils; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** @@ -185,5 +190,30 @@ public class SampleUtils { } + public static List getSamplesFromCommandLineInput(Collection sampleArgs) { + if (sampleArgs != null) { + // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our + // sample list set, and treat the entries as if they had been specified on the command line. + List samplesFromFiles = new ArrayList(); + for (String SAMPLE_EXPRESSION : sampleArgs) { + File sampleFile = new File(SAMPLE_EXPRESSION); + + try { + XReadLines reader = new XReadLines(sampleFile); + + List lines = reader.readLines(); + for (String line : lines) { + samplesFromFiles.add(line); + } + } catch (FileNotFoundException e) { + samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample + } + } + + return samplesFromFiles; + } + return new ArrayList(); + } + } \ No newline at end of file