Fix merge issues when incorporating new AF calculations changes

This commit is contained in:
Guillermo del Angel 2012-03-27 15:00:44 -04:00
commit 343a061b1c
26 changed files with 881 additions and 614 deletions

View File

@ -1,4 +1,4 @@
Copyright (c) 2011 The Broad Institute Copyright (c) 2012 The Broad Institute
Permission is hereby granted, free of charge, to any person Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation obtaining a copy of this software and associated documentation

View File

@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.report;
import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.lang.math.NumberUtils;
import java.util.Arrays; import java.util.*;
import java.util.Collection;
import java.util.TreeMap;
/** /**
* Holds values for a column in a GATK report table * Holds values for a column in a GATK report table
*/ */
public class GATKReportColumn extends TreeMap<Object, Object> { public class GATKReportColumn extends LinkedHashMap<Object, Object> {
final private String columnName; final private String columnName;
final private Object defaultValue; final private Object defaultValue;
final private String format; final private String format;

View File

@ -756,10 +756,6 @@ public class GATKReportTable {
*/ */
// Get the column widths for everything // Get the column widths for everything
HashMap<String, GATKReportColumnFormat> columnFormats = new HashMap<String, GATKReportColumnFormat>();
for (String columnName : columns.keySet()) {
columnFormats.put(columnName, columns.get(columnName).getColumnFormat());
}
String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s";
// Emit the table definition // Emit the table definition
@ -787,7 +783,7 @@ public class GATKReportTable {
if (needsPadding) { if (needsPadding) {
out.printf(" "); out.printf(" ");
} }
out.printf(columnFormats.get(columnName).getNameFormat(), columnName); out.printf(columns.get(columnName).getColumnFormat().getNameFormat(), columnName);
needsPadding = true; needsPadding = true;
} }
@ -796,29 +792,31 @@ public class GATKReportTable {
out.printf("%n"); out.printf("%n");
// Emit the table body // Emit the table body
for (Object primaryKey : primaryKeyColumn) { for (final Object primaryKey : primaryKeyColumn) {
needsPadding = false; needsPadding = false;
if (primaryKeyDisplay) { if (primaryKeyDisplay) {
out.printf(primaryKeyFormat, primaryKey); out.printf(primaryKeyFormat, primaryKey);
needsPadding = true; needsPadding = true;
} }
for (String columnName : columns.keySet()) { for (final Map.Entry<String, GATKReportColumn> entry : columns.entrySet()) {
if (columns.get(columnName).isDisplayable()) { final GATKReportColumn column = entry.getValue();
if (column.isDisplayable()) {
if (needsPadding) { if (needsPadding) {
out.printf(" "); out.print(" ");
} }
String value = columns.get(columnName).getStringValue(primaryKey);
out.printf(columnFormats.get(columnName).getValueFormat(), value); final String value = column.getStringValue(primaryKey);
out.printf(column.getColumnFormat().getValueFormat(), value);
needsPadding = true; needsPadding = true;
} }
} }
out.printf("%n"); out.println();
} }
out.printf("%n"); out.println();
} }
public int getNumRows() { public int getNumRows() {

View File

@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -68,6 +69,9 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
@Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events") @Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events")
public boolean SHOW_INDEL_PILEUPS = false; public boolean SHOW_INDEL_PILEUPS = false;
@Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output")
public boolean SHOW_VERBOSE = false;
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList(); public List<RodBinding<Feature>> rods = Collections.emptyList();
@ -82,7 +86,10 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
if ( context.hasBasePileup() ) { if ( context.hasBasePileup() ) {
ReadBackedPileup basePileup = context.getBasePileup(); ReadBackedPileup basePileup = context.getBasePileup();
out.printf("%s %s%n", basePileup.getPileupString(ref.getBaseAsChar()), rods); out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
if ( SHOW_VERBOSE )
out.printf(" %s", createVerboseOutput(basePileup));
out.println();
} }
if ( context.hasExtendedEventPileup() ) { if ( context.hasExtendedEventPileup() ) {
@ -126,6 +133,24 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
return rodString; return rodString;
} }
private static String createVerboseOutput(final ReadBackedPileup pileup) {
final StringBuilder sb = new StringBuilder();
boolean isFirst = true;
for ( PileupElement p : pileup ) {
if ( isFirst )
isFirst = false;
else
sb.append(",");
sb.append(p.getRead().getReadName());
sb.append(":");
sb.append(p.getOffset());
sb.append(":");
sb.append(p.getRead().getReadLength());
}
return sb.toString();
}
@Override @Override
public void onTraversalDone(Integer result) { public void onTraversalDone(Integer result) {
out.println("[REDUCE RESULT] Traversal result is: " + result); out.println("[REDUCE RESULT] Traversal result is: " + result);

View File

@ -91,6 +91,25 @@ public class ErrorRatePerCycle extends LocusWalker<Integer, Integer> {
this.cycle = cycle; this.cycle = cycle;
} }
// Must overload hashCode and equals to properly work with GATKReportColumn
@Override
public int hashCode() {
return readGroup.hashCode() + 33 * cycle;
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final TableKey oKey = (TableKey) o;
if ( cycle != oKey.cycle ) return false;
if ( !readGroup.equals(oKey.readGroup) ) return false;
return true;
}
@Override @Override
public int compareTo(final TableKey tableKey) { public int compareTo(final TableKey tableKey) {
final int scmp = readGroup.compareTo(tableKey.readGroup); final int scmp = readGroup.compareTo(tableKey.readGroup);

View File

@ -70,6 +70,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
* @return the alleles used for genotyping * @return the alleles used for genotyping
*/ */
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc, protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[][] log10AlleleFrequencyPriors, final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result); final AlleleFrequencyCalculationResult result);
} }

View File

@ -25,6 +25,11 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.ArrayList;
import java.util.Arrays;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
* User: ebanks * User: ebanks
@ -34,23 +39,54 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
*/ */
public class AlleleFrequencyCalculationResult { public class AlleleFrequencyCalculationResult {
// IMPORTANT NOTE: // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
// These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N). private double log10MLE;
// For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that private double log10MAP;
// the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may final private int[] alleleCountsOfMLE;
// be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0). final private int[] alleleCountsOfMAP;
// In the bi-allelic case (where there are no other alternate alleles over which to marginalize),
// the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED.
final double[][] log10AlleleFrequencyLikelihoods;
final double[][] log10AlleleFrequencyPosteriors;
// These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) // The posteriors seen, not including that of AF=0
double log10LikelihoodOfAFzero = 0.0; // TODO -- better implementation needed here (see below)
double log10PosteriorOfAFzero = 0.0; private ArrayList<Double> log10PosteriorMatrixValues = new ArrayList<Double>(100000);
private Double log10PosteriorMatrixSum = null;
public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; private double log10LikelihoodOfAFzero;
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; private double log10PosteriorOfAFzero;
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
public double getLog10MLE() {
return log10MLE;
}
public double getLog10MAP() {
return log10MAP;
}
public double getLog10PosteriorMatrixSum() {
if ( log10PosteriorMatrixSum == null ) {
// TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory;
// TODO -- will discuss with the team what the best option is
final double[] tmp = new double[log10PosteriorMatrixValues.size()];
for ( int i = 0; i < tmp.length; i++ )
tmp[i] = log10PosteriorMatrixValues.get(i);
log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp);
}
return log10PosteriorMatrixSum;
}
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
} }
public double getLog10LikelihoodOfAFzero() { public double getLog10LikelihoodOfAFzero() {
@ -60,4 +96,47 @@ public class AlleleFrequencyCalculationResult {
public double getLog10PosteriorOfAFzero() { public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero; return log10PosteriorOfAFzero;
} }
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
log10PosteriorMatrixValues.clear();
log10PosteriorMatrixSum = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
log10PosteriorMatrixValues.add(log10LofK);
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
} }

View File

@ -0,0 +1,286 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
* Code for determining which indels are segregating among the samples.
*
* This code is just a refactor of the original code from Guillermo in the UG.
*
* @author Mark DePristo
* @since 3/26/12
*/
public class ConsensusAlleleCounter {
final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class);
private final int minIndelCountForGenotyping;
private final boolean doMultiAllelicCalls;
private final double minFractionInOneSample;
private final GenomeLocParser locParser;
public ConsensusAlleleCounter(final GenomeLocParser locParser,
final boolean doMultiAllelicCalls,
final int minIndelCountForGenotyping,
final double minFractionInOneSample) {
this.minIndelCountForGenotyping = minIndelCountForGenotyping;
this.doMultiAllelicCalls = doMultiAllelicCalls;
this.minFractionInOneSample = minFractionInOneSample;
this.locParser = locParser;
}
/**
* Returns a list of Alleles at this locus that may be segregating
*
* @param ref
* @param contexts
* @param contextType
* @return
*/
public List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final Map<String, Integer> consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType);
// logger.info("Alleles at " + ref.getLocus());
// for ( Map.Entry<String, Integer> elt : consensusIndelStrings.entrySet() ) {
// logger.info(" " + elt.getValue() + " => " + elt.getKey());
// }
return consensusCountsToAlleles(ref, consensusIndelStrings);
}
//
// TODO -- WARNING DOESN'T WORK WITH REDUCED READS
//
private Map<String, Integer> countConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final GenomeLoc loc = ref.getLocus();
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
int insCount = 0, delCount = 0;
// quick check of total number of indels in pileup
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
insCount += indelPileup.getNumberOfInsertions();
delCount += indelPileup.getNumberOfDeletions();
}
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
return Collections.emptyMap();
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
final int nIndelReads = indelPileup.getNumberOfInsertions() + indelPileup.getNumberOfDeletions();
final int nReadsOverall = indelPileup.getNumberOfElements();
if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample) {
// if ( nIndelReads > 0 )
// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
continue;
// } else {
// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
}
for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) {
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if (ReadUtils.is454Read(read)) {
continue;
}
/* if (DEBUG && p.isIndel()) {
System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
p.getEventLength(),p.getType().toString(), p.getEventBases());
}
*/
String indelString = p.getEventBases();
if (p.isInsertion()) {
boolean foundKey = false;
// copy of hashmap into temp arrayList
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
for (String s : consensusIndelStrings.keySet()) {
cList.add(new Pair<String, Integer>(s,consensusIndelStrings.get(s)));
}
if (read.getAlignmentEnd() == loc.getStart()) {
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
// In this case, the read could have any of the inserted bases and we need to build a consensus
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
if (s.startsWith(indelString)) {
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
// opposite corner condition: read will start at current locus with an insertion
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
if (s.endsWith(indelString)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
cList.add(new Pair<String, Integer>(indelString,cnt+1));
}
// copy back arrayList into hashMap
consensusIndelStrings.clear();
for (Pair<String,Integer> pair : cList) {
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
}
}
else if (p.isDeletion()) {
indelString = String.format("D%d",p.getEventLength());
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
consensusIndelStrings.put(indelString,cnt+1);
}
}
}
return consensusIndelStrings;
}
private List<Allele> consensusCountsToAlleles(final ReferenceContext ref,
final Map<String, Integer> consensusIndelStrings) {
final GenomeLoc loc = ref.getLocus();
final Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
Allele refAllele, altAllele;
for (final Map.Entry<String, Integer> elt : consensusIndelStrings.entrySet()) {
final String s = elt.getKey();
final int curCnt = elt.getValue();
int stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
final int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
stop = loc.getStart() + dLen;
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases, true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
else continue; // don't go on with this allele if refBases are non-standard
} else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
else continue; // go on to next allele if consensus insertion has any non-standard base.
}
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(Arrays.asList(refAllele, altAllele));
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls)
vcs.add(builder.make());
else {
if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
}
}
if (vcs.isEmpty())
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
return mergedVC.getAlleles();
}
}

View File

@ -52,11 +52,9 @@ import java.util.*;
public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final int HAPLOTYPE_SIZE; private final int HAPLOTYPE_SIZE;
private final int minIndelCountForGenotyping;
private final boolean getAlleleListFromVCF; private final boolean getAlleleListFromVCF;
private boolean DEBUG = false; private boolean DEBUG = false;
private final boolean doMultiAllelicCalls = true;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private PairHMMIndelErrorModel pairModel; private PairHMMIndelErrorModel pairModel;
@ -72,217 +70,30 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// gdebug removeme // gdebug removeme
// todo -cleanup // todo -cleanup
private GenomeLoc lastSiteVisited; private GenomeLoc lastSiteVisited;
private ArrayList<Allele> alleleList; private List<Allele> alleleList = new ArrayList<Allele>();
static { static {
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>()); indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
} }
public IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger); super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
alleleList = new ArrayList<Allele>();
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>(); haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
} }
private List<Allele> computeConsensusAlleles(ReferenceContext ref,
public static ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref, Map<String, AlignmentContext> contexts,
Map<String, AlignmentContext> contexts, AlignmentContextUtils.ReadOrientation contextType,
AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser, GenomeLocParser locParser) {
int minIndelCountForGenotyping, boolean doMultiAllelicCalls) { ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
Allele refAllele = null, altAllele = null; return counter.computeConsensusAlleles(ref, contexts, contextType);
GenomeLoc loc = ref.getLocus();
ArrayList<Allele> aList = new ArrayList<Allele>();
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
int insCount = 0, delCount = 0;
// quick check of total number of indels in pileup
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
insCount += indelPileup.getNumberOfInsertions();
delCount += indelPileup.getNumberOfDeletions();
}
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
return aList;
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) {
//SAMRecord read = p.getRead();
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if (ReadUtils.is454Read(read)) {
continue;
}
/* if (DEBUG && p.isIndel()) {
System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
p.getEventLength(),p.getType().toString(), p.getEventBases());
}
*/
String indelString = p.getEventBases();
if (p.isInsertion()) {
boolean foundKey = false;
// copy of hashmap into temp arrayList
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
for (String s : consensusIndelStrings.keySet()) {
cList.add(new Pair<String, Integer>(s,consensusIndelStrings.get(s)));
}
if (read.getAlignmentEnd() == loc.getStart()) {
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
// In this case, the read could have any of the inserted bases and we need to build a consensus
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
if (s.startsWith(indelString)) {
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
// opposite corner condition: read will start at current locus with an insertion
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
if (s.endsWith(indelString)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
cList.add(new Pair<String, Integer>(indelString,cnt+1));
}
// copy back arrayList into hashMap
consensusIndelStrings.clear();
for (Pair<String,Integer> pair : cList) {
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
}
}
else if (p.isDeletion()) {
indelString = String.format("D%d",p.getEventLength());
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
consensusIndelStrings.put(indelString,cnt+1);
}
}
}
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
String bestAltAllele = "";
for (String s : consensusIndelStrings.keySet()) {
int curCnt = consensusIndelStrings.get(s), stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
stop = loc.getStart() + dLen;
byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases, true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
else continue; // don't go on with this allele if refBases are non-standard
} else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
else continue; // go on to next allele if consensus insertion has any non-standard base.
}
ArrayList vcAlleles = new ArrayList<Allele>();
vcAlleles.add(refAllele);
vcAlleles.add(altAllele);
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(vcAlleles);
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls)
vcs.add(builder.make());
else {
if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
}
}
if (vcs.isEmpty())
return aList; // nothing else to do, no alleles passed minimum count criterion
VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
aList = new ArrayList<Allele>(mergedVC.getAlleles());
return aList;
} }
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
@ -338,7 +149,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
} }
} else { } else {
alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser, minIndelCountForGenotyping,doMultiAllelicCalls); alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser);
if (alleleList.isEmpty()) if (alleleList.isEmpty())
return null; return null;
} }

View File

@ -1,209 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.commons.lang.NotImplementedException;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.security.cert.CertificateNotYetValidException;
import java.util.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 8/30/11
* Time: 10:08 AM
* To change this template use File | Settings | File Templates.
*/
public class UGBoundAF extends RodWalker<VariantContext,Integer> {
@Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true)
VCFWriter writer;
@Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true)
List<RodBinding<VariantContext>> variants;
private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0);
private HashMap<Integer,Pair<Double,Double>> epsilonPosteriorCache = new HashMap<Integer,Pair<Double,Double>>(8192);
private HashMap<Integer,Double> logAC0Cache = new HashMap<Integer, Double>(8192);
private int QUANTIZATION_FACTOR = 1000;
public void initialize() {
Set<VCFHeaderLine> allHeaderLines = new HashSet<VCFHeaderLine>(1024);
for ( RodBinding<VariantContext> v : variants ) {
String trackName = v.getName();
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>(vcfHeaders.get(trackName).getMetaData());
}
allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+
"frequency. First value=95% probability AF>x. Second value=95% probability AF<x."));
writer.writeHeader(new VCFHeader(allHeaderLines));
}
public Integer reduceInit() {
return 0;
}
@Override
public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext unused ) {
List<VariantContext> allVariants = tracker.getValues(variants);
if ( allVariants.size() == 0 ) {
return null;
}
List<Allele> alternateAlleles = getAllAlternateAlleles(allVariants);
VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet<String>()));
if ( alternateAlleles.size() > 1 ) {
logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString());
alternateAlleles.add(allVariants.get(0).getReference());
builder.alleles(alternateAlleles);
builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles)));
} else {
// get all the genotype likelihoods
GenotypesContext context = GenotypesContext.create();
int numNoCall = 0;
for ( VariantContext v : allVariants ) {
numNoCall += v.getNoCallCount();
context.addAll(v.getGenotypes());
}
builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context)));
}
return builder.make();
}
private List<Allele> getAllAlternateAlleles(List<VariantContext> variants) {
List<Allele> alleles = new ArrayList<Allele>(3); // some overhead
for ( VariantContext v : variants ) {
alleles.addAll(v.getAlternateAlleles());
}
return alleles;
}
@Override
public Integer reduce(VariantContext value, Integer sum) {
if ( value == null )
return sum;
writer.add(value);
return ++sum;
}
private int N_ITERATIONS = 1;
private double[] getACPosteriors(GenotypesContext gc) {
// note this uses uniform priors (!)
double[][] zeroPriors = new double[1][1+2*gc.size()];
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size());
// todo -- allow multiple alleles here
for ( int i = 0; i < N_ITERATIONS; i ++ ) {
ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false);
}
return result.log10AlleleFrequencyPosteriors[0];
}
private String boundAlleleFrequency(double[] ACLikelihoods) {
// note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs
// thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be
// agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over
// want to pick a lower limit x and upper limit y such that
// int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95
// int_{f=x to y} calculateAFPosterior(f) df = 0.95
// and that (y-x) is minimized
// this is done by quantizing [0,1] into small bins and, since the distribution is
// unimodal, greedily adding them until the probability is >= 0.95
throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented"));
}
private double calculateAFPosterior(double[] likelihoods, double af) {
double[] probLiks = new double[likelihoods.length];
for ( int c = 0; c < likelihoods.length; c++) {
probLiks[c] = calculateAFPosterior(c,likelihoods.length,af);
}
return MathUtils.log10sumLog10(probLiks);
}
private double calculateAFPosterior(int ac, int n, double af) {
// evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes
switch ( ac ) {
case 0:
return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af);
case 1:
return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT);
case 2:
return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT);
default:
return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac);
}
}
private double logAC0Coef(int an) {
if ( ! logAC0Cache.containsKey(an) ) {
double coef = -Math.log10(EPS_LOWER_LIMIT);
for ( int k = 1; k <= an; k++ ) {
// note this should typically just be
// term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k
// but the 1-E term will just be 1, so we do the following to mitigate this problem
double binom = MathUtils.binomialCoefficient(an,k);
double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k);
double term = binom/k - Math.pow(eps_correction,k);
if ( k % 2 == 0 ) {
coef += term;
} else {
coef -= term;
}
}
logAC0Cache.put(an,coef);
}
return logAC0Cache.get(an);
}
private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) {
double mid = (start + stop)/2;
double size = stop-start;
double fa = calculateAFPosterior(likelihoods,start);
double fb = calculateAFPosterior(likelihoods,mid);
double fc = calculateAFPosterior(likelihoods,stop);
double s = (size/6)*(fa + 4*fc + fb);
double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap);
return h;
}
private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){
if ( s == 0 )
return -300.0;
double c = ( a + b )/2;
double h = b-a;
double d = (a + c)/2;
double e = (c + b)/2;
double fd = calculateAFPosterior(likelihoods, d);
double fe = calculateAFPosterior(likelihoods, e);
double s_l = (h/12)*(fa + 4*fd + fc);
double s_r = (h/12)*(fc + 4*fe + fb);
double s_2 = s_l + s_r;
if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){
return Math.log10(s_2 + (s_2 - s)/15.0);
}
return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1));
}
}

View File

@ -118,6 +118,17 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false)
public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5;
/**
* Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute
* to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end
* up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for
* low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over
* come this threshold.
*/
@Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false)
public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25;
/** /**
* This argument informs the prior probability of having an indel at a site. * This argument informs the prior probability of having an indel at a site.
*/ */
@ -165,6 +176,7 @@ public class UnifiedArgumentCollection {
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING;
uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE;
uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY;
uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY;
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;

View File

@ -137,6 +137,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@Output(doc="File to which variants should be written",required=true) @Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null; protected VCFWriter writer = null;
@Hidden
@Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false) @Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null; protected PrintStream verboseWriter = null;
@ -218,7 +219,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
// initialize the verbose writer // initialize the verbose writer
if ( verboseWriter != null ) if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior"); verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, 2*samples.size()); UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, 2*samples.size());

View File

@ -84,8 +84,8 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>(); private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[][] log10AlleleFrequencyPriorsSNPs; private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[][] log10AlleleFrequencyPriorsIndels; private final double[] log10AlleleFrequencyPriorsIndels;
// the priors object // the priors object
private final GenotypePriors genotypePriorsSNPs; private final GenotypePriors genotypePriorsSNPs;
@ -131,9 +131,8 @@ public class UnifiedGenotyperEngine {
this.annotationEngine = engine; this.annotationEngine = engine;
this.N = N; this.N = N;
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; log10AlleleFrequencyPriorsIndels = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP);
@ -269,8 +268,8 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet // initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) { if ( afcm.get() == null ) {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
posteriorsArray.set(new double[N + 2]); posteriorsArray.set(new double[2]);
} }
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
@ -283,9 +282,7 @@ public class UnifiedGenotyperEngine {
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
} }
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) AFresult.reset();
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
// is the most likely frequency conformation AC=0 for all alternate alleles? // is the most likely frequency conformation AC=0 for all alternate alleles?
@ -301,11 +298,10 @@ public class UnifiedGenotyperEngine {
if ( indexOfAllele == -1 ) if ( indexOfAllele == -1 )
continue; continue;
int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]); int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1];
// if the most likely AC is not 0, then this is a good alternate allele to use; // if the most likely AC is not 0, then this is a good alternate allele to use
// make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array if ( indexOfBestAC != 0 ) {
if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) {
myAlleles.add(alternateAllele); myAlleles.add(alternateAllele);
bestGuessIsRef = false; bestGuessIsRef = false;
} }
@ -316,7 +312,6 @@ public class UnifiedGenotyperEngine {
} }
// calculate p(f>0): // calculate p(f>0):
// because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
final double PofF = 1.0 - normalizedPosteriors[0]; final double PofF = 1.0 - normalizedPosteriors[0];
@ -324,18 +319,11 @@ public class UnifiedGenotyperEngine {
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) ) if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero; phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
} else { } else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) { if ( Double.isInfinite(phredScaledConfidence) ) {
double sum = AFresult.log10AlleleFrequencyPosteriors[0][0]; final double sum = AFresult.getLog10PosteriorMatrixSum();
if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += AFresult.log10AlleleFrequencyPosteriors[0][i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
} }
} }
@ -364,7 +352,7 @@ public class UnifiedGenotyperEngine {
// print out stats if we have a writer // print out stats if we have a writer
if ( verboseWriter != null && !limitedContext ) if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model); printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last // *** note that calculating strand bias involves overwriting data structures, so we do that last
final HashMap<String, Object> attributes = new HashMap<String, Object>(); final HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -378,29 +366,27 @@ public class UnifiedGenotyperEngine {
// the overall lod // the overall lod
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles(); List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
// the forward lod // the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); AFresult.reset();
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod // the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); AFresult.reset();
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -437,9 +423,9 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
} }
private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) { public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero; normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1); normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum();
return MathUtils.normalizeFromLog10(normalizedPosteriors); return MathUtils.normalizeFromLog10(normalizedPosteriors);
} }
@ -499,14 +485,6 @@ public class UnifiedGenotyperEngine {
return stratifiedContexts; return stratifiedContexts;
} }
protected static void clearAFarray(double[][] AFs) {
for ( int i = 0; i < AFs.length; i++ ) {
for ( int j = 0; j < AFs[i].length; j++ ) {
AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
}
private final static double[] binomialProbabilityDepthCache = new double[10000]; private final static double[] binomialProbabilityDepthCache = new double[10000];
static { static {
for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) {
@ -551,7 +529,7 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false);
} }
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors, final GenotypeLikelihoodsCalculationModel.Model model) { protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) {
Allele refAllele = null, altAllele = null; Allele refAllele = null, altAllele = null;
for ( Allele allele : vc.getAlleles() ) { for ( Allele allele : vc.getAlleles() ) {
if ( allele.isReference() ) if ( allele.isReference() )
@ -574,11 +552,8 @@ public class UnifiedGenotyperEngine {
AFline.append(i + "/" + N + "\t"); AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE()));
AFline.append("0.00000000\t"); AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP()));
else
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]));
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
verboseWriter.println(AFline.toString()); verboseWriter.println(AFline.toString());
} }
@ -648,31 +623,29 @@ public class UnifiedGenotyperEngine {
return null; return null;
} }
protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
// the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i double sum = 0.0;
for (int alleles = 1; alleles <= priors.length; alleles++) {
double sum = 0.0;
// for each i // for each i
for (int i = 1; i <= N; i++) { for (int i = 1; i <= N; i++) {
double value = Math.pow(theta, alleles) / (double)i; final double value = theta / (double)i;
priors[alleles-1][i] = Math.log10(value); priors[i] = Math.log10(value);
sum += value; sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[alleles-1][0] = Math.log10(1.0 - sum);
} }
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
} }
protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
if (model.name().toUpperCase().contains("SNP")) if (model.name().toUpperCase().contains("SNP"))
return log10AlleleFrequencyPriorsSNPs; return log10AlleleFrequencyPriorsSNPs;
else if (model.name().toUpperCase().contains("INDEL")) else if (model.name().toUpperCase().contains("INDEL"))
return log10AlleleFrequencyPriorsIndels; return log10AlleleFrequencyPriorsIndels;
else else
throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
} }
private static GenotypePriors createGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) { private static GenotypePriors createGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) {

View File

@ -25,19 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet; import java.util.TreeSet;
public class GLBasedSampleSelector extends SampleSelector { public class GLBasedSampleSelector extends SampleSelector {
Map<Integer,double[][]> numAllelePriorMatrix = new HashMap<Integer,double[][]>(); double[] flatPriors = null;
double referenceLikelihood; double referenceLikelihood;
public GLBasedSampleSelector(TreeSet<String> sm, double refLik) { public GLBasedSampleSelector(TreeSet<String> sm, double refLik) {
super(sm); super(sm);
@ -53,9 +47,11 @@ public class GLBasedSampleSelector extends SampleSelector {
// now check to see (using EXACT model) whether this should be variant // now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec? // do we want to apply a prior? maybe user-spec?
double[][] flatPrior = createFlatPrior(vc.getAlleles()); if ( flatPriors == null ) {
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); flatPriors = new double[1+2*samples.size()];
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); }
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
// do we want to let this qual go up or down? // do we want to let this qual go up or down?
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
return true; return true;
@ -63,12 +59,4 @@ public class GLBasedSampleSelector extends SampleSelector {
return false; return false;
} }
private double[][] createFlatPrior(List<Allele> alleles) {
if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) {
numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]);
}
return numAllelePriorMatrix.get(alleles.size());
}
} }

View File

@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// find the comp // find the comp
final VariantContext comp = findMatchingComp(eval, compSet); final VariantContext comp = findMatchingComp(eval, compSet);
HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(); for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) {
for ( VariantStratifier vs : stratificationObjects ) {
List<String> states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
for ( StateKey stateKey : stateKeysHash ) {
NewEvaluationContext nec = evaluationContexts.get(stateKey); NewEvaluationContext nec = evaluationContexts.get(stateKey);
// eval against the comp // eval against the comp
@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
return null; return null;
} }
// private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
// final ReferenceContext ref,
// final VariantContext eval,
// final String evalName,
// final VariantContext comp,
// final String compName,
// final String sampleName ) {
// Set<StateKey> oldKeys = new HashSet<StateKey>(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName)));
//
// int n = 0;
// for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) {
// n++;
// if ( ! oldKeys.contains(newKey) )
// throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm");
// }
//
// if ( n != oldKeys.size() )
// throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size());
//
// return oldKeys;
// }
// private Iterable<StateKey> getApplicableStatesNew(final RefMetaDataTracker tracker,
// final ReferenceContext ref,
// final VariantContext eval,
// final String evalName,
// final VariantContext comp,
// final String compName,
// final String sampleName ) {
// // todo -- implement optimized version
// }
/**
* Given specific eval and comp VCs and the sample name, return an iterable
* over all of the applicable state keys.
*
* See header of StateKey for performance problems...
*
* @param tracker
* @param ref
* @param eval
* @param evalName
* @param comp
* @param compName
* @param sampleName
* @return
*/
private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final VariantContext eval,
final String evalName,
final VariantContext comp,
final String compName,
final String sampleName ) {
final HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(stratificationObjects.size());
for ( final VariantStratifier vs : stratificationObjects ) {
List<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
return new HashSet<StateKey>(stateKeys);
}
@Requires({"comp != null", "evals != null"}) @Requires({"comp != null", "evals != null"})
private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) { private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) {
// find all of the matching comps // find all of the matching comps
@ -482,11 +538,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public void onTraversalDone(Integer result) { public void onTraversalDone(Integer result) {
logger.info("Finalizing variant report"); logger.info("Finalizing variant report");
for ( StateKey stateKey : evaluationContexts.keySet() ) { for ( Map.Entry<StateKey, NewEvaluationContext> ecElt : evaluationContexts.entrySet() ) {
NewEvaluationContext nec = evaluationContexts.get(stateKey); final StateKey stateKey = ecElt.getKey();
final NewEvaluationContext nec = ecElt.getValue();
for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) { for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) {
ve.finalizeEvaluation(); ve.finalizeEvaluation();
final String veName = ve.getSimpleName(); // ve.getClass().getSimpleName();
AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve);
Map<Field, DataPoint> datamap = scanner.getData(); Map<Field, DataPoint> datamap = scanner.getData();
@ -498,7 +556,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
if (field.get(ve) instanceof TableType) { if (field.get(ve) instanceof TableType) {
TableType t = (TableType) field.get(ve); TableType t = (TableType) field.get(ve);
final String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); final String subTableName = veName + "." + field.getName();
final DataPoint dataPointAnn = datamap.get(field); final DataPoint dataPointAnn = datamap.get(field);
GATKReportTable table; GATKReportTable table;
@ -539,11 +597,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
} }
} }
} else { } else {
GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); GATKReportTable table = report.getTable(veName);
for ( VariantStratifier vs : stratificationObjects ) { for ( VariantStratifier vs : stratificationObjects ) {
String columnName = vs.getName(); final String columnName = vs.getName();
table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); table.set(stateKey.toString(), columnName, stateKey.get(vs.getName()));
} }

View File

@ -8,6 +8,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
public abstract class VariantEvaluator { public abstract class VariantEvaluator {
private VariantEvalWalker walker; private VariantEvalWalker walker;
private final String simpleName;
protected VariantEvaluator() {
this.simpleName = getClass().getSimpleName();
}
public void initialize(VariantEvalWalker walker) { public void initialize(VariantEvalWalker walker) {
this.walker = walker; this.walker = walker;
@ -90,4 +95,8 @@ public abstract class VariantEvaluator {
protected static String formattedRatio(final int num, final int denom) { protected static String formattedRatio(final int num, final int denom) {
return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom));
} }
public String getSimpleName() {
return simpleName;
}
} }

View File

@ -27,6 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.lang.annotation.Annotation; import java.lang.annotation.Annotation;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
@ -40,6 +41,7 @@ import java.util.Map;
* the object, a Mashalling object can serialize or deserialize a analysis module. * the object, a Mashalling object can serialize or deserialize a analysis module.
*/ */
public class AnalysisModuleScanner { public class AnalysisModuleScanner {
final private static Map<String, Annotation[]> annotationCache = new HashMap<String, Annotation[]>();
// what we extracted from the class // what we extracted from the class
private Map<Field, DataPoint> datums = new LinkedHashMap<Field, DataPoint>(); // the data we've discovered private Map<Field, DataPoint> datums = new LinkedHashMap<Field, DataPoint>(); // the data we've discovered
@ -84,13 +86,23 @@ public class AnalysisModuleScanner {
// get the fields from the class, and extract // get the fields from the class, and extract
for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) { for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) {
for (Field f : superCls.getDeclaredFields()) for (Field f : superCls.getDeclaredFields())
for (Annotation annotation : f.getAnnotations()) { for (Annotation annotation : getAnnotations(f)) {
if (annotation.annotationType().equals(DataPoint.class)) if (annotation.annotationType().equals(DataPoint.class))
datums.put(f,(DataPoint) annotation); datums.put(f,(DataPoint) annotation);
} }
} }
} }
private Annotation[] getAnnotations(final Field field) {
final String fieldName = field.toString();
Annotation[] annotations = annotationCache.get(fieldName);
if ( annotations == null ) {
annotations = field.getAnnotations();
annotationCache.put(fieldName, annotations);
}
return annotations;
}
/** /**
* *
* @return a map of the datum annotations found * @return a map of the datum annotations found

View File

@ -36,6 +36,16 @@ public class NewEvaluationContext extends HashMap<VariantStratifier, String> {
return new TreeMap<String, VariantEvaluator>(evaluationInstances); return new TreeMap<String, VariantEvaluator>(evaluationInstances);
} }
public StateKey makeStateKey() {
Map<String, String> map = new HashMap<String, String>(size());
for (Map.Entry<VariantStratifier, String> elt : this.entrySet() ) {
map.put(elt.getKey().getName(), elt.getValue());
}
return new StateKey(map);
}
public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) { public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) {
for ( final VariantEvaluator evaluation : evaluationInstances.values() ) { for ( final VariantEvaluator evaluation : evaluationInstances.values() ) {
// the other updateN methods don't see a null context // the other updateN methods don't see a null context

View File

@ -3,25 +3,102 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
public class StateKey extends TreeMap<String, String> { /**
// public int hashCode() { * A final constant class representing the specific state configuration
// int hashCode = 1; * for a VariantEvaluator instance.
// *
// for (final Map.Entry<String,String> pair : this.entrySet()) { * The way this is currently implemented is by a map from the name of a VariantStratification to a
// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); * specific state string. For example, the stratification Novelty has states all, known, novel. A
// } * specific variant and comp would be tagged as "known" by the stratification, and this could be
// * represented here by the map (Novelty -> known).
// return hashCode; *
// } * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
*
* I've been staring at this state key code for a while. It's just not right, and expensive to boot.
* Here are my thoughts for future work. The state key is both a key with specific state values for
* every stratification. For example, (known, sample1, ac=1). This capability is used in some places,
* such as below, to return a set of all states that should be updated given the eval and comp
* VCs. In principle there are a finite set of such combinations (the product of all states for all active
* stratifications at initialization). We could represent such keys as integers into the set of all combinations.
*
* Note that all of the code that manipulates these things is just terrible. It's all string manipulation and
* HashMaps. Since we are effectively always squaring off our VE analyses (i.e., we have a table with
* all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism. Instead
* we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from
* specific strat state to this key.
*
* The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in
* the initializeStateKey function. Instead, we should have code that looks like:
*
* init:
* allStates <- initializeCombinationalStateSpace
*
* map:
* for each eval / comp pair:
* for each relevantState based on eval / comp:
* allStates[relevantState].update(eval, comp)
*
*
*/
public final class StateKey {
/** High-performance cache of the toString operation for a constant class */
private final String string;
private final TreeMap<String, String> states;
public String toString() { public StateKey(final Map<String, String> states) {
String value = ""; this.states = new TreeMap<String, String>(states);
this.string = formatString();
}
for ( final String key : this.keySet() ) { public StateKey(final StateKey toOverride, final String keyOverride, final String valueOverride) {
//value += "\tstate " + key + ":" + this.get(key) + "\n"; if ( toOverride == null ) {
value += String.format("%s:%s;", key, this.get(key)); this.states = new TreeMap<String, String>();
} else {
this.states = new TreeMap<String, String>(toOverride.states);
} }
return value; this.states.put(keyOverride, valueOverride);
this.string = formatString();
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final StateKey stateKey = (StateKey) o;
if (states != null ? !states.equals(stateKey.states) : stateKey.states != null) return false;
return true;
}
@Override
public int hashCode() {
return states.hashCode();
}
@Override
public String toString() {
return string;
}
private final String formatString() {
StringBuilder b = new StringBuilder();
for ( Map.Entry<String, String> entry : states.entrySet() ) {
b.append(String.format("%s:%s;", entry.getKey(), entry.getValue()));
}
return b.toString();
}
// TODO -- might be slow because of tree map
public String get(final String key) {
return states.get(key);
} }
} }

View File

@ -214,20 +214,9 @@ public class VariantEvalUtils {
ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec)); ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec));
} }
} else { } else {
HashMap<StateKey, NewEvaluationContext> necs = new HashMap<StateKey, NewEvaluationContext>(); final StateKey stateKey = ec.makeStateKey();
StateKey stateKey = new StateKey();
for (VariantStratifier vs : ec.keySet()) {
String state = ec.get(vs);
stateKey.put(vs.getName(), state);
}
ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects);
return new HashMap<StateKey, NewEvaluationContext>(Collections.singletonMap(stateKey, ec));
necs.put(stateKey, ec);
return necs;
} }
return ecs; return ecs;
@ -428,14 +417,8 @@ public class VariantEvalUtils {
HashMap<VariantStratifier, List<String>> oneSetOfStates = newStateStack.pop(); HashMap<VariantStratifier, List<String>> oneSetOfStates = newStateStack.pop();
VariantStratifier vs = oneSetOfStates.keySet().iterator().next(); VariantStratifier vs = oneSetOfStates.keySet().iterator().next();
for (String state : oneSetOfStates.get(vs)) { for (final String state : oneSetOfStates.get(vs)) {
StateKey newStateKey = new StateKey(); final StateKey newStateKey = new StateKey(stateKey, vs.getName(), state);
if (stateKey != null) {
newStateKey.putAll(stateKey);
}
newStateKey.put(vs.getName(), state);
initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys); initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys);
} }
} else { } else {

View File

@ -182,7 +182,6 @@ public class Haplotype {
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref, public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
final int haplotypeSize, final int numPrefBases) { final int haplotypeSize, final int numPrefBases) {
LinkedHashMap<Allele,Haplotype> haplotypeMap = new LinkedHashMap<Allele,Haplotype>(); LinkedHashMap<Allele,Haplotype> haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
Allele refAllele = null; Allele refAllele = null;
@ -215,13 +214,13 @@ public class Haplotype {
// Create location for all haplotypes // Create location for all haplotypes
int startLoc = ref.getWindow().getStart() + startIdxInReference; final int startLoc = ref.getWindow().getStart() + startIdxInReference;
int stopLoc = startLoc + haplotypeSize-1; final int stopLoc = startLoc + haplotypeSize-1;
GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); final GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc);
for (Allele a : alleleList) { for (final Allele a : alleleList) {
byte[] alleleBases = a.getBases(); byte[] alleleBases = a.getBases();
// use string concatenation // use string concatenation
@ -315,5 +314,4 @@ public class Haplotype {
return (fallsInsideDeletion ? -1 : readBases); return (fallsInsideDeletion ? -1 : readBases);
} }
} }

View File

@ -234,6 +234,9 @@ public class MathUtils {
double sum = 0.0; double sum = 0.0;
double maxValue = Utils.findMaxEntry(log10p); double maxValue = Utils.findMaxEntry(log10p);
if(maxValue == Double.NEGATIVE_INFINITY)
return sum;
for (int i = start; i < finish; i++) { for (int i = start; i < finish; i++) {
sum += Math.pow(10.0, log10p[i] - maxValue); sum += Math.pow(10.0, log10p[i] - maxValue);
} }
@ -1508,6 +1511,24 @@ public class MathUtils {
return result; return result;
} }
/** Same routine, unboxed types for efficiency
*
* @param x
* @param y
* @return Vector of same length as x and y so that z[k] = x[k]+y[k]
*/
public static double[] vectorSum(double[]x, double[] y) {
if (x.length != y.length)
throw new ReviewedStingException("BUG: Lengths of x and y must be the same");
double[] result = new double[x.length];
for (int k=0; k <x.length; k++)
result[k] = x[k]+y[k];
return result;
}
public static <E extends Number> Double[] scalarTimesVector(E a, E[] v1) { public static <E extends Number> Double[] scalarTimesVector(E a, E[] v1) {
Double result[] = new Double[v1.length]; Double result[] = new Double[v1.length];

View File

@ -703,4 +703,11 @@ public class Utils {
return programRecord; return programRecord;
} }
public static <E> Collection<E> makeCollection(Iterable<E> iter) {
Collection<E> list = new ArrayList<E>();
for (E item : iter) {
list.add(item);
}
return list;
}
} }

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
@ -18,7 +17,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1; static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2; static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static final int numSamples = 3; static final int numSamples = 3;
static double[][] priors = new double[2][2*numSamples+1]; // flat priors static double[] priors = new double[2*numSamples+1]; // flat priors
@BeforeSuite @BeforeSuite
public void before() { public void before() {
@ -83,26 +82,16 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs") @Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) { public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2*numSamples+1; j++ ) {
result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int nameIndex = 1; int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) { Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero);
} else {
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
} }
} }
} }

View File

@ -17,8 +17,8 @@ import java.util.Map;
public class UnifiedGenotyperIntegrationTest extends WalkerTest { public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() { public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232")); Arrays.asList("ac3737b4212f634a03c640c83f670955"));
executeTest("test MultiSample Pilot1", spec); executeTest("test MultiSample Pilot1", spec);
} }
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() { public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); Arrays.asList("6f70dfbaf3bb70c702f9e9dbacd67c17"));
executeTest("test Multiple SNP alleles", spec); executeTest("test Multiple SNP alleles", spec);
} }
@ -138,7 +138,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() { public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("6172d2f3d370132f4c57a26aa94c256e")); Arrays.asList("e9d23a08472e4e27b4f25e844f5bad57"));
executeTest("test SLOD", spec); executeTest("test SLOD", spec);
} }
@ -146,8 +146,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testOutputParameter() { public void testOutputParameter() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();
e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "ecf92054c1e4bd9d6529b8002d385165" );
e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); e.put( "--output_mode EMIT_ALL_SITES", "119c9fcefbc69e0fc10b1dc52f6438e3" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -300,13 +300,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels() { public void testMultiSampleIndels() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("52340d578a708fa709b69ce48987bc9d")); Arrays.asList("fbc48d7d9e622c9af7922f91bc858151"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("9566c7abef5ee5829a516d90445b347f")); Arrays.asList("94c52ef70e44709ccd947d32e9c27da9"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
} }
@ -335,4 +335,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
UserException.class); UserException.class);
executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec);
} }
// --------------------------------------------------------------------------------------------------------------
//
// testing SnpEff
//
// --------------------------------------------------------------------------------------------------------------
final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation
+ "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030";
@Test
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("f08ff07ad49d388198c1887baad05977"));
executeTest("test minIndelFraction 0.0", spec);
}
@Test
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("a0945fd21369aaf68c7f1d96dbb930d1"));
executeTest("test minIndelFraction 0.25", spec);
}
@Test
public void testMinIndelFraction100() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 1", 1,
Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a"));
executeTest("test minIndelFraction 1.0", spec);
}
} }

View File

@ -0,0 +1,90 @@
package org.broadinstitute.sting.gatk.walkers.validation;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 3/26/12
* Time: 3:29 PM
* To change this template use File | Settings | File Templates.
*/
public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
public static String baseTestString(String args) {
return "-T ValidationSiteSelector -R " + b36KGReference + " -L 1 -o %s -NO_HEADER -numSites 100 " + args;
}
private static String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
private static String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
private static String samplePrefix = " -sf " + samplesFile;
private static String freqUnif = " --frequencySelectionMode UNIFORM ";
private static String freqAF = " --frequencySelectionMode KEEP_AF_SPECTRUM ";
private static String sampleNone = " -sampleMode NONE ";
private static String sampleGT = samplePrefix+" -sampleMode POLY_BASED_ON_GT ";
private static String sampleGL = samplePrefix+" -sampleMode POLY_BASED_ON_GL -samplePNonref 0.95";
@Test(enabled=true)
public void testNoSampleSelectionFreqUniform() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleNone + freqUnif + "--variant " + testfile),
1,
Arrays.asList("d49baeb8000a426c172ce1d81eb37963")
);
executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec);
}
@Test(enabled=true)
public void testNoSampleSelectionFreqAF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleNone + freqAF + "--variant " + testfile),
1,
Arrays.asList("0fb0d015d462c34514fc7e96beea5f56")
);
executeTest("testNoSampleSelectionFreqAF--" + testfile, spec);
}
@Test(enabled=true)
public void testPolyGTFreqUniform() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGT + freqUnif + "--variant " + testfile),
1,
Arrays.asList("0672854299d42ea8af906976a3849ae6")
);
executeTest("testPolyGTFreqUniform--" + testfile, spec);
}
@Test(enabled=true)
public void testPolyGTFreqAF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGT + freqAF + "--variant " + testfile),
1,
Arrays.asList("5bdffda1a063d0bddd6b236854ec627d")
);
executeTest("testPolyGTFreqAF--" + testfile, spec);
}
@Test(enabled=true)
public void testPolyGLFreqAF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGL + freqAF + "--variant " + testfile),
1,
Arrays.asList("35ef16aa41303606a4b94f7b88bd9aa8")
);
executeTest("testPolyGLFreqAF--" + testfile, spec);
}
}